In [None]:
from itertools import combinations
from pathlib import Path
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
proj_folder = Path('/home/yhuang2/PROJs/RealTimeAlignment/notebooks/frontier_results/')
plot_folder = proj_folder/'plots'
df = pd.read_csv(proj_folder/'performance.csv')
df.head()

In [None]:
parameters = ['num_features', 'num_subset_solvers', 'subset_size', 'subset_solver_depth']

model_str = []
for _, row in df.iterrows():
    tokens = []
    for param in parameters:
        tokens.append(f'{param}-{row[param]}')
    model_str.append('|'.join(tokens))

## studies
1. performance as function of model_size
2. fix _one_ parameter in ("num_features", "num_subset_solvers", "subset_size", "subset_solver_depth"), average over all other features to show the influence of this one features on "diff_pc"
3. fix _two_ parameters and do the same study

In [None]:
metric = 'diff'
if metric  == 'diff':
    metric_str = 'MSE'
else:
    metric_str = 'Residual'

df_cont = df[df.rounded==False]

for parameter in parameters:
    fig = go.Figure()
    trace = go.Scatter(
        x=df_cont['model_size_mb'],
        y=df_cont[f'valid_{metric}_pc'],
        mode='markers',
        marker=dict(size=5, 
                    color=df_cont[parameter],
                    colorscale='Viridis',),
        text=model_str, # What shows up on hover
        hoverinfo='text', # Show only the text, not (x,y) by default
    )
    fig.add_trace(trace)
    fig.update_layout(
        width=600,
        height=500,
        title=f'{metric_str} as function of model_size (color={parameter})',
        xaxis_title='model size (MB)',
        yaxis_title=f'{metric_str}'
    )
    
    # Show plot
    fig.show()

In [None]:
df_cont = df[df.rounded==False]

for pair in combinations(parameters, 2):
    fig = go.Figure()
    trace = go.Scatter(
        x=df_cont['model_size_mb'],
        y=df_cont[f'valid_{metric}_pc'],
        mode='markers',
        marker=dict(size=5, 
                    color=df_cont[pair[0]] * df_cont[pair[1]],
                    colorscale='Viridis',),
        text=model_str, # What shows up on hover
        hoverinfo='text', # Show only the text, not (x,y) by default
    )
    fig.add_trace(trace)
    fig.update_layout(
        width=750,
        height=500,
        title=f'{metric_str} as function of model_size<br>color={pair}',
        xaxis_title='model size (MB)',
        yaxis_title=f'{metric_str}'
    )
    
    # Show plot
    fig.show()

In [None]:
df_cont = df[df.rounded==False]

for pair in combinations(parameters, 3):
    fig = go.Figure()
    trace = go.Scatter(
        x=df_cont['model_size_mb'],
        y=df_cont[f'valid_{metric}_pc'],
        mode='markers',
        marker=dict(size=5, 
                    color=df_cont[pair[0]] * df_cont[pair[1]] * df_cont[pair[2]],
                    colorscale='Viridis',),
        text=model_str, # What shows up on hover
        hoverinfo='text', # Show only the text, not (x,y) by default
    )
    fig.add_trace(trace)
    fig.update_layout(
        width=750,
        height=500,
        title=f'{metric_str} as function of model_size<br>color={pair}',
        xaxis_title='model size (MB)',
        yaxis_title=f'{metric_str}'
    )
    
    # Show plot
    fig.show()

In [None]:
parameters = ['num_features', 'num_subset_solvers', 'subset_size', 'subset_solver_depth']

metric = 'diff'
# metric = 'res'
if metric  == 'diff':
    zmin = df[f'valid_{metric}_pc'].min()
    zmax = df[f'valid_{metric}_pc'].max()
    cbar_title = 'MSE'
else:
    zmin = df[f'valid_{metric}_pc'].min()
    zmax = df[f'valid_{metric}_pc'].max()
    cbar_title = 'Residual'

for parameter in parameters:
    fig = go.Figure()
    
    for rounded, color in zip([True, False], ['lightskyblue', 'lightcoral']):
        df_temp = df[df.rounded == rounded].groupby(by=parameter).mean()
        df_temp[parameter] = df_temp.index.astype(str)

        trace = go.Bar(x=df_temp[parameter],
                       y=df_temp[f'valid_{metric}_pc'],
                       name=f'rounded: {rounded}',
                       marker_color=color)
        fig.add_trace(trace)

    title = f'Residual as function of {parameter}'
    fig.update_layout(
        width=750,
        height=500,
        barmode='group',
        font=dict(family='Lucida Console',
                  size=13,),
                  # color='black')
        title=title,
        xaxis=dict(title=parameter,),
        yaxis=dict(title='validation Residual',
                   range=[zmin, zmax]),
        legend=dict(x=.98,
                    y=.98,
                    xanchor='right',
                    yanchor='top',
                    bgcolor='rgba(255,255,255,0.75)',)
                    # bordercolor='black',
                    # borderwidth=1)
    )
    fig.show()
    fig.write_image(plot_folder/f'residual_vs_{parameter}.png', scale=2) 

In [None]:
rounded = [True, False]
colors = ['lightskyblue', 'lightcoral']
rounded[1]

metric = 'res'
# metric = 'diff'
if metric  == 'diff':
    zmin = df[f'valid_{metric}_pc'].min()
    zmax = df[f'valid_{metric}_pc'].max()
    cbar_title = 'MSE'
else:
    zmin = df[f'valid_{metric}_pc'].min()
    zmax = df[f'valid_{metric}_pc'].max()
    cbar_title = 'Residual'

for pair in combinations(parameters, 2):
    print(pair)

    fig = make_subplots(rows=1, cols=2, 
                        subplot_titles=(f"rounded = {rounded[0]}", 
                                        f"rounded = {rounded[1]}"))
    
    for i, (rd, color) in enumerate(zip(rounded, colors), start=1):
        df_temp = df[df.rounded == rd].groupby(by=list(pair))[f'valid_{metric}_pc'].mean().reset_index()
        df_temp = pd.pivot_table(df_temp, index=pair[0], columns=pair[1])
        
        y_ticklabels = list(map(str, list(df_temp.index)))
        x_ticklabels = [str(col[1]) for col in df_temp.columns]
        
        # Add first heatmap without colorbar
        trace = go.Heatmap(z=df_temp.values, 
                           colorscale='RdBu_r', 
                           showscale=True, 
                           zmin=zmin,
                           zmax=zmax,
                           x=x_ticklabels,  # x-axis tick labels
                           y=y_ticklabels,  # y-axis tick labels
                           text=df_temp.values,  # This will be used as labels
                           texttemplate="%{text:.5f}",  # Show text directly in each cell
                           textfont={"size":11, "color":"black"},  # Customize font color/size
                           colorbar=dict(title=f"{cbar_title}"))
        fig.add_trace(trace, row=1, col=i)
        fig.update_xaxes(title_text=pair[1], row=1, col=i)
        fig.update_yaxes(title_text=pair[0], row=1, col=i)

    # Update layout
    fig.update_layout(
        title_text=f"{cbar_title} {pair[0]} vs {pair[1]}",
        height=500,
        width=900,
    )
    fig.show()
    fig.write_image(plot_folder/f'{cbar_title}-{pair[0]}-vs-{pair[1]}.png', scale=2)

## Get winners models 

In [None]:
top = 10
for rounded in [False, True]:
    diff_winners = df[df.rounded == rounded].sort_values(f'valid_diff_pc')[ : top]
    res_winners = df[df.rounded == rounded].sort_values(f'valid_res_pc')[ : top]
    
    winner = pd.merge(diff_winners, res_winners, on=parameters, how='inner')
    
    winner['valid_diff_pc'] = (winner['valid_diff_pc_x'] + winner['valid_diff_pc_y']) / 2
    winner['valid_res_pc'] = (winner['valid_res_pc_x'] + winner['valid_res_pc_y']) / 2
    
    print(winner[parameters + ['valid_diff_pc', 'valid_res_pc', 'valid_res_sc_y']])