In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
from statsmodels.formula.api import ols

from analysis.analysis_utils import add_tt_mals_runtime_cols, line_plot_padding_tile_size_tt_mals_runtime_per_matrix, get_percentage_change_per_category, get_percentage_change_per_double_category, line_plot_tile_size_rank_percentage_per_matrix


In [2]:
df1 = pd.read_csv('../../data/sweep_0_10_data1.csv')
df2 = pd.read_csv('../../data/sweep_0_10_data2.csv')
df = pd.concat([df1, df2], axis=0)
df.drop(columns=["run_id", "run_name", "_runtime", "_step", "_timestamp", "gauss_threshold"], inplace=True)
df.shape

(28604, 11)

In [3]:
df.tail()

Unnamed: 0,amd,rcm,padding,matrix_name,partial_gauss,n,rank,z_reduced,z_full,max_mode_size,tile_size
9403,True,True,2,Pres_Poisson,5,14819.0,1625.0,718845.0,623417.0,73.0,73.0
9404,True,True,2,Pres_Poisson,5,14819.0,325.0,718845.0,623417.0,203.0,203.0
9405,True,True,2,Pres_Poisson,5,14819.0,85.0,718845.0,623417.0,511.0,511.0
9406,True,True,2,Pres_Poisson,5,14819.0,19.0,718845.0,623417.0,2117.0,2117.0
9407,True,True,2,Pres_Poisson,5,14819.0,1.0,718845.0,623417.0,14819.0,14819.0


In [4]:
df = add_tt_mals_runtime_cols(df)

In [5]:
# no module is applied - baseline
baseline_df = df[(df['amd'] == False) & (df['rcm'] == False) & (df['partial_gauss'] == 0) & (df['padding'] == 0)]

# different levels of padding applied - padding_df
padding_df = df[(df['amd'] == False) & (df['rcm'] == False) & (df['partial_gauss'] == 0)]

# keep amd on and off - amd_df
amd_df = df[(df['rcm'] == False) & (df['partial_gauss'] == 0) & (df['padding'] == 0)]

# keep rcm on and off - rcm_df
rcm_df = df[(df['amd'] == False) & (df['partial_gauss'] == 0) & (df['padding'] == 0)]

# different levels of partial gauss - partial_gauss_df
partial_gauss_df = df[(df['amd'] == False) & (df['rcm'] == False) & (df['padding'] == 0)]

rcm_padding_df = df[(df['amd'] == False) & (df['partial_gauss'] == 0)]

In [6]:
df.tail()

Unnamed: 0,amd,rcm,padding,matrix_name,partial_gauss,n,rank,z_reduced,z_full,max_mode_size,tile_size,log_obj_func,obj_func
9403,True,True,2,Pres_Poisson,5,14819.0,1625.0,718845.0,623417.0,73.0,73.0,25.835484,166038300000.0
9404,True,True,2,Pres_Poisson,5,14819.0,325.0,718845.0,623417.0,203.0,203.0,31.879337,69987440000000.0
9405,True,True,2,Pres_Poisson,5,14819.0,85.0,718845.0,623417.0,511.0,511.0,37.418218,1.780433e+16
9406,True,True,2,Pres_Poisson,5,14819.0,19.0,718845.0,623417.0,2117.0,2117.0,45.946532,9.001713e+19
9407,True,True,2,Pres_Poisson,5,14819.0,1.0,718845.0,623417.0,14819.0,14819.0,57.621993,1.059042e+25


In [7]:
# setup color map for each matrix
matrix_names = df["matrix_name"].unique().tolist()
num_matrices = len(matrix_names)

colorscale = px.colors.sequential.Viridis
colors = px.colors.sample_colorscale(colorscale, num_matrices)

matrix_color_map = {matrix:color for matrix, color in zip(matrix_names, colors)}

In [8]:
# get best tile size choices
idx = rcm_padding_df.groupby(by=['matrix_name', 'rcm'])['log_obj_func'].idxmin()
rcm_padding_tile_agg_df = rcm_padding_df.loc[idx].reset_index(drop=True)
rcm_padding_tile_agg_df

Unnamed: 0,amd,rcm,padding,matrix_name,partial_gauss,n,rank,z_reduced,z_full,max_mode_size,tile_size,log_obj_func,obj_func
0,False,False,1,Pres_Poisson,0,14823.0,2599.0,715804.0,715804.0,61.0,61.0,25.070247,77244910000.0
1,False,True,1,Pres_Poisson,0,14823.0,2105.0,715804.0,715804.0,61.0,61.0,24.949895,68486010000.0
2,False,False,6,bcsstk13,0,2009.0,569.0,83883.0,83883.0,41.0,41.0,22.397285,5333563000.0
3,False,True,6,bcsstk13,0,2009.0,781.0,83883.0,83883.0,41.0,41.0,22.486159,5829276000.0
4,False,False,10,ex10,0,2420.0,1031.0,54840.0,54840.0,20.0,20.0,20.02497,497432400.0
5,False,True,10,ex10,0,2420.0,907.0,54840.0,54840.0,20.0,20.0,19.807764,400315600.0
6,False,False,2,ex10hs,0,2550.0,1410.0,57308.0,57308.0,17.0,17.0,20.221773,605625800.0
7,False,True,2,ex10hs,0,2550.0,1202.0,57308.0,57308.0,17.0,17.0,19.919391,447591400.0
8,False,False,6,ex13,0,2574.0,1209.0,75628.0,75628.0,18.0,18.0,20.058993,514647800.0
9,False,True,6,ex13,0,2574.0,1235.0,75628.0,75628.0,18.0,18.0,20.098502,535387600.0


Explore effects of rcm followed by padding

In [None]:
# need to assess how much is the combined better than individual components, compared to baseline


In [30]:
# statistical significance of individual components
df_lm = ols('log_obj_func ~ tile_size + C(rcm) + C(padding) + C(amd) + C(partial_gauss)', data=df).fit() #Specify C for Categorical
print(sm.stats.anova_lm(df_lm, typ=2))

                        sum_sq       df             F    PR(>F)
C(rcm)            2.401508e+01      1.0      0.524414  0.468971
C(padding)        1.193318e+03     10.0      2.605832  0.003673
C(amd)            2.187177e+00      1.0      0.047761  0.827008
C(partial_gauss)  1.208393e+03     10.0      2.638751  0.003263
tile_size         1.203326e+06      1.0  26276.851520  0.000000
Residual          1.308796e+06  28580.0           NaN       NaN


In [31]:
# statistical significance of pairwise interactions of components
df_lm = ols('log_obj_func ~ tile_size + C(rcm) + C(padding) + C(amd) + C(partial_gauss)' \
            '+ tile_size * C(rcm)' \
            '+ tile_size * C(padding)' \
            '+ tile_size * C(amd)' \
            '+ tile_size * C(partial_gauss)' \
            '+ C(rcm) * C(padding)' \
            '+ C(rcm) * C(amd)' \
            '+ C(rcm) * C(partial_gauss)' \
            '+ C(padding) * C(amd)' \
            '+ C(padding) * C(partial_gauss)' \
            '+ C(amd) * C(partial_gauss)' \
            , data=df).fit() #Specify C for Categorical
print(sm.stats.anova_lm(df_lm, typ=2))


                                   sum_sq       df             F  \
C(rcm)                       2.143032e+01      1.0      0.482793   
C(padding)                   1.195405e+03     10.0      2.693069   
C(amd)                       1.688573e+00      1.0      0.038041   
C(partial_gauss)             1.211275e+03     10.0      2.728820   
C(rcm):C(padding)            4.560327e+00     10.0      0.010274   
C(rcm):C(amd)                2.174646e+00      1.0      0.048992   
C(rcm):C(partial_gauss)      4.754519e+00     10.0      0.010711   
C(padding):C(amd)            5.466051e+00     10.0      0.012314   
C(padding):C(partial_gauss)  4.690339e+04    100.0     10.566629   
C(amd):C(partial_gauss)      4.178454e+00     10.0      0.009413   
tile_size                    1.163236e+06      1.0  26205.954163   
tile_size:C(rcm)             3.222184e+00      1.0      0.072591   
tile_size:C(padding)         1.975243e+02     10.0      0.444993   
tile_size:C(amd)             1.215483e-04      1