In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
from statsmodels.formula.api import ols

from analysis.analysis_utils import add_tt_mals_runtime_cols, line_plot_padding_tile_size_tt_mals_runtime_per_matrix, get_percentage_change_per_category, get_percentage_change_per_double_category, line_plot_tile_size_rank_percentage_per_matrix
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [6]:
df1 = pd.read_csv('../../data/sweep_0_10_data1.csv')
df2 = pd.read_csv('../../data/sweep_0_10_data2.csv')
df = pd.concat([df1, df2], axis=0)
df.drop(columns=["run_id", "run_name", "_runtime", "_step", "_timestamp", "gauss_threshold"], inplace=True)
df = add_tt_mals_runtime_cols(df)
df.shape

(28604, 13)

# first check assumptions for ANOVA
There are three primary assumptions in ANOVA:

1. The responses for each factor level have a normal population distribution.
2. These distributions have the same variance.
3. The data are independent.

In [7]:
# statistical significance of individual components
df_lm = ols('log_obj_func ~ tile_size + C(rcm) + C(padding) + C(amd) + C(partial_gauss)', data=df).fit() #Specify C for Categorical
print(sm.stats.anova_lm(df_lm, typ=2))

                        sum_sq       df             F    PR(>F)
C(rcm)            2.401508e+01      1.0      0.524414  0.468971
C(padding)        1.193318e+03     10.0      2.605832  0.003673
C(amd)            2.187177e+00      1.0      0.047761  0.827008
C(partial_gauss)  1.208393e+03     10.0      2.638751  0.003263
tile_size         1.203326e+06      1.0  26276.851520  0.000000
Residual          1.308796e+06  28580.0           NaN       NaN


In [8]:
# statistical significance of pairwise interactions of components
df_lm = ols('log_obj_func ~ tile_size + C(rcm) + C(padding) + C(amd) + C(partial_gauss)' \
            '+ tile_size * C(rcm)' \
            '+ tile_size * C(padding)' \
            '+ tile_size * C(amd)' \
            '+ tile_size * C(partial_gauss)' \
            '+ C(rcm) * C(padding)' \
            '+ C(rcm) * C(amd)' \
            '+ C(rcm) * C(partial_gauss)' \
            '+ C(padding) * C(amd)' \
            '+ C(padding) * C(partial_gauss)' \
            '+ C(amd) * C(partial_gauss)' \
            , data=df).fit() #Specify C for Categorical
print(sm.stats.anova_lm(df_lm, typ=2))

                                   sum_sq       df             F         PR(>F)
C(rcm)                       2.143032e+01      1.0      0.482793   4.871656e-01
C(padding)                   1.195405e+03     10.0      2.693069   2.679633e-03
C(amd)                       1.688573e+00      1.0      0.038041   8.453622e-01
C(partial_gauss)             1.211275e+03     10.0      2.728820   2.352303e-03
C(rcm):C(padding)            4.560327e+00     10.0      0.010274   1.000000e+00
C(rcm):C(amd)                2.174646e+00      1.0      0.048992   8.248291e-01
C(rcm):C(partial_gauss)      4.754519e+00     10.0      0.010711   1.000000e+00
C(padding):C(amd)            5.466051e+00     10.0      0.012314   1.000000e+00
C(padding):C(partial_gauss)  4.690339e+04    100.0     10.566629  4.493827e-156
C(amd):C(partial_gauss)      4.178454e+00     10.0      0.009413   1.000000e+00
tile_size                    1.163236e+06      1.0  26205.954163   0.000000e+00
tile_size:C(rcm)             3.222184e+0