In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

from analysis.analysis_utils import add_tt_mals_runtime_cols, line_plot_padding_tile_size_tt_mals_runtime_per_matrix, get_percentage_change_per_category, get_percentage_change_per_double_category, line_plot_tile_size_rank_percentage_per_matrix


In [26]:
df1 = pd.read_csv('../../data/sweep_0_10_data1.csv')  # 0-10, small matrices
df2 = pd.read_csv('../../data/sweep_0_10_data2.csv')  # 0-10, big matrices
# df3 = pd.read_csv('../../data/sweep_0_10_data3.csv')  # powers of 2, all matrices
# df4 = pd.read_csv('../../data/sweep_0_10_data4.csv')  # around powers of 2, small matrices
df5 = pd.read_csv('../../data/sweep_0_10_data5.csv')  # 10 fractional matrix sizes, small matrices
df5 = df5[df5['partial_gauss']<1.0]  # don't consider full row reductions
# df5 has fractional partial_gauss params - convert it to absolute terms
df5["partial_gauss"] = (df5["partial_gauss"]* (df5["n"] / (1 - df5["partial_gauss"]))).astype("int")

df = pd.concat([df1, df2, df5], axis=0)  # df3, df4
df.drop(columns=["run_id", "run_name", "_runtime", "_step", "_timestamp", "gauss_threshold"], inplace=True)
df.shape

(41944, 11)

In [27]:
df.tail()

Unnamed: 0,amd,rcm,padding,matrix_name,partial_gauss,n,rank,z_reduced,z_full,max_mode_size,tile_size
13694,True,True,0,ex3,1093,729.0,985.0,28293.0,48657.0,9.0,9.0
13695,True,True,0,ex3,1093,729.0,225.0,28293.0,48657.0,27.0,27.0
13696,True,True,0,ex3,1093,729.0,39.0,28293.0,48657.0,81.0,81.0
13697,True,True,0,ex3,1093,729.0,7.0,28293.0,48657.0,243.0,243.0
13698,True,True,0,ex3,1093,729.0,1.0,28293.0,48657.0,729.0,729.0


In [28]:
df = add_tt_mals_runtime_cols(df)

In [29]:
# no module is applied - baseline
baseline_df = df[(df['amd'] == False) & (df['rcm'] == False) & (df['partial_gauss'] == 0) & (df['padding'] == 0)]

# different levels of padding applied - padding_df
padding_df = df[(df['amd'] == False) & (df['rcm'] == False) & (df['partial_gauss'] == 0)]

# keep amd on and off - amd_df
amd_df = df[(df['rcm'] == False) & (df['partial_gauss'] == 0) & (df['padding'] == 0)]

# keep rcm on and off - rcm_df
rcm_df = df[(df['amd'] == False) & (df['partial_gauss'] == 0) & (df['padding'] == 0)]

# different levels of partial gauss - partial_gauss_df
partial_gauss_df = df[(df['amd'] == False) & (df['rcm'] == False) & (df['padding'] == 0)]

In [30]:
df

Unnamed: 0,amd,rcm,padding,matrix_name,partial_gauss,n,rank,z_reduced,z_full,max_mode_size,tile_size,log_obj_func,obj_func
0,False,True,8,ex10hs,5,2551.0,1.0,57237.0,42957.0,2551.0,2551.0,47.065444,2.755896e+20
1,False,True,8,ex10hs,8,2548.0,16886.0,57244.0,43009.0,13.0,2.0,24.599249,4.823008e+10
2,False,True,8,ex10hs,8,2548.0,6453.0,57244.0,43009.0,13.0,4.0,22.677196,7.056368e+09
3,False,True,8,ex10hs,8,2548.0,4009.0,57244.0,43009.0,13.0,7.0,21.727500,2.729816e+09
4,False,True,8,ex10hs,8,2548.0,1752.0,57244.0,43009.0,13.0,13.0,20.083512,5.274221e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13694,True,True,0,ex3,1093,729.0,985.0,28293.0,48657.0,9.0,9.0,18.195507,7.983773e+07
13695,True,True,0,ex3,1093,729.0,225.0,28293.0,48657.0,27.0,27.0,19.876396,4.287548e+08
13696,True,True,0,ex3,1093,729.0,39.0,28293.0,48657.0,81.0,81.0,26.366804,2.824602e+11
13697,True,True,0,ex3,1093,729.0,7.0,28293.0,48657.0,243.0,243.0,32.958369,2.058912e+14


In [31]:
# setup color map for each matrix
matrix_names = df["matrix_name"].unique().tolist()
num_matrices = len(matrix_names)

# sample continuous color scale
# colorscale = px.colors.sequential.Viridis
# colors = px.colors.sample_colorscale(colorscale, num_matrices)

# sample discrete color scale
colorscale = px.colors.qualitative.Plotly
colors = [colorscale[i % len(colorscale)] for i in range(num_matrices)]

matrix_color_map = {matrix:color for matrix, color in zip(matrix_names, colors)}

Tile size choice effects (baseline)

In [32]:
# Tile size choice influence on TT-MALS runtime
fig = px.line(baseline_df, x="tile_size", y="log_obj_func", color='matrix_name', symbol="matrix_name", markers=True, log_x=True,
              color_discrete_map=matrix_color_map,
              labels={
                     "tile_size": "Tile size",
                     "matrix_name": "Matrix name",
                 }
              )
fig.update_layout(
    title={
        'text': "Influence of tile size choice on TT-MALS runtime",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black'), # Font color
    yaxis_title=r'$\log(I^6 + rI^3 + r^2I^2)$'
)
fig.show()
fig.write_image("plots/baseline_tile_size_vs_log_obj_func.pdf")

In [33]:
# trade-off between rank and max mode size - baseline
fig = px.line(baseline_df, x="rank", y="max_mode_size", color="matrix_name", symbol="matrix_name", log_x=True, log_y=True, color_discrete_map=matrix_color_map,
                 labels={
                     "rank": "Rank (r)",
                     "matrix_name": "Matrix name",
                     "max_mode_size": "Maximum mode size (I)",
                 })
fig.update_layout(
    title={
        'text': "Trade-off between rank and maximum mode size",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black') # Font color
)
fig.show()

fig.write_image("plots/baseline_max_mode_size_vs_rank.pdf")

Explore effects of padding

In [34]:
# show different cases: when it helps and when it cannot improve the situation
line_plot_padding_tile_size_tt_mals_runtime_per_matrix(padding_df, "ex3")



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [35]:
line_plot_padding_tile_size_tt_mals_runtime_per_matrix(padding_df, "ex10hs")



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [36]:
line_plot_padding_tile_size_tt_mals_runtime_per_matrix(padding_df, "bcsstk13")



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [37]:
line_plot_padding_tile_size_tt_mals_runtime_per_matrix(padding_df, "Pres_Poisson")



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Effects of RCM

In [38]:
rcm_df = get_percentage_change_per_double_category(data_frame=rcm_df, result_column="rank_percentage", variable="rank", baseline_col="rcm", baseline_value=False, category1="matrix_name", category2="tile_size")
rcm_df.sort_values(by=["matrix_name", "tile_size"], inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [39]:
line_plot_tile_size_rank_percentage_per_matrix(rcm_df[rcm_df["rcm"] == True], "rcm", matrix_color_map=matrix_color_map)

Assess AMD effects

In [40]:
amd_df = get_percentage_change_per_double_category(data_frame=amd_df, result_column="rank_percentage", variable="rank", baseline_col="amd", baseline_value=False, category1="matrix_name", category2="tile_size")
amd_df.sort_values(by=["matrix_name", "tile_size"], inplace=True)
line_plot_tile_size_rank_percentage_per_matrix(amd_df[amd_df["amd"] == True], "amd", matrix_color_map=matrix_color_map)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [41]:
# do some preprocessing to be able to show nonzero entry change for different matrices together - do it in terms of percentage

partial_gauss_df = get_percentage_change_per_category(data_frame=partial_gauss_df, result_column="z_reduced_percentage", variable="z_reduced", baseline_col="partial_gauss", baseline_value=0, category="matrix_name")
partial_gauss_df = get_percentage_change_per_category(data_frame=partial_gauss_df, result_column="z_full_percentage", variable="z_full", baseline_col="partial_gauss", baseline_value=0, category="matrix_name")
partial_gauss_df.sort_values(by="partial_gauss", inplace=True)
# n holds the updated matrix size, hence we have to add the number of reduced variables to get original n
partial_gauss_df["reduced_variable_percentage"] = partial_gauss_df["partial_gauss"] / (partial_gauss_df["n"]+ partial_gauss_df["partial_gauss"])
partial_gauss_df.sort_values(by=["matrix_name", "partial_gauss"], inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a

In [42]:
partial_gauss_df

Unnamed: 0,amd,rcm,padding,matrix_name,partial_gauss,n,rank,z_reduced,z_full,max_mode_size,tile_size,log_obj_func,obj_func,z_reduced_percentage,z_full_percentage,reduced_variable_percentage
4159,False,False,0,Pres_Poisson,0,14822.0,4.0,715804.0,715804.0,7411.0,7411.0,53.464324,1.656765e+23,1.000000,1.000000,0.000000
4158,False,False,0,Pres_Poisson,0,14822.0,241977.0,715804.0,715804.0,7411.0,2.0,53.464344,1.656798e+23,1.000000,1.000000,0.000000
4160,False,False,0,Pres_Poisson,0,14822.0,1.0,715804.0,715804.0,14822.0,14822.0,57.623207,1.060330e+25,1.000000,1.000000,0.000000
4483,False,False,0,Pres_Poisson,1,14821.0,1.0,716761.0,623075.0,14821.0,14821.0,57.622802,1.059900e+25,1.001337,0.870455,0.000067
4888,False,False,0,Pres_Poisson,2,14820.0,52822.0,716664.0,623107.0,19.0,6.0,27.638650,1.007658e+12,1.001201,0.870499,0.000135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6473,False,False,0,ex3,1460,365.0,1.0,12897.0,68521.0,365.0,365.0,35.399384,2.364597e+15,0.244795,1.300579,0.800000
6471,False,False,0,ex3,1460,365.0,1177.0,12897.0,68521.0,73.0,5.0,25.793267,1.591745e+11,0.244795,1.300579,0.800000
6522,False,False,0,ex3,1647,183.0,1023.0,5969.0,69183.0,61.0,3.0,24.742289,5.564671e+10,0.113296,1.313144,0.900000
6523,False,False,0,ex3,1647,183.0,7.0,5969.0,69183.0,61.0,61.0,24.665278,5.152215e+10,0.113296,1.313144,0.900000


In [43]:
# skip big matrices as we didn't compute partial gauss for those at these scales
fig = px.line(partial_gauss_df[(partial_gauss_df["matrix_name"] != "Pres_Poisson") & (partial_gauss_df["matrix_name"] != "ex15")], x="reduced_variable_percentage", y="z_full_percentage", color="matrix_name", symbol="matrix_name", log_y=False, color_discrete_map=matrix_color_map, # log_x=True,
                 labels={
                     "partial_gauss": "Number of eliminated variables",
                     "z_full_percentage": "Change in nonzero entries",
                     "matrix_name": "Matrix name",
                     "reduced_variable_percentage": "Ratio of reduced variables",
                 })
fig.update_layout(
    title={
        'text': "Effect of variable elimination on nonzero entry count for complete matrix",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black'), # Font color
)
fig.show()

fig.write_image("plots/partial_gauss_nonzero_entries_full_matrix.pdf")

In [44]:
fig = px.line(partial_gauss_df[(partial_gauss_df["matrix_name"] != "Pres_Poisson") & (partial_gauss_df["matrix_name"] != "ex15")], x="reduced_variable_percentage", y="z_reduced_percentage", color="matrix_name", symbol="matrix_name", log_y=False, color_discrete_map=matrix_color_map, # log_x=True,
                 labels={
                     "partial_gauss": "Number of eliminated variables",
                     "z_reduced_percentage": "Change in nonzero entries",
                     "matrix_name": "Matrix name",
                     "reduced_variable_percentage": "Ratio of reduced variables",
                 })
fig.update_layout(
    title={
        'text': "Effect of variable elimination on nonzero entry count for submatrix",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black'), # Font color
)
fig.show()

fig.write_image("plots/partial_gauss_nonzero_entries_submatrix.pdf")

In [45]:
# todos: 
# 1) run experiments for partial Gauss with much higher number of variable eliminations too
# 2) create functions to aggregate data across different tile sizes and only keep the best runtime rows
# 3) using data of 2) plot how solo modules can improve the runtime
# 4) make nice plot for partial-Gauss that shows results for different matrices together
# 5) add individual component results and plots to report
# 6) do combined modules' effect analysis

In [46]:
partial_gauss_df[(partial_gauss_df["matrix_name"] == "ex3") & (partial_gauss_df["partial_gauss"] == 1)]

Unnamed: 0,amd,rcm,padding,matrix_name,partial_gauss,n,rank,z_reduced,z_full,max_mode_size,tile_size,log_obj_func,obj_func,z_reduced_percentage,z_full_percentage,reduced_variable_percentage
11171,False,False,0,ex3,1,1820.0,37.0,52666.0,43791.0,140.0,140.0,29.649872,7529664000000.0,0.999639,0.831185,0.000549
11165,False,False,0,ex3,1,1820.0,372.0,52666.0,43791.0,35.0,35.0,21.428211,2023736000.0,0.999639,0.831185,0.000549
11167,False,False,0,ex3,1,1820.0,134.0,52666.0,43791.0,65.0,65.0,25.047816,75531550000.0,0.999639,0.831185,0.000549
11155,False,False,0,ex3,1,1820.0,19284.0,52666.0,43791.0,13.0,2.0,24.864711,62893670000.0,0.999639,0.831185,0.000549
11164,False,False,0,ex3,1,1820.0,521.0,52666.0,43791.0,28.0,28.0,20.37532,706137000.0,0.999639,0.831185,0.000549
11172,False,False,0,ex3,1,1820.0,28.0,52666.0,43791.0,182.0,182.0,31.224045,36343830000000.0,0.999639,0.831185,0.000549
11173,False,False,0,ex3,1,1820.0,19.0,52666.0,43791.0,260.0,260.0,33.364091,308916100000000.0,0.999639,0.831185,0.000549
11174,False,False,0,ex3,1,1820.0,13.0,52666.0,43791.0,364.0,364.0,35.382923,2325993000000000.0,0.999639,0.831185,0.000549
11175,False,False,0,ex3,1,1820.0,10.0,52666.0,43791.0,455.0,455.0,36.721785,8872958000000000.0,0.999639,0.831185,0.000549
11166,False,False,0,ex3,1,1820.0,197.0,52666.0,43791.0,52.0,52.0,23.714149,19903250000.0,0.999639,0.831185,0.000549


In [47]:
# get best log_obj_func value per matrix for a given partial gauss number
# skip large matrices

idx = partial_gauss_df[(partial_gauss_df["matrix_name"] != "Pres_Poisson") & (partial_gauss_df["matrix_name"] != "ex15")].groupby(['matrix_name', 'partial_gauss'])['log_obj_func'].idxmin()

# Use these indices to filter the original DataFrame
partial_gauss_tile_agg_df = partial_gauss_df[(partial_gauss_df["matrix_name"] != "Pres_Poisson") & (partial_gauss_df["matrix_name"] != "ex15")].loc[idx].reset_index(drop=True)
partial_gauss_tile_agg_df

Unnamed: 0,amd,rcm,padding,matrix_name,partial_gauss,n,rank,z_reduced,z_full,max_mode_size,tile_size,log_obj_func,obj_func,z_reduced_percentage,z_full_percentage,reduced_variable_percentage
0,False,False,0,bcsstk13,0,2003.0,1.0,83883.0,83883.0,2003.0,2003.0,45.614408,6.457816e+19,1.000000,1.000000,0.000000
1,False,False,0,bcsstk13,1,2002.0,1337.0,84136.0,70264.0,22.0,22.0,20.716039,9.927997e+08,1.003016,0.837643,0.000499
2,False,False,0,bcsstk13,2,2001.0,907.0,84271.0,70440.0,29.0,29.0,20.992370,1.308792e+09,1.004625,0.839741,0.000999
3,False,False,0,bcsstk13,3,2000.0,1500.0,84264.0,70468.0,20.0,20.0,20.698973,9.760000e+08,1.004542,0.840075,0.001498
4,False,False,0,bcsstk13,4,1999.0,1.0,84201.0,70437.0,1999.0,1999.0,45.602414,6.380824e+19,1.003791,0.839705,0.001997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,False,False,0,ex3,911,911.0,1.0,29139.0,61987.0,911.0,911.0,40.887257,5.716237e+17,0.553080,1.176559,0.500000
96,False,False,0,ex3,1093,729.0,925.0,22669.0,64182.0,9.0,9.0,18.071285,7.051139e+07,0.430274,1.218222,0.599890
97,False,False,0,ex3,1276,547.0,1.0,19589.0,66500.0,547.0,547.0,37.826693,2.678699e+16,0.371814,1.262219,0.699945
98,False,False,0,ex3,1460,365.0,17.0,12897.0,68521.0,73.0,73.0,25.742811,1.513424e+11,0.244795,1.300579,0.800000


In [48]:
# scatter plot between partial gauss and runtime
fig = px.scatter(partial_gauss_tile_agg_df, x="reduced_variable_percentage", y="log_obj_func", color='matrix_name', color_discrete_sequence=colors, 
                 trendline="ols", trendline_scope="overall", symbol="matrix_name", color_discrete_map=matrix_color_map, trendline_color_override="lime", trendline_options=dict(log_y=True),
              labels={
                     "tile_size": "Tile size",
                     "matrix_name": "Matrix name",
                  "reduced_variable_percentage": "Ratio of reduced variables",
                 }
              )
fig.update_layout(
    title={
        'text': "Influence of variable elimination on TT-MALS runtime",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black'), # Font color
    yaxis_title=r'$\log(I^6 + rI^3 + r^2I^2)$'
)
fig.show()

In [49]:
# scatter plot between partial gauss and runtime
fig = px.scatter(partial_gauss_tile_agg_df, x="rank", y="max_mode_size", color='reduced_variable_percentage', color_discrete_sequence=colors, 
                log_y=True, log_x=True, symbol="matrix_name",
              labels={
                     "tile_size": "Tile size",
                     "matrix_name": "Matrix name",
                  "reduced_variable_percentage": "Ratio of reduced<br>variables",
                  "rank": "Rank"
                 }
              )
fig.update_layout(
    title={
        'text': "Influence of variable elimination on TT-ranks",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black'), # Font color
    yaxis_title="Maximum mode size",
    coloraxis_colorbar=dict(yanchor="top", y=1, x=-0.2,
                                          ticks="outside")
)
fig.show()

In [32]:
# idea: plot best tile size distribution for each matrix - can make it percentage of original size (n) to make it easy to compare across different matrices

In [None]:
# could compare how much partial gauss helps wrt best of first 10 reductions - should also do a window of 10 around fractions - otherwise we make it too dependent on factorization we get at specific instance