In [22]:
import pandas as pd
import plotly.express as px

from analysis.analysis_utils import add_tt_mals_runtime_cols, line_plot_padding_tile_size_tt_mals_runtime_per_matrix, get_percentage_change_per_category, get_percentage_change_per_double_category, line_plot_tile_size_rank_percentage_per_matrix, get_largest_prime_factor, update_marker_size

from optimizers.tile_size import prime_factors, possible_tile_sizes_from_factors


In [23]:
df2 = pd.read_csv('../../data/sweep_inc_data2_new_pad.csv')  #  big matrices
df1 = pd.read_csv('../../data/sweep_inc_data1_new_pad.csv')  # small matrices
df = pd.concat([df1, df2], axis=0)
df = df[df['sparsity_ratio'] > 0.8]  # drop rows that are incomplete due to too low sparsity
df.drop(columns=['partial_gauss'], inplace=True)
df.rename(columns={"num_reduced_variables": "partial_gauss"}, inplace=True)
df.drop(columns=["run_id", "run_name", "_runtime", "_step", "_timestamp", "gauss_threshold", "min_sparsity", "partial_gauss_increments"], inplace=True)
df.shape

(2339158, 12)

In [24]:
df = add_tt_mals_runtime_cols(df)

df["largest_prime_factor"] = df.apply(get_largest_prime_factor, axis=1)

df["is_baseline_tile"] = df["tile_size"] == df["largest_prime_factor"]

In [25]:
# no module is applied - baseline
baseline_df = df[(df['amd'] == False) & (df['rcm'] == False) & (df['partial_gauss'] == 0) & (df['padding'] == 0)]

# different levels of padding applied - padding_df
padding_df = df[(df['amd'] == False) & (df['rcm'] == False) & (df['partial_gauss'] == 0)]

# keep amd on and off - amd_df
amd_df = df[(df['rcm'] == False) & (df['partial_gauss'] == 0) & (df['padding'] == 0)]

# keep rcm on and off - rcm_df
rcm_df = df[(df['amd'] == False) & (df['partial_gauss'] == 0) & (df['padding'] == 0)]

# different levels of partial gauss - partial_gauss_df
partial_gauss_df = df[(df['amd'] == False) & (df['rcm'] == False) & (df['padding'] == 0)]

In [26]:
# setup color map for each matrix
matrix_names = df["matrix_name"].unique().tolist()
num_matrices = len(matrix_names)

# sample continuous color scale
# colorscale = px.colors.sequential.Viridis
# colors = px.colors.sample_colorscale(colorscale, num_matrices)

# sample discrete color scale
colorscale = px.colors.qualitative.Plotly
colors = [colorscale[i % len(colorscale)] for i in range(num_matrices)]

matrix_color_map = {matrix:color for matrix, color in zip(matrix_names, colors)}

Tile size choice effects (baseline)

In [27]:
baseline_df.sort_values(by=["matrix_name", "tile_size"], inplace=True)
baseline_df



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,amd,rcm,padding,matrix_name,max_mode_size,sparsity_ratio,z_full,n,tile_size,z_reduced,partial_gauss,rank,log_obj_func,obj_func,full_runtime,log_full_runtime,largest_prime_factor,is_baseline_tile
12760,False,False,0,Pres_Poisson,7411.0,0.997164,623075.0,14822.0,2.0,623075.0,0.0,216333.0,53.46434,1.656791e+23,1.1990310000000002e+31,71.561651,7411.0,False
12761,False,False,0,Pres_Poisson,7411.0,0.997164,623075.0,14822.0,7411.0,623075.0,0.0,4.0,53.464324,1.656765e+23,1.1990310000000002e+31,71.561651,7411.0,True
12762,False,False,0,Pres_Poisson,14822.0,0.997164,623075.0,14822.0,14822.0,623075.0,0.0,1.0,57.623207,1.06033e+25,7.673796000000001e+32,75.720534,7411.0,False
9917,False,False,0,bcsstk13,2003.0,0.982487,70264.0,2003.0,2003.0,70264.0,0.0,1.0,45.614408,6.457816e+19,7.205963e+26,61.842121,2003.0,True
19836,False,False,0,ex10,241.0,0.992511,43497.0,2410.0,2.0,43497.0,0.0,26020.0,33.093233,235618000000000.0,2.651065e+21,49.329248,241.0,False
19837,False,False,0,ex10,241.0,0.992511,43497.0,2410.0,5.0,43497.0,0.0,6641.0,32.922239,198585100000000.0,2.650004e+21,49.328848,241.0,False
19838,False,False,0,ex10,241.0,0.992511,43497.0,2410.0,10.0,43497.0,0.0,2524.0,32.910848,196335900000000.0,2.64994e+21,49.328824,241.0,False
19839,False,False,0,ex10,241.0,0.992511,43497.0,2410.0,241.0,43497.0,0.0,28.0,32.908784,195931000000000.0,2.649928e+21,49.328819,241.0,True
19840,False,False,0,ex10,482.0,0.992511,43497.0,2410.0,482.0,43497.0,0.0,13.0,37.067665,1.253956e+16,1.695954e+23,53.487703,241.0,False
19841,False,False,0,ex10,1205.0,0.992511,43497.0,2410.0,1205.0,43497.0,0.0,4.0,42.565409,3.061416e+18,4.140513e+25,58.985447,241.0,False


In [28]:
# Tile size choice influence on TT-MALS runtime
fig = px.line(baseline_df, x="tile_size", y="log_obj_func", color='matrix_name', symbol="matrix_name", markers=True, log_x=True,
              color_discrete_map=matrix_color_map,
              labels={
                     "tile_size": "Tile size",
                     "matrix_name": "Matrix name",
                 }
              )

# enlarge baseline tile size markers
fig.for_each_trace(lambda trace: update_marker_size(baseline_df, trace, trace.name, "matrix_name"))

fig.update_layout(
    title={
        'text': "Influence of tile size choice on TT-MALS runtime",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black'), # Font color
    yaxis_title=r'$\log(I^6 + rI^3 + r^2I^2)$'
)
fig.show()
fig.write_image("plots/baseline_tile_size_vs_log_obj_func.pdf")

In [29]:
# trade-off between rank and max mode size - baseline
fig = px.line(baseline_df, x="rank", y="max_mode_size", color="matrix_name", symbol="matrix_name", log_x=True, log_y=True, color_discrete_map=matrix_color_map,
                 labels={
                     "rank": "Rank (r)",
                     "matrix_name": "Matrix name",
                     "max_mode_size": "Maximum mode size (I)",
                 })
fig.update_layout(
    title={
        'text': "Trade-off between rank and maximum mode size",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black') # Font color
)
# enlarge baseline tile size markers
fig.for_each_trace(lambda trace: update_marker_size(baseline_df, trace, trace.name, "matrix_name"))
fig.show()

fig.write_image("plots/baseline_max_mode_size_vs_rank.pdf")

Explore effects of padding

In [9]:
# show different cases: when it helps and when it cannot improve the situation
line_plot_padding_tile_size_tt_mals_runtime_per_matrix(padding_df, "ex3")



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [10]:
line_plot_padding_tile_size_tt_mals_runtime_per_matrix(padding_df, "ex10hs")



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [11]:
line_plot_padding_tile_size_tt_mals_runtime_per_matrix(padding_df, "bcsstk13")



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [12]:
line_plot_padding_tile_size_tt_mals_runtime_per_matrix(padding_df, "Pres_Poisson")



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Effects of RCM

In [13]:
rcm_df = get_percentage_change_per_double_category(data_frame=rcm_df, result_column="rank_percentage", variable="rank", baseline_col="rcm", baseline_value=False, category1="matrix_name", category2="tile_size")
rcm_df.sort_values(by=["matrix_name", "tile_size"], inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [14]:
line_plot_tile_size_rank_percentage_per_matrix(rcm_df[rcm_df["rcm"] == True], "rcm", matrix_color_map=matrix_color_map)

Assess AMD effects

In [15]:
amd_df = get_percentage_change_per_double_category(data_frame=amd_df, result_column="rank_percentage", variable="rank", baseline_col="amd", baseline_value=False, category1="matrix_name", category2="tile_size")
amd_df.sort_values(by=["matrix_name", "tile_size"], inplace=True)
line_plot_tile_size_rank_percentage_per_matrix(amd_df[amd_df["amd"] == True], "amd", matrix_color_map=matrix_color_map)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [16]:
# do some preprocessing to be able to show nonzero entry change for different matrices together - do it in terms of percentage

partial_gauss_df = get_percentage_change_per_category(data_frame=partial_gauss_df, result_column="z_reduced_percentage", variable="z_reduced", baseline_col="partial_gauss", baseline_value=0, category="matrix_name")
partial_gauss_df = get_percentage_change_per_category(data_frame=partial_gauss_df, result_column="z_full_percentage", variable="z_full", baseline_col="partial_gauss", baseline_value=0, category="matrix_name")
partial_gauss_df.sort_values(by="partial_gauss", inplace=True)
# n holds the updated matrix size, hence we have to add the number of reduced variables to get original n
partial_gauss_df["reduced_variable_percentage"] = partial_gauss_df["partial_gauss"] / (partial_gauss_df["n"]+ partial_gauss_df["partial_gauss"])
partial_gauss_df.sort_values(by=["matrix_name", "partial_gauss"], inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a

In [17]:
partial_gauss_df

Unnamed: 0,amd,rcm,padding,matrix_name,max_mode_size,sparsity_ratio,z_full,n,tile_size,z_reduced,...,rank,log_obj_func,obj_func,full_runtime,log_full_runtime,largest_prime_factor,is_baseline_tile,z_reduced_percentage,z_full_percentage,reduced_variable_percentage
12760,False,False,0,Pres_Poisson,7411.0,0.997164,623075.0,14822.0,2.0,623075.0,...,216333.0,53.464340,1.656791e+23,1.199031e+31,71.561651,7411.0,False,1.000000,1.000000,0.000000
12761,False,False,0,Pres_Poisson,7411.0,0.997164,623075.0,14822.0,7411.0,623075.0,...,4.0,53.464324,1.656765e+23,1.199031e+31,71.561651,7411.0,True,1.000000,1.000000,0.000000
12762,False,False,0,Pres_Poisson,14822.0,0.997164,623075.0,14822.0,14822.0,623075.0,...,1.0,57.623207,1.060330e+25,7.673796e+32,75.720534,7411.0,False,1.000000,1.000000,0.000000
12763,False,False,0,Pres_Poisson,14821.0,0.997164,623019.0,14821.0,14821.0,622969.0,...,1.0,57.622802,1.059900e+25,7.670259e+32,75.720073,14821.0,True,0.999830,0.999910,0.000067
12774,False,False,0,Pres_Poisson,20.0,0.997163,623168.0,14820.0,20.0,623071.0,...,10221.0,24.459346,4.193330e+10,8.052230e+15,36.624725,19.0,False,0.999994,1.000149,0.000135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277531,False,False,0,ex3,184.0,0.875591,71682.0,184.0,184.0,4212.0,...,1.0,31.289615,3.880673e+13,2.122749e+19,44.501829,23.0,False,0.096184,1.636912,0.898957
277530,False,False,0,ex3,92.0,0.875591,71682.0,184.0,92.0,4212.0,...,4.0,27.130737,6.063583e+11,3.316795e+17,40.342946,23.0,False,0.096184,1.636912,0.898957
277529,False,False,0,ex3,46.0,0.875591,71682.0,184.0,46.0,4212.0,...,12.0,22.972004,9.475770e+09,5.182499e+15,36.184064,23.0,False,0.096184,1.636912,0.898957
277528,False,False,0,ex3,23.0,0.875591,71682.0,184.0,23.0,4212.0,...,40.0,18.821930,1.493690e+08,8.098164e+13,32.025244,23.0,True,0.096184,1.636912,0.898957


In [18]:
# get best log_obj_func value per matrix for a given partial gauss number
# skip large matrices

idx = partial_gauss_df[(partial_gauss_df["matrix_name"] != "Pres_Poisson") & (partial_gauss_df["matrix_name"] != "ex15")].groupby(['matrix_name', 'partial_gauss'])['log_obj_func'].idxmin()

# Use these indices to filter the original DataFrame
partial_gauss_tile_agg_df = partial_gauss_df[(partial_gauss_df["matrix_name"] != "Pres_Poisson") & (partial_gauss_df["matrix_name"] != "ex15")].loc[idx].reset_index(drop=True)
partial_gauss_tile_agg_df

Unnamed: 0,amd,rcm,padding,matrix_name,max_mode_size,sparsity_ratio,z_full,n,tile_size,z_reduced,...,rank,log_obj_func,obj_func,full_runtime,log_full_runtime,largest_prime_factor,is_baseline_tile,z_reduced_percentage,z_full_percentage,reduced_variable_percentage
0,False,False,0,bcsstk13,2003.0,0.982487,70264.0,2003.0,2003.0,70264.0,...,1.0,45.614408,6.457816e+19,7.205963e+26,61.842121,2003.0,True,1.000000,1.000000,0.000000
1,False,False,0,bcsstk13,22.0,0.982433,70440.0,2002.0,22.0,70410.0,...,1335.0,20.713409,9.901919e+08,1.286753e+15,34.790898,13.0,False,1.002078,1.002505,0.000499
2,False,False,0,bcsstk13,29.0,0.982416,70468.0,2001.0,29.0,70405.0,...,907.0,20.992370,1.308792e+09,6.648495e+15,36.433167,29.0,True,1.002007,1.002903,0.000999
3,False,False,0,bcsstk13,20.0,0.982415,70437.0,2000.0,20.0,70341.0,...,1500.0,20.698973,9.760000e+08,7.361641e+14,34.232474,5.0,False,1.001096,1.002462,0.001498
4,False,False,0,bcsstk13,1999.0,0.982413,70407.0,1999.0,1999.0,70279.0,...,1.0,45.602414,6.380824e+19,7.105087e+26,61.828023,1999.0,True,1.000213,1.002035,0.001997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7518,False,False,0,ex3,47.0,0.873670,71814.0,188.0,47.0,4465.0,...,12.0,23.101031,1.078078e+10,6.093657e+15,36.346025,47.0,True,0.101962,1.639926,0.896760
7519,False,False,0,ex3,17.0,0.874317,71772.0,187.0,17.0,4395.0,...,69.0,17.067918,2.585249e+07,1.354128e+13,30.236764,17.0,True,0.100363,1.638967,0.897309
7520,False,False,0,ex3,31.0,0.871922,71837.0,186.0,31.0,4431.0,...,24.0,20.605352,8.887722e+08,4.935844e+14,33.832715,31.0,True,0.101185,1.640451,0.897858
7521,False,False,0,ex3,37.0,0.873543,71768.0,185.0,37.0,4328.0,...,19.0,21.666075,2.567183e+09,1.415184e+15,34.886036,37.0,True,0.098833,1.638876,0.898407


In [19]:
# for next plots nice to see the number of factors we can choose from for certain results

def get_num_possible_tiles(row):
    return len(possible_tile_sizes_from_factors(prime_factors(row["n"])))

partial_gauss_tile_agg_df["num_possible_tiles"] = partial_gauss_tile_agg_df.apply(get_num_possible_tiles, axis=1)

In [20]:
# scatter plot between partial gauss and runtime
matrix_name = "ex3"
fig = px.scatter(partial_gauss_tile_agg_df[partial_gauss_tile_agg_df["matrix_name"] == matrix_name], x="reduced_variable_percentage", y="log_obj_func", color='num_possible_tiles',
              labels={
                     "tile_size": "Tile size",
                     "matrix_name": "Matrix name",
                  "reduced_variable_percentage": "Ratio of reduced variables",
                  "num_possible_tiles": "Number of possible<br>tile size choices"
                 }
              )
fig.update_layout(
    title={
        'text': "Influence of variable elimination on TT-MALS runtime for {}".format(matrix_name),
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black'), # Font color
    yaxis_title=r'$\log(I^6 + rI^3 + r^2I^2)$'
)
fig.show()
fig.write_image("plots/{}_pg_ratio_tile_size_log_obj_func.pdf".format(matrix_name))

In [21]:
# scatter plot for partial gauss effects on ranks and max mode size
fig = px.scatter(partial_gauss_tile_agg_df[partial_gauss_tile_agg_df["matrix_name"] == matrix_name], x="rank", y="max_mode_size", color='reduced_variable_percentage', color_discrete_sequence=colors, 
                log_y=True, log_x=True,  # symbol="matrix_name",
              labels={
                     "tile_size": "Tile size",
                     "matrix_name": "Matrix name",
                  "reduced_variable_percentage": "Ratio of reduced<br>variables",
                  "rank": "Rank"
                 }
              )
fig.update_layout(
    title={
        'text': "Influence of variable elimination on TT-ranks and max mode size for {}".format(matrix_name),
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black'), # Font color
    yaxis_title="Maximum mode size",
    # coloraxis_colorbar=dict(yanchor="top", y=1, x=-0.2, ticks="outside")
)
fig.show()
fig.write_image("plots/{}_pg_ratio_max_mode_size_rank.pdf".format(matrix_name))