In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

from analysis.analysis_utils import add_tt_mals_runtime_cols, line_plot_padding_tile_size_tt_mals_runtime_per_matrix, get_percentage_change_per_category, get_percentage_change_per_double_category, line_plot_tile_size_rank_percentage_per_matrix

from optimizers.tile_size import prime_factors, possible_tile_sizes_from_factors


In [2]:
# old data
df1 = pd.read_csv('../../data/sweep_0_10_data1.csv')  # 0-10, small matrices
df2 = pd.read_csv('../../data/sweep_0_10_data2.csv')  # 0-10, big matrices
old_df = pd.concat([df1, df2], axis=0)
old_df.drop(columns=["run_id", "run_name", "_runtime", "_step", "_timestamp", "gauss_threshold",], inplace=True) 
old_df.rename(columns={"num_reduced_variables": "partial_gauss"}, inplace=True)
# concat 0-10 sweeps for larger matrices
old_df.shape

(28604, 11)

In [3]:
# new data
df2 = pd.read_csv('../../data/sweep_0_10_data2.csv')  # 0-10, big matrices
df = pd.read_csv('../../data/sweep_inc_data1.csv')
df = df[df['sparsity_ratio'] > 0.8]  # drop rows that are incomplete due to too low sparsity
df.drop(columns=['partial_gauss'], inplace=True)
df.rename(columns={"num_reduced_variables": "partial_gauss"}, inplace=True)
# concat 0-10 sweeps for larger matrices
df = pd.concat([df, df2], axis=0)  # df3, df4
df.drop(columns=["run_id", "run_name", "_runtime", "_step", "_timestamp", "gauss_threshold", "min_sparsity", "partial_gauss_increments"], inplace=True)
df.shape

(2331080, 12)

In [14]:
# setup color map for each matrix
matrix_names = df["matrix_name"].unique().tolist()
num_matrices = len(matrix_names)

# sample continuous color scale
# colorscale = px.colors.sequential.Viridis
# colors = px.colors.sample_colorscale(colorscale, num_matrices)

# sample discrete color scale
colorscale = px.colors.qualitative.Plotly
colors = [colorscale[i % len(colorscale)] for i in range(num_matrices)]

matrix_color_map = {matrix:color for matrix, color in zip(matrix_names, colors)}

In [4]:
df = add_tt_mals_runtime_cols(df)
old_df = add_tt_mals_runtime_cols(old_df)

In [5]:
rcm_df = df[(df['amd'] == False) & (df['partial_gauss'] == 0) & (df['padding'] == 0)]
rcm_old_df = old_df[(old_df['amd'] == False) & (old_df['partial_gauss'] == 0) & (old_df['padding'] == 0)]

In [6]:
rcm_df

Unnamed: 0,amd,rcm,padding,matrix_name,z_full,partial_gauss,rank,tile_size,z_reduced,max_mode_size,sparsity_ratio,n,log_obj_func,obj_func
73048,False,True,0,bcsstk13,70264.0,0.0,1.0,2003.0,70264.0,2003.0,0.982487,2003.0,45.614408,6.457816e+19
124454,False,True,0,ex10hs,42944.0,0.0,20178.0,2.0,42944.0,13.0,0.993385,2548.0,24.955309,6.885779e+10
124455,False,True,0,ex10hs,42944.0,0.0,8201.0,4.0,42944.0,13.0,0.993385,2548.0,23.155929,1.138918e+10
124456,False,True,0,ex10hs,42944.0,0.0,4374.0,7.0,42944.0,13.0,0.993385,2548.0,21.901220,3.247724e+09
124457,False,True,0,ex10hs,42944.0,0.0,1770.0,13.0,42944.0,13.0,0.993385,2548.0,20.103695,5.381756e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4076,False,True,0,Pres_Poisson,715804.0,0.0,4.0,7411.0,715804.0,7411.0,,14822.0,53.464324,1.656765e+23
4077,False,True,0,Pres_Poisson,715804.0,0.0,1.0,14822.0,715804.0,14822.0,,14822.0,57.623207,1.060330e+25
4158,False,False,0,Pres_Poisson,715804.0,0.0,241977.0,2.0,715804.0,7411.0,,14822.0,53.464344,1.656798e+23
4159,False,False,0,Pres_Poisson,715804.0,0.0,4.0,7411.0,715804.0,7411.0,,14822.0,53.464324,1.656765e+23


In [7]:
rcm_old_df

Unnamed: 0,amd,rcm,padding,matrix_name,partial_gauss,n,rank,z_reduced,z_full,max_mode_size,tile_size,log_obj_func,obj_func
3326,False,False,0,bcsstk13,0,2003.0,1.0,83883.0,83883.0,2003.0,2003.0,45.614408,6.457816e+19
4493,False,False,0,ex10hs,0,2548.0,29964.0,57308.0,57308.0,13.0,2.0,25.745868,1.518058e+11
4494,False,False,0,ex10hs,0,2548.0,11353.0,57308.0,57308.0,13.0,4.0,23.805739,2.181228e+10
4495,False,False,0,ex10hs,0,2548.0,4830.0,57308.0,57308.0,13.0,7.0,22.099010,3.958022e+09
4496,False,False,0,ex10hs,0,2548.0,1974.0,57308.0,57308.0,13.0,13.0,20.319352,6.677019e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4076,False,True,0,Pres_Poisson,0,14822.0,4.0,715804.0,715804.0,7411.0,7411.0,53.464324,1.656765e+23
4077,False,True,0,Pres_Poisson,0,14822.0,1.0,715804.0,715804.0,14822.0,14822.0,57.623207,1.060330e+25
4158,False,False,0,Pres_Poisson,0,14822.0,241977.0,715804.0,715804.0,7411.0,2.0,53.464344,1.656798e+23
4159,False,False,0,Pres_Poisson,0,14822.0,4.0,715804.0,715804.0,7411.0,7411.0,53.464324,1.656765e+23


In [8]:
rcm_df = get_percentage_change_per_double_category(data_frame=rcm_df, result_column="rank_percentage", variable="rank", baseline_col="rcm", baseline_value=False, category1="matrix_name", category2="tile_size")
rcm_df.sort_values(by=["matrix_name", "tile_size"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_frame[result_column] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_frame[result_column] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rcm_df.sort_values(by=["matrix_name", "tile_size"], inplace=True)


In [9]:
rcm_old_df = get_percentage_change_per_double_category(data_frame=rcm_old_df, result_column="rank_percentage", variable="rank", baseline_col="rcm", baseline_value=False, category1="matrix_name", category2="tile_size")
rcm_old_df.sort_values(by=["matrix_name", "tile_size"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_frame[result_column] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_frame[result_column] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rcm_old_df.sort_values(by=["matrix_name", "tile_size"], inplace=True)


In [15]:
line_plot_tile_size_rank_percentage_per_matrix(rcm_df[rcm_df["rcm"] == True], "rcm", matrix_color_map=matrix_color_map)

In [16]:
line_plot_tile_size_rank_percentage_per_matrix(rcm_old_df[rcm_old_df["rcm"] == True], "rcm", matrix_color_map=matrix_color_map)

In [17]:
rcm_old_df[(rcm_old_df["rcm"] == True) & (rcm_old_df["matrix_name"] == "ex10hs")]

Unnamed: 0,amd,rcm,padding,matrix_name,partial_gauss,n,rank,z_reduced,z_full,max_mode_size,tile_size,log_obj_func,obj_func,rank_percentage
4867,False,True,0,ex10hs,0,2548.0,16878.0,57308.0,57308.0,13.0,2.0,24.598301,48184410000.0,0.563276
4868,False,True,0,ex10hs,0,2548.0,7673.0,57308.0,57308.0,13.0,4.0,23.023002,9971547000.0,0.675857
4869,False,True,0,ex10hs,0,2548.0,3922.0,57308.0,57308.0,13.0,7.0,21.683771,2613016000.0,0.812008
4870,False,True,0,ex10hs,0,2548.0,1762.0,57308.0,57308.0,13.0,13.0,20.09475,533382800.0,0.892604
4871,False,True,0,ex10hs,0,2548.0,1494.0,57308.0,57308.0,14.0,14.0,19.922774,449108100.0,0.837444
4872,False,True,0,ex10hs,0,2548.0,660.0,57308.0,57308.0,26.0,26.0,20.237103,614981500.0,0.846154
4873,False,True,0,ex10hs,0,2548.0,599.0,57308.0,57308.0,28.0,28.0,20.470101,776339500.0,0.869376
4874,False,True,0,ex10hs,0,2548.0,266.0,57308.0,57308.0,49.0,49.0,23.365352,14042470000.0,0.875
4875,False,True,0,ex10hs,0,2548.0,243.0,57308.0,57308.0,52.0,52.0,23.717219,19964450000.0,0.916981
4876,False,True,0,ex10hs,0,2548.0,104.0,57308.0,57308.0,91.0,91.0,27.065453,568037200000.0,0.928571


In [18]:
rcm_df[(rcm_df["rcm"] == True) & (rcm_df["matrix_name"] == "ex10hs")]

Unnamed: 0,amd,rcm,padding,matrix_name,z_full,partial_gauss,rank,tile_size,z_reduced,max_mode_size,sparsity_ratio,n,log_obj_func,obj_func,rank_percentage
124454,False,True,0,ex10hs,42944.0,0.0,20178.0,2.0,42944.0,13.0,0.993385,2548.0,24.955309,68857790000.0,0.721132
124455,False,True,0,ex10hs,42944.0,0.0,8201.0,4.0,42944.0,13.0,0.993385,2548.0,23.155929,11389180000.0,0.754392
124456,False,True,0,ex10hs,42944.0,0.0,4374.0,7.0,42944.0,13.0,0.993385,2548.0,21.90122,3247724000.0,0.93302
124457,False,True,0,ex10hs,42944.0,0.0,1770.0,13.0,42944.0,13.0,0.993385,2548.0,20.103695,538175600.0,0.906762
124458,False,True,0,ex10hs,42944.0,0.0,1570.0,14.0,42944.0,14.0,0.993385,2548.0,20.019984,494958000.0,0.887006
124459,False,True,0,ex10hs,42944.0,0.0,592.0,26.0,42944.0,26.0,0.993385,2548.0,20.1367,556234400.0,0.758974
124460,False,True,0,ex10hs,42944.0,0.0,523.0,28.0,42944.0,28.0,0.993385,2548.0,20.377697,707817900.0,0.763504
124461,False,True,0,ex10hs,42944.0,0.0,206.0,49.0,42944.0,49.0,0.993385,2548.0,23.359993,13967410000.0,0.682119
124462,False,True,0,ex10hs,42944.0,0.0,189.0,52.0,42944.0,52.0,0.993385,2548.0,23.713673,19893770000.0,0.713208
124463,False,True,0,ex10hs,42944.0,0.0,92.0,91.0,42944.0,91.0,0.993385,2548.0,27.065403,568008700000.0,0.836364


Conclusion: difference is due to rounding in partial Gauss step which in the new code already gets applied when PG is 0. Apparently for some matrices there are nonzero values so close to zero that they get rounded, leading to different RCM results.