In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from scipy import stats

from analysis.analysis_utils import add_tt_mals_runtime_cols, line_plot_padding_tile_size_tt_mals_runtime_per_matrix, get_percentage_change_per_matrix


In [19]:
df1 = pd.read_csv('../../data/sweep_0_10_data1.csv')
df2 = pd.read_csv('../../data/sweep_0_10_data2.csv')
df = pd.concat([df1, df2], axis=0)
df.drop(columns=["run_id", "run_name", "_runtime", "_step", "_timestamp"], inplace=True)
df.shape

(28569, 11)

In [20]:
df.tail()

Unnamed: 0,amd,rcm,padding,matrix_name,partial_gauss,gauss_threshold,rank,tile_size,max_mode_size,n,z
8590,True,True,0,Pres_Poisson,2,1e-07,1.0,14820.0,14820.0,14820.0,716664.0
8591,True,False,0,Pres_Poisson,1,1e-07,1.0,14821.0,14821.0,14821.0,716761.0
8592,True,True,0,Pres_Poisson,0,1e-07,178951.0,2.0,7411.0,14822.0,715804.0
8593,True,True,0,Pres_Poisson,0,1e-07,4.0,7411.0,7411.0,14822.0,715804.0
8594,True,True,0,Pres_Poisson,0,1e-07,1.0,14822.0,14822.0,14822.0,715804.0


In [21]:
df = add_tt_mals_runtime_cols(df)

In [22]:
# no module is applied - baseline
baseline_df = df[(df['amd'] == False) & (df['rcm'] == False) & (df['partial_gauss'] == 0) & (df['padding'] == 0)]

# different levels of padding applied - padding_df
padding_df = df[(df['amd'] == False) & (df['rcm'] == False) & (df['partial_gauss'] == 0)]

# keep amd on and off - amd_df
amd_df = df[(df['rcm'] == False) & (df['partial_gauss'] == 0) & (df['padding'] == 0)]

# keep rcm on and off - rcm_df
rcm_df = df[(df['amd'] == False) & (df['partial_gauss'] == 0) & (df['padding'] == 0)]

# different levels of partial gauss - partial_gauss_df
partial_gauss_df = df[(df['amd'] == False) & (df['rcm'] == False) & (df['padding'] == 0)]

# 2-modules combined
amd_partial_gauss_df = df[(df['amd'] == True) & (df['rcm'] == False) & (df['padding'] == 0)]

In [23]:
df.head()

Unnamed: 0,amd,rcm,padding,matrix_name,partial_gauss,gauss_threshold,rank,tile_size,max_mode_size,n,z,log_obj_func,obj_func
0,False,True,10,ex15,10,1e-07,20642.0,3.0,109.0,6867.0,98477.0,29.542966,6766233000000.0
1,False,True,10,ex15,10,1e-07,7546.0,7.0,109.0,6867.0,98477.0,28.491123,2363402000000.0
2,False,True,10,ex15,10,1e-07,5480.0,9.0,109.0,6867.0,98477.0,28.344455,2040988000000.0
3,False,True,10,ex15,10,1e-07,2043.0,21.0,109.0,6867.0,98477.0,28.178758,1729335000000.0
4,False,True,10,ex15,10,1e-07,339.0,63.0,109.0,6867.0,98477.0,28.149163,1678905000000.0


Tile size choice effects (baseline)

In [24]:
# Tile size choice influence on TT-MALS runtime
fig = px.line(baseline_df, x="tile_size", y="log_obj_func", color='matrix_name', symbol="matrix_name", markers=True, log_x=True,
              labels={
                     "tile_size": "Tile size",
                     "matrix_name": "Matrix name",
                 }
              )
fig.update_layout(
    title={
        'text': "Influence of tile size choice on TT-MALS runtime",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black'), # Font color
    yaxis_title=r'$\log(I^6 + rI^3 + r^2I^2)$'
)
fig.show()
fig.write_image("plots/baseline_tile_size_vs_log_obj_func.pdf")

In [25]:
# trade-off between rank and max mode size - baseline
fig = px.line(baseline_df, x="rank", y="max_mode_size", color="matrix_name", symbol="matrix_name", log_x=True, log_y=True, 
                 labels={
                     "rank": "Rank (r)",
                     "matrix_name": "Matrix name",
                     "max_mode_size": "Maximum mode size (I)",
                 })
fig.update_layout(
    title={
        'text': "Trade-off between rank and maximum mode size",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black') # Font color
)
fig.show()

fig.write_image("plots/baseline_max_mode_size_vs_rank.pdf")

Explore effects of padding

In [26]:
# show different cases: when it helps and when it cannot improve the situation
line_plot_padding_tile_size_tt_mals_runtime_per_matrix(padding_df, "ex3")

In [27]:
line_plot_padding_tile_size_tt_mals_runtime_per_matrix(padding_df, "ex10hs")

In [28]:
line_plot_padding_tile_size_tt_mals_runtime_per_matrix(padding_df, "bcsstk13")

In [29]:
line_plot_padding_tile_size_tt_mals_runtime_per_matrix(padding_df, "Pres_Poisson")

Effects of RCM

In [30]:
rcm_df.sort_values(by=["tile_size"], inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [31]:
rcm_df

Unnamed: 0,amd,rcm,padding,matrix_name,partial_gauss,gauss_threshold,rank,tile_size,max_mode_size,n,z,log_obj_func,obj_func
4029,False,True,0,ex13,0,1.000000e-07,18980.0,2.0,107.0,2568.0,75628.0,29.362389,5.648374e+12
6136,False,False,0,ex10hs,0,1.000000e-07,29964.0,2.0,13.0,2548.0,57308.0,25.745868,1.518058e+11
4248,False,False,0,Pres_Poisson,0,1.000000e-07,241977.0,2.0,7411.0,14822.0,715804.0,53.464344,1.656798e+23
4245,False,True,0,Pres_Poisson,0,1.000000e-07,178951.0,2.0,7411.0,14822.0,715804.0,53.464335,1.656783e+23
4014,False,False,0,ex13,0,1.000000e-07,22354.0,2.0,107.0,2568.0,75628.0,29.611912,7.249195e+12
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2209,False,False,0,ex15,0,1.000000e-07,1.0,6867.0,6867.0,6867.0,98671.0,53.006896,1.048582e+23
4246,False,True,0,Pres_Poisson,0,1.000000e-07,4.0,7411.0,7411.0,14822.0,715804.0,53.464324,1.656765e+23
4249,False,False,0,Pres_Poisson,0,1.000000e-07,4.0,7411.0,7411.0,14822.0,715804.0,53.464324,1.656765e+23
4247,False,True,0,Pres_Poisson,0,1.000000e-07,1.0,14822.0,14822.0,14822.0,715804.0,57.623207,1.060330e+25


In [15]:
fig = px.bar(rcm_df, x="matrix_name", y="obj_func", color="rcm", barmode='group', text_auto=True, log_y=True,
                 labels={
                     "rank": "Rank (r)",
                     "matrix_name": "Matrix Name",
                     "max_mode_size": "Maximum Mode Size (I)",
                 })
fig.update_layout(
    # title={
    #     'text': "Influence of tile size choice and RCM on TT-MALS runtime",
    #     'x':0.5,
    #     'xanchor': 'center',
    #     'yanchor': 'top'
    # },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black'), # Font color
    # yaxis_title=r'$\log(I^6 + rI^3 + r^2I^2)$'
)
fig.show()

# fig.write_image("plots/rcm_max_mode_size_vs_max_rank.pdf")

In [16]:
rcm_df = get_percentage_change_per_matrix(data_frame=rcm_df, result_column="rank_percentage", variable="rank", baseline_col="rcm", baseline_value=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



AssertionError: there should be only one unique, baseline value per matrix

In [23]:
# do some preprocessing to be able to show nonzero entry change for different matrices together - do it in terms of percentage

partial_gauss_df = get_percentage_change_per_matrix(data_frame=partial_gauss_df, result_column="z_percentage", variable="z", baseline_col="partial_gauss", baseline_value=0)
partial_gauss_df.sort_values(by="partial_gauss", inplace=True)

amd_partial_gauss_df = get_percentage_change_per_matrix(data_frame=amd_partial_gauss_df, result_column="z_percentage", variable="z", baseline_col="partial_gauss", baseline_value=0)
amd_partial_gauss_df.sort_values(by="partial_gauss", inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [24]:
fig = px.line(partial_gauss_df, x="partial_gauss", y="z_percentage", color="matrix_name", symbol="matrix_name", log_y=True,
                 labels={
                     "partial_gauss": "Number of eliminated variables",
                     "z_percentage": "Change in nonzero entries",
                     "matrix_name": "Matrix name",
                 })
fig.update_layout(
    title={
        'text': "Effect of variable elimination on nonzero entry count",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black'), # Font color
)
fig.show()

fig.write_image("plots/partial_gauss_nonzero_entries.pdf")

In [25]:
fig = px.line(amd_partial_gauss_df, x="partial_gauss", y="z_percentage", color="matrix_name", symbol="matrix_name", log_y=True,
                 labels={
                     "partial_gauss": "Number of eliminated variables",
                     "z_percentage": "Change in nonzero entries",
                     "matrix_name": "Matrix name",
                 })
fig.update_layout(
    title={
        'text': "Effect of variable elimination on nonzero entry count",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    plot_bgcolor='white',   # Plot area background color
    paper_bgcolor='white',  # Entire figure background color
    font=dict(color='black'), # Font color
)
fig.show()

fig.write_image("plots/amd_partial_gauss_nonzero_entries.pdf")

In [91]:
# explore effects of padding: 
def line_plot_partial_gauss_tile_size_tt_mals_runtime(matrix_str: str, pg_df: pd.DataFrame):
    default_colorscale = px.colors.sequential.Jet
    colors = px.colors.sample_colorscale(default_colorscale, 11)
    fig = px.line(pg_df[pg_df["matrix_name"] == matrix_str], x="tile_size", y="log_obj_func", color="partial_gauss", symbol="partial_gauss", log_x=True, color_discrete_sequence=colors,
                     labels={
                         "tile_size": "Tile size",
                     })
    fig.update_layout(
        title={
            'text': "Influence of tile size choice and partial Gauss on TT-MALS runtime ({})".format(matrix_str),
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        plot_bgcolor='white',   # Plot area background color
        paper_bgcolor='white',  # Entire figure background color
        font=dict(color='black'), # Font color
        yaxis_title=r'$\log(I^6 + rI^3 + r^2I^2)$'
    )
    fig.show()
    
    fig.write_image("plots/{}_partial_gauss_tile_size_vs_log_obj_func.pdf".format(matrix_str))