In [None]:
#import required libraries for analysis
import pandas as pd
import os

import plotly.express as px
import plotly.io as pio

pio.templates.default="simple_white"

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import multilabel_confusion_matrix

# Training Score Calculations

In [None]:
#create a dataframe for the training performance calculations to be stored
training_results_df = pd.DataFrame(columns=["algo","dataset_size","train_score"])

#loop through each training results file
for i in [x for x in os.listdir() if "training" in x]:
    working_df = pd.read_csv(i,index_col=0)
    
    columns_list = list(working_df.columns)
    columns_list.remove("y_train")
    
    #lopp through each column in the results file
    for j in columns_list:
        details = j.split("_")
        algo = details[0]
        size = details[1]        
        
        y_true = working_df[["y_train"]]
        y_pred = working_df[[j]]
        
        #calculate the accuracy
        accuracy = accuracy_score(y_true, y_pred, normalize=True)
        
        #append the results to the results df
        training_results_df = training_results_df.append({
            "algo":algo,
            "dataset_size":size,
            "train_score":accuracy
        },ignore_index=True)

# Test Score Calculations

In [None]:
#read test predictions matrix 
raw_results = pd.read_csv("test_results_consolidated_15Feb22.csv",index_col=0)

In [None]:
#create a list of all columns
columns_list = list(raw_results.columns)

#and drop the ground truth column
columns_list.remove("y_test")

In [None]:
#create a dataframe for the test performance calculations to be stored
test_results_df = pd.DataFrame(columns=["algo","dataset_size","test_score","balanced_accuracy_score",
                                       "f1_score"])

#loop through all columsn of algo and training sample size
for i in columns_list:
    details = i.split("_")
    algo = details[0]
    size = details[1]
    
    #create the ground truth and prediction series
    y_true = raw_results[["y_test"]]
    y_pred = raw_results[[i]]
    
    #calculate the key metrics
    f_score = f1_score(y_true,y_pred,average="weighted")
    accuracy = accuracy_score(y_true, y_pred, normalize=True)
    bal_acc = balanced_accuracy_score(y_true, y_pred)

    #append results to the results dataframe
    test_results_df = test_results_df.append({
                                            "algo":algo,
                                            "dataset_size":size,
                                            "test_score":accuracy,
                                            "balanced_accuracy_score":bal_acc,
                                            "f1_score":f_score
                                            },ignore_index=True)

In [None]:
test_results_df

# Join DataFrames

In [None]:
#join the training and test results dataframes key'd off algo and training sample size
results_df = pd.merge(training_results_df, test_results_df,
                      how="left",
                      left_on=["algo", "dataset_size"],
                      right_on=["algo", "dataset_size"])

#calculate the "overfit" metric
results_df["overfit"]=results_df["train_score"]-results_df["test_score"]

In [None]:
results_df

In [None]:
#export results to csv
results_df.to_csv("results_df_17Feb22.csv")

# Create Plots

In [None]:
#create a list of labels to iterate over
labels = ["train_score","test_score","balanced_accuracy_score","f1_score","overfit"]

#create a dictionary of strings to mark up plot axes
titles={"train_score":"Training Score [-]",
       "test_score":"Testing Score [-]",
       "balanced_accuracy_score":"Balanced Accuracy Score [-]",
       "f1_score":"F1 Score [-]",
       "overfit": "Train-Test Delta [-]"
       }

#loop through different metrics in labels list
for i in labels:
    #retrieve the assocaited label from the titles dictionary
    title=titles.get(i)
    
    #create the scatter plot of dataset size vs. metric of interest
    fig = px.scatter(results_df,
                     x="dataset_size",
                     y=i,
                     color="algo",
                     range_x=[10,100000],
                     log_x=True,
                     width=640,
                     height=640)
    
    #tidy the plot up
    fig.update_xaxes(showgrid=True,showline=True, linewidth=1, mirror=True,title="Training set size [observations]")
    fig.update_yaxes(showgrid=True,showline=True, linewidth=1, mirror=True,title = title)
    fig.update_traces(marker={"size": 15,
                             "symbol": 134,
                             "line_width": 3})
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5,
        title="Algorithm:",
        bordercolor="black",
        borderwidth=1
    ))
    
    fig.show()

In [None]:
#create scatter plot of training vs. test scores to examine overfitting
fig = px.scatter(results_df,
                 x="train_score",
                 y="test_score",
                 color="algo",
                 range_x=[0.4,1],
                 range_y=[0.4,1],
                 width=640,
                 height=640,
                )

fig.add_shape(
    type="line",
    line=dict(dash="dash",color="black",width=3),
    x0=0.4,x1=1,y0=0.4,y1=1
)

fig.update_xaxes(
    showgrid=True,
    showline=True,
    linewidth=1,
    mirror=True,
    title="Training Score [-]"
)

fig.update_yaxes(
    showgrid=True,
    showline=True,
    linewidth=1,
    mirror=True,
    title="Testing Score [-]"
)

fig.update_traces(marker={"size": 15,
                         "symbol": 134,
                         "line_width": 3})

fig.update_layout(legend=dict(
    orientation="v",
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01,
    title="Algorithm:",
    bordercolor="black",
    borderwidth=1
))

In [None]:
#create a series of boxplots to examine distributions of results
for i in results_df.columns.drop(["algo","dataset_size"]):
    title=titles.get(i)
    
    fig = px.box(results_df,
                 x="algo",
                 y=i,
                 height=760,
                 width=760)
    
    fig.update_xaxes(
        showgrid=True,
        showline=True,
        linewidth=1,
        mirror=True,
        title="Algortihm"
        )

    fig.update_yaxes(
        showgrid=True,
        showline=True,
        linewidth=1,
        mirror=True,
        title=f"{title}"
        )

    fig.show()