In [None]:
import os
import re
import pickle
from IPython.display import display_html
import pandas as pd
import altair as alt
from scipy import stats

In [None]:
# Define base path where files will be stored.
# This is unpacked from the pickle file created in Step 0.

with open('pickledHomeScratchShared.pickle', "rb") as f:
    baseHomePath,baseScratchPath,baseSharedPath = pickle.load(f)

## Section 1 : Analysis between `all_models_scores...` pickle files in `model_scores`

In [None]:
# Create a list files to choose from.

fileNameList = os.listdir(baseSharedPath+'/model_scores')
fileNameList.sort()
[print(fileName) for fileName in fileNameList if re.match('^all.+',fileName)];


In [None]:
fileName = 'all_model_scores_Pre-trained_5fold_seed696_20230225_032515.pickle'
n_splits = 5 # 
viewChoices = ['Transverse','Coronal','Sagittal']
with open("{}/{}".format(baseSharedPath+'/model_scores',fileName), "rb") as f:
    allModelsScores = pickle.load(f)

In [None]:
# Initiate empty dict
scoresDict = dict()

scoresVarList = ["scores_16-stitched_14","scores_16-stitched_42","scores_16-stitched_696"
                 ,"scores_Pre-trained_14","scores_Pre-trained_42","scores_Pre-trained_696"
                 ,"scores_Processed_14","scores_Processed_42","scores_Processed_696"]

# Load pickled file contents into the scoreDict
for scores,fileName in zip(scoresVarList,fileNameList):
    with open("{}/{}".format(baseSharedPath+'/model_scores',fileName), "rb") as f:
        scoresDict[scores] = pickle.load(f)
        


In [None]:
# Example on how to display all of the confusion matrices one over the other for "scores_809_14"

conf_mat_list = [[allModelsScores[key][i][-1] for key in scoresDict["scores_16-stitched_14"]] for i in range(n_splits)]
i = 0
for iteration in conf_mat_list:
    i += 1
    confT,confC,confS = iteration
    confT_styler = confT.style.set_table_attributes("style='display:inline'").set_caption('Iteration {}-{}'.format(i,'Transvere'))
    confC_styler = confC.style.set_table_attributes("style='display:inline'").set_caption('Iteration {}-{}'.format(i,'Coronal'))
    confS_styler = confS.style.set_table_attributes("style='display:inline'").set_caption('Iteration {}-{}'.format(i,'Sagittal'))
    space = "\xa0" * 10
    display_html(confT_styler._repr_html_()+space+confC_styler._repr_html_()+space+confS_styler._repr_html_()
             , raw=True)

In [None]:
# Unpack all of the models from "scores_16-stitched_14" and display
print('Below are results for a {} K-Fold validation training'.format(n_splits))
modelDfT,modelDfC,modelDfS = [pd.DataFrame([item[0:-1] for item in scoresDict["scores_16-stitched_14"][key]]
             ,columns= ["ActualNegativesInTrain" ,"ActualPositivesInTrain"
                              ,"ActualNegativesInTest", "ActualPositivesInTest"
                        ,"f1Score","precision","recall","accuracy"]) for key in allModelsScores]

modelDfT_styler = modelDfT.style.set_table_attributes("style='display:inline'").set_caption('Transvere')
modelDfC_styler = modelDfC.style.set_table_attributes("style='display:inline'").set_caption('Coronal')
modelDfS_styler = modelDfS.style.set_table_attributes("style='display:inline'").set_caption('Sagittal')

space = "\xa0" * 10
display_html(modelDfT_styler._repr_html_()+space+modelDfC_styler._repr_html_()+space+modelDfS_styler._repr_html_()
             , raw=True)

In [None]:
# Create a Master dataframe that has all the scores from all of the iterations
masterDf = None
for dictKey in scoresDict:
    modelDfT,modelDfC,modelDfS = [pd.DataFrame([item[0:-1] for item in scoresDict[dictKey][key]]
             ,columns= ["ActualNegativesInTrain" ,"ActualPositivesInTrain"
                              ,"ActualNegativesInTest", "ActualPositivesInTest"
                        ,"f1Score","precision","recall","accuracy"]
            ,index=[1,2,3,4,5]) for key in scoresDict[dictKey]]
    _,dataMode,seedNum = dictKey.split("_")
    modelDfT['plane'] = 'Transverse'
    modelDfC['plane'] = 'Coronal'
    modelDfS['plane'] = 'Sagittal'
    concatenated = pd.concat([modelDfT, modelDfC, modelDfS])
    concatenated.index = concatenated.index.set_names(['Iteration'])
    concatenated.reset_index(inplace=True)
    concatenated['dataMode'] = dataMode
    concatenated['seedNum'] = seedNum
    if masterDf is not None:
        masterDf = pd.concat([masterDf,concatenated])
    else:
        masterDf = concatenated.copy()

masterDf

In [None]:
stdDev = alt.Chart(masterDf).mark_errorbar(extent='stdev',color='darkorange').encode(
        x=alt.X('recall:Q', axis=alt.Axis(tickMinStep=0.5, tickCount=20), title='Recall'),
        y=alt.Y('plane:N', title='Plane')
    )

means = alt.Chart(masterDf).mark_circle(color='black').encode(
        x=alt.X('mean(recall):Q', title='Recall'),
        y=alt.Y('plane:N', title='Plane')
)

(stdDev+means).properties(
    width = 200, height = 200
    ).facet(
    column = alt.Column('dataMode:N', header=alt.Header(title='Data representation used in model',titleFontSize=16)),
    row= alt.Row('seedNum:N', header=alt.Header(title='Seed Number/Random State of experiment',titleFontSize=16))
).resolve_axis(
    x='independent',
    y='independent',
).configure_header(
    labelFontSize=16
)

## Section 2 : Sensitivity analysis of 6 parameters using pickle files in `sen_analysis`

Based on choices of kernel sizes, number of filters and neurons in fully connected layer

In [None]:
# Create a list files to choose from.
# These files have been generated from Step 4.1
senAnalysisFiles = os.listdir(baseSharedPath+'/sen_analysis')
senAnalysisFiles.sort()
[print(fileName) for fileName in senAnalysisFiles];

In [None]:
# Load all the scores back into memory from pickle files shown above
fileName = 'kernelSenScores_5fold_seed42_20230225_050448.pickle'
with open("{}/{}".format(baseSharedPath+'/sen_analysis',fileName), "rb") as f:
    kernelSenScores = pickle.load(f)
    
fileName = 'numFiltersSenScores_5fold_seed42_20230225_051611.pickle'
with open("{}/{}".format(baseSharedPath+'/sen_analysis',fileName), "rb") as f:
    numFiltersSenScores = pickle.load(f)

fileName = 'denseNeuronSenScores_5fold_seed42_20230225_052744.pickle'
with open("{}/{}".format(baseSharedPath+'/sen_analysis',fileName), "rb") as f:
    denseNeuronSenScores = pickle.load(f)
    
fileName = 'learnRateSenScores_5fold_seed42_20230225_054503.pickle'
with open("{}/{}".format(baseSharedPath+'/sen_analysis',fileName), "rb") as f:
    learnRateSenScores = pickle.load(f)

fileName = 'l2RegSenScores_5fold_seed42_20230225_055811.pickle'
with open("{}/{}".format(baseSharedPath+'/sen_analysis',fileName), "rb") as f:
    l2RegSenScores = pickle.load(f)
    
fileName = 'dropOutSenScores_5fold_seed42_20230225_060637.pickle'
with open("{}/{}".format(baseSharedPath+'/sen_analysis',fileName), "rb") as f:
    dropOutSenScores = pickle.load(f)
    

In [None]:
# Helper function to generate accChart,recallChart and precisionChart from the DataFrame with xVar and 3 yVar

def generateChart(df,xVar,yVar1,yVar2,yVar3,accYScale=[0.82, 0.92]):
    accLine = alt.Chart(df).mark_line(color='darkorange').encode(
        x='{}:N'.format(xVar),
        y = alt.Y('mean({})'.format(yVar1) ,scale=alt.Scale(domain=accYScale))
    )
    accBand = alt.Chart(df).mark_errorband(extent='ci',color='lightblue').encode(
        x='{}:N'.format(xVar),
        y=alt.Y('{}'.format(yVar1), title='Accuracy',scale=alt.Scale(domain=accYScale)),
    )
    recallLine = alt.Chart(df).mark_line(color='darkorange').encode(
        x='{}:N'.format(xVar),
        y = alt.Y('mean({})'.format(yVar2) ,scale=alt.Scale(domain=[0.0, 0.92]))
    )
    recallBand = alt.Chart(df).mark_errorband(extent='ci',color='lightblue').encode(
        x='{}:N'.format(xVar),
        y=alt.Y('{}'.format(yVar2), title='Recall',scale=alt.Scale(domain=[0.0, 0.92])),
    )

    precisionLine = alt.Chart(df).mark_line(color='darkorange').encode(
        x='{}:N'.format(xVar),
        y = alt.Y('mean({})'.format(yVar3) ,scale=alt.Scale(domain=[0.0, 0.92]))
    )
    precisionBand = alt.Chart(df).mark_errorband(extent='ci',color='lightblue').encode(
        x='{}:N'.format(xVar),
        y=alt.Y('{}'.format(yVar3), title='Precision',scale=alt.Scale(domain=[0.0, 0.92])),
    )
    
    accChart = (accBand + accLine).properties(width = 250)
    recallChart = (recallLine + recallBand).properties(width = 250)
    precisionChart = (precisionLine + precisionBand).properties(width = 250)
    
    return (accChart,recallChart,precisionChart)


In [None]:
# 1 - kernelSenScores DataFrame
kernelSenScoresDf = pd.DataFrame(kernelSenScores,columns = ['Kernel_Size','Iteration',"f1Score","precision","recall","accuracy"])

accChartKernel, recallChartKernel, precisionChartKernel = generateChart(kernelSenScoresDf
                                                                        ,'Kernel_Size','accuracy','recall','precision')

(accChartKernel | recallChartKernel | precisionChartKernel).properties(
title={
        'text': 'Effect of Accuracy, Recall and Precision with increasing kernel size of 1st CNN layer from 3 through 11',
        'subtitle' : ['This data has been created using 5-fold cross validation using the "Coronal" view on the "Preprocessed dataset"'
                      ,'The blue space represents the confidence interval from gathering data over 5-fold using seed as 42.'
                     ,'The orange line represents the mean value of the metric (i.e. Accuracy, recall and Precision respectively)'],
        'fontSize':16,
        'subtitleFontSize':12,
    }
)

In [None]:
# 2 - numFiltersSenScores DataFrame
numFiltersSenScoresDf = pd.DataFrame(numFiltersSenScores,columns = ['Num_Filters','Iteration',"f1Score","precision","recall","accuracy"])

accChartNumFilters, recallChartNumFilters, precisionChartNumFilters = generateChart(numFiltersSenScoresDf
                                                                        ,'Num_Filters','accuracy','recall','precision')

(accChartNumFilters | recallChartNumFilters | precisionChartNumFilters).properties(
title={
        'text': 'Effect of Accuracy, Recall and Precision with increasing number of filters of 1st CNN layer from 8 through 128',
        'subtitle' : ['This data has been created using 5-fold cross validation using the "Coronal" view on the "Preprocessed dataset"'
                      ,'The blue space represents the confidence interval from gathering data over 5-fold using seed as 42.'
                     ,'The orange line represents the mean value of the metric (i.e. Accuracy, recall and Precision respectively)'],
        'fontSize':16,
        'subtitleFontSize':12,
    }
)

In [None]:
# 3 - denseNeuronSenScores DataFrame
denseNeuronSenScoresDf = pd.DataFrame(denseNeuronSenScores,columns = ['Dense_Neurons','Iteration',"f1Score","precision","recall","accuracy"])

accChartDenseNeurons, recallChartDenseNeurons, precisionChartDenseNeurons = generateChart(denseNeuronSenScoresDf
                                                                        ,'Dense_Neurons','accuracy','recall','precision')

(accChartDenseNeurons | recallChartDenseNeurons | precisionChartDenseNeurons).properties(
title={
        'text': 'Effect of Accuracy, Recall and Precision with increasing number of filters of first Dense layer from 8 through 128',
        'subtitle' : ['This data has been created using 5-fold cross validation using the "Coronal" view on the "Preprocessed dataset"'
                      ,'The blue space represents the confidence interval from gathering data over 5-fold using seed as 42.'
                     ,'The orange line represents the mean value of the metric (i.e. Accuracy, recall and Precision respectively)'],
        'fontSize':16,
        'subtitleFontSize':12,
    }
)

In [None]:
# 4 - learnRateSenScores DataFrame
learnRateSenScoresDf = pd.DataFrame(learnRateSenScores,columns = ['Learn_Rate','Iteration',"f1Score","precision","recall","accuracy"])

accChartLearnRate, recallChartLearnRate, precisionChartLearnRate = generateChart(learnRateSenScoresDf
                                                                        ,'Learn_Rate','accuracy','recall','precision'
                                                                                ,accYScale=[0.7,1.0])

(accChartLearnRate | recallChartLearnRate | precisionChartLearnRate).properties(
title={
        'text': 'Effect of Accuracy, Recall and Precision with increasing learning rate on "Adam" optimizer from 0.0001 through 1',
        'subtitle' : ['This data has been created using 5-fold cross validation using the "Coronal" view on the "Preprocessed dataset"'
                      ,'The blue space represents the confidence interval from gathering data over 5-fold using seed as 42.'
                     ,'The orange line represents the mean value of the metric (i.e. Accuracy, recall and Precision respectively)'],
        'fontSize':16,
        'subtitleFontSize':12,
    }
)

In [None]:
# 5 - l2RegSenScores DataFrame
l2RegSenScoresDf = pd.DataFrame(l2RegSenScores,columns = ['L2_Regularizer','Iteration',"f1Score","precision","recall","accuracy"])


accChartL2Reg, recallChartL2Reg, precisionChartL2Reg = generateChart(l2RegSenScoresDf
                                                                        ,'L2_Regularizer','accuracy','recall','precision'
                                                                               ,accYScale=[0.7,1.0] ) 

(accChartL2Reg | recallChartL2Reg | precisionChartL2Reg).properties(
title={
        'text': 'Effect of Accuracy, Recall and Precision with increasing L2 regularization on applicable layers from 0.001 through 0.5',
        'subtitle' : ['This data has been created using 5-fold cross validation using the "Coronal" view on the "Preprocessed dataset"'
                      ,'The blue space represents the confidence interval from gathering data over 5-fold using seed as 42.'
                     ,'The orange line represents the mean value of the metric (i.e. Accuracy, recall and Precision respectively)'],
        'fontSize':16,
        'subtitleFontSize':12,
    }
)

In [None]:
# 6 - dropOutSenScores DataFrame
dropOutSenScoresDf = pd.DataFrame(dropOutSenScores,columns = ['Drop_Out','Iteration',"f1Score","precision","recall","accuracy"])


accChartDropOut, recallChartDropOut, precisionChartDropOut = generateChart(dropOutSenScoresDf
                                                                        ,'Drop_Out','accuracy','recall','precision'
                                                                               ,accYScale=[0.7,1.0] ) 

(accChartDropOut | recallChartDropOut | precisionChartDropOut).properties(
title={
        'text': 'Effect of Accuracy, Recall and Precision with increasing Drop out fraction on applicable layers from 0.05 through 0.5',
        'subtitle' : ['This data has been created using 5-fold cross validation using the "Coronal" view on the "Preprocessed dataset"'
                      ,'The blue space represents the confidence interval from gathering data over 5-fold using seed as 42.'
                     ,'The orange line represents the mean value of the metric (i.e. Accuracy, recall and Precision respectively)'],
        'fontSize':16,
        'subtitleFontSize':12,
    }
)

## Section 3 : Analysis of `best_model...` pickle file in `model_scores`

In [None]:
# Create a list files to choose from.

fileNameList = os.listdir(baseSharedPath+'/model_scores')
fileNameList.sort()
[print(fileName) for fileName in fileNameList if re.match('^best.+',fileName)];


In [None]:
fileName = 'best_model_scores_Coronal_Processed_5fold_seed42_20230226_042907.pickle'
with open("{}/{}".format(baseSharedPath+'/model_scores',fileName), "rb") as f:
    modelCVScores = pickle.load(f)

In [None]:
conf_mat_list = [modelCVScore[-1] for modelCVScore in modelCVScores]
i=0
for conf_matx in conf_mat_list:
    i += 1
    #confT,confC,confS = iteration
    #confT_styler = confT.style.set_table_attributes("style='display:inline'").set_caption('Iteration {}-{}'.format(i,'Transvere'))
    conf_matx_styler = conf_matx.style.set_table_attributes("style='display:inline'").set_caption('Iteration {}-{}'.format(i,'Coronal'))
    #confS_styler = confS.style.set_table_attributes("style='display:inline'").set_caption('Iteration {}-{}'.format(i,'Sagittal'))
    space = "\xa0" * 10
    display_html(conf_matx_styler._repr_html_()
             , raw=True)

In [None]:
# Display scores from each of the 5 iterations for scores on cross validation

modelDfC = pd.DataFrame([item[0:-1] for item in modelCVScores]
            ,columns= ["ActualNegativesInTrain" ,"ActualPositivesInTrain"
                              ,"ActualNegativesInTest", "ActualPositivesInTest"
                        ,"f1Score","precision","recall","accuracy"],index=['Iteration {}'.format(i) for i in range(1,6)])
modelDfC_styler = modelDfC.style.set_table_attributes("style='display:inline'").set_caption(
    '5-fold cross validation results in each iteration for best "Coronal"+"Processed" model using seed=42').set_table_styles([{
                            'selector': 'caption',
                            'props': [('font-size', '20px')]
                        }])
display_html(modelDfC_styler._repr_html_()
         , raw=True)

In [None]:
modelDfC.mean()

## Section 4 : Compare each of the scores from CNN vs baseline models

In [None]:
# Create a list files to choose from.

fileNameList = os.listdir(baseSharedPath+'/model_scores')
fileNameList.sort()
[print(fileName) for fileName in fileNameList if re.match('^cnn.+',fileName)];


In [None]:
fileName = 'cnn_vs_rest_5fold_seed42_20230226_061528.pickle'
with open("{}/{}".format(baseSharedPath+'/model_scores',fileName), "rb") as f:
    cnnVsRest = pickle.load(f)

In [None]:
# Gather the values from dictionary and build into DataFrame to use with altair
cnnVsRestList = []
for modelType in cnnVsRest:
    for result in cnnVsRest[modelType]:
        cnnVsRestList.append([modelType,*result[4:-1]])
cnnVsRestListDf = pd.DataFrame(cnnVsRestList,columns=["Model Type","f1Score","precision","recall","accuracy"])
meltedDf = pd.melt(cnnVsRestListDf, id_vars=["Model Type"], var_name='Metric')
meltedDf.head()

In [None]:
# Produce altair chart
meanChart = alt.Chart(meltedDf).mark_bar().encode(
    x='Metric:N',
    y='mean(value):Q',
    color='Model Type:N',
    column='Model Type:N'
).properties(width = 120)

meanChart


In [None]:
# Statistical significance of results of CNN vs others

listOfModels = ['DummyClassifier', 'GaussianNaiveBayes', 'RandomForest']
listOfMetrics = list(meltedDf['Metric'].unique())
compare_df = pd.DataFrame(columns=['Model Compared With', 'Metric', 't-statistic', 'p-value'])
index = 0
for metric in listOfMetrics:
    for model in listOfModels:
        CNNValue = meltedDf[(meltedDf['Metric'] == metric) & (meltedDf['Model Type'] == 'CNN')]['value']
        CompareValue = meltedDf[(meltedDf['Metric'] == metric) & (meltedDf['Model Type'] == model)]['value']
        result = stats.ttest_ind(CompareValue,CNNValue)
        compare_df.loc[index,'Model Compared With'] = model
        compare_df.loc[index,'Metric'] =  metric
        compare_df.loc[index,'t-statistic'] = round(result.statistic,2)
        compare_df.loc[index,'p-value'] = round(result.pvalue,2)
        index += 1

# Display compare_df
compare_df


## Section 5 : Learning curve with standard deviation

In [None]:
# Create a list files to choose from.

fileNameList = os.listdir(baseSharedPath+'/model_scores')
fileNameList.sort()
[print(fileName) for fileName in fileNameList if re.match('^lear.+',fileName)];


In [None]:
fileName = 'learning_curve_5fold_seed42_20230227_130500.pickle'
with open("{}/{}".format(baseSharedPath+'/model_scores',fileName), "rb") as f:
    learnCurve = pickle.load(f)

In [None]:
learnCurveDf = pd.DataFrame(learnCurve,columns=["Iteration","Size","ActualNegativesInTrain" ,"ActualPositivesInTrain"
                          ,"ActualNegativesInTest", "ActualPositivesInTest","f1Score","precision","recall","accuracy","conf_matrix"])
learnCurveDf.drop(columns=["ActualNegativesInTrain" ,"ActualPositivesInTrain"
                          ,"ActualNegativesInTest", "ActualPositivesInTest","conf_matrix"],inplace=True)
learnCurveDf.head()

In [None]:
stdDev = alt.Chart(learnCurveDf).mark_errorbar(extent='stdev',color='darkorange').encode(
        x=alt.X('Size:O',title=''),
        y=alt.Y('recall:Q',title='')
    )

means = alt.Chart(learnCurveDf).mark_circle(color='black').encode(
        x=alt.X('Size:O',title=''),
        y=alt.Y('mean(recall):Q',title='')
)

line = alt.Chart(learnCurveDf).mark_line(color='blue').encode(
        x=alt.X('Size:O', title='Size of total dataset (80% split to train and 20% to validate)'),
        y=alt.Y('mean(recall):Q', title='Recall score on validation data')
)


(stdDev+means+line).properties(
    width = 400, height = 300
    ).properties(
title={
        'text': 'Effect of Training data size on recall of the model ',
        'fontSize':16,
        'subtitleFontSize':12,
    }
)