# Sensitivity Analysis of Starting Threshold Value

### Threshold value specified in hierachical clustering algorithm. Metric used to determine if a new cluster should be formed based on already exisiting clusters.
### Feature trajectories were clustered using *scipy.cluster.hierarchy.fclusterdata* 
### based on:
### - Euclidean distances between trajectories
### - *weighted* linkage method
### - *distance* criterion
###
### Here we explored the impact of varying the starting t-val and its impact on the number of features within clusters and the number of clusters per patient

### After this experiment, a starting t-val of 1.75 was employed


In [None]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
root_dir = '../Output/Sensitivity/'
runs = os.listdir(root_dir)
print(runs)

df_res = pd.DataFrame()
# loop through all runs and read the cluster summary file
for run in runs:
    file_path = f'{root_dir}{run}/Clustering/ClusterSummary.txt'
    
    df = pd.read_csv(file_path, sep='\: ')
    df = df.T
    df.columns = df.iloc[0]
    df = df[1:]
    df['T-val'] = run.replace('Run-', '')
    
    df_res = pd.concat([df_res, df])


# make t-val first col
cols = df_res.columns.tolist()

cols = cols[-1:] + cols[:-1]
df_res = df_res[cols]

df_res

In [None]:
# split the range of clusters and range of features per cluster into min and max
df_res[['Min clusters', 'Max clusters']] = df_res['Range of clusters'].str.split(expand=True)
df_res['Min clusters'] = df_res['Min clusters'].astype(int)
df_res['Max clusters'] = df_res['Max clusters'].astype(int)

# split the range of clusters and range of features per cluster into min and max
df_res[['Min features', 'Max features']] = df_res['Range of features per cluster'].str.split(expand=True).astype(float)
df_res['Min features'] = df_res['Min features'].astype(float)
df_res['Max features'] = df_res['Max features'].astype(float)


df_res.rename(columns={'Mean number of clusters per patient': 'Mean clusters per patient'}, inplace=True)
df_res['Mean clusters per patient'] = df_res['Mean clusters per patient'].astype(float)
df_res.drop(columns=['Range of clusters', 'Range of features per cluster'], inplace=True)

In [None]:
# make sure all columns are float
for col in df_res.columns:
    df_res[col] = df_res[col].astype(float)

df_res.reset_index(inplace=True)
df_res.rename(columns={'Mean number of stable clusters per patient': 'Mean stable clusters'}, inplace=True)
df_res['index'] = df_res['index'].astype(float)
df_res.rename(columns={'index': 'Mean stable clusters'}, inplace=True)

### Plot of number of clusters per patient against starting t-val

In [None]:
# plot t-val vs 
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(10, 6))
# scatterplot of t-val vs min clusters
sns.lineplot(data=df_res, x='T-val', y='Min clusters', linestyle='--', color='black')
sns.lineplot(data=df_res, x='T-val', y='Max clusters', linestyle='--', color='black', label='Min/Max')
# plot mean with sd as error bars
sns.lineplot(data=df_res, x='T-val', y='Mean clusters per patient', color='blue', label='Mean')
plt.fill_between(df_res['T-val'], df_res['Mean clusters per patient'] - df_res['Std of clusters'], df_res['Mean clusters per patient'] + df_res['Std of clusters'],
                color='blue', alpha=0.2, label='Std')
sns.lineplot(data=df_res, x='T-val', y='Mean stable clusters', color='red', label='Stable clusters')
plt.legend(fontsize='large')

plt.title('Number clusters per patient', fontsize='large')
plt.xlabel('T-val', fontsize='large')
plt.ylabel('Number of clusters', fontsize='large')
plt.tight_layout()
plt.savefig('../SubmissionVisuals/t-valSM/clusters-per-patient.png')
plt.show()

### Plot of mean number of features per cluster per patients against starting t-val

In [None]:
# repeat for number of features per cluster
fig, ax = plt.subplots(figsize=(10, 6))

sns.lineplot(data=df_res, x='T-val', y='Min features', linestyle='--', color='black')
sns.lineplot(data=df_res, x='T-val', y='Max features', linestyle='--', color='black', label='Min/Max')
# plot mean with sd as error bars
sns.lineplot(data=df_res, x='T-val', y='Mean features per cluster per patient', color='blue', label='Mean')
plt.fill_between(df_res['T-val'], df_res['Mean features per cluster per patient'] - df_res['Std features per cluster per patient'], df_res['Mean features per cluster per patient'] + df_res['Std features per cluster per patient'],
                color='blue', alpha=0.2, label='Std')
plt.legend(fontsize='large')

plt.title('Mean number features per cluster per patient', fontsize='large')
plt.xlabel('T-val', fontsize='large')
plt.ylabel('Number of features', fontsize='large')
plt.tight_layout()
plt.savefig('../SubmissionVisuals/t-valSM/features-per-cluster.png')
plt.show()