In [None]:
import pandas as pd # standard
import numpy as np # standard
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score # for accuracy calculation
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

import thermogram_utilities

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_excel("/Users/avery/OneDrive/Documents/GitHub/Clinical_TLB_2023-2024/lung_cancer_tlb.xlsx")
df['CancerType'] = np.where(df['CancerType'].isna(), 'Control', df['CancerType'])
df['CancerType'] = np.where(df['CancerType'] == "Control", 'Control', 'Other')
df['CancerType'].value_counts()

In [None]:

# replace NA with control

# get location of cut off values
lower_column_index = df.columns.get_loc("T51")
upper_column_index = df.columns.get_loc("T83.1")
label_column_index = df.columns.get_loc("CancerType")

column_indices = np.arange(lower_column_index, upper_column_index)
column_indices = np.append(column_indices, 0)
column_indices = np.append(column_indices, 1)



column_indices = np.append(column_indices, label_column_index)

df = df.iloc[:, column_indices]

df_tree = df.reset_index(drop=True)

In [None]:
performance_metrics = pd.DataFrame(columns=['Weighted Accuracy', 'AUC', 'n_estimators', "max_depth", "max_features"])

# set number of bootstraps
total_bootstraps = 1000

# length of df
num_rows = df_tree.shape[0]

# create array of all indices in full data set
all_indices = np.arange(num_rows)

# columns to drop
drop_cols = ['sample_id', 'pub_id', 'CancerType']


# loop for specified iterations
for i in range(total_bootstraps):
    
    # randomly select indices to use as train set
    train_indices = np.random.choice(num_rows, num_rows, replace = True)

    # get the train set using the indices
    train_set = df_tree.iloc[train_indices, : ]

    # get indices not included in train_indices to use as test set
    test_indices = np.setdiff1d(all_indices, train_indices)

    # get test set using test indices
    test_set = df_tree.iloc[test_indices, :]

    for trees in [100, 250, 500, 1000]:
            
        for depth in [num_rows//2, None]:

            for features in ["sqrt", "log2", None]:


                # initialize random forest (default settings)
                clf = RandomForestClassifier(n_estimators=trees, max_depth=depth, max_features=features, n_jobs = -1)

                # train forest
                clf = clf.fit(train_set.drop(drop_cols, axis = 1), train_set['CancerType'])

                # get probabilities
                test_probabilities = clf.predict_proba(test_set.drop(drop_cols, axis = 1))

                # test decision tree
                test_predictions = clf.predict(test_set.drop(drop_cols, axis = 1))

                # calculate weighted accuracy
                balanced_acc = balanced_accuracy_score(test_set['CancerType'], test_predictions)

                # calculate AUC
                auc = roc_auc_score(test_set['CancerType'] == 'Other', test_probabilities[:, 1])

                # append accuracy, auc to results df
                performance_metrics.loc[len(performance_metrics)] = [balanced_acc, auc, trees, depth, features]

    performance_metrics.to_excel('sensitivity_analysis_control_all.xlsx', index=False)

    print(i)



In [None]:
clf.classes_

In [11]:
sensitivity_analysis_df = pd.read_excel("sensitivity_analysis_control_all.xlsx")
sensitivity_analysis_df["max_features"] = sensitivity_analysis_df['max_features'].fillna('None')
sensitivity_analysis_df["max_depth"] = sensitivity_analysis_df['max_depth'].fillna('None')
result = sensitivity_analysis_df.groupby(['n_estimators', 'max_depth', 'max_features']).mean()

In [None]:
boot_strap_number = np.repeat(np.arange(0, 1000), 24)
boot_strap_number = pd.Series(boot_strap_number)
result = pd.concat([boot_strap_number, sensitivity_analysis_df], axis=1)
result["max_features"] = result['max_features'].fillna('None')

result_1 = result.groupby(['n_estimators', 'max_depth', 'max_features']).mean()

We use sort_values() to sort the DataFrame by 'Value' in descending order.
We then use groupby('Group') to group the sorted DataFrame by the 'Group' column.
Finally, we use .head(1) to select the first row within each group, which corresponds to the row with the highest 'Value'.

In [None]:
hyper_parameter_combinations = result.sort_values(by='Weighted Accuracy', ascending=False).groupby(0).head(1)

hyper_parameter_combinations

In [None]:
sns.boxplot(data=result, x='n_estimators', y='Weighted Accuracy')

In [None]:
sns.boxplot(data=result, x='max_depth', y='Weighted Accuracy')


In [None]:
sns.boxplot(data=result, x='max_features', y='Weighted Accuracy')


In [None]:
hyper_parameter_combinations['n_estimators'].value_counts()

In [None]:
sensitivity_analysis_df = pd.read_excel("sensitivity_analysis_control_all.xlsx")
