In [1]:
import pandas as pd # standard
import numpy as np # standard
from sklearn import tree # package to make decision tree
from sklearn.metrics import accuracy_score # for accuracy calculation
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt

In [2]:
df = pd.read_excel("/Users/avery/OneDrive/Documents/GitHub/Clinical_TLB_2023-2024/lung_cancer_tlb.xlsx")

# replace NA with control
df['CancerType'] = np.where(df['CancerType'].isna(), 'Control', df['CancerType'])

# keep only Control and Adenocarcinoma for analysis
df_tree = df[(df['CancerType'] == 'Control') | (df['CancerType'] == 'Adenocarcinoma')]
df_tree = df_tree.reset_index(drop=True)

In [3]:
# length of df
num_rows = df_tree.shape[0]

# number of bootstraps
total_bootstraps = 1

# create results df
hyperparameter_tuning_df = pd.DataFrame(columns=["max_depth", "max_features", 'min_leaves', "balanced_accuracy", 'auc'])

# create array of all indices in full data set
all_indices = np.arange(num_rows)

# columns to drop
drop_cols = ['sample_id', 'pub_id', 'CancerType']

# loop to bootstrap and validate many times
for i in range(total_bootstraps):

    # sample indices with replacement of df
    train_indices = np.random.choice(num_rows, num_rows, replace = True)
    # get the train set using the indices
    train_set = df_tree.iloc[train_indices, : ]
    # get the indices not selected
    test_indices = np.setdiff1d(all_indices, train_indices)
    # use not selected indices as the train set
    test_set = df_tree.iloc[test_indices, : ]

    for depth in range(1, 16):
        for features in range(1, 452, 5):
            for leaves in range(1,6):

                # initialize decision tree
                clf = tree.DecisionTreeClassifier(max_depth= depth, max_features=features, min_samples_leaf= leaves)

                # train and test tree
                clf = clf.fit( train_set.drop(drop_cols, axis = 1), train_set['CancerType'])
                test_predictions = clf.predict(test_set.drop(drop_cols, axis = 1))

                # calculate balanced accuracy
                balanced_acc = balanced_accuracy_score(test_set['CancerType'], test_predictions)

                # get probabilities
                test_probabilities = clf.predict_proba(test_set.drop(drop_cols, axis = 1))

                # test decision tree
                test_predictions = clf.predict(test_set.drop(drop_cols, axis = 1))

                # calculate weighted accuracy
                balanced_acc = balanced_accuracy_score(test_set['CancerType'], test_predictions)

                # calculate AUC
                auc = roc_auc_score(test_set['CancerType'] == 'Control', test_probabilities[:, 1])

                # append to results df
                hyperparameter_tuning_df.loc[len(hyperparameter_tuning_df)] = [depth, features, leaves, balanced_acc, auc]




In [5]:
hyperparameter_tuning_df.sort_values('balanced_accuracy', ascending=False).head(3)

Unnamed: 0,max_depth,max_features,min_leaves,balanced_accuracy,auc
1867,5.0,46.0,3.0,0.752415,0.740338
3833,9.0,191.0,4.0,0.736715,0.706522
3262,8.0,76.0,3.0,0.736715,0.724638


In [6]:
hyperparameter_tuning_df.sort_values('auc', ascending=False).head(3)

Unnamed: 0,max_depth,max_features,min_leaves,balanced_accuracy,auc
3198,8.0,11.0,4.0,0.675121,0.7657
2331,6.0,56.0,2.0,0.724638,0.746377
3682,9.0,41.0,3.0,0.708937,0.742754


In [None]:
# assume train/test split is already done

'''hyperparameter_tuning_df = pd.DataFrame(columns=["max_depth", "max_features", 'min_leaves', "balanced_accuracy"])

for depth in range(1, 16):
    for features in range(1, 452, 5):
        for leaves in range(1,6):

            # initialize decision tree
            clf = tree.DecisionTreeClassifier(max_depth= depth, max_features=features, min_samples_leaf= leaves)

            # train and test tree
            clf = clf.fit( train_set.drop(drop_cols, axis = 1), train_set['CancerType'])
            test_predictions = clf.predict(test_set.drop(drop_cols, axis = 1))

            # calculate balanced accuracy
            balanced_acc = balanced_accuracy_score(test_set['CancerType'], test_predictions)

            # append to df
            hyperparameter_tuning_df.loc[len(hyperparameter_tuning_df)] = [depth, features, leaves, balanced_acc]
'''

In [18]:
hyperparameter_tuning_df.sort_values('balanced_accuracy', ascending=False).head(1).iloc[0, 0:3]

clf = tree.DecisionTreeClassifier(max_depth= 5, max_features=46, min_samples_leaf= 3)

# train and test tree
clf = clf.fit( df_tree.drop(drop_cols, axis = 1), df_tree['CancerType'])

feature_importance1 = clf.feature_importances_

temps = df_tree.drop(['CancerType', 'sample_id', 'pub_id'], axis = 1).columns.str.replace('T', '')
temps = temps.astype(float)
feature_importance = pd.DataFrame({"Temperature":temps, "Importance": feature_importance1})