In [1]:
import pandas as pd # standard
import numpy as np # standard
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score # for accuracy calculation
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

import thermogram_utilities

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_excel("/Users/avery/OneDrive/Documents/GitHub/Clinical_TLB_2023-2024/lung_cancer_tlb.xlsx")
df['CancerType'] = np.where(df['CancerType'].isna(), 'Control', df['CancerType'])

In [3]:
# get location of cut off values
lower_column_index = df.columns.get_loc("T51")
upper_column_index = df.columns.get_loc("T83.1")
label_column_index = df.columns.get_loc("CancerType")

column_indices = np.arange(lower_column_index, upper_column_index)
column_indices = np.append(column_indices, 0)
column_indices = np.append(column_indices, 1)



column_indices = np.append(column_indices, label_column_index)

df = df.iloc[:, column_indices]

# keep only Control and Adenocarcinoma for analysis
df_tree = df[(df['CancerType'] == 'Control') | (df['CancerType'] == 'Adenocarcinoma')]
df_tree = df_tree.reset_index(drop=True)

In [4]:
# set adeno to 1 and squamous to 0
df_tree['CancerType'] = np.where(df_tree['CancerType'] == "Adenocarcinoma", 1, 0)

df_tree = df_tree.drop(["pub_id", "sample_id"], axis = 1)

In [5]:
performance_metrics = pd.DataFrame(columns=['Weighted Accuracy', 'AUC', 'n_estimators', "max_depth", "max_features"])

# Set number of bootstraps
total_bootstraps = 1000

# Length of df
num_rows = df_tree.shape[0]

# Create array of all indices in the full data set
all_indices = np.arange(num_rows)

# Columns to drop
#drop_cols = ['sample_id', 'pub_id', 'CancerType']

# Loop for specified iterations
for i in range(total_bootstraps):
    
    # Randomly select indices to use as the train set
    train_indices = np.random.choice(num_rows, num_rows, replace=True)

    # Get the train set using the indices
    train_set = df_tree.iloc[train_indices, : ]

    # Get indices not included in train_indices to use as the test set
    test_indices = np.setdiff1d(all_indices, train_indices)

    # Get the test set using test indices
    test_set = df_tree.iloc[test_indices, :]

    # Drop 'CancerType' column from train and test sets
    train_set_no_target = train_set.drop('CancerType', axis=1)
    test_set_no_target = test_set.drop('CancerType', axis=1)

    for trees in [100, 250, 500, 1000]:
            
        for depth in [num_rows//2, None]:

            for features in ["sqrt", "log2", None]:

                # Initialize random forest (default settings)
                clf = RandomForestClassifier(n_estimators=trees, max_depth=depth, max_features=features, n_jobs= -1)

                # Train the random forest
                clf = clf.fit(train_set_no_target, train_set['CancerType'])

                # Get probabilities
                test_probabilities = clf.predict_proba(test_set_no_target)

                # Test the decision tree
                test_predictions = clf.predict(test_set_no_target)

                # Calculate weighted accuracy
                balanced_acc = balanced_accuracy_score(test_set['CancerType'], test_predictions)

                # Calculate AUC
                auc = roc_auc_score(test_set['CancerType'], test_probabilities[:, 1])

                # Append accuracy, AUC to results DataFrame
                performance_metrics.loc[len(performance_metrics)] = [balanced_acc, auc, trees, depth, features]

    performance_metrics.to_excel('Adeno_Control.xlsx', index=False)

    print(i)


KeyboardInterrupt: 