In [1]:
#import libraries
import pandas as pd
from sklearn import tree # package to make decision tree/forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.model_selection import StratifiedKFold


#from sklearn.tree import export_graphviz
import numpy as np
#import graphviz

import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")

# read in data file
df = pd.read_excel("/Users/avery/OneDrive/Documents/GitHub/Clinical_TLB_2023-2024/lung_cancer_tlb.xlsx")

In [2]:
# SET UP
# keep only control and adenocarcinoma
df['CancerType'] = np.where(df['CancerType'].isna(), 'Control', df['CancerType'])

df_tree = df[(df['CancerType'] == 'Control') | (df['CancerType'] == 'Adenocarcinoma')]

In [3]:
# split into seperate dfs for features and labels
features = df_tree.drop(['CancerType', 'sample_id'], axis=1)
labels = df_tree[['pub_id', 'CancerType']]

Here is a simple train/test split

In [4]:
# split into training and testing sets (stratified split)
train_set_features, test_set_features, train_set_labels, test_set_labels = train_test_split(features, labels, test_size=0.2, stratify=labels['CancerType'])

# using training to fit tree / forest
clf = tree.DecisionTreeClassifier()
clf = clf.fit( train_set_features.drop('pub_id', axis=1), train_set_labels.drop('pub_id', axis=1) )

# using tree / forest to predict testing set
test_set_predictions = clf.predict(test_set_features.drop('pub_id', axis=1))
accuracy = accuracy_score(test_set_predictions, test_set_labels['CancerType'])
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.68


Here is a stratified k fold approach

In [4]:
# initialize object
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate through the folds and access the indices
for fold, (train_indices, test_indices) in enumerate(stratified_kfold.split(features.drop('pub_id', axis=1), labels.drop('pub_id', axis=1))):
    # Inside this loop, train_indices and test_indices contain the indices for the current fold
    print(f"Fold {fold + 1} - Train Indices:", train_indices)
    print(f"Fold {fold + 1} - Test Indices:", test_indices)
    print("\n")

Fold 1 - Train Indices: [  1   2   4   5   7   8   9  10  11  13  15  17  18  19  21  23  24  25
  26  27  28  29  30  31  32  34  35  36  37  38  39  40  41  42  44  46
  47  48  49  50  51  52  53  55  56  57  58  59  60  62  64  65  66  67
  68  69  70  71  72  73  74  75  77  78  79  80  81  82  84  86  88  89
  90  92  93  94  95  96  97  99 100 103 105 106 107 108 109 110 112 113
 114 116 117 118 119 120 121 122]
Fold 1 - Test Indices: [  0   3   6  12  14  16  20  22  33  43  45  54  61  63  76  83  85  87
  91  98 101 102 104 111 115]


Fold 2 - Train Indices: [  0   1   3   5   6   7   8   9  11  12  13  14  15  16  17  18  19  20
  21  22  24  25  26  27  28  30  31  32  33  34  35  36  37  38  40  42
  43  44  45  47  48  49  50  53  54  56  57  58  59  60  61  62  63  64
  65  66  67  70  72  74  75  76  77  78  79  80  81  83  84  85  86  87
  88  89  90  91  92  96  97  98  99 100 101 102 103 104 106 107 108 109
 111 114 115 116 117 118 120 122]
Fold 2 - Test Indices: [  

In [5]:
# create an empty pandas df to store indices from StratifiedKFold object
cv_indices_df = pd.DataFrame(columns=['Fold', 'Train', 'Test'])

# loop though each fold in SKF object
for fold, (train_indices, test_indices) in enumerate(stratified_kfold.split(features.drop('pub_id', axis=1), labels.drop('pub_id', axis=1))):
    cv_indices_df.loc[len(cv_indices_df.index)] = (fold, train_indices, test_indices)
    

# can filter df like this to get train and test set
#features.iloc[cv_indices_df.iloc[1, 1]]

cv_indices_df.iloc[0,1]


array([  1,   2,   4,   5,   7,   8,   9,  10,  11,  13,  15,  17,  18,
        19,  21,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  34,
        35,  36,  37,  38,  39,  40,  41,  42,  44,  46,  47,  48,  49,
        50,  51,  52,  53,  55,  56,  57,  58,  59,  60,  62,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  77,  78,  79,
        80,  81,  82,  84,  86,  88,  89,  90,  92,  93,  94,  95,  96,
        97,  99, 100, 103, 105, 106, 107, 108, 109, 110, 112, 113, 114,
       116, 117, 118, 119, 120, 121, 122])

In [18]:
# split into seperate dfs for features and labels
'''features = df_tree.drop(['CancerType', 'sample_id'], axis=1)
labels = df_tree[['pub_id', 'CancerType']]

folds = 5
fold = 0
# loop through number of folds
#for fold in range(folds):

    # get train set indices
train_set_indicies = cv_indices_df.iloc[fold, 1]

# get test set indices
test_set_indicies = cv_indices_df.iloc[fold, 2]

# get train and test dfs
train_set_features = features.iloc[train_set_indicies]
train_set_labels = labels.iloc[train_indices]

test_set_features = features.iloc[test_set_indicies]
test_set_labels = labels.iloc[test_set_indicies]

# train decision tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit( train_set_features.drop('pub_id', axis=1), train_set_labels.drop('pub_id', axis=1) )

# predict decision tree
test_set_predictions = clf.predict(test_set_features.drop('pub_id', axis=1))

# calculate Accuracy, AUC
accuracy = accuracy_score(test_set_predictions, test_set_labels['CancerType'])

print(accuracy)'''



"test_set_features = features.iloc[test_set_indicies]\ntest_set_labels = labels.iloc[test_set_indicies]\n\n# train decision tree\nclf = tree.DecisionTreeClassifier()\nclf = clf.fit( train_set_features.drop('pub_id', axis=1), train_set_labels.drop('pub_id', axis=1) )\n\n# predict decision tree\ntest_set_predictions = clf.predict(test_set_features.drop('pub_id', axis=1))\n\n# calculate Accuracy, AUC\naccuracy = accuracy_score(test_set_predictions, test_set_labels['CancerType'])\n\nprint(accuracy)"

In [14]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

performance_metrics = pd.DataFrame(columns=['Train', 'Test', 'Test Accuracy'])

# Define the number of splits and random seed for StratifiedKFold
n_splits = 5
random_seed = 42
skf = StratifiedKFold(n_splits=n_splits, random_state=random_seed, shuffle=True)

# Initialize lists to store train and test indices
train_indices_list = []
test_indices_list = []

# loop through the splits and extract train and test indices
for train_indices, test_indices in skf.split(df_tree, df_tree['CancerType']):

    # append train and test indices to their lists
    train_indices_list.append(train_indices)
    test_indices_list.append(test_indices)

# access the rows of the thermogram df using the indices
for fold in range(n_splits):

    # get training and testing dataframes
    train_df = df_tree.iloc[train_indices_list[fold]].drop(['sample_id', 'pub_id'], axis = 1)
    test_df = df_tree.iloc[test_indices_list[fold]].drop(['sample_id', 'pub_id'], axis = 1)
    
    # train the decision tree using the train set
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit( train_df.drop('CancerType', axis = 1), train_df['CancerType'])

    # predict the test set
    test_predictions = clf.predict(test_df.drop('CancerType', axis = 1))

    # compare predictions to labels
    accuracy = accuracy_score(test_predictions, test_df['CancerType'])

    # store train/test indices and accuracy
    performance_metrics.loc[len(performance_metrics)] = [train_indices_list[fold], test_indices_list[fold], accuracy]