In [6]:
# data processing and utils
import pandas as pd
import itertools
from tqdm import tqdm
import numpy as np
from collections import Counter
import re
import csv
from collections import defaultdict

# math and ml
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, classification_report, f1_score
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import scipy.stats as ss
import math
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import math
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz
import pydot

In [2]:
# biomarkers comprehensive table
df = pd.read_csv('files/IGTP NANOSTRING AND GLOBAL TABLE.tsv',sep='\t')

# biomarker group 7
bgroup7_BX = ['C1QB','DLK', 'C1QB_10', 'DLK_5', 'DLK_1'] # 'DKK1' excluded

In [3]:
# prospective cohort stratified split for the EFS study
features = ['CHIC_CAT','AGE_8','GENDER_M1','PRETEXT','AFP','Vascular Invasion','Multifocality','D_METASTASIS','EFS','Diagnosis_DEF_3cat']

a = df[(df['Retrospective1/Prospective2']==2) & (df['FU_2yrorEFS1']==1)][features]
y = a['EFS']
a = a.drop(columns=['EFS','Diagnosis_DEF_3cat'])

mapper = DataFrameMapper([(a.columns, StandardScaler())])
scaled_features = mapper.fit_transform(a.copy(), 4)
a = pd.DataFrame(scaled_features, index=a.index, columns=a.columns)
X = a.to_numpy()
idx = a.index
X_train, X_test, y_train, y_test, EFS_idx_train, EFS_idx_test = train_test_split(X, y, idx, test_size = 0.3, random_state = 42, stratify = y)

In [4]:
# create train and test sets
configs_BX = {'7':bgroup7_BX}

config = '7'
bgroups_BX = configs_BX[config]

EFS_train_BX = pd.concat([df[(df['FU_2yrorEFS1']==1) & (df['Retrospective1/Prospective2']==1)][bgroups_BX+['EFS']], 
           df[(df['FU_2yrorEFS1']==1) & (df.index.isin(EFS_idx_train))][bgroups_BX+['EFS']]]).dropna()
EFS_test_BX = df[(df['FU_2yrorEFS1']==1) & (df.index.isin(EFS_idx_test))][bgroups_BX+['EFS']].dropna()

y_train = EFS_train_BX['EFS'] 
X_train = EFS_train_BX.drop(['EFS'], axis=1)
y_test = EFS_test_BX['EFS']
X_test = EFS_test_BX.drop(['EFS'], axis=1)

print('Train set',len(y_train.index))
print('Test set',len(y_test.index))

Train set 47
Test set 14


In [5]:
for i in range(1,len(bgroup7_BX)+1):
    for j in list(itertools.combinations(bgroup7_BX,i)):
        
        y_train = EFS_train_BX[list(j)+['EFS']]['EFS']
        X_train = EFS_train_BX[list(j)+['EFS']].drop(['EFS'], axis=1)
        y_test = EFS_test_BX[list(j)+['EFS']]['EFS']
        X_test = EFS_test_BX[list(j)+['EFS']].drop(['EFS'], axis=1)
        
        tag = 'EFS_BX_7_'+str.join(',',j)
        
        with open('output_ML/ESF_BX_plasma_RF/'+tag+'_sets.txt', 'w') as f:
            f.write('Train set'+str(len(y_train.index))+'\n')
            f.write('Test set'+str(len(y_test.index))+'\n')
        
        features = list(j)
        classifier = BalancedRandomForestClassifier(max_depth=2, random_state=0, class_weight="balanced_subsample")

        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_train)
        print(tag,'Train accuracy: ' + str(accuracy_score(y_train, y_pred)))
        print(tag,'Train f1-score:' + str(f1_score(y_train, y_pred)))
        y_pred = classifier.predict(X_test)
        print(tag,'Test accuracy: ' + str(accuracy_score(y_test, y_pred)))
        print(tag,'Test f1-score:' + str(f1_score(y_test, y_pred)))
        print('\n')

        report = classification_report(y_test, y_pred, output_dict=True)
        report_df = pd.DataFrame(report)
        report_df.to_csv('output_ML/ESF_BX_plasma_RF/'+tag+'_RF_report.tsv',sep='\t') 

        cm = confusion_matrix(y_test, y_pred)

        features = features
        importances = classifier.feature_importances_
        indices = np.argsort(importances)

        # plots

        fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
        ax1 = plt.subplot(axes[0])
        plot_confusion_matrix(classifier, X_test[features], y_test, ax=ax1)
        ax1.grid(False)
        axes[1].barh(range(len(indices)), importances[indices], color='g', align='center')
        axes[1].set_yticks(range(len(indices)))
        axes[1].set_yticklabels([features[i] for i in indices])
        axes[1].set_xlabel('Relative Importance')
        if len(features)>=10:
            axes[1].set_ylim([max(indices)-10, max(indices)])
            axes[1].set_title('Feature Importances (top 10)')
        else:
            axes[1].set_title('Feature Importances')
        plt.savefig('output_ML/ESF_BX_plasma_RF/'+tag+'_RF.png', bbox_inches='tight')
        plt.close(fig)

EFS_BX_7_C1QB Train accuracy: 0.7872340425531915
EFS_BX_7_C1QB Train f1-score:0.7058823529411765
EFS_BX_7_C1QB Test accuracy: 0.42857142857142855
EFS_BX_7_C1QB Test f1-score:0.20000000000000004


EFS_BX_7_DLK Train accuracy: 0.6170212765957447
EFS_BX_7_DLK Train f1-score:0.64
EFS_BX_7_DLK Test accuracy: 0.21428571428571427
EFS_BX_7_DLK Test f1-score:0.0


EFS_BX_7_C1QB_10 Train accuracy: 0.5106382978723404
EFS_BX_7_C1QB_10 Train f1-score:0.4390243902439025
EFS_BX_7_C1QB_10 Test accuracy: 0.42857142857142855
EFS_BX_7_C1QB_10 Test f1-score:0.42857142857142855


EFS_BX_7_DLK_5 Train accuracy: 0.574468085106383
EFS_BX_7_DLK_5 Train f1-score:0.5833333333333333
EFS_BX_7_DLK_5 Test accuracy: 0.42857142857142855
EFS_BX_7_DLK_5 Test f1-score:0.5000000000000001


EFS_BX_7_DLK_1 Train accuracy: 0.5319148936170213
EFS_BX_7_DLK_1 Train f1-score:0.5925925925925927
EFS_BX_7_DLK_1 Test accuracy: 0.35714285714285715
EFS_BX_7_DLK_1 Test f1-score:0.47058823529411764


EFS_BX_7_C1QB,DLK Train accuracy: 0.