In [0]:
import numpy as np
import pandas as pd
import inspect
import sklearn
from sklearn.model_selection import cross_val_score
from sklearn import decomposition
from sklearn.linear_model import LogisticRegression as LR
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.neighbors import KNeighborsClassifier as kNN
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import AdaBoostClassifier as AB
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.svm import SVC
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')
##
arm = "Combination" #"Lapatinib" #"Trastuzumab" 
er_status = 'ERNEGATIVE'
methods_dict = {'logreg': LR,
                'k_nearest_neighbour' : kNN,
                'naive_bayes': NB, 
                'random_forest': RF,
                'svc': SVC}

In [0]:
input_path = 'data/'
# reading the data (target)

if arm == "Lapatinib":
  data = pd.read_csv(input_path + 'LAPATINIB_ALONE_'+er_status+'_expression_response.csv', index_col=0)
elif arm == "Trastuzumab":
  data = pd.read_csv(input_path + 'TRASTUZUMAB_ALONE_'+er_status+'_expression_response.csv', index_col=0)
elif arm == "Combination":
  data = pd.read_csv(input_path + 'LAPATINIB_IN_COMBINATION_WITH_TRASTUZUMAB_'+er_status+'_expression_response.csv', index_col=0)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
output_var = data['response'].values
output_var = output_var.tolist()
input_features = data.drop(columns = 'response')

In [0]:
var_sortedind = np.argsort(-input_features.var().values)
# Generating feature frames after implementing feature selection
input_features_top10 = input_features.iloc[:,var_sortedind[0:10]]
input_features_top30 = input_features.iloc[:,var_sortedind[0:30]]
input_features_top100 = input_features.iloc[:,var_sortedind[0:100]]
input_features_top500 = input_features.iloc[:,var_sortedind[0:500]]
input_features_top2000 = input_features.iloc[:,var_sortedind[0:2000]]
#
# Create PCA object
pca = decomposition.PCA(n_components=input_features.shape[0],whiten=True, random_state = 42)
# fitting the PCA model using the training data
pca.fit_transform(input_features)
# generate principle components of the training data
input_features_pca = pd.DataFrame(pca.transform(input_features))

feature_dictionary = {'top10genes': input_features_top10,
                      'top30genes': input_features_top30,
                      'top100genes': input_features_top100,
                      'top500genes': input_features_top500,
                      'top2000genes': input_features_top2000,
                      'PCs': input_features_pca,
                      'allgenes': input_features}

In [0]:
def LOO(method, input_df, output):
  
  if 'random_state' in inspect.getfullargspec(method)[0]:
    model = method(random_state = 42)
  else:
    model = method()
  preds = []
  for point_iter in range(0,len(output)):
    input_tmp = input_df.drop(input_df.index.values[point_iter], axis = 0)
    output_tmp = output[:point_iter] + output[point_iter+1 :]
    model.fit(input_tmp, output_tmp)
    preds.append(model.predict(input_df.iloc[[point_iter],:])[0])

  return preds

      

# Create logistic regression object
predictions_dict = {}

for method_iter in methods_dict:
  print(method_iter)
  predictions = pd.DataFrame(columns=[*feature_dictionary] )

  for feat_iter in feature_dictionary:
    predictions[feat_iter] = LOO(method = methods_dict[method_iter],
                                 input_df = feature_dictionary[feat_iter],
                                 output = output_var)
  predictions_dict[method_iter] = predictions

logreg
k_nearest_neighbour
naive_bayes
random_forest
svc


In [0]:
performance = pd.DataFrame(columns=['method','features','f1','AUC-ROC', 'banalced-accuracy'])

for method_iter in methods_dict:
  print(method_iter)

  for feat_iter in feature_dictionary:

    perf_list = [method_iter,
                 feat_iter,
                 metrics.f1_score(output_var, predictions_dict[method_iter][feat_iter]),
                 metrics.roc_auc_score(output_var, predictions_dict[method_iter][feat_iter]),
                 metrics.balanced_accuracy_score(output_var, predictions_dict[method_iter][feat_iter])]

    perf_series = pd.Series(perf_list, index = performance.columns)
    performance = performance.append(perf_series, ignore_index=True)


logreg
k_nearest_neighbour
naive_bayes
random_forest
svc


In [0]:
performance.to_csv(input_path + arm + '_' + er_status + '_' + 'predictions.csv', index = False)