## Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive');

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import NearMiss
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from imblearn.under_sampling import RandomUnderSampler
from random import randint as r

In [3]:
#data = pd.read_excel(io='/content/drive/MyDrive/Biology /Biology Research Project/sclc labelled data.xlsx',sheet_name="Sheet1")
data = pd.read_excel(io='/content/drive/MyDrive/Biology Research Project/sclc labelled data.xlsx',sheet_name="Sheet1")

## Data Preprocess

In [4]:
data['NSCLC'] = data['NSCLC'].fillna(0)
data['SCLC'] = data['SCLC'].fillna(0)
x = data.iloc[:,[3,4,6]]
y_sclc = data.iloc[:,8]
y_nsclc = data.iloc[:,9]

In [5]:
nm = NearMiss()
print('SCLC Original dataset shape:', Counter(y_sclc))
x_nm_sclc, y_nm_sclc = nm.fit_resample(x, y_sclc)
print('SCLC Resample dataset shape:', Counter(y_nm_sclc))
print('NSCLC Original dataset shape:', Counter(y_nsclc))
x_nm_nsclc, y_nm_nsclc = nm.fit_resample(x, y_nsclc)
print('NSCLC Resample dataset shape:', Counter(y_nm_nsclc))

SCLC Original dataset shape: Counter({0.0: 18857, 1.0: 921})
SCLC Resample dataset shape: Counter({0.0: 921, 1.0: 921})
NSCLC Original dataset shape: Counter({0.0: 19087, 1.0: 691})
NSCLC Resample dataset shape: Counter({0.0: 691, 1.0: 691})


In [6]:
x_sclc = x_nm_sclc
x_nsclc = x_nm_nsclc
y_sclc = y_nm_sclc
y_nsclc = y_nm_nsclc

### Dividing data into Train and Test

In [7]:
xtrain_sclc, xtest_sclc, ytrain_sclc, ytest_sclc = train_test_split(x_sclc, y_sclc, test_size = 0.20)
xtrain_nsclc, xtest_nsclc, ytrain_nsclc, ytest_nsclc = train_test_split(x_nsclc, y_nsclc, test_size = 0.20)

### Scaling Data

In [8]:
sc_x = StandardScaler()
xtrain_sclc = sc_x.fit_transform(xtrain_sclc)
xtest_sclc = sc_x.transform(xtest_sclc)
sc_x = StandardScaler()
xtrain_nsclc = sc_x.fit_transform(xtrain_nsclc) 
xtest_nsclc = sc_x.transform(xtest_nsclc)

## Fine Tuning

In [9]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt','log2']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [10]:
rfc = RandomForestClassifier()
rf_random_sclc = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, n_jobs = -1)
rf_random_sclc.fit(xtrain_sclc, ytrain_sclc)
params_sclc = rf_random_sclc.best_params_
params_sclc

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'bootstrap': True,
 'max_depth': 30,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 1000}

In [11]:
rfc = RandomForestClassifier()
rf_random_nsclc = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, n_jobs = -1)
rf_random_nsclc.fit(xtrain_nsclc, ytrain_nsclc)
params_nsclc = rf_random_nsclc.best_params_
params_nsclc

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'bootstrap': True,
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 1800}

# Training and Testing

In [None]:
rfc_sclc = RandomForestClassifier(**params_sclc)
rfc_sclc.fit(xtrain_sclc,ytrain_sclc)
rfc_predict_sclc = rfc_sclc.predict(xtest_sclc)
print("=== Confusion Matrix ===")
print(confusion_matrix(ytest_sclc, rfc_predict_sclc))
print('\n')
print("=== Classification Report ===")
print(classification_report(ytest_sclc, rfc_predict_sclc))

=== Confusion Matrix ===
[[150  25]
 [ 43 151]]


=== Classification Report ===
              precision    recall  f1-score   support

         0.0       0.78      0.86      0.82       175
         1.0       0.86      0.78      0.82       194

    accuracy                           0.82       369
   macro avg       0.82      0.82      0.82       369
weighted avg       0.82      0.82      0.82       369



In [None]:
rfc_nsclc = RandomForestClassifier(**params_nsclc)
rfc_nsclc.fit(xtrain_nsclc,ytrain_nsclc)
rfc_predict_nsclc = rfc_nsclc.predict(xtest_nsclc)
print("=== Confusion Matrix ===")
print(confusion_matrix(ytest_nsclc, rfc_predict_nsclc))
print('\n')
print("=== Classification Report ===")
print(classification_report(ytest_nsclc, rfc_predict_nsclc))

=== Confusion Matrix ===
[[134  13]
 [ 25 105]]


=== Classification Report ===
              precision    recall  f1-score   support

         0.0       0.84      0.91      0.88       147
         1.0       0.89      0.81      0.85       130

    accuracy                           0.86       277
   macro avg       0.87      0.86      0.86       277
weighted avg       0.86      0.86      0.86       277



# Precision Graph Plotting

In [None]:
rfc_sclc = RandomForestClassifier(**params_sclc)
rfc_nsclc = RandomForestClassifier(**params_nsclc)
n = []
sclc = []
nsclc = []
for i in range(5,45):
  n.append(i)
  rfc_cv_score_sclc = cross_val_score(rfc_sclc, x_sclc, y_sclc, cv=i, scoring='average_precision')
  sclc.append(np.mean(rfc_cv_score_sclc))
  rfc_cv_score_nsclc = cross_val_score(rfc_nsclc, x_nsclc, y_nsclc, cv=i, scoring='average_precision')
  nsclc.append(np.mean(rfc_cv_score_nsclc))

n = np.array(n)
nsclc = np.array(nsclc)
sclc = np.array(sclc) 

In [None]:
# figure
fig, ax = plt.subplots(1, figsize=(20,10), facecolor='#FFFFFF')
ax.set_facecolor('#FFFFFF')
# data
price = nsclc
rate = sclc
# plots
plt.plot(n, price, marker='o', markersize=4.5, color='#7a7777', linewidth=1)
plt.plot(n, rate, marker='*', markersize=6, color='#7a7777', linewidth=1)
# ticks n title
# plt.title('Price and Exchange Rate\n', loc='left', color='black', fontsize=16)
ax.tick_params(axis='both', colors='black')
plt.xticks(n[::3])
# # spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_color('black')
ax.spines['bottom'].set_color('black')
# grid
ax.set_axisbelow(True)
ax.yaxis.grid(color='#7a7777', linestyle='dashed', alpha=0.5)
plt.ylim(0.86,)
plt.xlabel('Number of Folds',fontsize=18) 
plt.ylabel('Average Precision',fontsize=18) 
# legend
legend = plt.legend(['NSCLC', 'SCLC'], frameon=True, ncol=2, loc = "lower right",fontsize = '20')
plt.setp(legend.get_texts(), color='black')
plt.savefig("results.png")

# Functions to run above code



### Helper Functions

In [None]:
def preprocess(data):
  data['NSCLC'] = data['NSCLC'].fillna(0)
  data['SCLC'] = data['SCLC'].fillna(0)
  x = data.iloc[:,[3,4,6]]
  y_sclc = data.iloc[:,8]
  y_nsclc = data.iloc[:,9]
  nm = NearMiss()
  print('SCLC Original dataset shape:', Counter(y_sclc))
  x_nm_sclc, y_nm_sclc = nm.fit_resample(x, y_sclc)
  print('SCLC Resample dataset shape:', Counter(y_nm_sclc))
  print('NSCLC Original dataset shape:', Counter(y_nsclc))
  x_nm_nsclc, y_nm_nsclc = nm.fit_resample(x, y_nsclc)
  print('NSCLC Resample dataset shape:', Counter(y_nm_nsclc))
  x_sclc = x_nm_sclc
  x_nsclc = x_nm_nsclc
  y_sclc = y_nm_sclc
  y_nsclc = y_nm_nsclc
  xtrain_sclc, xtest_sclc, ytrain_sclc, ytest_sclc = train_test_split(x_sclc, y_sclc, test_size = 0.20)
  xtrain_nsclc, xtest_nsclc, ytrain_nsclc, ytest_nsclc = train_test_split(x_nsclc, y_nsclc, test_size = 0.20)
  sc_x = StandardScaler()
  xtrain_sclc = sc_x.fit_transform(xtrain_sclc)
  xtest_sclc = sc_x.transform(xtest_sclc)
  sc_x = StandardScaler()
  xtrain_nsclc = sc_x.fit_transform(xtrain_nsclc) 
  xtest_nsclc = sc_x.transform(xtest_nsclc)
  return (x_sclc, y_sclc, x_nsclc, y_nsclc, xtrain_sclc, xtest_sclc, ytrain_sclc, ytest_sclc, xtrain_nsclc, xtest_nsclc, ytrain_nsclc, ytest_nsclc)


def finetune(xtrain_sclc, ytrain_sclc, xtrain_nsclc, ytrain_nsclc):
  n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
  max_features = ['auto', 'sqrt','log2']
  max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
  max_depth.append(None)
  min_samples_split = [2, 5, 10]
  min_samples_leaf = [1, 2, 4]
  bootstrap = [True, False]
  random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}
  rfc = RandomForestClassifier()
  rf_random_sclc = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, n_jobs = -1)
  rf_random_sclc.fit(xtrain_sclc, ytrain_sclc)
  params_sclc = rf_random_sclc.best_params_
  rfc = RandomForestClassifier()
  rf_random_nsclc = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, n_jobs = -1)
  rf_random_nsclc.fit(xtrain_nsclc, ytrain_nsclc)
  params_nsclc = rf_random_nsclc.best_params_
  return(params_sclc, params_nsclc)

def train_test(params_sclc, params_nsclc, xtrain_sclc, ytrain_sclc, xtest_sclc, ytest_sclc, xtrain_nsclc, ytrain_nsclc, xtest_nsclc, ytest_nsclc):
  rfc_sclc = RandomForestClassifier(**params_sclc)
  rfc_sclc.fit(xtrain_sclc, ytrain_sclc)
  rfc_predict_sclc = rfc_sclc.predict(xtest_sclc)
  print("=== Confusion Matrix ===")
  print(confusion_matrix(ytest_sclc, rfc_predict_sclc))
  print('\n')
  print("=== Classification Report ===")
  print(classification_report(ytest_sclc, rfc_predict_sclc))
  rfc_nsclc = RandomForestClassifier(**params_nsclc)
  rfc_nsclc.fit(xtrain_nsclc, ytrain_nsclc)
  rfc_predict_nsclc = rfc_nsclc.predict(xtest_nsclc)
  print("=== Confusion Matrix ===")
  print(confusion_matrix(ytest_nsclc, rfc_predict_nsclc))
  print('\n')
  print("=== Classification Report ===")
  print(classification_report(ytest_nsclc, rfc_predict_nsclc))

def plot(params_sclc, params_nsclc, x_sclc, y_sclc, x_nsclc, y_nsclc):
  rfc_sclc = RandomForestClassifier(**params_sclc)
  rfc_nsclc = RandomForestClassifier(**params_nsclc)
  n = []
  sclc = []
  nsclc = []
  for i in range(5,45):
    n.append(i)
    rfc_cv_score_sclc = cross_val_score(rfc_sclc, x_sclc, y_sclc, cv=i, scoring='average_precision')
    sclc.append(np.mean(rfc_cv_score_sclc))
    rfc_cv_score_nsclc = cross_val_score(rfc_nsclc, x_nsclc, y_nsclc, cv=i, scoring='average_precision')
    nsclc.append(np.mean(rfc_cv_score_nsclc))

  n = np.array(n)
  nsclc = np.array(nsclc)
  sclc = np.array(sclc)
  # figure
  fig, ax = plt.subplots(1, figsize=(20,10), facecolor='#FFFFFF')
  ax.set_facecolor('#FFFFFF')
  # data
  price = nsclc
  rate = sclc
  # plots
  plt.plot(n, price, marker='o', markersize=4.5, color='#7a7777', linewidth=1)
  plt.plot(n, rate, marker='*', markersize=6, color='#7a7777', linewidth=1)
  # ticks n title
  # plt.title('Price and Exchange Rate\n', loc='left', color='black', fontsize=16)
  ax.tick_params(axis='both', colors='black')
  plt.xticks(n[::3])
  # # spines
  ax.spines['right'].set_visible(False)
  ax.spines['top'].set_visible(False)
  ax.spines['left'].set_color('black')
  ax.spines['bottom'].set_color('black')
  # grid
  ax.set_axisbelow(True)
  ax.yaxis.grid(color='#7a7777', linestyle='dashed', alpha=0.5)
  plt.ylim(0.86,)
  plt.xlabel('Number of Folds',fontsize=18) 
  plt.ylabel('Average Precision',fontsize=18) 
  # legend
  legend = plt.legend(['NSCLC', 'SCLC'], frameon=True, ncol=2, loc = "lower right",fontsize = '20')
  plt.setp(legend.get_texts(), color='black')
  plt.savefig("results.png")

### Main Function to run the code and plot the precision for n folds

In [None]:
def main():
  data = pd.read_excel(io='/content/drive/MyDrive/Biology /Biology Research Project/sclc labelled data.xlsx',sheet_name="Sheet1")
  (x_sclc, y_sclc, x_nsclc, y_nsclc, xtrain_sclc, xtest_sclc, ytrain_sclc, ytest_sclc, xtrain_nsclc, xtest_nsclc, ytrain_nsclc, ytest_nsclc) = preprocess(data)
  (params_sclc, params_nsclc) = finetune(xtrain_sclc, ytrain_sclc, xtrain_nsclc, ytrain_nsclc)
  train_test(params_sclc, params_nsclc, xtrain_sclc, ytrain_sclc, xtest_sclc, ytest_sclc, xtrain_nsclc, ytrain_nsclc, xtest_nsclc, ytest_nsclc)
  plot(params_sclc, params_nsclc, x_sclc, y_sclc, x_nsclc, y_nsclc)

main()

### Test function to run the code and test it on the given input.

In [None]:
test_input = data.iloc[30:40,:]

# index = r(0,19900)
# test_input = data.iloc[index:index+20,:]

Unnamed: 0.1,Unnamed: 0,Tags,Name,FC,logFC,logCPM,P-Value,FDR,SCLC,NSCLC
12238,12238.0,[],FAR2,-1.036364,-0.05153,6.047611,0.768052,0.889167,,
12239,12239.0,[],ATF6,-1.074289,-0.103382,8.15467,0.260387,0.490329,1.0,
12240,12240.0,[],ATF3,-1.197653,-0.260209,6.107613,0.301848,0.533602,1.0,
12241,12241.0,[],LOC441087,1.365118,0.449026,-1.056675,0.084938,0.258926,,
12242,12242.0,[],GATA6,-1.275932,-0.351551,4.885739,0.114056,0.305799,,
12243,12243.0,[],TPRG1L,1.004909,0.007065,4.665519,0.97644,0.994635,,
12244,12244.0,[],PDGFA,-1.24327,-0.314139,4.715532,0.088348,0.265474,1.0,1.0
12245,12245.0,[],GATA2,1.032892,0.046689,2.803374,0.842767,0.931684,,
12246,12246.0,[],CIAO3,-1.148757,-0.200073,3.933105,0.075367,0.242021,,
12247,12247.0,[],CIAO1,1.123722,0.168285,5.935793,0.131505,0.332089,,


In [None]:
def train_and_test(test_input):
  data = pd.read_excel(io='/content/drive/MyDrive/Biology /Biology Research Project/sclc labelled data.xlsx',sheet_name="Sheet1")
  (x_sclc, y_sclc, x_nsclc, y_nsclc, xtrain_sclc, xtest_sclc, ytrain_sclc, ytest_sclc, xtrain_nsclc, xtest_nsclc, ytrain_nsclc, ytest_nsclc) = preprocess(data)
  (params_sclc, params_nsclc) = finetune(xtrain_sclc, ytrain_sclc, xtrain_nsclc, ytrain_nsclc)
  rfc_sclc = RandomForestClassifier(**params_sclc)
  rfc_sclc.fit(xtrain_sclc, ytrain_sclc)
  rfc_nsclc = RandomForestClassifier(**params_nsclc)
  rfc_nsclc.fit(xtrain_nsclc, ytrain_nsclc)

  test_input['NSCLC'] = test_input['NSCLC'].fillna(0)
  test_input['SCLC'] = test_input['SCLC'].fillna(0)
  text_x = test_input.iloc[:,[3,4,6]]
  test_y_sclc = test_input.iloc[:,8]
  test_y_nsclc = test_input.iloc[:,9]
  sc_x = StandardScaler()
  test_x = sc_x.fit_transform(text_x)

  rfc_predict_sclc = np.array(rfc_sclc.predict(text_x))
  rfc_predict_nsclc = np.array(rfc_nsclc.predict(text_x))
  common_biomarkers = (rfc_predict_sclc + rfc_predict_nsclc)//2
  return (rfc_predict_sclc, rfc_predict_nsclc, common_biomarkers)



(rfc_predict_sclc, rfc_predict_nsclc, common_biomarkers) = train_and_test(test_input)

SCLC Original dataset shape: Counter({0.0: 18857, 1.0: 921})
SCLC Resample dataset shape: Counter({0.0: 921, 1.0: 921})
NSCLC Original dataset shape: Counter({0.0: 19087, 1.0: 691})
NSCLC Resample dataset shape: Counter({0.0: 691, 1.0: 691})
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [None]:
test_input

Unnamed: 0.1,Unnamed: 0,Tags,Name,FC,logFC,logCPM,P-Value,FDR,SCLC,NSCLC
12238,12238.0,[],FAR2,-1.036364,-0.05153,6.047611,0.768052,0.889167,,
12239,12239.0,[],ATF6,-1.074289,-0.103382,8.15467,0.260387,0.490329,1.0,
12240,12240.0,[],ATF3,-1.197653,-0.260209,6.107613,0.301848,0.533602,1.0,
12241,12241.0,[],LOC441087,1.365118,0.449026,-1.056675,0.084938,0.258926,,
12242,12242.0,[],GATA6,-1.275932,-0.351551,4.885739,0.114056,0.305799,,
12243,12243.0,[],TPRG1L,1.004909,0.007065,4.665519,0.97644,0.994635,,
12244,12244.0,[],PDGFA,-1.24327,-0.314139,4.715532,0.088348,0.265474,1.0,1.0
12245,12245.0,[],GATA2,1.032892,0.046689,2.803374,0.842767,0.931684,,
12246,12246.0,[],CIAO3,-1.148757,-0.200073,3.933105,0.075367,0.242021,,
12247,12247.0,[],CIAO1,1.123722,0.168285,5.935793,0.131505,0.332089,,


In [None]:
print("SCLC Biomarkers")
for (i, val) in enumerate(rfc_predict_sclc):
  if val == 1:
    print(test_input.iloc[i,:].Name)
print("\nNSCLC Biomarkers")
for (i, val) in enumerate(rfc_predict_nsclc):
  if val == 1:
    print(test_input.iloc[i,:].Name)
print("\nCommon Biomarkers")
for (i, val) in enumerate(common_biomarkers):
  if val == 1:
    print(test_input.iloc[i,:].Name)

SCLC Biomarkers
ATF6
ATF3
LOC441087
PDGFA
PDGFD
PDGFC
PIP5K1C

NSCLC Biomarkers
TPRG1L
PDGFA
PDGFD
PDGFC
PIP5K1A

Common Biomarkers
PDGFA
PDGFD
PDGFC
