In [None]:
import pandas as pd
import numpy as np
import math

import warnings
warnings.filterwarnings("ignore")

pd.set_option('max_rows', 99999)

import os
os.chdir('/content/drive/MyDrive/Master Thesis')

In [None]:
from __future__ import print_function
import sys,tempfile, urllib, os
from sklearn.model_selection import train_test_split

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder

In [None]:
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB
from sklearn.cluster import KMeans


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn import svm

import seaborn as sn
import matplotlib.pyplot as plt

from xgboost import XGBClassifier

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report

In [None]:
#Get verified data for prediction model

def get_verified_data(df):

  df_1 = pd.DataFrame()

  caseid_list = []
  sub_list = []
  res_list = []
  endo_list = []
  hpv_list = []
  proc_list = []
  bio_list = []
  treat_list = []
  hist_list = []
  inf_list = []

  res_ver_list  = []

  for row, val in df.iterrows():
    res = val['result']
    sub = val['subsequent_smear']
    hpv = val['HPV_test']
    hist = val['Histology']
    bio = val['Biopsy Result']
    proc = val['Procedure']
    endo = val['Endocervical']
    caseid = val['case_id']
    treat = val['treat_course']
    inf = val['Infection']

    if (bio != 'Not Applicable' and bio != 'Undefined' and bio!= 'Other') or sub == 'select':
      caseid_list.append(caseid)
      sub_list.append(sub)
      res_list.append(res)
      endo_list.append(endo)
      hpv_list.append(hpv)
      proc_list.append(proc)
      bio_list.append(bio)
      treat_list.append(treat)
      hist_list.append(hist)
      inf_list.append(inf)

  
  df_1['case_id'] = np.array(caseid_list)
  df_1['result'] = np.array(res_list)
  df_1['Endocervical'] = np.array(endo_list)
  df_1['Infection'] = np.array(inf_list)
  df_1['HPV_test'] = np.array(hpv_list)
  df_1['treat_course'] = np.array(treat_list)

  extra_sample_list = [73,159,273,312,660,701,755,691,645,59,69,823,858]
  df_2 = pd.DataFrame()

  df_2['case_id'] = np.array(extra_sample_list)
  df_2['result'] = np.array(df['result'][df['case_id'].isin(extra_sample_list)].tolist())
  df_2['Endocervical'] = np.array(df['Endocervical'][df['case_id'].isin(extra_sample_list)].tolist())
  df_2['Infection'] = np.array(df['Infection'][df['case_id'].isin(extra_sample_list)].tolist())
  df_2['HPV_test'] = np.array(df['HPV_test'][df['case_id'].isin(extra_sample_list)].tolist())
  df_2['treat_course'] = np.array(df['treat_course'][df['case_id'].isin(extra_sample_list)].tolist())

  df_1 = pd.concat([df_1, df_2])


  return df_1

In [None]:
#convert to categorical variables to numbers
def encode(df):
  df['treat_course'] = df['treat_course'].replace(['normal', 'low-grade', 'high-grade'],[0,1,2])
  df['result'] = df['result'].replace(['Negative', 'BNA', 'Low Grade', 'High Grade (Mod)', 'High Grade (Sev)','Invasive','Glandular'],[0,1,2,3,4,5,6])
  df['Endocervical'] = df['Endocervical'].replace(['YES','NO'],[1,0])
  df['Infection'] = df['Infection'].replace(['None','Fungal', 'Monilia', 'Koilocytes', 'ALOS'],[0,1,2,3,4])
  df['HPV_test'] = df['HPV_test'].replace(['No','Yes - NEG', 'Yes - POS'],[0,1,2])

  df['result'] = df['result'].astype(int)
  df['Endocervical'] = df['Endocervical'].astype(int)
  df['Infection'] = df['Infection'].astype(int)
  df['HPV_test'] = df['HPV_test'].astype(int)
  
  cat_list = ['result', 'Endocervical', 'Infection', 'HPV_test']
  column_trans = make_column_transformer((OrdinalEncoder(), cat_list))
  tis_transformed = column_trans.fit_transform(df)

  df_trans = df.copy()
  df_trans = pd.DataFrame(tis_transformed, columns=cat_list).astype(int)
  df.update(df_trans)

  return df

In [None]:
#K fold Cross validation

def cross_val(model, df_x, df_y, n_split = 10):
  folds = KFold(n_splits= n_split)
  accuracy_list = []
  precision_list = []
  recall_list = []
  f1_list = []
  for train_index, test_index in folds.split(df_x.index, df_y.index):
    X_train, X_test, y_train, y_test = df_x.iloc[train_index], df_x.iloc[test_index], df_y.iloc[train_index], df_y.iloc[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred, average=None)
    recall = recall_score(y_test,y_pred, average=None)
    f1 = f1_score(y_test,y_pred, average=None)
    
    accuracy_list.append(accuracy)
    precision_list.append(np.mean(precision))
    recall_list.append(np.mean(recall))
    f1_list.append(np.mean(f1))
    
  return accuracy_list, precision_list, recall_list, f1_list

In [None]:
tis_ds = pd.read_csv('TIS_Data_processed.csv')
gen_ds = pd.read_csv('GEN_Data_processed.csv')

In [None]:
tis_ds.head()

Unnamed: 0.1,Unnamed: 0,case_id,result,Endocervical,Infection,subsequent_smear,HPV_test,Histology,Procedure,Biopsy Result,treat_course
0,0,1,Glandular,YES,,Negs x 4,No,YES,LLETZ,Neg,high-grade
1,1,2,Negative,NO,,No follow up,No,NO,Not Applicable,Not Applicable,normal
2,2,3,Negative,NO,,No follow up,Yes - NEG,NO,Not Applicable,Not Applicable,normal
3,3,4,Negative,NO,,Neg x 2,No,NO,Not Applicable,Not Applicable,normal
4,4,5,Negative,NO,,No follow up,No,NO,Not Applicable,Not Applicable,normal


Creating Dataset for prediction model

In [91]:
#Using only data that would be available before confirming diagnosis of patient
#treat course is the dependent variable
tis_ds_n_verified_case = tis_ds[['case_id', 'result', 'Endocervical', 'Infection', 'HPV_test', 'treat_course']]
tis_ds_n_verified = tis_ds[['result', 'Endocervical', 'Infection', 'HPV_test', 'treat_course']]
tis_ds_n_verified = tis_ds_n_verified[tis_ds_n_verified['treat_course'] != 'Undefined'][tis_ds_n_verified['treat_course'] != 'Not Applicable']
tis_ds_verified = get_verified_data(tis_ds)
tis_ds_verified = tis_ds_verified[['result', 'Endocervical', 'Infection', 'HPV_test', 'treat_course']]
tis_ds_verified = tis_ds_verified[tis_ds_verified['treat_course'] != 'Undefined'][tis_ds_verified['treat_course'] != 'Not Applicable']
tis_ds = tis_ds[tis_ds['treat_course'] != 'Undefined'][tis_ds['treat_course'] != 'Not Applicable']

In [92]:
tis_ds_nv_trans = encode(tis_ds_n_verified)
tis_ds_v_trans = encode(tis_ds_verified)

In [93]:
tis_ds_nv_x = tis_ds_nv_trans.copy()
tis_ds_nv_y =  tis_ds_nv_x.pop('treat_course')
# tis_ds_nv_y =  tis_ds_nv_trans.pop('treat_course')
tis_ds_v_x = tis_ds_v_trans.copy()
tis_ds_v_y =  tis_ds_v_x.pop('treat_course')
# tis_ds_v_y =  tis_ds_v_trans.pop('treat_course')

In [94]:
tis_ds_v_x.head()

Unnamed: 0,result,Endocervical,Infection,HPV_test
0,6.0,1.0,0.0,0.0
1,4.0,1.0,0.0,0.0
2,0.0,1.0,0.0,1.0
3,3.0,1.0,0.0,0.0
4,2.0,1.0,0.0,0.0


Creating Dataset for genius system

In [95]:
gen_ds = gen_ds[['result', 'Endocervical', 'Infection', 'HPV_test', 'treat_course']]
gen_ds = gen_ds[gen_ds['treat_course'] != 'Undefined'][gen_ds['treat_course'] != 'Not Applicable']

In [96]:
gen_ds_trans = encode(gen_ds)
gen_ds_x = gen_ds_trans.copy()
gen_ds_y =  gen_ds_x.pop('treat_course')

In [97]:
tis_ds['treat_course'].value_counts()

normal        622
high-grade    164
low-grade      76
Name: treat_course, dtype: int64

In [98]:
tis_ds_nv_x.head()

Unnamed: 0,result,Endocervical,Infection,HPV_test
0,6.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0


Random Forest

In [99]:
print('Non-verified Data')

rand_for_nv_model = RandomForestClassifier(n_estimators = 100)
rand_forest_nv_acc, rand_forest_nv_prec, rand_forest_nv_rec, rand_forest_nv_f1 = cross_val(rand_for_nv_model, tis_ds_nv_x, tis_ds_nv_y)
print(np.mean(rand_forest_nv_acc))
print(np.mean(rand_forest_nv_prec))
print(np.mean(rand_forest_nv_rec))
print(np.mean(rand_forest_nv_f1))

rand_for_nv_model.fit(tis_ds_nv_x, tis_ds_nv_y)

print('Verified Data')

rand_for_v_model = RandomForestClassifier(n_estimators = 100)
rand_forest_v_acc, rand_forest_v_prec, rand_forest_v_rec, rand_forest_v_f1 = cross_val(rand_for_v_model, tis_ds_v_x, tis_ds_v_y)
print(np.mean(rand_forest_v_acc))
print(np.mean(rand_forest_v_prec))
print(np.mean(rand_forest_v_rec))
print(np.mean(rand_forest_v_f1))

rand_for_v_model.fit(tis_ds_v_x, tis_ds_v_y)

Non-verified Data
0.7007885592087677
0.28313114579000914
0.3317668556856474
0.28933917118587515
Verified Data
0.8076315789473684
0.5466196537094369
0.6085057926679444
0.5530484848352495


RandomForestClassifier()

In [100]:
y_pred_rand = rand_for_nv_model.predict(gen_ds_x)
y_pred_rand_df = pd.DataFrame()
y_pred_rand_df['y_pred'] = y_pred_rand
y_pred_rand_df['y_pred'].value_counts()

0    710
1     52
2     20
Name: y_pred, dtype: int64

Logistic regression

In [101]:
#regressor_nv_model = LogisticRegression()  
print('Non-verified Data')
regressor_nv_model = LogisticRegression(solver='liblinear', max_iter=1000) 
regressor_nv_acc, regressor_nv_prec, regressor_nv_rec, regressor_nv_f1 = cross_val(regressor_nv_model, tis_ds_nv_x, tis_ds_nv_y)
print(np.mean(regressor_nv_acc))
print(np.mean(regressor_nv_prec))
print(np.mean(regressor_nv_rec))
print(np.mean(regressor_nv_f1))

regressor_nv_model.fit(tis_ds_nv_x, tis_ds_nv_y)

print('Verified Data')
#regressor_v_model = LogisticRegression()
regressor_v_model = LogisticRegression(solver='liblinear', max_iter=1000) 
regressor_v_acc, regressor_v_prec, regressor_v_rec, regressor_v_f1 = cross_val(regressor_v_model, tis_ds_v_x, tis_ds_v_y)
print(np.mean(regressor_v_acc))
print(np.mean(regressor_v_prec))
print(np.mean(regressor_v_rec))
print(np.mean(regressor_v_f1))

regressor_v_model.fit(tis_ds_v_x, tis_ds_v_y)

Non-verified Data
0.7216786955359529
0.7216786955359529
0.24055956517865096
0.33333333333333337
0.27929078186489065
Verified Data
0.8181578947368422
0.564244352711845
0.5467148262813897
0.5241923613835378


LogisticRegression(max_iter=1000, solver='liblinear')

In [102]:
y_pred_log = regressor_nv_model.predict(gen_ds_x)
y_pred_log_df = pd.DataFrame()
y_pred_log_df['y_pred'] = y_pred_log
y_pred_log_df['y_pred'].value_counts()

0    782
Name: y_pred, dtype: int64

SVM model

In [103]:
print('Non-verified Data')
svm_nv_model = svm.SVC(gamma = 'auto')  # solver='liblinear' for one-versus-rest 
svm_nv_acc, svm_nv_prec, svm_nv_rec, svm_nv_f1 = cross_val(svm_nv_model, tis_ds_nv_x, tis_ds_nv_y)
print(np.mean(svm_nv_acc))
print(np.mean(svm_nv_prec))
print(np.mean(svm_nv_rec))
print(np.mean(svm_nv_f1))

svm_nv_model.fit(tis_ds_nv_x, tis_ds_nv_y)
print('Verified Data')
svm_v_model = svm.SVC(gamma = 'auto')  # solver='liblinear' for one-versus-rest 
svm_v_acc, svm_v_prec, svm_v_rec, svm_v_f1 = cross_val(svm_v_model, tis_ds_v_x, tis_ds_v_y)
print(np.mean(svm_v_acc))
print(np.mean(svm_v_prec))
print(np.mean(svm_v_rec))
print(np.mean(svm_v_f1))

svm_nv_model.fit(tis_ds_v_x, tis_ds_v_y)

Non-verified Data
0.7216786955359529
0.24055956517865096
0.33333333333333337
0.27929078186489065
Verified Data
0.8026315789473685
0.5032674005274625
0.6102769177846576
0.5300028295101824


SVC(gamma='auto')

In [104]:
y_pred_snm = svm_nv_model.predict(gen_ds_x)
y_pred_svm_df = pd.DataFrame()
y_pred_svm_df['y_pred'] = y_pred_snm
y_pred_svm_df['y_pred'].value_counts()

2    396
1    386
Name: y_pred, dtype: int64

Naive Bayes (Gaussian)

In [105]:
print('Non-verified Data')
nb_nv_model  = GaussianNB()
nb_nv_acc, nb_nv_prec, nb_nv_rec, nb_nv_f1 = cross_val(nb_nv_model, tis_ds_nv_x, tis_ds_nv_y)
print(np.mean(nb_nv_acc))
print(np.mean(nb_nv_prec))
print(np.mean(nb_nv_rec))
print(np.mean(nb_nv_f1))

nb_nv_model.fit(tis_ds_nv_x, tis_ds_nv_y)
print('Verified Data')
nb_v_model  = GaussianNB()
nb_v_acc, nb_v_prec, nb_v_rec, nb_v_f1 = cross_val(nb_v_model, tis_ds_v_x, tis_ds_v_y)
print(np.mean(nb_v_acc))
print(np.mean(nb_v_prec))
print(np.mean(nb_v_rec))
print(np.mean(nb_v_f1))

nb_v_model.fit(tis_ds_v_x, tis_ds_v_y)

Non-verified Data
0.6479149959903768
0.27300575291924345
0.3333083444902315
0.2776469098902693
Verified Data
0.7934210526315788
0.6512620399036809
0.6194133208839091
0.5830464589876354


GaussianNB()

In [106]:
y_pred_nb = nb_nv_model.predict(gen_ds_x)
y_pred_nb_df = pd.DataFrame()
y_pred_nb_df['y_pred'] = y_pred_nb
y_pred_nb_df['y_pred'].value_counts()

0    760
2     22
Name: y_pred, dtype: int64

XGBoost

In [137]:
print('Non-verified Data')
xgbclf_nv_model = XGBClassifier()
xg_nv_acc, xg_nv_prec, xg_nv_rec, xg_nv_f1 = cross_val(xgbclf_nv_model, tis_ds_nv_x, tis_ds_nv_y)
print(np.mean(xg_nv_acc))
print(np.mean(xg_nv_prec))
print(np.mean(xg_nv_rec))
print(np.mean(xg_nv_f1))

xgbclf_nv_model.fit(tis_ds_nv_x, tis_ds_nv_y)

# print('Verified Data')
# xgbclf_v_model  = XGBClassifier()
# xg_v_acc, xg_v_prec, xg_v_rec, xg_v_f1 = cross_val(xgbclf_v_model, tis_ds_v_x, tis_ds_v_y)
# print(np.mean(xg_v_acc))
# print(np.mean(xg_v_prec))
# print(np.mean(xg_v_rec))
# print(np.mean(xg_v_f1))

# xg_v_model.fit(tis_ds_v_x, tis_ds_v_y)

Non-verified Data
0.7077920342154504
0.24512905234393761
0.32808727224120393
0.2788968971726881


XGBClassifier(objective='multi:softprob')

In [108]:
y_pred_xg = xgbclf_nv_model.predict(gen_ds_x)
y_pred_xg_df = pd.DataFrame()
y_pred_xg_df['y_pred'] = y_pred_xg
y_pred_xg_df['y_pred'].value_counts()

0    755
2     16
1     11
Name: y_pred, dtype: int64

Using Grid search CV

In [109]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [110]:
# Create the param grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(param_grid)

{'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}


In [111]:
rf_Model = RandomForestClassifier()

In [112]:
from sklearn.model_selection import GridSearchCV
rf_Grid = GridSearchCV(estimator = rf_Model, param_grid = param_grid, cv = 3, verbose=2, n_jobs = 4)

In [113]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tis_ds_nv_x, tis_ds_nv_y, test_size=0.20, random_state=101)

In [114]:
print(f'X_train : {X_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_test : {y_test.shape}')

X_train : (689, 4)
y_train : (689,)
X_test : (173, 4)
y_test : (173,)


In [115]:
rf_Grid.fit(X_train, y_train)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 4],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72,
                                          80]},
             verbose=2)

In [116]:
rf_Grid.best_params_

{'bootstrap': True,
 'max_depth': 2,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 10}

In [117]:
print (f'Train Accuracy - : {rf_Grid.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {rf_Grid.score(X_test,y_test):.3f}')

Train Accuracy - : 0.717
Test Accuracy - : 0.746


In [118]:
y_pred_nb = rf_Grid.predict(gen_ds_x)
y_pred_nb_df = pd.DataFrame()
y_pred_nb_df['y_pred'] = y_pred_nb
y_pred_nb_df['y_pred'].value_counts()

0    779
1      3
Name: y_pred, dtype: int64