In [13]:
#Import necessary libraries
import numpy as np #This works with numbers
import pandas as pd #This allows us to work with a dataset
import matplotlib.pyplot as plt #This allows us to plot data
from sklearn.model_selection import train_test_split #For splitting data for training and test
from sklearn.preprocessing import StandardScaler #For scaling features
from sklearn import datasets, svm 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score #For metrics and evaluation algorithm
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier #For KNN
from sklearn.preprocessing import KBinsDiscretizer
from sklearn import tree
pd.set_option('display.max_columns', None)

In [14]:
#--- Loading Data and dropping features
#load data into dataframe
data = pd.read_csv('HCV-Egy-Data.csv')

# Remove Outliers 
X = data
col = list(X.drop('Baselinehistological staging',axis=1).columns)
for i in col:
    y = data[i]
    removed_outliers = y.between(y.quantile(.02), y.quantile(.98))
    index_names = data[~removed_outliers].index 
    data.drop(index_names, inplace=True)
    
#dropping histoligical baseline due to duplicate representation of data
data = data.drop('Baseline histological Grading', axis = 1)

In [15]:

#--- Descritization (Robert)

#list of columns for binning
weeks_list = ['AST 1', 'ALT 1', 'ALT4', 'ALT 12', 'ALT 24', 'ALT 36', 'ALT 48', 'ALT after 24 w']

#descritizes the columns listed in weeks_list #sources: https://dfrieds.com/data-analysis/bin-values-python-pandas.html , https://stackoverflow.com/questions/48248731/pandas-cut-multiple-columns
for feature in weeks_list:
    data[feature] = pd.cut(x=data[feature], bins=[0,20,40,128], labels=[1,2,3]) #discrete values    

#customized binning for the following features    
#data['Age '] = pd.cut(x=data['Age '], bins=[0,32,37,42,47,52,57,62], labels=[1,2,3,4,5,6,7])
data['BMI'] = pd.cut(x=data['BMI'], bins=[0,18.5,25,30,35,40], labels=[1,2,3,4,5])
data['WBC'] = pd.cut(x=data['WBC'], bins=[0,4000,11000,12101], labels=[1,2,3])
data['RBC'] = pd.cut(x=data['RBC'], bins=[0,3000000,5000000,5018451], labels=[1,2,3])
#data['Plat'] = pd.cut(x=data['Plat'], bins=[93013,100000,226465], labels=[1,2]) #removed a typo discrete value of 255000
data['RNA Base'] = pd.cut(x=data['RNA Base'], bins=[0,5,1201086], labels=[1,2])                                        
data['RNA 4'] = pd.cut(x=data['RNA 4'], bins=[0,5,1201715], labels=[1,2])    
data['RNA 12'] = pd.cut(x=data['RNA 12'], bins=[0,5,3731527], labels=[1,2])    
data['RNA EOT'] = pd.cut(x=data['RNA EOT'], bins=[0,5,808450], labels=[1,2])    
data['RNA EF'] = pd.cut(x=data['RNA EF'], bins=[0,5,810333], labels=[1,2])    # corrected maximum from 808450 in discretization description

# conditional binning for HGB by Gender. Each gender row is placed in a gender conditional dataframe
male_df = data[data['Gender']==1]
male_df['HGB'] = pd.cut(x=male_df['HGB'], bins=[2,14,17.5,20], labels=[1,2,3])
#print(male_df)
female_df = data[data['Gender']==2]
female_df['HGB'] = pd.cut(x=female_df['HGB'], bins=[2,12.3,15.3,20], labels=[1,2,3])
#print(female_df)
#merges male and female selections into a new dataframe
data = pd.merge(male_df, female_df, how='outer')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [16]:
# converting all datatypes to float (Robert)
# source: https://stackoverflow.com/questions/44369504/how-to-convert-entire-dataframe-values-to-float-in-pandas
data = data.apply(pd.to_numeric, errors='coerce')
data.dtypes

Age                                   int64
Gender                                int64
BMI                                   int64
Fever                                 int64
Nausea/Vomting                        int64
Headache                              int64
Diarrhea                              int64
Fatigue & generalized bone ache       int64
Jaundice                              int64
Epigastric pain                       int64
WBC                                   int64
RBC                                   int64
HGB                                   int64
Plat                                float64
AST 1                                 int64
ALT 1                                 int64
ALT4                                  int64
ALT 12                                int64
ALT 24                                int64
ALT 36                                int64
ALT 48                                int64
ALT after 24 w                        int64
RNA Base                        

In [17]:

    
#seperate X and y
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

#Calculate biomarker scores
fib4 = (X['Age ']*X['AST 1'])/(X['Plat']*np.sqrt(X['ALT 1']))
modapri = (X['Age ']*X['AST 1'])/(X['Plat']*100)
liverDam = X['AST 1']/X['ALT 1']

#Feature selection Age, Gender, BMI
ModX = X.iloc[:,0:3]

#Select ALT scores over time
ModX['ALT 1'] = X['ALT 1']
ModX['ALT4'] = X['ALT4']
ModX['ALT 24'] = X['ALT 24']
ModX['ALT 48'] = X['ALT 48']

ModX['AST 1'] = X['AST 1']

ModX['fib4'] = fib4
ModX['modapri'] = modapri
ModX['liverDam'] = liverDam

ModX

Unnamed: 0,Age,Gender,BMI,ALT 1,ALT4,ALT 24,ALT 48,AST 1,fib4,modapri,liverDam
0,45,1,3,3,3,3,3,3,0.000794,0.000014,1.0
1,34,1,2,3,3,3,3,3,0.000417,0.000007,1.0
2,58,1,4,3,3,3,3,3,0.000672,0.000012,1.0
3,61,1,4,3,3,3,3,3,0.000535,0.000009,1.0
4,56,1,3,3,3,3,3,3,0.000509,0.000009,1.0
...,...,...,...,...,...,...,...,...,...,...,...
850,59,2,4,3,3,3,3,3,0.000959,0.000017,1.0
851,43,2,2,3,3,3,3,3,0.000716,0.000012,1.0
852,36,2,4,3,3,3,3,3,0.000388,0.000007,1.0
853,47,2,4,3,3,3,3,3,0.000498,0.000009,1.0


In [18]:
#Split data into training and test data
X_train, X_test, y_train, y_test = train_test_split(ModX, y, test_size = 0.20, random_state = 0)

#Feature scale so that one feature doesn't have more influence than another
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
#Here we are going to make a simple SVM classifier
classifier = svm.NuSVC(nu = 0.3, kernel='rbf')
#Carry out cross validation 10 fold and find the mean of the score to get overall accuracy for train data
scores = cross_val_score(classifier, X_train, y_train, cv=10)
print(scores)
print(scores.mean())



[0.27536232 0.1884058  0.24637681 0.26086957 0.27536232 0.36764706
 0.27941176 0.25       0.25       0.23880597]
0.2632241605272868




In [20]:
#train the classifier on train set and then print out its overall accuracy for the train data
classifier.fit(X_train, y_train)

print(classifier.score(X_train,y_train))

0.34210526315789475




In [21]:
#Determine the overall accuracy score of the test data set
print(classifier.score(X_test,y_test))

0.24561403508771928


In [22]:
#Making predictions
y_pred = classifier.predict(X_test)
#To evaluate algorithm we will print the confusion matrix and other metrics
print("The confusion matrix is:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

The confusion matrix is:
[[16 13  7  8]
 [10 10 11  4]
 [21 11 10  4]
 [14 14 12  6]]
              precision    recall  f1-score   support

           1       0.26      0.36      0.30        44
           2       0.21      0.29      0.24        35
           3       0.25      0.22      0.23        46
           4       0.27      0.13      0.18        46

    accuracy                           0.25       171
   macro avg       0.25      0.25      0.24       171
weighted avg       0.25      0.25      0.24       171



In [23]:
#Training with k-nearest neighbor (KNN)
classifier2 = KNeighborsClassifier(n_neighbors = 100)
classifier2.fit(X_train, y_train)
#Making predictions
y_pred = classifier2.predict(X_test)
#To evaluate algorithm we will print the confusion matrix and other metrics
print("The confusion matrix is:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

The confusion matrix is:
[[ 6  7 17 14]
 [ 8  6 15  6]
 [12  8 13 13]
 [12  9 16  9]]
              precision    recall  f1-score   support

           1       0.16      0.14      0.15        44
           2       0.20      0.17      0.18        35
           3       0.21      0.28      0.24        46
           4       0.21      0.20      0.20        46

    accuracy                           0.20       171
   macro avg       0.20      0.20      0.19       171
weighted avg       0.20      0.20      0.20       171



In [24]:
#Decision trees
#Here we are going to make a simple decision classifier
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(clf.score(X_train,y_train))
#Determine the overall accuracy score of the test data set
print(clf.score(X_test,y_test))
#Making predictions
y_pred = clf.predict(X_test)
#To evaluate algorithm we will print the confusion matrix and other metrics
print("The confusion matrix is:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

1.0
0.24561403508771928
The confusion matrix is:
[[11 15  7 11]
 [ 8  5  7 15]
 [14  6 12 14]
 [15 10  7 14]]
              precision    recall  f1-score   support

           1       0.23      0.25      0.24        44
           2       0.14      0.14      0.14        35
           3       0.36      0.26      0.30        46
           4       0.26      0.30      0.28        46

    accuracy                           0.25       171
   macro avg       0.25      0.24      0.24       171
weighted avg       0.25      0.25      0.25       171

