In [208]:
#Import necessary libraries
import numpy as np #This works with numbers
import pandas as pd #This allows us to work with a dataset
import matplotlib.pyplot as plt #This allows us to plot data
from sklearn.model_selection import train_test_split #For splitting data for training and test
from sklearn.preprocessing import StandardScaler #For scaling features
from sklearn import datasets, svm 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score #For metrics and evaluation algorithm
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier #For KNN
from sklearn.preprocessing import KBinsDiscretizer
from sklearn import tree

In [209]:
#load data into dataframe
data = pd.read_csv('HCV-Egy-Data.csv')

#dropping histoligical baseline due to duplicate representation of data  ## I don't want to drop the outliers (Robert)
#data = data.drop('Baseline histological Grading', axis = 1)

# Remove Outliers ## I don't want to drop the outliers (Robert)


X = data
col = list(X.drop('Baselinehistological staging',axis=1).columns)
for i in col:
    y = data[i]
    removed_outliers = y.between(y.quantile(.02), y.quantile(.98))
    index_names = data[~removed_outliers].index 
    data.drop(index_names, inplace=True)


#dropping histoligical baseline due to duplicate representation of data, and it seems to decrease accuracy if included
data = data.drop('Baseline histological Grading', axis = 1)

In [210]:
#--- Descritization (Robert)

#list of columns for binning
weeks_list = ['AST 1', 'ALT 1', 'ALT4', 'ALT 12', 'ALT 24', 'ALT 36', 'ALT 48', 'ALT after 24 w']

#descritizes the columns listed in weeks_list #sources: https://dfrieds.com/data-analysis/bin-values-python-pandas.html , https://stackoverflow.com/questions/48248731/pandas-cut-multiple-columns
for feature in weeks_list:
    data[feature] = pd.cut(x=data[feature], bins=[0,20,40,128], labels=[1,2,3]) #discrete values    

#customized binning for the following features    

#data['Age '] = pd.cut(x=data['Age '], bins=[0,32,37,42,47,52,57,62], labels=[1,2,3,4,5,6,7])
data['BMI'] = pd.cut(x=data['BMI'], bins=[0,18.5,25,30,35,40], labels=[1,2,3,4,5])
data['WBC'] = pd.cut(x=data['WBC'], bins=[0,4000,11000,12101], labels=[1,2,3])
data['RBC'] = pd.cut(x=data['RBC'], bins=[0,3000000,5000000,5018451], labels=[1,2,3])
#data['Plat'] = pd.cut(x=data['Plat'], bins=[93013,100000,226465], labels=[1,2]) #removed a typo discrete value of 255000
data['RNA Base'] = pd.cut(x=data['RNA Base'], bins=[0,5,1201086], labels=[1,2])                                        
data['RNA 4'] = pd.cut(x=data['RNA 4'], bins=[0,5,1201715], labels=[1,2])    
data['RNA 12'] = pd.cut(x=data['RNA 12'], bins=[0,5,3731527], labels=[1,2])    
data['RNA EOT'] = pd.cut(x=data['RNA EOT'], bins=[0,5,808450], labels=[1,2])    
data['RNA EF'] = pd.cut(x=data['RNA EF'], bins=[0,5,810333], labels=[1,2])    # corrected maximum from 808450 in discretization description

# conditional binning for HGB by Gender. Each gender row is placed in a gender conditional dataframe
male_df = data[data['Gender']==1]
male_df['HGB'] = pd.cut(x=male_df['HGB'], bins=[2,14,17.5,20], labels=[1,2,3])
#print(male_df)
female_df = data[data['Gender']==2]
female_df['HGB'] = pd.cut(x=female_df['HGB'], bins=[2,12.3,15.3,20], labels=[1,2,3])
#print(female_df)
#merges male and female selections into a new dataframe
data = pd.merge(male_df, female_df, how='outer')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [211]:
# converting all datatypes to float because most are of a 'category' type (Robert)
# source: https://stackoverflow.com/questions/44369504/how-to-convert-entire-dataframe-values-to-float-in-pandas
data = data.apply(pd.to_numeric, errors='coerce')
print(data)

     Age   Gender  BMI  Fever  Nausea/Vomting  Headache   Diarrhea   \
0      45       1    3      2               1          2          2   
1      34       1    2      1               2          1          1   
2      58       1    4      2               1          1          1   
3      61       1    4      1               2          2          2   
4      56       1    3      1               2          2          2   
..    ...     ...  ...    ...             ...        ...        ...   
850    59       2    4      2               2          2          1   
851    43       2    2      1               2          2          1   
852    36       2    4      1               2          2          1   
853    47       2    4      2               2          2          2   
854    52       2    2      1               2          1          1   

     Fatigue & generalized bone ache   Jaundice   Epigastric pain   ...  \
0                                   1          1                 2  ... 

In [212]:
#seperate X and y
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

#Calculate biomarker scores
fib4 = (X['Age ']*X['AST 1'])/(X['Plat']*np.sqrt(X['ALT 1']))
modapri = (X['Age ']*X['AST 1'])/(X['Plat']*100)
liverDam = X['AST 1']/X['ALT 1']

''' 
#Feature selection Age, Gender, BMI  # These are not enough features in this scope, so removing it (Robert)
ModX = X.iloc[:,0:3]
'''
ModX = X.iloc[:,:-1]

#Select ALT scores over time
ModX['ALT 1'] = X['ALT 1']
ModX['ALT4'] = X['ALT4']
ModX['ALT 24'] = X['ALT 24']
ModX['ALT 48'] = X['ALT 48']

ModX['AST 1'] = X['AST 1']

ModX['fib4'] = fib4
ModX['modapri'] = modapri
ModX['liverDam'] = liverDam

In [213]:
#Transform y to only focus on F4 (Alexander) ## Binary targeting (Robert)

for i in range(len(y)):
    current = y.iloc[i]
    if current != 1:
        y.iloc[i] = 0
        
print(y.head(5))


0    0
1    0
2    1
3    0
4    0
Name: Baselinehistological staging, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [214]:
##--- Descritization found in literature (Alexander) 
#Apply discretization to calculated ratios

#Modify apri ratio
for i in range(len(X)):
    current = ModX.iloc[i,-2]
    if current > 0.5 :   # literature says 0.7,  mathematical model says 0.5 (Selina)
        ModX.iloc[i,-2] = 1
    else:
        ModX.iloc[i,-2] = 0

#Modify liverdam ratio
for i in range(len(X)):
    current = ModX.iloc[i,-1]
    if current >= 1 :
        ModX.iloc[i,-1] = 1
    else:
        ModX.iloc[i,-1] = 0

#Modify fib4  ratio
for i in range(len(X)):
    current = ModX.iloc[i,-3]
    if current < 1.45 :
        ModX.iloc[i,-3] = 0
    else:
        ModX.iloc[i,-3] = 1
        
print(ModX.head(5))


   Age   Gender  BMI  Fever  Nausea/Vomting  Headache   Diarrhea   \
0    45       1    3      2               1          2          2   
1    34       1    2      1               2          1          1   
2    58       1    4      2               1          1          1   
3    61       1    4      1               2          2          2   
4    56       1    3      1               2          2          2   

   Fatigue & generalized bone ache   Jaundice   Epigastric pain   ...  ALT 36  \
0                                 1          1                 2  ...       3   
1                                 2          2                 1  ...       3   
2                                 2          1                 1  ...       3   
3                                 1          1                 2  ...       3   
4                                 2          2                 2  ...       3   

   ALT 48  ALT after 24 w  RNA Base  RNA 4  RNA 12  RNA EOT  fib4  modapri  \
0       3           

In [215]:
#Split data into training and test data # switching from 0.2 to 0.5 test size (Robert)
X_train, X_test, y_train, y_test = train_test_split(ModX, y, test_size = 0.60, random_state = 0) 

#Feature scale so that one feature doesn't have more influence than another
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [216]:
X_train

array([[-0.18082818,  0.9883717 ,  0.02168284, ...,  0.        ,
         0.        ,  0.10878566],
       [ 1.11001741,  0.9883717 , -1.21423929, ...,  0.        ,
         0.        ,  0.10878566],
       [ 0.52326941,  0.9883717 ,  0.02168284, ...,  0.        ,
         0.        ,  0.10878566],
       ...,
       [ 0.87531821,  0.9883717 ,  1.25760498, ...,  0.        ,
         0.        ,  0.10878566],
       [-1.70637297,  0.9883717 , -1.21423929, ...,  0.        ,
         0.        ,  0.10878566],
       [-0.41552738,  0.9883717 ,  0.02168284, ...,  0.        ,
         0.        ,  0.10878566]])

In [217]:
# Use Bayesian Optimization?
#Here we are going to make a simple SVM classifier
classifier = svm.NuSVC(nu = 0.3, kernel='rbf')
#Carry out cross validation 10 fold and find the mean of the score to get overall accuracy for train data
scores = cross_val_score(classifier, X_train, y_train, cv=5) # set 'cv' to 5 to increase sample size
print(scores)
print(scores.mean())

[0.64285714 0.63235294 0.67647059 0.63235294 0.63235294]
0.6432773109243697




In [218]:
#train the classifier on train set and then print out its overall accuracy for the train data
classifier.fit(X_train, y_train)

print(classifier.score(X_train,y_train))

0.9649122807017544




In [219]:
#Determine the overall accuracy score of the test data set
print(classifier.score(X_test,y_test))

0.6432748538011696


In [220]:
#Making predictions
y_pred = classifier.predict(X_test)
#To evaluate algorithm we will print the confusion matrix and other metrics
print("The confusion matrix is:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

The confusion matrix is:
[[306  79]
 [104  24]]
              precision    recall  f1-score   support

           0       0.75      0.79      0.77       385
           1       0.23      0.19      0.21       128

    accuracy                           0.64       513
   macro avg       0.49      0.49      0.49       513
weighted avg       0.62      0.64      0.63       513



In [221]:
#Training with k-nearest neighbor (KNN)
classifier2 = KNeighborsClassifier(n_neighbors = 4 )
classifier2.fit(X_train, y_train)
#Making predictions
y_pred = classifier2.predict(X_test)
#To evaluate algorithm we will print the confusion matrix and other metrics
print("The confusion matrix is:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

The confusion matrix is:
[[367  18]
 [125   3]]
              precision    recall  f1-score   support

           0       0.75      0.95      0.84       385
           1       0.14      0.02      0.04       128

    accuracy                           0.72       513
   macro avg       0.44      0.49      0.44       513
weighted avg       0.60      0.72      0.64       513



In [222]:
#Decision trees
#Here we are going to make a simple decision classifier
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(clf.score(X_train,y_train))
#Determine the overall accuracy score of the test data set
print(clf.score(X_test,y_test))
#Making predictions
y_pred = clf.predict(X_test)
#To evaluate algorithm we will print the confusion matrix and other metrics
print("The confusion matrix is:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

1.0
0.6237816764132553
The confusion matrix is:
[[291  94]
 [ 99  29]]
              precision    recall  f1-score   support

           0       0.75      0.76      0.75       385
           1       0.24      0.23      0.23       128

    accuracy                           0.62       513
   macro avg       0.49      0.49      0.49       513
weighted avg       0.62      0.62      0.62       513

