# Applying Machine Learning to a Hepatitis C Egyptian Cohort Dataset for Predicting the Disease Stage - Experiment 6

## Two-class prediction
* [F1 vs F4](#first-bullet)&nbsp;(<font color="red">F2=F3=0, F1=1 and F4=4</font>)
* [F1 vs. F3+F4](#second-bullet) &nbsp;(<font color="red">F2=0, F1=1 and F3+F4=4</font>)
* [F1 vs. F2+F3+F4](#third-bullet) &nbsp;(<font color="red">F1=1 and F2+F3+F4=4</font>)
* [F1 vs. F2+F3](#fourth-bullet) &nbsp;(<font color="red">F4=0, F1=1 and F2+F3=2</font>)

In [None]:
import import_ipynb
import warnings
import pandas as pd
warnings.filterwarnings("ignore")

In [None]:
from CommonUtilsHCV import *

In [None]:
test_connection(conn="Connected")

#### Dataset downloaded from https://archive.ics.uci.edu/ml/machine-learning-databases/00503/

In [None]:
data = pd.read_excel(r"HCV_Egy_data_for_loading.xlsx")

In [None]:
print("number of observations in data:", " ", data.shape)

In [None]:
data["BHS"].value_counts(sort=0)

## F1 vs. F4  <a class="anchor" id="first-bullet"></a>

In [None]:
data_tmp1 = data.loc[data["BHS"] != 3]
data_tmp = data_tmp1.loc[data_tmp1["BHS"] != 2]

In [None]:
data_tmp["BHS"].value_counts(sort=0)

In [None]:
# Split data for training and testing
data_lists = []
X,y = standard_scaler(dataframe=data_tmp)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=20)

data_lists = [X_train, y_train, X_test, y_test]
label_names = y_train.unique()

In [None]:
models_result = get_all_the_best_values(data_lists=data_lists, experiment_name="Experiment_61")

In [None]:
from sklearn.ensemble import VotingClassifier
#create a dictionary of the models
estimators = [
#     ('lr', models_result[0]['best_param']), 
#               ('nb', models_result[1]['best_param']),
              ('dt', models_result[2]['best_param']),
              ('rf', models_result[3]['best_param']),
              ('xgb', models_result[4]['best_param']),
              ('knn', models_result[5]['best_param']), 
#              ('svm', models_result[6]['best_param']) #,
#               ('nn', models_result[7]['best_param'])
             ]
#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting="hard")

In [None]:
ensemble.fit(data_lists[0], data_lists[1])

In [None]:
pred = ensemble.predict(data_lists[2])

In [None]:
evaluate_metrics_2d(y_pred=pred, y_test=data_lists[3])

## F1 vs. F3+F4   <a class="anchor" id="second-bullet"></a>

In [None]:
data_tmp = data.loc[data["BHS"] != 2]
data_tmp["BHS"] = np.where(data_tmp["BHS"]<=2 , 1, 4)

In [None]:
data_tmp["BHS"].value_counts(sort=0)

In [None]:
df_majority = data_tmp[data_tmp["BHS"]==4]
df_minority = data_tmp[data_tmp["BHS"]==1]

#Downsample majority class
df_majority_downsampled = resample(df_minority,
                                  replace=True,
                                  n_samples=717,
                                  random_state=123)

# concat the minority and majority downsampled dataframe
df_downsampled = pd.concat([df_majority, df_majority_downsampled])

# Display new class counts
df_downsampled.BHS.value_counts()

In [None]:
data_tmp = df_downsampled.copy()

In [None]:
data_tmp["BHS"].value_counts(sort=0)

In [None]:
# Split data for training and testing
data_lists = []
X,y = standard_scaler(dataframe=data_tmp)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=20)

data_lists = [X_train, y_train, X_test, y_test]
label_names = y_train.unique()

In [None]:
models_result = get_all_the_best_values(data_lists=data_lists, experiment_name="Experiment_62")

In [None]:
from sklearn.ensemble import VotingClassifier
#create a dictionary of the models
estimators = [
#     ('lr', models_result[0]['best_param']), 
#               ('nb', models_result[1]['best_param']),
              ('dt', models_result[2]['best_param']),
              ('rf', models_result[3]['best_param']),
              ('xgb', models_result[4]['best_param']),
              ('knn', models_result[5]['best_param']), 
              ('svm', models_result[6]['best_param']) #,
#               ('nn', models_result[7]['best_param'])
             ]
#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting="hard")

In [None]:
ensemble.fit(data_lists[0], data_lists[1])

In [None]:
pred = ensemble.predict(data_lists[2])

In [None]:
evaluate_metrics_2d(y_pred=pred, y_test=data_lists[3])

## F1 vs. F2+F3+F4  <a class="anchor" id="third-bullet"></a>

In [None]:
data_tmp = data.copy()
data_tmp["BHS"] = np.where(data_tmp["BHS"]==1 , 1, 4)

In [None]:
data_tmp["BHS"].value_counts(sort=0)

In [None]:
df_majority = data_tmp[data_tmp["BHS"]==4]
df_minority = data_tmp[data_tmp["BHS"]==1]

#Downsample majority class
df_majority_downsampled = resample(df_minority,
                                  replace=True,
                                  n_samples=1049,
                                  random_state=123)

# concat the minority and majority downsampled dataframe
df_downsampled = pd.concat([df_majority, df_majority_downsampled])

# Display new class counts
df_downsampled.BHS.value_counts()

In [None]:
data_tmp = df_downsampled.copy()

In [None]:
data_tmp["BHS"].value_counts(sort=0)

In [None]:
# Split data for training and testing
data_lists = []
X,y = standard_scaler(dataframe=data_tmp)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=20)

data_lists = [X_train, y_train, X_test, y_test]
label_names = y_train.unique()

In [None]:
models_result = get_all_the_best_values(data_lists=data_lists, experiment_name="Experiment_6c")
# get_the_best_values_KNN(data_lists)

In [None]:
# KNeighborsClassifier(algorithm='auto', leaf_size=1, metric='minkowski',
#                       metric_params=None, n_jobs=None, n_neighbors=1, p=1,
#                       weights='uniform')

In [None]:
# XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#                colsample_bynode=1, colsample_bytree=1, gamma=0,
#                learning_rate=0.075, max_delta_step=0, max_depth=9,
#                min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
#                nthread=1, number_of_estimators=1, objective='binary:logistic',
#                random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
#                seed=None, silent=None, subsample=1, verbosity=1)

In [None]:
# RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#                         max_depth=8, max_features='auto', max_leaf_nodes=None,
#                         min_impurity_decrease=0.0, min_impurity_split=None,
#                         min_samples_leaf=1, min_samples_split=2,
#                         min_weight_fraction_leaf=0.0, n_estimators=130,
#                         n_jobs=None, oob_score=True, random_state=None,
#                         verbose=0, warm_start=False)

In [None]:
from sklearn.ensemble import VotingClassifier
#create a dictionary of the models
estimators = [
#               ('lr', models_result[0]['best_param']), 
#               ('nb', models_result[1]['best_param']),
              ('dt', models_result[2]['best_param']),
              ('rf', models_result[3]['best_param']),
              ('xgb', models_result[4]['best_param']),
              ('knn', models_result[5]['best_param']), 
              ('svm', models_result[6]['best_param']) #,
#               ('nn', models_result[7]['best_param'])
             ]
#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting="hard")

In [None]:
ensemble.fit(data_lists[0], data_lists[1])

In [None]:
pred = ensemble.predict(data_lists[2])

In [None]:
evaluate_metrics_2d(y_pred=pred, y_test=data_lists[3])

### LIME

In [None]:
#!pip install lime

In [None]:
import lime
import lime.lime_tabular
from lime.lime_tabular import LimeTabularExplainer
from matplotlib import pyplot as plt

In [None]:
feature_names = ["Age","AST1", "ALT1", "Plat", "BMI"]

In [None]:
import sklearn
train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(data_tmp[feature_names], 
                                                                                  data_tmp["BHS"], 
                                                                                  train_size=0.70, 
                                                                                  test_size=0.30)

In [None]:
# rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#                         max_depth=8, max_features='auto', max_leaf_nodes=None,
#                         min_impurity_decrease=0.0, min_impurity_split=None,
#                         min_samples_leaf=1, min_samples_split=2,
#                         min_weight_fraction_leaf=0.0, n_estimators=130,
#                         n_jobs=None, oob_score=True, random_state=None,
#                         verbose=0, warm_start=False)
# # rf = RandomForestClassifier()
# rf.fit(train, labels_train)

In [None]:
# XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#               colsample_bynode=1, colsample_bytree=1, gamma=0,
#               learning_rate=0.075, max_delta_step=0, max_depth=9,
#               min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
#               nthread=1, number_of_estimators=1, objective='binary:logistic',
#               random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
#               seed=None, silent=None, subsample=1, verbosity=1)

In [None]:
knn = KNeighborsClassifier(algorithm='auto', leaf_size=1, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=1, p=1,
                      weights='uniform')
knn.fit(train, labels_train)

In [None]:
# def prob(data1):
#     return rf.predict_proba(data1)

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(
#     train.astype(int).values,
    train.values,
    training_labels = labels_train.astype(int).values,
    feature_names=feature_names, 
    class_names=[1, 4],
    mode="classification")

In [None]:
rows = [1, 300, 20, 3] 
# rows = [20, 1, 3, 300, 109, 25, 69] 
# rows = [575, 36, 52, 227, 29, 122, 38, 1, 21, 4]
# rows = np.arange(600, 620, 1).tolist()
for row in rows:
   print(row)
#    exp_all = explainer.explain_instance(test.iloc[row], predict_fn_xgb, num_features=5)
   exp_all = explainer.explain_instance(test.iloc[row], knn.predict_proba, num_features=5)
   exp_all.show_in_notebook(show_table=True)
   print(exp_all.as_list())

In [None]:
labels_test

## Interpreting the XGB Model

### first run the previous experiment to get the test and train sets¶

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
# store the best xgb model in an object
X_trn = data_lists[0]
y_trn = data_lists[1]
X_tst = data_lists[2]
y_tst = data_lists[3]
param_grid = {'max_depth' : list(range(2,4,1)), 
              'number_of_estimators' : list(range(1,3,1)), 
              'learning_rate' : [0.005] #, 
#                'nthread' : list(range(1,,1))
             }
xgb = XGBClassifier()
clf = GridSearchCV(estimator=xgb, param_grid=param_grid)
model_xgb = clf.fit(X_trn, y_trn)

In [None]:
# dtc = DecisionTreeClassifier()
# dtc.fit(X_trn, y_trn)
# perm = PermutationImportance(dtc, random_state=1).fit(X_trn, y_trn)
# rfc = RandomForestClassifier()
# rfc.fit(X_trn, y_trn)
# perm = PermutationImportance(rfc, random_state=1).fit(X_trn, y_trn)
xgb = XGBClassifier()
model = xgb.fit(X_trn, y_trn)
perm = PermutationImportance(xgb, random_state=1).fit(X_trn, y_trn)
# knn = KNeighborsClassifier()
# knn.fit(X_trn, y_trn)
# perm = PermutationImportance(knn, random_state=1).fit(X_trn, y_trn)
# svm = SVC()
# svm.fit(X_trn, y_trn)
# perm = PermutationImportance(svm, random_state=1).fit(X_trn, y_trn)

In [None]:
# imp_df(X_trn.columns, perm.feature_importances_)
# "Age","AST1", "ALT1", "Plat", "BMI"
perm.feature_importances_

### try other means of feature attributions - SHAP

In [None]:
# conda install -c conda-forge shap

In [None]:
import shap