In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import  recall_score,roc_auc_score, roc_curve
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from sklearn.tree import DecisionTreeClassifier
import warnings
import itertools
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv('data.csv',index_col=0)

# Cleaning
By inspecting the values in the different columns we noticed some irregular values especially in the gender and diff_sym_hos columns.

In [3]:
#Negative hours changed to positive
for i in range(len(df)):
    df.iloc[i, 12] = df.iloc[i, 12] * -1 if df.iloc[i, 12] < 0 else df.iloc[i, 12]

#Gender value 2 changed to the most common gender
df['gender'] = df['gender'].mask(df['gender'] == 2, 1)

# Helper Methods
These methods will be used to perform different operations such as encoding the data. As well as splitting the dataframe to train, test, and validation sets.

In [4]:
# Splits the given dataframe into 70% training set, 15% validation set, 15% testing set
def split(df):
    x = df.drop('result', axis=1)
    y = df['result']
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,shuffle=True, random_state=42, stratify=y)
    X_validate, X_test, y_validate, y_test = train_test_split(X_test, y_test, test_size=0.5,shuffle=True, random_state=42, stratify=y_test)
    return X_train, X_validate, X_test, y_train, y_validate, y_test

In [5]:
# one hot encoding
def encode(df, columns):
    df_encoded = df.copy()
    for col in columns:
        encoding = pd.get_dummies(df[col], prefix=col)
        df_encoded = df_encoded.join(encoding)
        df_encoded.drop(col, axis=1, inplace=True)
    return df_encoded


This method will take the training and validation data, perform hyperparameter tuning and evaluate based on the best recall from the different models for SVM.

In [6]:
def svm_validate(X_train, y_train, X_validate, y_validate):
    dic = {'clf':[], 'recall':[]}
    # Regularization parameter. The strength of the regularization 
    # is inversely proportional to C
    c_values = [0.1, 1, 10, 100, 1000]
    
    #Kernel coefficient for ‘rbf’
    gamma_values = [1, 0.1, 0.01, 0.001, 0.0001]
    
    #Degree of the polynomial kernel function
    degrees = np.arange(1, 11, 1)

    parameters = list(itertools.product(c_values, gamma_values, degrees))
    for c, gamma, degree in parameters:
        
        #Perform scaling on the data then fit the model
        rbf_kernel_svm_clf = Pipeline([
            ("scaler", StandardScaler()),
            ("svm_clf", SVC(C=c, gamma=gamma, degree=degree, random_state= 2))
        ])
        rbf_kernel_svm_clf.fit(X_train, y_train)
        y_pred = rbf_kernel_svm_clf.predict(X_validate)
        recall = recall_score(y_validate, y_pred)
        dic['clf'].append(rbf_kernel_svm_clf)
        dic['recall'].append(recall)
    df_scores = pd.DataFrame(dic)
    return df_scores.iloc[df_scores['recall'].idxmax(), 0]

This method will take the training and validation data, perform hyperparameter tuning and evaluate based on the best recall from the different models for DT. More hyperparameters were tested (such as max_depth) but they resulted in worse metrics thus they were removed.

In [7]:
def DT_validate(X_train, y_train, X_validate, y_validate):
    dic = {'clf':[], 'recall':[]}
    
    # The function to measure the quality of a split.
    criterion = ['entropy','gini']
    
    # The strategy used to choose the split at each node.
    splitter = ['best','random']
    
    # Weights associated with classes
    class_weight = ['balanced',None]
    
    # The number of features to consider when looking for the best split
    max_features = [i for i in range(1,X_train.shape[1])]
    
    parameters = list(itertools.product(criterion, splitter, class_weight, max_features))
    for crit, splitter, weight, max_features in parameters:
        DT = DecisionTreeClassifier(criterion = crit, splitter=splitter, class_weight=weight, max_features=max_features, random_state=2)
        DT.fit(X_train, y_train)
        y_pred = DT.predict(X_validate)
        recall = recall_score(y_validate, y_pred)
        dic['clf'].append(DT)
        dic['recall'].append(recall)
    df_scores = pd.DataFrame(dic)
    return df_scores.iloc[df_scores['recall'].idxmax(), 0]
    

# SVM

## Without Encoding
First we attempt to fit the model without any changes to act as a baseline.

In [8]:
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df)
best_svm = svm_validate(X_train, y_train, X_validate, y_validate)
print(best_svm)
print(classification_report(y_test, best_svm.predict(X_test)))

Pipeline(steps=[('scaler', StandardScaler()),
                ('svm_clf', SVC(C=10, degree=1, gamma=0.1, random_state=2))])
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       114
           1       0.81      0.81      0.81        16

    accuracy                           0.95       130
   macro avg       0.89      0.89      0.89       130
weighted avg       0.95      0.95      0.95       130



## Encoding
We then encode some of the categorical data in the dataset to improve the metrics.

In [9]:
cols = ['location', 'country', 'symptom1', 'symptom2', 'symptom3', 'symptom4', 'symptom5', 'symptom6']
df_encoded = encode(df, cols)
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df_encoded)
best_svm = svm_validate(X_train, y_train, X_validate, y_validate)
print(best_svm)
print(classification_report(y_test, best_svm.predict(X_test)))

Pipeline(steps=[('scaler', StandardScaler()),
                ('svm_clf', SVC(C=100, degree=1, gamma=0.001, random_state=2))])
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       114
           1       1.00      0.75      0.86        16

    accuracy                           0.97       130
   macro avg       0.98      0.88      0.92       130
weighted avg       0.97      0.97      0.97       130



## Feature selection
Here we experimentally select features. After some experiments this is the best result. Some features were removed based on their low correlation with the data while others were removed after experimentally testing that their removal improved recall.

In [35]:
dropped_cols = ['symptom6', 'symptom5', 'symptom4','symptom2']
encoded_cols = ['country','location', 'symptom1', 'symptom3']
df_encoded = encode(df.drop(dropped_cols, axis=1), encoded_cols)
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df_encoded)
best_svm = svm_validate(X_train, y_train, X_validate, y_validate)
print(best_svm)
print(classification_report(y_test, best_svm.predict(X_test)))

Pipeline(steps=[('scaler', StandardScaler()),
                ('svm_clf', SVC(C=100, degree=1, gamma=0.001, random_state=2))])
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       114
           1       0.93      0.88      0.90        16

    accuracy                           0.98       130
   macro avg       0.96      0.93      0.95       130
weighted avg       0.98      0.98      0.98       130



## Bagging with Validation
In this section, we utilise a bagging classifier to attempt to improve the recall.

In [11]:
dic = {'clf':[], 'recall':[]}
c_values = [0.1, 1, 10, 100, 1000] 
gamma_values = [1, 0.1, 0.01, 0.001, 0.0001]
degrees = np.arange(1, 11, 1)

parameters = list(itertools.product(c_values, gamma_values, degrees))
for c, gamma, degree in parameters:
    rbf_kernel_svm_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(C=c, gamma=gamma, degree=degree, random_state = 2))
    ])
    bagging_clf = BaggingClassifier(base_estimator=rbf_kernel_svm_clf, n_estimators=10, random_state=2)
    bagging_clf.fit(X_train, y_train)
    y_pred = bagging_clf.predict(X_validate)
    recall = recall_score(y_validate, y_pred)
    dic['clf'].append(bagging_clf)
    dic['recall'].append(recall)
df_scores = pd.DataFrame(dic)

In [12]:
bagging_clf = df_scores.iloc[df_scores['recall'].idxmax(), 0]
print(bagging_clf)
print(classification_report(y_test, bagging_clf.predict(X_test)))

BaggingClassifier(base_estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                                 ('svm_clf',
                                                  SVC(C=10, degree=1,
                                                      gamma=0.01,
                                                      random_state=2))]),
                  random_state=2)
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       114
           1       0.91      0.62      0.74        16

    accuracy                           0.95       130
   macro avg       0.93      0.81      0.86       130
weighted avg       0.94      0.95      0.94       130



## Over sampling
Since the data is imbalanced for the '1' class we utilise a random oversampler to balance the data by duplicating existing data from the '1' class.

In [32]:
dropped_cols = ['symptom6', 'symptom5', 'symptom4', 'symptom3', 'symptom2']
df_dropped = df.drop(dropped_cols, axis=1)
ros = RandomOverSampler(random_state=42)
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df_dropped)
X_train, y_train= ros.fit_resample(X=X_train, y=y_train)
best_svm = svm_validate(X_train, y_train, X_validate, y_validate)
print(best_svm)
print(classification_report(y_test, best_svm.predict(X_test)))

Pipeline(steps=[('scaler', StandardScaler()),
                ('svm_clf', SVC(C=0.1, degree=1, gamma=0.1, random_state=2))])
              precision    recall  f1-score   support

           0       0.98      0.86      0.92       114
           1       0.47      0.88      0.61        16

    accuracy                           0.86       130
   macro avg       0.72      0.87      0.76       130
weighted avg       0.92      0.86      0.88       130



## Using a Bagging Classifier on Oversampled Data

In [25]:
dropped_cols = ['symptom6', 'symptom5', 'symptom4', 'symptom3', 'symptom2']
#encoded_cols = ['country','location', 'symptom1', 'symptom3', 'symptom6', 'symptom5', 'symptom4','symptom2']
encoded_cols = []
df_encoded = encode(df.drop(dropped_cols, axis=1), encoded_cols)
ros = RandomOverSampler(random_state=42)
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df_encoded)
X_train, y_train= ros.fit_resample(X=X_train, y=y_train)
dic = {'clf':[], 'recall':[]}
c_values = [0.1, 1, 10, 100, 1000] 
gamma_values = [1, 0.1, 0.01, 0.001, 0.0001]
degrees = np.arange(1, 11, 1)

parameters = list(itertools.product(c_values, gamma_values, degrees))
for c, gamma, degree in parameters:
    rbf_kernel_svm_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(C=c, gamma=gamma, degree=degree, random_state = 2))
    ])
    bagging_clf = BaggingClassifier(base_estimator=rbf_kernel_svm_clf, n_estimators=10, random_state=2)
    bagging_clf.fit(X_train, y_train)
    y_pred = bagging_clf.predict(X_validate)
    recall = recall_score(y_validate, y_pred)
    dic['clf'].append(bagging_clf)
    dic['recall'].append(recall)
df_scores = pd.DataFrame(dic)
bagging_clf = df_scores.iloc[df_scores['recall'].idxmax(), 0]
print(bagging_clf)
print(classification_report(y_test, bagging_clf.predict(X_test)))

BaggingClassifier(base_estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                                 ('svm_clf',
                                                  SVC(C=0.1, degree=1,
                                                      gamma=0.1,
                                                      random_state=2))]),
                  random_state=2)
              precision    recall  f1-score   support

           0       0.98      0.88      0.93       114
           1       0.50      0.88      0.64        16

    accuracy                           0.88       130
   macro avg       0.74      0.88      0.78       130
weighted avg       0.92      0.88      0.89       130



# Decision Tree

In [15]:
df = pd.read_csv('data.csv',index_col=0)

## Without Encoding
First we attempt to fit the model without any changes to act as a baseline.

In [16]:
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df)
best_DT = DT_validate(X_train, y_train, X_validate, y_validate)
print(best_DT)
print(classification_report(y_test, best_DT.predict(X_test)))

DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_features=6, random_state=2)
              precision    recall  f1-score   support

           0       0.98      0.94      0.96       114
           1       0.67      0.88      0.76        16

    accuracy                           0.93       130
   macro avg       0.82      0.91      0.86       130
weighted avg       0.94      0.93      0.93       130



## Encoding
We then encode some of the categorical data in the dataset to attempt to improve the metrics.

In [17]:
cols = ['location', 'country', 'symptom1', 'symptom2', 'symptom3', 'symptom4', 'symptom5', 'symptom6']
df_encoded = encode(df, cols)
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df_encoded)
best_DT = DT_validate(X_train, y_train, X_validate, y_validate)
print(best_DT)
print(classification_report(y_test, best_DT.predict(X_test)))

DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_features=37, random_state=2)
              precision    recall  f1-score   support

           0       0.96      0.95      0.95       114
           1       0.65      0.69      0.67        16

    accuracy                           0.92       130
   macro avg       0.80      0.82      0.81       130
weighted avg       0.92      0.92      0.92       130



Performing one-hot encoding on the data decreases the information gain of each column thus worsening the performance metrics of the model. Thus we use the encoding that was given in the dataset.

## Feature Selection
Here we experimentally select features. After some experiments this is the best result. Some features were removed based on their low correlation with the data while others were removed after experimentally testing that their removal improved recall.

In [18]:
dropped_cols = ['symptom6', 'symptom5', 'symptom4']
df_dropped = df.drop(dropped_cols, axis=1)
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df_dropped)
best_DT = DT_validate(X_train, y_train, X_validate, y_validate)
print(best_DT)
print(classification_report(y_test, best_DT.predict(X_test)))

DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_features=3, random_state=2)
              precision    recall  f1-score   support

           0       0.97      0.93      0.95       114
           1       0.62      0.81      0.70        16

    accuracy                           0.92       130
   macro avg       0.80      0.87      0.83       130
weighted avg       0.93      0.92      0.92       130



## Bagging with Validation
In this section, we utilise a bagging classifier to attempt to improve the recall.

In [19]:
dic = {'clf':[], 'recall':[]}
criterion = ['entropy','gini']
splitter = ['best','random']
class_weight = ['balanced', None]
max_features = [i for i in range(1,X_train.shape[1])]
parameters = list(itertools.product(criterion, splitter, class_weight,max_features))
for crit, splitter, weight, max_features in parameters:
    DT = DecisionTreeClassifier(criterion = crit, splitter=splitter, class_weight=weight, random_state = 2, max_features = max_features)
    bagging_clf = BaggingClassifier(base_estimator= DT, random_state=2)
    bagging_clf.fit(X_train, y_train)
    y_pred = bagging_clf.predict(X_validate)
    recall = recall_score(y_validate, y_pred)
    dic['clf'].append(bagging_clf)
    dic['recall'].append(recall)
df_scores = pd.DataFrame(dic)


In [20]:
bagging_clf = df_scores.iloc[df_scores['recall'].idxmax(), 0]
print(bagging_clf)
print(classification_report(y_test, bagging_clf.predict(X_test)))

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced',
                                                        criterion='entropy',
                                                        max_features=7,
                                                        random_state=2),
                  random_state=2)
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       114
           1       0.93      0.81      0.87        16

    accuracy                           0.97       130
   macro avg       0.95      0.90      0.92       130
weighted avg       0.97      0.97      0.97       130



## Oversampling
Since the data is imbalanced for the '1' class we utilise a random oversampler to balance the data by duplicating existing data from the '1' class.

In [22]:
dropped_cols = ['symptom6', 'symptom5', 'symptom4', 'symptom3', 'symptom2']
df_dropped = df.drop(dropped_cols, axis=1)
ros = RandomOverSampler(random_state=42)
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df_dropped)
X_train, y_train= ros.fit_resample(X=X_train, y=y_train)
best_DT = DT_validate(X_train, y_train, X_validate, y_validate)
print(best_DT)
print(classification_report(y_test, best_DT.predict(X_test)))


DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_features=2, random_state=2)
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       114
           1       0.81      0.81      0.81        16

    accuracy                           0.95       130
   macro avg       0.89      0.89      0.89       130
weighted avg       0.95      0.95      0.95       130



## Using a Bagging Classifier on Oversampled Data

In [24]:
dropped_cols = ['symptom6', 'symptom5', 'symptom4', 'symptom3', 'symptom2']
encoded_cols = []
df_encoded = encode(df.drop(dropped_cols, axis=1), encoded_cols)

ros = RandomOverSampler(random_state=42)
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df_encoded)
X_train, y_train= ros.fit_resample(X=X_train, y=y_train)

dic = {'clf':[], 'recall':[]}
criterion = ['entropy','gini']
splitter = ['best','random']
class_weight = ['balanced', None]
max_features = [i for i in range(1,X_train.shape[1])]

parameters = list(itertools.product(criterion, splitter, class_weight,max_features))

for crit, splitter, weight, max_features in parameters:
    DT = DecisionTreeClassifier(criterion = crit, splitter=splitter, class_weight=weight, random_state = 2, max_features = max_features)
    bagging_clf = BaggingClassifier(base_estimator= DT, random_state=2)
    bagging_clf.fit(X_train, y_train)
    y_pred = bagging_clf.predict(X_validate)
    recall = recall_score(y_validate, y_pred)
    dic['clf'].append(bagging_clf)
    dic['recall'].append(recall)

df_scores = pd.DataFrame(dic)
bagging_clf = df_scores.iloc[df_scores['recall'].idxmax(), 0]
print(bagging_clf)
print(classification_report(y_test, bagging_clf.predict(X_test)))

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced',
                                                        criterion='entropy',
                                                        max_features=2,
                                                        random_state=2),
                  random_state=2)
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       114
           1       0.93      0.88      0.90        16

    accuracy                           0.98       130
   macro avg       0.96      0.93      0.95       130
weighted avg       0.98      0.98      0.98       130

