In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import  recall_score,roc_auc_score, roc_curve
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
import warnings
import itertools
warnings.filterwarnings('ignore')
%matplotlib inline

In [36]:
df = pd.read_csv('data.csv')

Index(['location', 'country', 'gender', 'age', 'vis_wuhan', 'from_wuhan',
       'symptom1', 'symptom2', 'symptom3', 'symptom4', 'symptom5', 'symptom6',
       'diff_sym_hos', 'result'],
      dtype='object')

# Cleaning

In [37]:
#Negative hours changed to positive
for i in range(len(df)):
    df.iloc[i, 12] = df.iloc[i, 12] * -1 if df.iloc[i, 12] < 0 else df.iloc[i, 12]

#Gender value 2 changed to the most common gender
df['gender'] = df['gender'].mask(df['gender'] == 2, 1)

# Helper Methods

In [38]:
# Splits the given dataframe into 70% training set, 15% validation set, 15% testing set
def split(df):
    x = df.drop('result', axis=1)
    y = df['result']
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,shuffle=True, random_state=42, stratify=y)
    X_validate, X_test, y_validate, y_test = train_test_split(X_test, y_test, test_size=0.5,shuffle=True, random_state=42, stratify=y_test)
    return X_train, X_validate, X_test, y_train, y_validate, y_test

In [39]:
# one hot encoding
def encode(df, columns):
    df_encoded = df.copy()
    for col in columns:
        encoding = pd.get_dummies(df[col], prefix=col)
        df_encoded = df_encoded.join(encoding)
        df_encoded.drop(col, axis=1, inplace=True)
    return df_encoded


In [None]:
def svm_validate(X_train, y_train, X_validate, y_validate):
    dic = {'clf':[], 'recall':[]}
    c_values = [0.1, 1, 10, 100, 1000]
    gamma_values = [1, 0.1, 0.01, 0.001, 0.0001]
    degrees = np.arange(1, 11, 1)

    parameters = list(itertools.product(c_values, gamma_values, degrees))
    for c, gamma, degree in parameters:
        rbf_kernel_svm_clf = Pipeline([
            ("scaler", StandardScaler()),
            ("svm_clf", SVC(C=c, gamma=gamma, degree=degree))
        ])
        rbf_kernel_svm_clf.fit(X_train, y_train)
        y_pred = rbf_kernel_svm_clf.predict(X_validate)
        recall = recall_score(y_validate, y_pred)
        dic['clf'].append(rbf_kernel_svm_clf)
        dic['recall'].append(recall)
    df_scores = pd.DataFrame(dic)
    return df_scores.iloc[df_scores['recall'].idxmax(), 0]

# Without Encoding

In [79]:
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df)
best_svm = svm_validate(X_train, y_train, X_validate, y_validate)
print(best_svm)
print(classification_report(y_test, best_svm.predict(X_test)))

Pipeline(steps=[('scaler', StandardScaler()),
                ('svm_clf', SVC(C=10, degree=1, gamma=0.1))])
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       114
           1       0.81      0.81      0.81        16

    accuracy                           0.95       130
   macro avg       0.89      0.89      0.89       130
weighted avg       0.95      0.95      0.95       130



# Encoding

In [80]:
cols = ['location', 'country', 'symptom1', 'symptom2', 'symptom3', 'symptom4', 'symptom5', 'symptom6']
df_encoded = encode(df, cols)
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df_encoded)
best_svm = svm_validate(X_train, y_train, X_validate, y_validate)
print(best_svm)
print(classification_report(y_test, best_svm.predict(X_test)))

Pipeline(steps=[('scaler', StandardScaler()),
                ('svm_clf', SVC(C=100, degree=1, gamma=0.001))])
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       114
           1       1.00      0.75      0.86        16

    accuracy                           0.97       130
   macro avg       0.98      0.88      0.92       130
weighted avg       0.97      0.97      0.97       130



# Feature selection

In [81]:
dropped_cols = ['symptom6', 'symptom5', 'symptom4','symptom2']
encoded_cols = ['country','location', 'symptom1', 'symptom3']
df_encoded = encode(df.drop(dropped_cols, axis=1), encoded_cols)
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df_encoded)
best_svm = svm_validate(X_train, y_train, X_validate, y_validate)
print(best_svm)
print(classification_report(y_test, best_svm.predict(X_test)))

Pipeline(steps=[('scaler', StandardScaler()),
                ('svm_clf', SVC(C=100, degree=1, gamma=0.001))])
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       114
           1       1.00      0.88      0.93        16

    accuracy                           0.98       130
   macro avg       0.99      0.94      0.96       130
weighted avg       0.98      0.98      0.98       130



# Bagging with validation

In [82]:
dic = {'clf':[], 'recall':[]}
c_values = [0.1, 1, 10, 100, 1000]
gamma_values = [1, 0.1, 0.01, 0.001, 0.0001]
degrees = np.arange(1, 11, 1)

parameters = list(itertools.product(c_values, gamma_values, degrees))
for c, gamma, degree in parameters:
    rbf_kernel_svm_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(C=c, gamma=gamma, degree=degree))
    ])
    bagging_clf = BaggingClassifier(base_estimator=rbf_kernel_svm_clf, n_estimators=10, random_state=42)
    bagging_clf.fit(X_train, y_train)
    y_pred = bagging_clf.predict(X_validate)
    recall = recall_score(y_validate, y_pred)
    dic['clf'].append(bagging_clf)
    dic['recall'].append(recall)
df_scores = pd.DataFrame(dic)

In [83]:
bagging_clf = df_scores.iloc[df_scores['recall'].idxmax(), 0]
print(bagging_clf)
print(classification_report(y_test, bagging_clf.predict(X_test)))

BaggingClassifier(base_estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                                 ('svm_clf',
                                                  SVC(C=10, degree=1,
                                                      gamma=0.01))]),
                  random_state=42)
              precision    recall  f1-score   support

           0       0.93      1.00      0.97       114
           1       1.00      0.50      0.67        16

    accuracy                           0.94       130
   macro avg       0.97      0.75      0.82       130
weighted avg       0.94      0.94      0.93       130



# Over sampling

In [84]:
dropped_cols = ['symptom6', 'symptom5', 'symptom4', 'symptom3', 'symptom2']
# encoded_cols = ['country','location', 'symptom1', 'symptom3', 'symptom6', 'symptom5', 'symptom4','symptom2']
encoded_cols = []
df_encoded = encode(df.drop(dropped_cols, axis=1), encoded_cols)
ros = RandomOverSampler(random_state=42)
X_train, X_validate, X_test, y_train, y_validate, y_test = split(df_encoded)
X_train, y_train= ros.fit_resample(X=X_train, y=y_train)
best_svm = svm_validate(X_train, y_train, X_validate, y_validate)
print(best_svm)
print(classification_report(y_test, best_svm.predict(X_test)))


Pipeline(steps=[('scaler', StandardScaler()),
                ('svm_clf', SVC(C=0.1, degree=1, gamma=0.1))])
              precision    recall  f1-score   support

           0       0.99      0.86      0.92       114
           1       0.48      0.94      0.64        16

    accuracy                           0.87       130
   macro avg       0.74      0.90      0.78       130
weighted avg       0.93      0.87      0.89       130

