In [206]:
import pandas as pd
import numpy as np
import scipy 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from random import randint

%matplotlib inline

# Loading Data

In [220]:
df = pd.read_csv('data.csv',index_col=0)
df

Unnamed: 0,location,country,gender,age,vis_wuhan,from_wuhan,symptom1,symptom2,symptom3,symptom4,symptom5,symptom6,diff_sym_hos,result
0,104,8,1,66.0,1,0,14,31,19,12,3,1,8,1
1,101,8,0,56.0,0,1,14,31,19,12,3,1,0,0
2,137,8,1,46.0,0,1,14,31,19,12,3,1,13,0
3,116,8,0,60.0,1,0,14,31,19,12,3,1,0,0
4,116,8,1,58.0,0,0,14,31,19,12,3,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
858,48,3,2,24.0,0,0,14,31,19,12,3,1,0,1
859,0,0,2,35.0,0,0,14,31,19,12,3,1,0,0
860,3,1,1,49.4,0,0,14,31,19,12,3,1,0,0
861,24,9,1,49.4,0,0,14,31,19,12,3,1,0,0


In [221]:
test_size = 0.1
validaition_size = 0.15

# unknown gender and negative hours

In [223]:
df['gender'].value_counts()

1    561
0    302
Name: gender, dtype: int64

In [222]:
df['gender'] = df['gender'].mask(df['gender'] == 2, 1)
for i in range(len(df)):
    df.iloc[i, 12] = df.iloc[i,12]*-1 if df.iloc[i, 12] < 0 else df.iloc[i, 12]

In [224]:
x = df[['location', 'country', 'gender', 'age', 'vis_wuhan', 'from_wuhan',
       'symptom1', 'symptom2', 'symptom3', 'symptom4', 'symptom5', 'symptom6',
       'diff_sym_hos']]
y = df['result']
X_train1, X_test, y_train1, y_test = train_test_split(x, y, test_size=test_size,shuffle=True, random_state=42)
print(np.shape(X_train1))
print(np.shape(X_test))

(776, 13)
(87, 13)


In [225]:
X_train2, X_validate, y_train2, y_validate = train_test_split(X_train1, y_train1, test_size=validaition_size,shuffle=True, random_state=42)
print(np.shape(X_train2))
print(np.shape(X_validate))

(659, 13)
(117, 13)


# train wihout validation

dic = {'k':[], 'weights':[],'precision':[], 'recall':[], 'F1':[], 'roc_auc':[]}
weights = ['distance', 'uniform']
for k in range(1, int(len(X_train1)**0.5), 2):
        for j in range(2):
            knn = KNeighborsClassifier(n_neighbors=k, weights=weights[j])
            knn.fit(X_train1, y_train1)
            y_predict = knn.predict(X_test)
            precision, recall = precision_score(y_predict, y_test), recall_score(y_predict, y_test)
            roc_auc = roc_auc_score(y_test, y_predict)
            F1 = 2 * (precision * recall) / (precision + recall)
            dic['weights'].append(weights[j])
            dic['k'].append(k)
            dic['precision'].append(precision)
            dic['recall'].append(recall)
            dic['F1'].append(F1)
            dic['roc_auc'].append(roc_auc)
#print(y_predict)
#print(np.asarray(y_test))
f = pd.DataFrame(dic)
f[f['roc_auc'] == f['roc_auc'].max()]

scaler = MinMaxScaler()
names = df.columns
result = scaler.fit_transform(df)
print(type(result))
df2 = {}
for i in range(14):
    df2[names[i]] = result[:,i]
print(type(df2))
df2 = pd.DataFrame(df2)
print(type(df2))

df2.head()

# with validation

In [227]:
dic = {'k':[], 'weights':[],'precision':[], 'recall':[], 'F1':[], 'roc_auc':[]}
weights = ['distance', 'uniform']
for k in range(1, int(len(X_train2)**0.5), 2):
        for j in range(2):
            knn = KNeighborsClassifier(n_neighbors=k, weights=weights[j])
            knn.fit(X_train2, y_train2)
            y_predict = knn.predict(X_validate)
            precision= precision_score(y_validate, y_predict)
            recall = recall_score(y_validate,y_predict)
            roc_auc = roc_auc_score(y_validate, y_predict)
            F1 = f1_score(y_validate, y_predict)
            dic['weights'].append(weights[j])
            dic['k'].append(k)
            dic['precision'].append(precision)
            dic['recall'].append(recall)
            dic['F1'].append(F1)
            dic['roc_auc'].append(roc_auc)
f = pd.DataFrame(dic)
f[f['roc_auc'] == f['roc_auc'].max()]

Unnamed: 0,k,weights,precision,recall,F1,roc_auc
9,9,uniform,0.777778,0.7,0.736842,0.840654


In [228]:
# dic = {'k':[], 'weights':[],'precision':[], 'recall':[], 'F1':[], 'roc_auc':[]}
# weights = ['distance', 'uniform']
knn = KNeighborsClassifier(n_neighbors=9, weights='distance')
knn.fit(X_train1, y_train1)
y_predict = knn.predict(X_test)
precision= precision_score(y_test, y_predict)
recall = recall_score(y_test,y_predict)
roc_auc = roc_auc_score(y_test, y_predict)
F1 = f1_score(y_test, y_predict)
precision, recall, F1, roc_auc

(0.8888888888888888, 0.8, 0.8421052631578948, 0.8935064935064935)

# Encoding and normalization

In [229]:
df_new = df.copy()
for i in range(1, 7):
    encoding = pd.get_dummies(df[f'symptom{i}'], prefix=f'symptom{i}')
    df_new = df_new.join(encoding)
    df_new.drop(f'symptom{i}', axis=1, inplace=True)
scaler = MinMaxScaler()
df_new['age'] = scaler.fit_transform(df_new['age'].to_numpy()[:,np.newaxis])[:,0]
# df_new['diff_sym_hos'] = scaler.fit_transform(df_new['diff_sym_hos'].to_numpy()[:,np.newaxis])[:,0]
df_new['location'] = scaler.fit_transform(df_new['location'].to_numpy()[:,np.newaxis])[:,0]
df_new

Unnamed: 0,location,country,gender,age,vis_wuhan,from_wuhan,diff_sym_hos,result,symptom1_0,symptom1_1,...,symptom4_9,symptom4_10,symptom4_11,symptom4_12,symptom5_0,symptom5_1,symptom5_2,symptom5_3,symptom6_0,symptom6_1
0,0.753623,8,1,0.680851,1,0,8,1,0,0,...,0,0,0,1,0,0,0,1,0,1
1,0.731884,8,0,0.574468,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
2,0.992754,8,1,0.468085,0,1,13,0,0,0,...,0,0,0,1,0,0,0,1,0,1
3,0.840580,8,0,0.617021,1,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
4,0.840580,8,1,0.595745,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
858,0.347826,3,1,0.234043,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1
859,0.000000,0,1,0.351064,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
860,0.021739,1,1,0.504255,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
861,0.173913,9,1,0.504255,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1


In [230]:
x = df_new.drop('result', axis=1)
y = df_new['result']
X_train1, X_test, y_train1, y_test = train_test_split(x, y, test_size=test_size,shuffle=True, random_state=42)
print(np.shape(X_train1))
print(np.shape(X_test))

(776, 103)
(87, 103)


In [231]:
X_train2, X_validate, y_train2, y_validate = train_test_split(X_train1, y_train1, test_size=validaition_size,shuffle=True, random_state=42)
print(np.shape(X_train2))
print(np.shape(X_validate))

(659, 103)
(117, 103)


# no validation

dic = {'k':[], 'weights':[],'precision':[], 'recall':[], 'F1':[], 'roc_auc':[]}
weights = ['distance', 'uniform']
for k in range(1, int(len(X_train1)**0.5), 2):
        for j in range(2):
            knn = KNeighborsClassifier(n_neighbors=k, weights=weights[j])
            knn.fit(X_train1, y_train1)
            y_predict = knn.predict(X_test)
            precision, recall = precision_score(y_predict, y_test), recall_score(y_predict, y_test)
            roc_auc = roc_auc_score(y_test, y_predict)
            F1 = 2 * (precision * recall) / (precision + recall)
            dic['weights'].append(weights[j])
            dic['k'].append(k)
            dic['precision'].append(precision)
            dic['recall'].append(recall)
            dic['F1'].append(F1)
            dic['roc_auc'].append(roc_auc)
#print(y_predict)
#print(np.asarray(y_test))
f = pd.DataFrame(dic)
#f
f[f['roc_auc'] == f['roc_auc'].max()]

# validation

In [234]:
dic = {'k':[], 'weights':[],'precision':[], 'recall':[], 'F1':[], 'roc_auc':[]}
weights = ['distance', 'uniform']
for k in range(1, int(len(X_train2)**0.5), 2):
        for j in range(2):
            knn = KNeighborsClassifier(n_neighbors=k, weights=weights[j])
            knn.fit(X_train2, y_train2)
            y_predict = knn.predict(X_validate)
            precision= precision_score(y_validate, y_predict)
            recall = recall_score(y_validate,y_predict)
            roc_auc = roc_auc_score(y_validate, y_predict)
            F1 = f1_score(y_validate, y_predict)
            dic['weights'].append(weights[j])
            dic['k'].append(k)
            dic['precision'].append(precision)
            dic['recall'].append(recall)
            dic['F1'].append(F1)
            dic['roc_auc'].append(roc_auc)
f = pd.DataFrame(dic)
f[f['roc_auc'] == f['roc_auc'].max()]

Unnamed: 0,k,weights,precision,recall,F1,roc_auc
2,3,distance,0.470588,0.8,0.592593,0.857944
3,3,uniform,0.470588,0.8,0.592593,0.857944


In [235]:
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn.fit(X_train1, y_train1)
y_predict = knn.predict(X_test)
precision= precision_score(y_test, y_predict)
recall = recall_score(y_test,y_predict)
roc_auc = roc_auc_score(y_test, y_predict)
F1 = f1_score(y_test, y_predict)
precision, recall, F1, roc_auc

(0.625, 0.5, 0.5555555555555556, 0.7305194805194805)