In [368]:
import pandas as pd
import numpy as np
import scipy 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from random import randint

%matplotlib inline

# Loading Data

In [369]:
df = pd.read_csv('data.csv',index_col=0)
df

Unnamed: 0,location,country,gender,age,vis_wuhan,from_wuhan,symptom1,symptom2,symptom3,symptom4,symptom5,symptom6,diff_sym_hos,result
0,104,8,1,66.0,1,0,14,31,19,12,3,1,8,1
1,101,8,0,56.0,0,1,14,31,19,12,3,1,0,0
2,137,8,1,46.0,0,1,14,31,19,12,3,1,13,0
3,116,8,0,60.0,1,0,14,31,19,12,3,1,0,0
4,116,8,1,58.0,0,0,14,31,19,12,3,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
858,48,3,2,24.0,0,0,14,31,19,12,3,1,0,1
859,0,0,2,35.0,0,0,14,31,19,12,3,1,0,0
860,3,1,1,49.4,0,0,14,31,19,12,3,1,0,0
861,24,9,1,49.4,0,0,14,31,19,12,3,1,0,0


In [370]:
test_size = 0.1
validaition_size = 0.15

# unknown gender and negative hours

In [371]:
df['gender'] = df['gender'].mask(df['gender'] == 2, 1)
for i in range(len(df)):
    df.iloc[i, 12] = df.iloc[i,12]*-1 if df.iloc[i, 12] < 0 else df.iloc[i, 12]

# Training, Validation, Testing Functions

In [372]:
def train_validate(X_train, X_validate, y_train, y_validate):
    dic = {'k':[], 'weights':[],'precision':[], 'recall':[], 'F1':[], 'roc_auc':[]}
    weights = ['distance', 'uniform']
    for k in range(1, int(len(X_train)**0.5), 2):
            for j in range(2):
                knn = KNeighborsClassifier(n_neighbors=k, weights=weights[j])
                knn.fit(X_train, y_train)
                y_predict = knn.predict(X_validate)
                precision= precision_score(y_validate, y_predict)
                recall = recall_score(y_validate,y_predict)
                roc_auc = roc_auc_score(y_validate, y_predict)
                F1 = f1_score(y_validate, y_predict)
                dic['weights'].append(weights[j])
                dic['k'].append(k)
                dic['precision'].append(precision)
                dic['recall'].append(recall)
                dic['F1'].append(F1)
                dic['roc_auc'].append(roc_auc)
    return pd.DataFrame(dic)

In [373]:
def train_test(X_train, X_test, y_train, y_test, optimal_k, weights):
    knn = KNeighborsClassifier(n_neighbors=optimal_k, weights=weights)
    knn.fit(X_train1, y_train1)
    y_predict = knn.predict(X_test)
    precision= precision_score(y_test, y_predict)
    recall = recall_score(y_test,y_predict)
    roc_auc = roc_auc_score(y_test, y_predict)
    F1 = f1_score(y_test, y_predict)
    return precision, recall, F1, roc_auc

# training wihtout one hot encoding

In [374]:
x = df[['location', 'country', 'gender', 'age', 'vis_wuhan', 'from_wuhan',
       'symptom1', 'symptom2', 'symptom3', 'symptom4', 'symptom5', 'symptom6',
       'diff_sym_hos']]
y = df['result']
X_train1, X_test, y_train1, y_test = train_test_split(x, y, test_size=test_size,shuffle=True, random_state=42)
X_train2, X_validate, y_train2, y_validate = train_test_split(X_train1, y_train1, test_size=validaition_size,shuffle=True, random_state=42)

In [375]:
f = train_validate(X_train2, X_validate, y_train2, y_validate)
f[f['roc_auc'] == f['roc_auc'].max()]

Unnamed: 0,k,weights,precision,recall,F1,roc_auc
11,11,uniform,0.875,0.7,0.777778,0.845327


In [376]:
print("predicting on training")
print(train_test(X_train1, X_train1, y_train1, y_train1, 11, 'distance'))
print(train_test(X_train1, X_train1, y_train1, y_train1, 11, 'uniform'))
print("predicting on testing")
print(train_test(X_train1, X_test, y_train1, y_test, 11, 'distance'))
print(train_test(X_train1, X_test, y_train1, y_test, 11, 'uniform'))

predicting on training
(1.0, 1.0, 1.0, 1.0)
(0.8732394366197183, 0.6326530612244898, 0.7337278106508875, 0.809689362470652)
predicting on testing
(0.9, 0.9, 0.9, 0.9435064935064935)
(0.875, 0.7, 0.7777777777777777, 0.8435064935064934)


# encoding
one hot encoding for catagorical data

In [377]:
columns = ['symptom1', 'symptom2', 'symptom3', 'symptom4', 'symptom5', 'symptom6', 'location', 'country']
df_new = df.copy()
for i in columns:
    encoding = pd.get_dummies(df[i], prefix=i)
    df_new = df_new.join(encoding)
    df_new.drop(i, axis=1, inplace=True)
df_new

Unnamed: 0,gender,age,vis_wuhan,from_wuhan,diff_sym_hos,result,symptom1_0,symptom1_1,symptom1_2,symptom1_3,...,country_24,country_25,country_26,country_27,country_28,country_29,country_30,country_31,country_32,country_33
0,1,66.0,1,0,8,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,56.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,46.0,0,1,13,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,60.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,58.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
858,1,24.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
859,1,35.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,1,49.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
861,1,49.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [378]:
x = df_new.drop('result', axis=1)
y = df_new['result']
X_train1, X_test, y_train1, y_test = train_test_split(x, y, test_size=test_size,shuffle=True, random_state=42)
X_train2, X_validate, y_train2, y_validate = train_test_split(X_train1, y_train1, test_size=validaition_size,shuffle=True, random_state=42)

In [379]:
f = train_validate(X_train2, X_validate, y_train2, y_validate)
f[f['roc_auc'] == f['roc_auc'].max()]

Unnamed: 0,k,weights,precision,recall,F1,roc_auc
0,1,distance,0.615385,0.8,0.695652,0.876636
1,1,uniform,0.615385,0.8,0.695652,0.876636


In [380]:
print("predicting on training")
print(train_test(X_train1, X_train1, y_train1, y_train1, 1, 'distance'))
print(train_test(X_train1, X_train1, y_train1, y_train1, 1, 'uniform'))
print("predicting on testing")
print(train_test(X_train1, X_test, y_train1, y_test, 1, 'distance'))
print(train_test(X_train1, X_test, y_train1, y_test, 1, 'uniform'))

predicting on training
(1.0, 1.0, 1.0, 1.0)
(1.0, 1.0, 1.0, 1.0)
predicting on testing
(1.0, 0.9, 0.9473684210526316, 0.95)
(1.0, 0.9, 0.9473684210526316, 0.95)


# Normalization
normalization for age
every time i try normalizing the dff_sym_hos it gives me a warning during training

In [381]:
scaler = MinMaxScaler()
df_new['age'] = scaler.fit_transform(df_new['age'].to_numpy()[:,np.newaxis])[:,0]
# df_new['diff_sym_hos'] = scaler.fit_transform(df_new['diff_sym_hos'].to_numpy()[:,np.newaxis])[:,0]
df_new

Unnamed: 0,gender,age,vis_wuhan,from_wuhan,diff_sym_hos,result,symptom1_0,symptom1_1,symptom1_2,symptom1_3,...,country_24,country_25,country_26,country_27,country_28,country_29,country_30,country_31,country_32,country_33
0,1,0.680851,1,0,8,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0.574468,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0.468085,0,1,13,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0.617021,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0.595745,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
858,1,0.234043,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
859,1,0.351064,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,1,0.504255,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
861,1,0.504255,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [382]:
x = df_new.drop('result', axis=1)
y = df_new['result']
X_train1, X_test, y_train1, y_test = train_test_split(x, y, test_size=test_size,shuffle=True, random_state=42)
X_train2, X_validate, y_train2, y_validate = train_test_split(X_train1, y_train1, test_size=validaition_size,shuffle=True, random_state=42)

In [383]:
f = train_validate(X_train2, X_validate, y_train2, y_validate)
f[f['roc_auc'] == f['roc_auc'].max()]

Unnamed: 0,k,weights,precision,recall,F1,roc_auc
2,3,distance,0.4375,0.7,0.538462,0.807944
3,3,uniform,0.4375,0.7,0.538462,0.807944


In [384]:
print("predicting on training")
print(train_test(X_train1, X_train1, y_train1, y_train1, 3, 'distance'))
print(train_test(X_train1, X_train1, y_train1, y_train1, 3, 'uniform'))
print("predicting on testing")
print(train_test(X_train1, X_test, y_train1, y_test, 3, 'distance'))
print(train_test(X_train1, X_test, y_train1, y_test, 3, 'uniform'))

predicting on training
(1.0, 1.0, 1.0, 1.0)
(0.8809523809523809, 0.7551020408163265, 0.8131868131868131, 0.8701763891397266)
predicting on testing
(0.6363636363636364, 0.7, 0.6666666666666666, 0.824025974025974)
(0.6666666666666666, 0.6, 0.631578947368421, 0.7805194805194805)
