In [29]:
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import os

In [30]:
df = pd.read_csv(os.path.join('..', "arun/Resources/", 'diagnosis-of-covid-19-and-its-clinical-spectrum.csv'))

In [31]:
data = df[['sars_cov_2_exam_result','patient_age_quantile', 'leukocytes', 'platelets', 'monocytes', 'hematocrit', 'red_blood_cells', 'lymphocytes', 'eosinophils', 'hemoglobin']]

In [32]:
target = data['sars_cov_2_exam_result']
target_names = ['negative', 'positive']

In [33]:
data = data.drop('sars_cov_2_exam_result', axis=1)
feature_names=data.columns

In [50]:
X=data.values

In [54]:
y=target.values

In [34]:
data.head()

Unnamed: 0,patient_age_quantile,leukocytes,platelets,monocytes,hematocrit,red_blood_cells,lymphocytes,eosinophils,hemoglobin
0,13,,,,,,,,
1,17,-0.09461,-0.517413,0.357547,0.236515,0.102004,0.318366,1.482158,-0.02234
2,8,,,,,,,,
3,5,,,,,,,,
4,15,,,,,,,,


In [55]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [56]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:len(feature_names)])
X[:, 1:len(feature_names)] = imputer.transform(X[:, 1:len(feature_names)])

In [57]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [58]:
print('Test Acc: %.3f' % model.score(X_test, y_test)) 

Test Acc: 0.907


In [59]:
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

             precision    recall  f1-score   support

   negative       0.91      1.00      0.95      1280
   positive       0.00      0.00      0.00       131

avg / total       0.82      0.91      0.86      1411



  'precision', 'predicted', average, warn_for)


In [60]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [61]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.8987252124645893, total=   0.1s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.8993621545003544, total=   0.1s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.8992907801418439, total=   0.1s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8987252124645893, total=   0.1s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8993621545003544, total=   0.1s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8992907801418439, total=   0.1s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.8987252124645893, total=   0.1s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.8993621545003544, total=   0.3s
[CV] C=1, gamma=0.01 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  8.4min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [62]:
print(grid.best_params_)

{'C': 1, 'gamma': 0.0001}


In [63]:
print(grid.best_score_)

0.899125915426


In [64]:
predictions = grid.predict(X_test)

In [65]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names))

             precision    recall  f1-score   support

   negative       0.91      1.00      0.95      1280
   positive       0.00      0.00      0.00       131

avg / total       0.82      0.91      0.86      1411



  'precision', 'predicted', average, warn_for)


In [71]:
from sklearn.externals import joblib

# Save the trained model to a file 
joblib.dump(model, 'svc_model_covid_blood_test.h5')


['svc_model_covid_blood_test.h5']

In [72]:
model_SCV = joblib.load('svc_model_covid_blood_test.h5')

In [74]:
print(model_SCV.predict([[45,-0.094610348,-0.51741302,0.357546657,-1.571682215,-0.850035012,0.318365753,1.482158184,-0.774212003]]))

['negative']
