In [None]:
# UCSD Data Science Bootcamp, Final Project
# Alexis Perumal, 5/11/20
# Implement Regression Models for Einstein blood marker data.
#
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [2]:
import pandas as pd
import numpy as np

# Read the CSV and Perform Basic Data Cleaning

In [3]:
dataset_filename = 'Resources/diagnosis-of-covid-19-and-its-clinical-spectrum.csv'

df = pd.read_csv(dataset_filename)
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# # Drop the null rows
# df = df.dropna()
print(df.shape)
df.head()

(5644, 106)


Unnamed: 0,patient_id,patient_age_quantile,sars_cov_2_exam_result,patient_addmited_to_regular_ward_1_yes_0_no,patient_addmited_to_semi_intensive_unit_1_yes_0_no,patient_addmited_to_intensive_care_unit_1_yes_0_no,hematocrit,hemoglobin,platelets,mean_platelet_volume,...,hb_saturation_arterial_blood_gases,pco2_arterial_blood_gas_analysis,base_excess_arterial_blood_gas_analysis,ph_arterial_blood_gas_analysis,total_co2_arterial_blood_gas_analysis,hco3_arterial_blood_gas_analysis,po2_arterial_blood_gas_analysis,arteiral_fio2,phosphor,cto2_arterial_blood_gas_analysis
0,44477f75e8169d2,13,negative,f,f,f,,,,,...,,,,,,,,,,
1,126e9dd13932f68,17,negative,f,f,f,0.236515,-0.02234,-0.517413,0.010677,...,,,,,,,,,,
2,a46b4402a0e5696,8,negative,f,f,f,,,,,...,,,,,,,,,,
3,f7d619a94f97c45,5,negative,f,f,f,,,,,...,,,,,,,,,,
4,d9e41465789c2b5,15,negative,f,f,f,,,,,...,,,,,,,,,,


In [4]:
df.columns

Index(['patient_id', 'patient_age_quantile', 'sars_cov_2_exam_result',
       'patient_addmited_to_regular_ward_1_yes_0_no',
       'patient_addmited_to_semi_intensive_unit_1_yes_0_no',
       'patient_addmited_to_intensive_care_unit_1_yes_0_no', 'hematocrit',
       'hemoglobin', 'platelets', 'mean_platelet_volume',
       ...
       'hb_saturation_arterial_blood_gases',
       'pco2_arterial_blood_gas_analysis',
       'base_excess_arterial_blood_gas_analysis',
       'ph_arterial_blood_gas_analysis',
       'total_co2_arterial_blood_gas_analysis',
       'hco3_arterial_blood_gas_analysis', 'po2_arterial_blood_gas_analysis',
       'arteiral_fio2', 'phosphor', 'cto2_arterial_blood_gas_analysis'],
      dtype='object', length=106)

# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
# selected_features = df[['names', 'of', 'selected', 'features', 'here']]
selected_features = df[ ['sars_cov_2_exam_result', 'patient_age_quantile', 'leukocytes', 'platelets', 'monocytes', 'hematocrit', 'eosinophils',
                         'red_blood_cells', 'lymphocytes', 'hemoglobin', 'mean_platelet_volume'] ]
selected_features.head()

Unnamed: 0,sars_cov_2_exam_result,patient_age_quantile,leukocytes,platelets,monocytes,hematocrit,eosinophils,red_blood_cells,lymphocytes,hemoglobin,mean_platelet_volume
0,negative,13,,,,,,,,,
1,negative,17,-0.09461,-0.517413,0.357547,0.236515,1.482158,0.102004,0.318366,-0.02234,0.010677
2,negative,8,,,,,,,,,
3,negative,5,,,,,,,,,
4,negative,15,,,,,,,,,


In [6]:
# # Drop the null rows
print(selected_features.shape)
selected_features = selected_features.dropna()
print(selected_features.shape)

(5644, 11)
(598, 11)


# Create a Train Test Split

Use `koi_disposition` for the y values

In [7]:
# y = df["koi_disposition"].map({'CANDIDATE':0.0, 'CONFIRMED':1.0, 'FALSE POSITIVE':0.0}).values.reshape(-1, 1)
y = selected_features['sars_cov_2_exam_result'].map({'positive':1.0, 'negative':0.0})
X = selected_features[ ['patient_age_quantile', 'leukocytes', 'platelets', 'monocytes', 'hematocrit', 'eosinophils',
                         'red_blood_cells', 'lymphocytes', 'hemoglobin', 'mean_platelet_volume'] ]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

y

1       0.0
8       0.0
15      0.0
18      0.0
22      0.0
       ... 
5602    0.0
5614    0.0
5615    0.0
5618    0.0
5643    1.0
Name: sars_cov_2_exam_result, Length: 598, dtype: float64

In [8]:
type(y)

pandas.core.series.Series

In [10]:
total = len(y)
y_pos = len(y.loc[(y==1.0)])
y_neg = len(y.loc[(y==0.0)])
print('Positive outcomes: ', y_pos, ', ', round(y_pos/total*100.0, 1), '%' )
print('Negative outcomes: ', y_neg, ', ', round(y_neg/total*100.0, 1), '%' )

Positive outcomes:  81 ,  13.5 %
Negative outcomes:  517 ,  86.5 %


In [11]:
print("X_train length: ", len(X_train))
print("X_test length: ", len(X_test))
print("y_train length: ", len(y_train))
print("y_test length: ", len(y_test))
print(X_train.shape)
X_train.head()

X_train length:  448
X_test length:  150
y_train length:  448
y_test length:  150
(448, 10)


Unnamed: 0,patient_age_quantile,leukocytes,platelets,monocytes,hematocrit,eosinophils,red_blood_cells,lymphocytes,hemoglobin,mean_platelet_volume
5581,19,0.609436,0.826701,-0.167718,-2.212562,0.344395,-2.031269,-0.312784,-2.027332,-0.325903
303,15,-0.871009,-1.321369,2.957604,-0.564585,0.386534,-0.215342,-0.295726,-0.460932,0.908221
5172,4,1.257827,-0.165682,-0.482876,-0.862136,-0.835508,-1.202642,0.437772,-0.33562,-0.999063
22,9,-1.132592,-0.668155,2.012129,0.190738,-0.70909,-0.127191,0.002791,-0.147652,1.020415
1317,10,0.492559,0.374475,-1.612194,-0.495919,-0.624811,-0.814774,-0.43219,-0.523588,0.683835


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [12]:
X_train.head()

Unnamed: 0,patient_age_quantile,leukocytes,platelets,monocytes,hematocrit,eosinophils,red_blood_cells,lymphocytes,hemoglobin,mean_platelet_volume
5581,19,0.609436,0.826701,-0.167718,-2.212562,0.344395,-2.031269,-0.312784,-2.027332,-0.325903
303,15,-0.871009,-1.321369,2.957604,-0.564585,0.386534,-0.215342,-0.295726,-0.460932,0.908221
5172,4,1.257827,-0.165682,-0.482876,-0.862136,-0.835508,-1.202642,0.437772,-0.33562,-0.999063
22,9,-1.132592,-0.668155,2.012129,0.190738,-0.70909,-0.127191,0.002791,-0.147652,1.020415
1317,10,0.492559,0.374475,-1.612194,-0.495919,-0.624811,-0.814774,-0.43219,-0.523588,0.683835


In [13]:
# Scale your data
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
# y_scaler = StandardScaler().fit(y_train)

In [14]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
# y_train_scaled = y_scaler.transform(y_train)
# y_test_scaled = y_scaler.transform(y_test)

# Setup Evaluation of Multiple Models

In [19]:
# # Classifier Models
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
# from sklearn.linear_model import SGDClassifier

# Regression Models
from sklearn import linear_model
from sklearn import svm

In [24]:
# # Classifier Models
# lr = LogisticRegression(random_state=99, max_iter=500) # Max_iter=100 (default) generated a warning
# rf = RandomForestClassifier(random_state=99)
# knn = KNeighborsClassifier(n_neighbors=10)
# svm_model = SVC()
# sgd = SGDClassifier(shuffle=True)

# models = [lr, rf, knn, svm_model, sgd]
# names = ['Logistic Regression', 'Random Forest', 'K Nearest Neighbbors', 'Support Vector Classification',
#         'Stochastic Gradient Descent']

# Regression Models
linear_regression = linear_model.LinearRegression()
ridge_regression = linear_model.Ridge(alpha=.5)
svregression = svm.SVR()

models = [linear_regression, ridge_regression, svregression]
names = ['OLS linear Regression', 'Ridge Regression', 'Support Vector Regression']



In [25]:
for name, model in zip(names, models):
    model.fit(X_train_scaled, y_train)
    score = model.score(X_test_scaled, y_test)
    print(name, score)

OLS linear Regression 0.12590304986325895
Ridge Regression 0.12618241164046196
Support Vector Regression 0.2066331783768467


# Save to PKL file

In [35]:
# See: https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
import pickle

filename = 'Models/logistic_regression_classifier-alexis.pkl'
pickle.dump(lr, open(filename, 'wb'))


# Test the PKL file

In [36]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test_scaled, y_test)
print(result)

0.88
