In [14]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [15]:
dataset = pd.read_csv('Resources/diagnosis-of-covid-19-and-its-clinical-spectrum.csv')
dataset.tail()

Unnamed: 0,patient_id,patient_age_quantile,sars_cov_2_exam_result,patient_addmited_to_regular_ward_1_yes_0_no,patient_addmited_to_semi_intensive_unit_1_yes_0_no,patient_addmited_to_intensive_care_unit_1_yes_0_no,hematocrit,hemoglobin,platelets,mean_platelet_volume,...,hb_saturation_arterial_blood_gases,pco2_arterial_blood_gas_analysis,base_excess_arterial_blood_gas_analysis,ph_arterial_blood_gas_analysis,total_co2_arterial_blood_gas_analysis,hco3_arterial_blood_gas_analysis,po2_arterial_blood_gas_analysis,arteiral_fio2,phosphor,cto2_arterial_blood_gas_analysis
5639,ae66feb9e4dc3a0,3,positive,f,f,f,,,,,...,,,,,,,,,,
5640,517c2834024f3ea,17,negative,f,f,f,,,,,...,,,,,,,,,,
5641,5c57d6037fe266d,4,negative,f,f,f,,,,,...,,,,,,,,,,
5642,c20c44766f28291,10,negative,f,f,f,,,,,...,,,,,,,,,,
5643,2697fdccbfeb7f7,19,positive,f,f,f,0.694287,0.541564,-0.906829,-0.325903,...,,,,,,,,,,


In [16]:
#drop some of the sparsely populated columns in the end
dataset_new = dataset.drop(columns=["cto2_arterial_blood_gas_analysis", "phosphor", "arteiral_fio2", "po2_arterial_blood_gas_analysis", "hco3_arterial_blood_gas_analysis", "total_co2_arterial_blood_gas_analysis", "ph_arterial_blood_gas_analysis", "base_excess_arterial_blood_gas_analysis","pco2_arterial_blood_gas_analysis", "hb_saturation_arterial_blood_gases", "albumin", "d_dimer", "d_dimer", "lipase_dosage", "arterial_lactic_acid", "ferritin", "creatine_phosphokinase_cpk", "vitamin_b12", "prothrombin_time_pt_activity",
                                   "lactic_dehydrogenase","international_normalized_ratio_inr","relationship_patient_normal", "partial_thromboplastin_time_ptt" ])

dataset_new.head()

Unnamed: 0,patient_id,patient_age_quantile,sars_cov_2_exam_result,patient_addmited_to_regular_ward_1_yes_0_no,patient_addmited_to_semi_intensive_unit_1_yes_0_no,patient_addmited_to_intensive_care_unit_1_yes_0_no,hematocrit,hemoglobin,platelets,mean_platelet_volume,...,urine_urobilinogen,urine_protein,urine_sugar,urine_leukocytes,urine_crystals,urine_red_blood_cells,urine_hyaline_cylinders,urine_granular_cylinders,urine_yeasts,urine_color
0,44477f75e8169d2,13,negative,f,f,f,,,,,...,,,,,,,,,,
1,126e9dd13932f68,17,negative,f,f,f,0.236515,-0.02234,-0.517413,0.010677,...,,,,,,,,,,
2,a46b4402a0e5696,8,negative,f,f,f,,,,,...,,,,,,,,,,
3,f7d619a94f97c45,5,negative,f,f,f,,,,,...,,,,,,,,,,
4,d9e41465789c2b5,15,negative,f,f,f,,,,,...,,,,,,,,,,


In [17]:
#drop some of the columns that has no important info to offer
dataset_new_2 = dataset_new.drop(columns=["patient_addmited_to_regular_ward_1_yes_0_no","patient_addmited_to_semi_intensive_unit_1_yes_0_no", "patient_addmited_to_intensive_care_unit_1_yes_0_no"])
dataset_new_2.head()

Unnamed: 0,patient_id,patient_age_quantile,sars_cov_2_exam_result,hematocrit,hemoglobin,platelets,mean_platelet_volume,red_blood_cells,lymphocytes,mean_corpuscular_hemoglobin_concentration_mchc,...,urine_urobilinogen,urine_protein,urine_sugar,urine_leukocytes,urine_crystals,urine_red_blood_cells,urine_hyaline_cylinders,urine_granular_cylinders,urine_yeasts,urine_color
0,44477f75e8169d2,13,negative,,,,,,,,...,,,,,,,,,,
1,126e9dd13932f68,17,negative,0.236515,-0.02234,-0.517413,0.010677,0.102004,0.318366,-0.95079,...,,,,,,,,,,
2,a46b4402a0e5696,8,negative,,,,,,,,...,,,,,,,,,,
3,f7d619a94f97c45,5,negative,,,,,,,,...,,,,,,,,,,
4,d9e41465789c2b5,15,negative,,,,,,,,...,,,,,,,,,,


In [18]:
# choose 1st model features, mainly based on the blood test features, urine test will be put in 2nd model
X = dataset_new_2[["patient_age_quantile","hematocrit","hemoglobin","platelets","mean_platelet_volume","red_blood_cells","lymphocytes","mean_corpuscular_hemoglobin_concentration_mchc","leukocytes","basophils","mean_corpuscular_hemoglobin_mch","eosinophils","mean_corpuscular_volume_mcv","monocytes","red_blood_cell_distribution_width_rdw","serum_glucose"]].values
y = dataset_new_2["sars_cov_2_exam_result"].values
print(X)

[[13.                 nan         nan ...         nan         nan
          nan]
 [17.          0.23651545 -0.02234027 ...  0.35754666 -0.62507266
  -0.14064808]
 [ 8.                 nan         nan ...         nan         nan
          nan]
 ...
 [ 4.                 nan         nan ...         nan         nan
          nan]
 [10.                 nan         nan ...         nan         nan
          nan]
 [19.          0.69428688  0.54156393 ...  0.5676524  -0.18279028
          nan]]


In [19]:
print(y)

['negative' 'negative' 'negative' ... 'negative' 'negative' 'positive']


In [34]:
#taking care of missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:16])
X[:, 1:16] = imputer.transform(X[:, 1:16])

In [35]:
print(X)

[[13.          0.05340703  0.04031596 ... -0.11519115 -0.18279028
  -0.29207045]
 [17.          0.23651545 -0.02234027 ...  0.35754666 -0.62507266
  -0.14064808]
 [ 8.          0.05340703  0.04031596 ... -0.11519115 -0.18279028
  -0.29207045]
 ...
 [ 4.          0.05340703  0.04031596 ... -0.11519115 -0.18279028
  -0.29207045]
 [10.          0.05340703  0.04031596 ... -0.11519115 -0.18279028
  -0.29207045]
 [19.          0.69428688  0.54156393 ...  0.5676524  -0.18279028
  -0.29207045]]


In [36]:
#encoding variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[0 0 0 ... 0 0 1]


In [37]:
# split data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

In [38]:
#Scale the data using the MinMaxScaler
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

In [39]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [40]:
from sklearn.svm import SVC 
model = SVC(kernel='rbf')
model.fit(X_train_scaled,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [41]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.9140642303433001
Testing Data Score: 0.9052258635961028
