In [13]:
import numpy as np
import pandas as pd
import glob as gl
from pprint import pprint

fname = 'diagnosis-of-covid-19-and-its-clinical-spectrum.csv'
path = gl.glob(f'../**/**/{fname}')
df = pd.read_csv(path[0])

see = pd.DataFrame(len(df) - df.isna().sum(), index=df.columns, columns=['Non-NaN']).head(100)

#for index, row in see.iterrows():
#    print(row)

Non-NaN    5644
Name: patient_id, dtype: int64
Non-NaN    5644
Name: patient_age_quantile, dtype: int64
Non-NaN    5644
Name: sars_cov_2_exam_result, dtype: int64
Non-NaN    5644
Name: patient_addmited_to_regular_ward_1_yes_0_no, dtype: int64
Non-NaN    5644
Name: patient_addmited_to_semi_intensive_unit_1_yes_0_no, dtype: int64
Non-NaN    5644
Name: patient_addmited_to_intensive_care_unit_1_yes_0_no, dtype: int64
Non-NaN    603
Name: hematocrit, dtype: int64
Non-NaN    603
Name: hemoglobin, dtype: int64
Non-NaN    602
Name: platelets, dtype: int64
Non-NaN    599
Name: mean_platelet_volume, dtype: int64
Non-NaN    602
Name: red_blood_cells, dtype: int64
Non-NaN    602
Name: lymphocytes, dtype: int64
Non-NaN    602
Name: mean_corpuscular_hemoglobin_concentration_mchc, dtype: int64
Non-NaN    602
Name: leukocytes, dtype: int64
Non-NaN    602
Name: basophils, dtype: int64
Non-NaN    602
Name: mean_corpuscular_hemoglobin_mch, dtype: int64
Non-NaN    602
Name: eosinophils, dtype: int64
Non-N

# Interesting columns to choose.

Research Article: https://www.techscience.com/cmc/v63n1/38464

In [2]:
cols = [
    'patient_age_quantile',
    'hematocrit',
    'hemoglobin',
    'platelets',
    'mean_platelet_volume',
    'red_blood_cells',
    'sars_cov_2_exam_result'
]
rowcol = ['Nrows', 'Ncols']
pd.DataFrame(df.shape, index=rowcol, columns=['Original Data'])

Unnamed: 0,Original Data
Nrows,5644
Ncols,111


# Separate x and y parameters.

In [3]:
df = df[cols].dropna()
category = [0 if result == 'negative' else 1 for result in df['sars_cov_2_exam_result']]
df['sars_cov_2_exam_result'] = category
X = df.drop('sars_cov_2_exam_result', axis=1)
y = df['sars_cov_2_exam_result'].values.reshape(-1, 1)
pd.DataFrame((X.shape, y.shape), index=['X', 'y'], columns=rowcol)

Unnamed: 0,Nrows,Ncols
X,599,6
y,599,1


# Separate training and test data.

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
data = pd.DataFrame({
    'Training': pd.Series(y_train.flatten()).value_counts(),
    'Testing': pd.Series(y_test.flatten()).value_counts()
})
data.index = ['Negative', 'Positive']
data

Unnamed: 0,Training,Testing
Negative,388,130
Positive,61,20


# Classification

In [5]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train.flatten())
predictions = classifier.predict(X_test)

In [6]:
results = pd.DataFrame({
    "Actual": y_test.flatten(),
    "Prediction": predictions,
    'Category': np.zeros_like(predictions)
})

results = results.replace(0, 'N')
results = results.replace(1, 'P')

for index, row in results.iterrows():
    if (row['Actual'] == 'N') and (row['Prediction'] == 'N'):
        cat = 'TN'
    if (row['Actual'] == 'P') and (row['Prediction'] == 'P'):
        cat = 'TP'
    if (row['Actual'] == 'N') and (row['Prediction'] == 'P'):
        cat = 'FP'
    if (row['Actual'] == 'P') and (row['Prediction'] == 'N'):
        cat = 'FN'
    results['Category'][index] = cat

# Confusion Matrix

In [7]:
dd = results['Category'].value_counts().to_dict()
cf = ['TP', 'FP', 'TN', 'FN']

for c in cf:
    if c not in dd.keys():
        dd[c] = 0
dd

{'TN': 130, 'FN': 19, 'TP': 1, 'FP': 0}

In [9]:
accuracy = (dd['TP'] + dd['TN']) / data.sum()['Testing']
precision = (dd['TP']) / (dd['TP'] + dd['FP'])
recall = (dd['TP']) / (dd['TP'] + dd['FN'])

pd.DataFrame([accuracy, precision, recall],
             index=['Accuracy', 'Precision', 'Recall'],
             columns=['Percent']
            )

Unnamed: 0,Percent
Accuracy,0.873333
Precision,1.0
Recall,0.05
