# Interesting columns to choose.

Research Article: https://www.techscience.com/cmc/v63n1/38464

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('../arun/Resources/diagnosis-of-covid-19-and-its-clinical-spectrum.csv')
cols = [
    'patient_age_quantile',
    'hematocrit',
    'hemoglobin',
    'platelets',
    'mean_platelet_volume',
    'red_blood_cells',
    'sars_cov_2_exam_result'
]
rowcol = ['Nrows', 'Ncols']
pd.DataFrame(df.shape, index=rowcol, columns=['Original Data'])

Unnamed: 0,Original Data
Nrows,5644
Ncols,111


# Separate x and y parameters.

In [2]:
df = df[cols].dropna()
category = [0 if result == 'negative' else 1 for result in df['sars_cov_2_exam_result']]
df['sars_cov_2_exam_result'] = category
X = df.drop('sars_cov_2_exam_result', axis=1)
y = df['sars_cov_2_exam_result'].values.reshape(-1, 1)
pd.DataFrame((X.shape, y.shape), index=['X', 'y'], columns=rowcol)

Unnamed: 0,Nrows,Ncols
X,599,6
y,599,1


# Separate training and test data.

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
data = pd.DataFrame({
    'Training': pd.Series(y_train.flatten()).value_counts(),
    'Testing': pd.Series(y_test.flatten()).value_counts()
})
data.index = ['Negative', 'Positive']
data

Unnamed: 0,Training,Testing
Negative,388,130
Positive,61,20


# Classification

In [44]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train.flatten())
predictions = classifier.predict(X_test)

In [10]:
results = pd.DataFrame({
    "Actual": y_test.flatten(),
    "Prediction": predictions,
    'Category': np.zeros_like(predictions)
})

results = results.replace(0, 'N')
results = results.replace(1, 'P')

for index, row in results.iterrows():
    if (row['Actual'] == 'N') and (row['Prediction'] == 'N'):
        cat = 'TN'
    if (row['Actual'] == 'P') and (row['Prediction'] == 'P'):
        cat = 'TP'
    if (row['Actual'] == 'N') and (row['Prediction'] == 'P'):
        cat = 'FP'
    if (row['Actual'] == 'P') and (row['Prediction'] == 'N'):
        cat = 'FN'
    results['Category'][index] = cat

# Confusion Matrix

In [31]:
dd = results['Category'].value_counts().to_dict()
cf = ['TP', 'FP', 'TN', 'FN']

for c in cf:
    if c not in dd.keys():
        dd[c] = 0
dd

{'TN': 130, 'FN': 19, 'TP': 1, 'FP': 0}

In [43]:
accuracy = 100 * (dd['TP'] + dd['TN']) / data.sum()['Testing']
precision = 100 * (dd['TP']) / (dd['TP'] + dd['FP'])
recall = 100 * (dd['TP']) / (dd['TP'] + dd['FN'])

pd.DataFrame([accuracy, precision, recall],
             index=['Accuracy', 'Precision', 'Recall'],
             columns=['Percent']
            )

Unnamed: 0,Percent
Accuracy,87.333333
Precision,100.0
Recall,5.0
