# Interesting columns to choose.

Research Article: https://www.techscience.com/cmc/v63n1/38464

In [1]:
import pandas as pd

df = pd.read_csv('../arun/Resources/diagnosis-of-covid-19-and-its-clinical-spectrum.csv')
cols = [
    'patient_age_quantile',
    'hematocrit',
    'hemoglobin',
    'platelets',
    'mean_platelet_volume',
    'red_blood_cells',
    'sars_cov_2_exam_result'
]
rowcol = ['Nrows', 'Ncols']
pd.DataFrame(df.shape, index=rowcol, columns=['Count'])

Unnamed: 0,Count
Nrows,5644
Ncols,111


# Separate x and y parameters.

In [2]:
df = df[cols].dropna()
category = [0 if result == 'negative' else 1 for result in df['sars_cov_2_exam_result']]
df['sars_cov_2_exam_result'] = category
X = df.drop('sars_cov_2_exam_result', axis=1)
y = df['sars_cov_2_exam_result'].values.reshape(-1, 1)
pd.DataFrame((X.shape, y.shape), index=['X', 'y'], columns=rowcol)

Unnamed: 0,Nrows,Ncols
X,599,6
y,599,1


# Rescale data.

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Compare with other models.

In [4]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
import numpy as np

names = ['Linear Regression', 'Lasso', 'Ridge', 'Elastic Net']
com_df = pd.DataFrame(index=names, columns=['Root Mean Square', 'R-squared'])

models = [LinearRegression(), Lasso(alpha=0.01), Ridge(alpha=.01), ElasticNet(alpha=.01)]
for mod, name in zip(models, names):
    mod.fit(X_train, y_train)
    predictions = mod.predict(X_test)
    MSE = mean_squared_error(y_test, predictions)
    r2 = mod.score(X_test, y_test)
    com_df.loc[name] = [MSE, r2]
com_df

Unnamed: 0,Root Mean Square,R-squared
Linear Regression,0.105715,0.0851619
Lasso,0.105409,0.0878036
Ridge,0.105714,0.0851713
Elastic Net,0.105162,0.0899473


# Classification

In [11]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train.flatten())
predictions = classifier.predict(X_test)

print(f'Size of training data: {y_train.size}')
print(f'Size of testing data: {y_test.size}')
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Size of training data: 449
Size of testing data: 150
Testing Data Score: 0.8733333333333333


In [7]:
results = pd.DataFrame({
    "Actual": y_test.flatten(),
    "Prediction": predictions,
    'Category': np.zeros_like(predictions)
})

results = results.replace(0, 'N')
results = results.replace(1, 'P')

for index, row in results.iterrows():
    if (row['Actual'] == 'N') and (row['Prediction'] == 'N'):
        cat = 'TN'
    if (row['Actual'] == 'P') and (row['Prediction'] == 'P'):
        cat = 'TP'
    if (row['Actual'] == 'N') and (row['Prediction'] == 'P'):
        cat = 'FP'
    if (row['Actual'] == 'P') and (row['Prediction'] == 'N'):
        cat = 'FN'
    results['Category'][index] = cat
results['Category'].value_counts()

TN    130
FN     19
TP      1
Name: Category, dtype: int64