We will examine the Covid-19 binary prediction example in more detail and suggest the glioma grading dataset as a good route for further analysis in future assessments.

First we import the libraries and load the data.

In [2]:
# Importing required libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('corona.csv')

In [4]:
df.head()

Unnamed: 0,Ind_ID,Test_date,Cough_symptoms,Fever,Sore_throat,Shortness_of_breath,Headache,Corona,Age_60_above,Sex,Known_contact
0,1,11-03-2020,True,False,True,False,False,negative,,,Abroad
1,2,11-03-2020,False,True,False,False,False,positive,,,Abroad
2,3,11-03-2020,False,True,False,False,False,positive,,,Abroad
3,4,11-03-2020,True,False,False,False,False,negative,,,Abroad
4,5,11-03-2020,True,False,False,False,False,negative,,,Contact with confirmed


Then they do some data preprocessing.

In [5]:
df.shape

(278848, 11)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278848 entries, 0 to 278847
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Ind_ID               278848 non-null  int64 
 1   Test_date            278848 non-null  object
 2   Cough_symptoms       278596 non-null  object
 3   Fever                278596 non-null  object
 4   Sore_throat          278847 non-null  object
 5   Shortness_of_breath  278847 non-null  object
 6   Headache             278847 non-null  object
 7   Corona               278848 non-null  object
 8   Age_60_above         151528 non-null  object
 9   Sex                  259285 non-null  object
 10  Known_contact        278848 non-null  object
dtypes: int64(1), object(10)
memory usage: 23.4+ MB


In [7]:
# replacing the none values with Nan values

df.replace({'None': np.nan}, inplace=True)

In [8]:
df.isnull().sum()

Ind_ID                      0
Test_date                   0
Cough_symptoms            252
Fever                     252
Sore_throat                 1
Shortness_of_breath         1
Headache                    1
Corona                      0
Age_60_above           127320
Sex                     19563
Known_contact               0
dtype: int64

Following code block deletes the rows that have NaNs in the specified columns.

In [9]:
# Dropping the nan values from columns

df.dropna(subset=['Cough_symptoms','Fever','Sore_throat','Shortness_of_breath','Headache'],axis=0,inplace=True)

Then we remove the Age_60_above column as it has way too many NaNs.

In [None]:
# dropping the age_60_above column

df.drop('Age_60_above',axis=1,inplace=True)

In [None]:
# # replacing the none values with Nan values

df.replace({'other': np.nan},inplace=True)

In [None]:
# Dropping the nan values from columns

df.dropna(subset=['Corona'],axis=0,inplace=True)

In [None]:
# converting the datatype as categorical

for i in df.columns:
    if i=='Ind_ID' or i == 'Test_date':
        pass
    else:
        df[i] = df[i].astype('category')

In [None]:
df.info()

Then some EDA.

In [None]:
sns.set_style('dark')

In [None]:
cols = ['Cough_symptoms', 'Fever', 'Sore_throat']

# Set up the figure and axes

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))

# Plot univariate distributions for each column
for i, col in enumerate(cols):
    sns.countplot(data=df , x=col, ax=axes[i])
    axes[i].set_title(f'Distribution of {col}' , fontsize=18 )
    axes[i].set_ylabel('Count')
    axes[i].tick_params(axis='x')
    axes[i].grid(True)

plt.tight_layout()
plt.show()

In [None]:
cols = ['Shortness_of_breath','Headache',	'Corona']

# Set up the figure and axes

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))

# Plot univariate distributions for each column
for i, col in enumerate(cols):
    sns.countplot(data=df , x=col, ax=axes[i])
    axes[i].set_title(f'Distribution of {col}' , fontsize=18 )
    axes[i].set_ylabel('Count')
    axes[i].tick_params(axis='x')
    axes[i].grid(True)

plt.tight_layout()
plt.show()

In [None]:
cols = ['Sex',	'Known_contact']

# Set up the figure and axes

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))

# Plot univariate distributions for each column
for i, col in enumerate(cols):
    sns.countplot(data=df , x=col, ax=axes[i])
    axes[i].set_title(f'Distribution of {col}' , fontsize=18 )
    axes[i].set_ylabel('Count')
    axes[i].tick_params(axis='x')
    axes[i].grid(True)

plt.tight_layout()
plt.show()

Now we plot for each label (positive or negative).

In [None]:
cols = ['Cough_symptoms', 'Fever', 'Sore_throat']

# Set up the figure and axes

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))

# Plot univariate distributions for each column
for i, col in enumerate(cols):
    sns.countplot(data=df , x=col,hue='Corona', ax=axes[i]  , palette="Set2")
    axes[i].set_title(f'Distribution of {col}' , fontsize=18 )
    axes[i].set_ylabel('Count')
    axes[i].tick_params(axis='x')
    axes[i].grid(True)

plt.tight_layout()
plt.show()

In [None]:
cols = ['Shortness_of_breath','Headache']

# Set up the figure and axes

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,4))

# Plot univariate distributions for each column
for i, col in enumerate(cols):
    sns.countplot(data=df , x=col,hue='Corona', ax=axes[i]  , palette="Set2")
    axes[i].set_title(f'Distribution of {col}' , fontsize=18 )
    axes[i].set_ylabel('Count')
    axes[i].tick_params(axis='x')
    axes[i].grid(True)

plt.tight_layout()
plt.show()

In [None]:
cols = ['Sex',	'Known_contact']

# Set up the figure and axes

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))

# Plot univariate distributions for each column
for i, col in enumerate(cols):
    sns.countplot(data=df , x=col,hue='Corona', ax=axes[i]  , palette="Set2")
    axes[i].set_title(f'Distribution of {col}' , fontsize=12 )
    axes[i].set_ylabel('Count')
    axes[i].tick_params(axis='x')
    axes[i].grid(True)

plt.tight_layout()
plt.show()

These were the authors comments:

Cough_symptoms: A significant number of individuals who tested negative also reported no cough symptoms. However, among those who reported having a cough, the number of positive cases is relatively higher.

Fever: Most individuals did not report having a fever. Among those who did, the number of positive cases is higher than negative cases.

Sore_throat: The majority did not experience this symptom.

Shortness_of_breath: This symptom was also not common among the individuals.

Headache: A notable number of individuals reported not having headaches.

Age_60_above: A considerable number of individuals were below 60 years of age.

Sex: The number of females and males in the dataset is almost evenly distributed.

Known_contact: Most individuals did not have a known contact with a confirmed case.

Next they do what they call 'feature transformation' which really is just converting labels to numbers.

In [None]:
# creating a copy of original dataframe

covid_df = df.copy(deep=True)

In [None]:
# Encoding

covid_df['Cough_symptoms'] = pd.get_dummies(covid_df['Cough_symptoms'],drop_first=True)  # to aviod multicolinearity

covid_df['Fever'] = pd.get_dummies(covid_df['Fever'],drop_first=True)

covid_df['Sore_throat'] = pd.get_dummies(covid_df['Sore_throat'],drop_first=True)

covid_df['Shortness_of_breath'] = pd.get_dummies(covid_df['Shortness_of_breath'],drop_first=True)

covid_df['Headache'] = pd.get_dummies(covid_df['Headache'],drop_first=True)

covid_df['Corona'] = pd.get_dummies(covid_df['Corona'],drop_first=True)

In [None]:
# encoding

covid_df['Sex'].replace({'male':0,'female':1},inplace=True)
covid_df['Known_contact'].replace({'Other':2,'Contact with confirmed':1,'Abroad':0},inplace=True)

In [None]:
covid_df.head()

In [None]:
covid_df.isnull().sum()

Now they do 'missing value imputation'

In [None]:
from sklearn.impute import KNNImputer

In [None]:
# creating a copy of original dataframe

imputed_data = covid_df.copy(deep=True)

In [None]:
# applying KNN imputation

knn=KNNImputer(n_neighbors=5,weights='uniform')
columns=['Cough_symptoms', 'Fever', 'Sore_throat','Shortness_of_breath', 'Headache','Sex','Known_contact']
imputed=knn.fit_transform(covid_df[columns])

In [None]:
df1 = pandas.DataFrame(imputed,columns=columns)

In [None]:
imputed_data.info()

In [None]:
imputed_data['Sex'] = imputed_data['Sex'].astype('uint8')