In [2]:
!pip install scikit-learn
!pip install numpy
!pip install matplotlib

Collecting scikit-learn
  Downloading scikit_learn-1.1.2-cp310-cp310-win_amd64.whl (7.4 MB)
     ---------------------------------------- 7.4/7.4 MB 7.8 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.0.0
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
     -------------------------------------- 298.0/298.0 kB 9.3 MB/s eta 0:00:00
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.1.2 threadpoolctl-3.1.0


In [3]:
#Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [4]:
#Read the dataset
data=pd.read_csv('covid_database.csv')

#Display the first five rows
print(data.head())

   ID  age_group  Sex  Fever  Cough  Resp_dist  Sore_throat  Chest_pain  \
0   1          1    1      1      0          0            1           1   
1   2          1    1      1      0          0            0           0   
2   3          1    1      1      0          0            0           0   
3   4          1    0      1      1          0            1           1   
4   5          1    0      1      1          1            1           0   

   Headache  Diarrhea  ...  Cancer  Kidney_problems  Fear_of_reinfection  \
0         1         0  ...       0                0                    0   
1         0         0  ...       0                0                    0   
2         0         0  ...       0                0                    0   
3         1         0  ...       0                0                    0   
4         1         0  ...       0                0                    0   

   Hozpitalization  Self_medication  Persistent_symptoms  Depression  \
0                0  

In [5]:
data.groupby('Persistent_symptoms').count()

Unnamed: 0_level_0,ID,age_group,Sex,Fever,Cough,Resp_dist,Sore_throat,Chest_pain,Headache,Diarrhea,...,Heart_disease,Cancer,Kidney_problems,Fear_of_reinfection,Hozpitalization,Self_medication,Depression,Sputum_production,Lack_of_appetite,Fatigue
Persistent_symptoms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,802,802,802,802,802,802,802,802,802,802,...,802,802,802,802,802,802,802,802,802,802
1,200,200,200,200,200,200,200,200,200,200,...,200,200,200,200,200,200,200,200,200,200


In [6]:
#Check for null values
print(data.isnull().sum())

ID                     0
age_group              0
Sex                    0
Fever                  0
Cough                  0
Resp_dist              0
Sore_throat            0
Chest_pain             0
Headache               0
Diarrhea               0
Dyspnea                0
Myalgia                0
Sleep_disturbance      0
Smoking_habits         0
Physical_exercise      0
Diabetes               0
Hypertension           0
Heart_disease          0
Cancer                 0
Kidney_problems        0
Fear_of_reinfection    0
Hozpitalization        0
Self_medication        0
Persistent_symptoms    0
Depression             0
Sputum_production      0
Lack_of_appetite       0
Fatigue                0
dtype: int64


In [7]:
#Check data types
data.dtypes

ID                     int64
age_group              int64
Sex                    int64
Fever                  int64
Cough                  int64
Resp_dist              int64
Sore_throat            int64
Chest_pain             int64
Headache               int64
Diarrhea               int64
Dyspnea                int64
Myalgia                int64
Sleep_disturbance      int64
Smoking_habits         int64
Physical_exercise      int64
Diabetes               int64
Hypertension           int64
Heart_disease          int64
Cancer                 int64
Kidney_problems        int64
Fear_of_reinfection    int64
Hozpitalization        int64
Self_medication        int64
Persistent_symptoms    int64
Depression             int64
Sputum_production      int64
Lack_of_appetite       int64
Fatigue                int64
dtype: object

In [8]:
#Divide the dataset into independent and dependent variables
X = data.drop(["Persistent_symptoms"], axis=1)
y = data['Persistent_symptoms']

#Split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True) 

#Data was splitted as 80% train data and 20% test data.
y_train = y_train.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)

print("X_train shape:",X_train.shape)
print("X_test shape:",X_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

X_train shape: (801, 27)
X_test shape: (201, 27)
y_train shape: (801, 1)
y_test shape: (201, 1)


In [9]:
#Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [10]:
#To store results of models, we create two dictionaries
result_dict_train = {}
result_dict_test = {}

In [11]:
reg = LogisticRegression(random_state = 42)
accuracies = cross_val_score(reg, X_train, y_train, cv=5)
reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)

#Obtain accuracy
print("Train Score:",np.mean(accuracies))
print("Test Score:",reg.score(X_test,y_test))

Train Score: 0.8439673913043478
Test Score: 0.835820895522388


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [12]:
#Store results in the dictionaries
result_dict_train["Logistic Train Score"] = np.mean(accuracies)
result_dict_test["Logistic Test Score"] = reg.score(X_test,y_test)

In [13]:
knn = KNeighborsClassifier()
accuracies = cross_val_score(knn, X_train, y_train, cv=5)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)

#Obtain accuracy
print("Train Score:",np.mean(accuracies))
print("Test Score:",knn.score(X_test,y_test))

Train Score: 0.8514596273291926
Test Score: 0.845771144278607


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


In [14]:
#Store results in the dictionaries
result_dict_train["KNN Train Score"] = np.mean(accuracies)
result_dict_test["KNN Test Score"] = knn.score(X_test,y_test)

In [15]:
svc = SVC(random_state = 42)
accuracies = cross_val_score(svc, X_train, y_train, cv=5)
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)

#Obtain accuracy
print("Train Score:",np.mean(accuracies))
print("Test Score:",svc.score(X_test,y_test))

Train Score: 0.862694099378882
Test Score: 0.8606965174129353


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [16]:
#Store results in the dictionaries
result_dict_train["SVM Train Score"] = np.mean(accuracies)
result_dict_test["SVM Test Score"] = svc.score(X_test,y_test)

In [17]:
dtc = DecisionTreeClassifier(random_state = 42)
accuracies = cross_val_score(dtc, X_train, y_train, cv=5)
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)

#Obtain accuracy
print("Train Score:",np.mean(accuracies))
print("Test Score:",dtc.score(X_test,y_test))

Train Score: 0.7928027950310559
Test Score: 0.8258706467661692


In [18]:
#Store results in the dictionaries
result_dict_train["Decision Tree Train Score"] = np.mean(accuracies)
result_dict_test["Decision Tree Test Score"] = dtc.score(X_test,y_test)

In [19]:
rfc = RandomForestClassifier(random_state = 42)
accuracies = cross_val_score(rfc, X_train, y_train, cv=5)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)

#Obtain accuracy
print("Train Score:",np.mean(accuracies))
print("Test Score:",rfc.score(X_test,y_test))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  rfc.fit(X_train,y_train)


Train Score: 0.8739285714285714
Test Score: 0.8656716417910447


In [20]:
#Store results in the dictionaries
result_dict_train["Random Forest Train Score"] = np.mean(accuracies)
result_dict_test["Random Forest Test Score"] = rfc.score(X_test,y_test)

In [21]:
gnb = GaussianNB()
accuracies = cross_val_score(gnb, X_train, y_train, cv=5)
gnb.fit(X_train,y_train)
y_pred = gnb.predict(X_test)

#Obtain accuracy
print("Train Score:",np.mean(accuracies))
print("Test Score:",gnb.score(X_test,y_test))

Train Score: 0.79152950310559
Test Score: 0.7761194029850746


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [22]:
#Store results in the dictionaries
result_dict_train["Gaussian NB Train Score"] = np.mean(accuracies)
result_dict_test["Gaussian NB Test Score"] = gnb.score(X_test,y_test)

In [23]:
df_result_train = pd.DataFrame.from_dict(result_dict_train,orient = "index", columns=["Score"])
df_result_train

Unnamed: 0,Score
Logistic Train Score,0.843967
KNN Train Score,0.85146
SVM Train Score,0.862694
Decision Tree Train Score,0.792803
Random Forest Train Score,0.873929
Gaussian NB Train Score,0.79153


In [24]:
df_result_test = pd.DataFrame.from_dict(result_dict_test,orient = "index",columns=["Score"])
df_result_test

Unnamed: 0,Score
Logistic Test Score,0.835821
KNN Test Score,0.845771
SVM Test Score,0.860697
Decision Tree Test Score,0.825871
Random Forest Test Score,0.865672
Gaussian NB Test Score,0.776119


In [None]:
import seaborn as sns


fig,ax = plt.subplots(1,2,figsize=(20,5))
sns.barplot(x = df_result_train.index,y = df_result_train.Score,ax = ax[0])
sns.barplot(x = df_result_test.index,y = df_result_test.Score,ax = ax[1])
ax[0].set_xticklabels(df_result_train.index,rotation = 75)
ax[1].set_xticklabels(df_result_test.index,rotation = 75)
plt.show()