In [8]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
data=pd.read_excel(r"C:\Users\RBI\Downloads\Covid_Dataset.xlsx")
data

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,age_60_and_above,gender,test_indication,corona_result
0,0,0,0,0,0,Yes,female,Contact with confirmed,positive
1,1,0,0,0,0,Yes,male,Contact with confirmed,positive
2,0,0,0,0,0,Yes,male,Abroad,positive
3,1,1,0,0,0,Yes,male,Other,positive
4,1,1,0,0,0,Yes,female,Contact with confirmed,positive
...,...,...,...,...,...,...,...,...,...
2132,1,0,0,1,1,No,female,Contact with confirmed,positive
2133,1,0,0,1,1,No,male,Contact with confirmed,positive
2134,0,0,0,0,0,No,male,Other,positive
2135,0,0,0,0,0,No,male,Other,positive


## Data Preprocessing

In [10]:
data.shape

(2137, 9)

In [65]:
data.info()

TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type

In [12]:
data.describe(include='all')

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,age_60_and_above,gender,test_indication,corona_result
count,2137.0,2137.0,2137.0,2137.0,2137.0,2137,2137,2137,2137
unique,,,,,,2,3,3,2
top,,,,,,No,female,Other,positive
freq,,,,,,1159,1064,1290,1105
mean,0.419747,0.27328,0.038372,0.037436,0.058493,,,,
std,0.493633,0.445748,0.192137,0.189871,0.234729,,,,
min,0.0,0.0,0.0,0.0,0.0,,,,
25%,0.0,0.0,0.0,0.0,0.0,,,,
50%,0.0,0.0,0.0,0.0,0.0,,,,
75%,1.0,1.0,0.0,0.0,0.0,,,,


In [13]:
# check for duplicated values
data.duplicated().sum()

1925

In [14]:
# check for null values
data.isnull().sum()

cough                  0
fever                  0
sore_throat            0
shortness_of_breath    0
head_ache              0
age_60_and_above       0
gender                 0
test_indication        0
corona_result          0
dtype: int64

Here we can see that there are no null values in the data.

In [15]:
# check unique values in all variables
for i in data.columns:
    print({i:data[i].unique()})

{'cough': array([0, 1], dtype=int64)}
{'fever': array([0, 1], dtype=int64)}
{'sore_throat': array([0, 1], dtype=int64)}
{'shortness_of_breath': array([0, 1], dtype=int64)}
{'head_ache': array([0, 1], dtype=int64)}
{'age_60_and_above': array(['Yes', 'No'], dtype=object)}
{'gender': array(['female', 'male', 'None'], dtype=object)}
{'test_indication': array(['Contact with confirmed', 'Abroad', 'Other'], dtype=object)}
{'corona_result': array(['positive', 'negative'], dtype=object)}


By finding out the unique values it can be seen that the gender variable has 'None' value present in the data. So we try replacing it.

In [16]:
data.replace("None", np.nan,inplace=True)

Here we have replaced the 'None' value by nan.

In [17]:
data.isnull().sum()

cough                   0
fever                   0
sore_throat             0
shortness_of_breath     0
head_ache               0
age_60_and_above        0
gender                 16
test_indication         0
corona_result           0
dtype: int64

There are 16 none values present in the gender.

In [18]:
data['gender'].fillna(data.gender.mode()[0],inplace=True)

Now we have replacing the the null values by the mode value of the gender variable.

In [19]:
data.isnull().sum()

cough                  0
fever                  0
sore_throat            0
shortness_of_breath    0
head_ache              0
age_60_and_above       0
gender                 0
test_indication        0
corona_result          0
dtype: int64

All the none values are removed.

In [20]:
data.dtypes

cough                   int64
fever                   int64
sore_throat             int64
shortness_of_breath     int64
head_ache               int64
age_60_and_above       object
gender                 object
test_indication        object
corona_result          object
dtype: object

In [21]:
# Converting Categorical data to numerical data

data.age_60_and_above.replace({'Yes':1,"No":0}, inplace=True)
data.gender.replace({'female':0, 'male':1},inplace=True)
data.test_indication.replace({'Contact with confirmed':0, 'Abroad':1, 'Other':2},inplace=True)
data.corona_result.replace({'positive':1, 'negative':0},inplace=True)

In [22]:
data

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,age_60_and_above,gender,test_indication,corona_result
0,0,0,0,0,0,1,0,0,1
1,1,0,0,0,0,1,1,0,1
2,0,0,0,0,0,1,1,1,1
3,1,1,0,0,0,1,1,2,1
4,1,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
2132,1,0,0,1,1,0,0,0,1
2133,1,0,0,1,1,0,1,0,1
2134,0,0,0,0,0,0,1,2,1
2135,0,0,0,0,0,0,1,2,1


In [23]:
# Creating X & Y
X = data.values[:,0:-1]     
Y = data.values[:,-1] 

In [24]:
print(X.shape)
print(Y.shape)

(2137, 8)
(2137,)


In [25]:
# Scaling the data performing standardization

from sklearn.preprocessing import StandardScaler
 
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [26]:
print(X)

[[-0.85052166 -0.61322619 -0.19975654 ...  1.08860993 -0.98929455
  -1.59765106]
 [ 1.17574901 -0.61322619 -0.19975654 ...  1.08860993  1.0108213
  -1.59765106]
 [-0.85052166 -0.61322619 -0.19975654 ...  1.08860993  1.0108213
  -0.42075333]
 ...
 [-0.85052166 -0.61322619 -0.19975654 ... -0.91860268  1.0108213
   0.7561444 ]
 [-0.85052166 -0.61322619 -0.19975654 ... -0.91860268  1.0108213
   0.7561444 ]
 [ 1.17574901 -0.61322619 -0.19975654 ... -0.91860268 -0.98929455
   0.7561444 ]]


In [27]:
# Split the data into test and train

from sklearn.model_selection import train_test_split 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,
                                                    random_state=10)   

The data has been splitted into train and test with a test size of 0.3.

In [28]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)


(1495, 8)
(1495,)
(642, 8)
(642,)


## Model Building

### 1. Logistic regression

In [29]:
from sklearn.linear_model import LogisticRegression

# create a model object
classifier = LogisticRegression()

# fitting training data to the model
classifier.fit(X_train, Y_train)

Y_pred=classifier.predict(X_test)
#print(Y_pred)

In [30]:
# generating probaility matrix

Y_pred_prob=classifier.predict_proba(X_test)
Y_pred_prob

array([[7.23946640e-01, 2.76053360e-01],
       [2.89774148e-01, 7.10225852e-01],
       [8.83699070e-02, 9.11630093e-01],
       ...,
       [5.71558986e-01, 4.28441014e-01],
       [2.81200550e-01, 7.18799450e-01],
       [1.28400127e-05, 9.99987160e-01]])

In [31]:
print(list(zip(Y_test,Y_pred)))

[(1, 0), (0, 1), (1, 1), (1, 0), (1, 1), (1, 0), (1, 0), (1, 1), (1, 1), (0, 0), (0, 0), (0, 0), (1, 0), (1, 1), (1, 1), (0, 0), (1, 1), (0, 0), (1, 1), (1, 0), (1, 1), (1, 0), (1, 1), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (1, 0), (1, 1), (0, 1), (1, 1), (1, 1), (1, 0), (1, 0), (0, 0), (1, 0), (1, 1), (1, 1), (1, 1), (0, 0), (1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 0), (0, 0), (1, 0), (1, 1), (1, 1), (0, 0), (1, 1), (0, 0), (1, 1), (0, 1), (0, 0), (1, 0), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (0, 0), (0, 0), (1, 1), (0, 0), (1, 0), (1, 1), (0, 1), (0, 0), (1, 0), (1, 1), (0, 0), (1, 1), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (1, 1), (0, 0), (1, 0), (1, 0), (1, 0), (0, 0), (0, 0), (0, 0), (1, 1), (1, 0), (1, 1), (0, 0), (0, 0), (0, 0), (0, 1), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (1, 1), (1, 0), (0, 1), (0, 0), (1, 0), (1, 1), (0, 0), (0, 0), (1, 0), (1, 1), (0, 1), (1, 0), (0, 0), (1, 1), (1, 1),

In [32]:
# Generating confusion matrix, accuracy score, classification report

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
print("Classification report: ")
print(classification_report(Y_test,Y_pred))
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[260  58]
 [123 201]]
Classification report: 
              precision    recall  f1-score   support

           0       0.68      0.82      0.74       318
           1       0.78      0.62      0.69       324

    accuracy                           0.72       642
   macro avg       0.73      0.72      0.72       642
weighted avg       0.73      0.72      0.72       642

Accuracy of the model:  0.7180685358255452


For base model:
* The accuracy is coming as  72%
* recall value for class 0 is 0.82
* recall value for class 1 is 0.62 which is low.

### Tuning the model
  Changing the threashold to 0.4

In [33]:
# Changing the threashold to 0.4

y_pred_prob = classifier.predict_proba(X_test)
print(y_pred_prob)
 
y_pred_class_1=[]
for value in y_pred_prob[:,1]: 
    if value > 0.4:                
        y_pred_class_1.append(1)
    else:
        y_pred_class_1.append(0)
#print(y_pred_class)

[[7.23946640e-01 2.76053360e-01]
 [2.89774148e-01 7.10225852e-01]
 [8.83699070e-02 9.11630093e-01]
 ...
 [5.71558986e-01 4.28441014e-01]
 [2.81200550e-01 7.18799450e-01]
 [1.28400127e-05 9.99987160e-01]]


In [34]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
cfm=confusion_matrix(Y_test,y_pred_class_1)
print(cfm)
acc=accuracy_score(Y_test, y_pred_class_1)
print("Accuracy of the model: ",acc)
print(classification_report(Y_test, y_pred_class_1))

[[230  88]
 [ 93 231]]
Accuracy of the model:  0.7180685358255452
              precision    recall  f1-score   support

           0       0.71      0.72      0.72       318
           1       0.72      0.71      0.72       324

    accuracy                           0.72       642
   macro avg       0.72      0.72      0.72       642
weighted avg       0.72      0.72      0.72       642



For threashold 0.4:
* The accuracy is same 72%. 
* recall value for class 0 has decreased to 0.72 
* recall value for class 1 it has increaed to 0.71

Changing the threashold to 0.44

In [57]:
# Accuracy for 0.48
y_pred_prob = classifier.predict_proba(X_test)
print(y_pred_prob)

y_pred_class_2=[]
for value in y_pred_prob[:,1]:
    if value > 0.48:                
        y_pred_class_2.append(1)
    else:
        y_pred_class_2.append(0)
#print(y_pred_class)

[[7.23946640e-01 2.76053360e-01]
 [2.89774148e-01 7.10225852e-01]
 [8.83699070e-02 9.11630093e-01]
 ...
 [5.71558986e-01 4.28441014e-01]
 [2.81200550e-01 7.18799450e-01]
 [1.28400127e-05 9.99987160e-01]]


In [58]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
cfm=confusion_matrix(Y_test,y_pred_class_2)
print(cfm)
acc=accuracy_score(Y_test, y_pred_class_2)
print("Accuracy of the model: ",acc)
print(classification_report(Y_test, y_pred_class_2))


[[250  68]
 [108 216]]
Accuracy of the model:  0.7258566978193146
              precision    recall  f1-score   support

           0       0.70      0.79      0.74       318
           1       0.76      0.67      0.71       324

    accuracy                           0.73       642
   macro avg       0.73      0.73      0.73       642
weighted avg       0.73      0.73      0.72       642



For threashold 0.48:
* The accuracy has increased to 72%. 
* recall value for class 0 has increased to 0.73
* recall value for class 1 it has decreased to 0.67

### 2. Descision Tree

In [39]:
# BASE MODEL
from sklearn.tree import DecisionTreeClassifier

# create a model object
model_DT=DecisionTreeClassifier(random_state=10,criterion='gini')

# fitting training data to the model
model_DT.fit(X_train, Y_train)

Y_pred_1=model_DT.predict(X_test)
#print(Y_pred)

In [40]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report

# confusion matric
 
cfm=confusion_matrix(Y_test,Y_pred_1)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred_1))
 
acc=accuracy_score(Y_test, Y_pred_1)
print("Accuracy of the model: ",acc)

[[270  48]
 [ 98 226]]
Classification report: 
              precision    recall  f1-score   support

           0       0.73      0.85      0.79       318
           1       0.82      0.70      0.76       324

    accuracy                           0.77       642
   macro avg       0.78      0.77      0.77       642
weighted avg       0.78      0.77      0.77       642

Accuracy of the model:  0.7725856697819314


For descision tree base model:
* The accuracy has improved a lot to 77%. 
* The recall value for class 0 is 0.85 
* class 1 is 0.70, which is better than logistic regression.

In [41]:
model_DT.score(X_train,Y_train)

0.7993311036789298

### 3. Random Forest

In [42]:
from sklearn.ensemble import RandomForestClassifier
 
model_RandomForest=RandomForestClassifier(n_estimators=100,                 
                                          random_state=10, bootstrap=True,   
                                         n_jobs=-1)                         
 
#fit the model on the data and predict the values
model_RandomForest.fit(X_train,Y_train) 
Y_pred=model_RandomForest.predict(X_test)

In [43]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[269  49]
 [ 92 232]]
Classification report: 
              precision    recall  f1-score   support

           0       0.75      0.85      0.79       318
           1       0.83      0.72      0.77       324

    accuracy                           0.78       642
   macro avg       0.79      0.78      0.78       642
weighted avg       0.79      0.78      0.78       642

Accuracy of the model:  0.780373831775701


In [44]:
model_RandomForest.score(X_train,Y_train)

0.7993311036789298

For Random Forest model:
* The accuracy has improved a lot to 78%. 
* The recall value for class 0 is 0.85
* class 1 is 0.72, which is better than descision tree.

# Conclusion

The random forest clasifier gives the best prediction.

Recall value for class 0 is 0.85 and for class 1 is 0.72. These are the best values obtained from any model.

The accuracy 78%.