In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv('Madedata1.csv')

In [3]:
df.head()

Unnamed: 0,Country,Age,Gender,fever,Bodypain,Runny_nose,Difficulty_in_breathing,Nasal_congestion,Sore_throat,Severity,Contact_with_covid_patient,Infected
0,China,10,Male,102,1,0,0,0,1,Mild,No,0
1,Italy,20,Male,103,1,1,0,0,0,Moderate,Not known,1
2,Iran,55,Transgender,99,0,0,0,1,1,Severe,No,0
3,Republic of Korean,37,Female,100,0,1,1,0,0,Mild,Yes,1
4,France,45,Male,101,1,1,1,1,0,Moderate,Yes,1


In [4]:
X = df.iloc[:, 1:-1]
y = df.iloc[:, -1]

In [5]:
X.head()

Unnamed: 0,Age,Gender,fever,Bodypain,Runny_nose,Difficulty_in_breathing,Nasal_congestion,Sore_throat,Severity,Contact_with_covid_patient
0,10,Male,102,1,0,0,0,1,Mild,No
1,20,Male,103,1,1,0,0,0,Moderate,Not known
2,55,Transgender,99,0,0,0,1,1,Severe,No
3,37,Female,100,0,1,1,0,0,Mild,Yes
4,45,Male,101,1,1,1,1,0,Moderate,Yes


In [6]:
pd.value_counts(X['Gender'])

Male           1257
Female         1208
Transgender      34
Name: Gender, dtype: int64

In [7]:
pd.value_counts(X['Severity'])

Mild        1591
Moderate     525
Severe       383
Name: Severity, dtype: int64

In [8]:
pd.value_counts(X['Contact_with_covid_patient'])

No           1203
Yes           638
Not known     633
yes            25
Name: Contact_with_covid_patient, dtype: int64

In [9]:
X['Contact_with_covid_patient'] = X['Contact_with_covid_patient'].str.lower()

In [10]:
pd.value_counts(X['Contact_with_covid_patient'])

no           1203
yes           663
not known     633
Name: Contact_with_covid_patient, dtype: int64

In [11]:
genderLabel = LabelEncoder()
X['Gender'] = genderLabel.fit_transform(X['Gender'])

In [12]:
X['Gender']

0       1
1       1
2       2
3       0
4       1
       ..
2494    1
2495    0
2496    0
2497    0
2498    0
Name: Gender, Length: 2499, dtype: int32

In [14]:
gender = X['Gender'].values
gender = gender.reshape(-1,1)

In [15]:
gender_onehot = OneHotEncoder()
gender_onehot.fit_transform(gender)

<2499x3 sparse matrix of type '<class 'numpy.float64'>'
	with 2499 stored elements in Compressed Sparse Row format>

In [16]:
gender_onehot.fit_transform(gender).toarray()

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [17]:
gender = gender_onehot.fit_transform(gender).toarray()

In [19]:
severityLabel = LabelEncoder()
X['Severity'] = severityLabel.fit_transform(X['Severity'])

severity = X['Severity'].values
severity = severity.reshape(-1,1)

severity_onehot = OneHotEncoder()
severity = severity_onehot.fit_transform(severity).toarray()

contactLabel = LabelEncoder()
X['Contact_with_covid_patient'] = contactLabel.fit_transform(X['Contact_with_covid_patient'])

contact = X['Contact_with_covid_patient'].values
contact = contact.reshape(-1,1)

contact_onehot = OneHotEncoder()
contact = contact_onehot.fit_transform(contact).toarray()

In [20]:
X.head()

Unnamed: 0,Age,Gender,fever,Bodypain,Runny_nose,Difficulty_in_breathing,Nasal_congestion,Sore_throat,Severity,Contact_with_covid_patient
0,10,1,102,1,0,0,0,1,0,0
1,20,1,103,1,1,0,0,0,1,1
2,55,2,99,0,0,0,1,1,2,0
3,37,0,100,0,1,1,0,0,0,2
4,45,1,101,1,1,1,1,0,1,2


In [21]:
X.drop(columns=['Gender', 'Severity', 'Contact_with_covid_patient'], inplace=True)

In [22]:
X.head()

Unnamed: 0,Age,fever,Bodypain,Runny_nose,Difficulty_in_breathing,Nasal_congestion,Sore_throat
0,10,102,1,0,0,0,1
1,20,103,1,1,0,0,0
2,55,99,0,0,0,1,1
3,37,100,0,1,1,0,0
4,45,101,1,1,1,1,0


In [23]:
X = X.values
X = np.c_[X, gender, severity, contact]

In [24]:
X[0]

array([ 10., 102.,   1.,   0.,   0.,   0.,   1.,   0.,   1.,   0.,   1.,
         0.,   0.,   1.,   0.,   0.])

In [26]:
minmax = MinMaxScaler()
X = minmax.fit_transform(X)

In [27]:
X[0]

array([0.        , 0.66666667, 1.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 1.        , 0.        ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.        ])

In [28]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [29]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1874, 16), (625, 16), (1874,), (625,))

In [30]:
logistic = LogisticRegression()
logistic.fit(x_train, y_train)

LogisticRegression()

In [31]:
y_pred = logistic.predict(x_test)

In [32]:
accuracy_score(y_test, y_pred)

0.936

In [33]:
confusion_matrix(y_test, y_pred)

array([[280,  32],
       [  8, 305]], dtype=int64)