In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [2]:
df = pd.read_csv("covid_toy.csv")

In [3]:
df.head(20)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
5,84,Female,,Mild,Bangalore,Yes
6,14,Male,101.0,Strong,Bangalore,No
7,20,Female,,Strong,Mumbai,Yes
8,19,Female,100.0,Strong,Bangalore,No
9,64,Female,101.0,Mild,Delhi,No


In [4]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [8]:
df["gender"].value_counts()

gender
Female    59
Male      41
Name: count, dtype: int64

In [9]:
df["cough"].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [10]:
df["city"].value_counts()

city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["has_covid"]),df["has_covid"],test_size=0.2)

In [12]:
X_train

Unnamed: 0,age,gender,fever,cough,city
59,6,Female,104.0,Mild,Kolkata
37,55,Male,100.0,Mild,Kolkata
88,5,Female,100.0,Mild,Kolkata
99,10,Female,98.0,Strong,Kolkata
28,16,Male,104.0,Mild,Kolkata
...,...,...,...,...,...
1,27,Male,100.0,Mild,Delhi
36,38,Female,101.0,Mild,Bangalore
49,44,Male,104.0,Mild,Mumbai
40,49,Female,102.0,Mild,Delhi


In [13]:
X_test

Unnamed: 0,age,gender,fever,cough,city
32,34,Female,101.0,Strong,Delhi
8,19,Female,100.0,Strong,Bangalore
80,14,Female,99.0,Mild,Mumbai
38,49,Female,101.0,Mild,Delhi
65,69,Female,102.0,Mild,Bangalore
58,23,Male,98.0,Strong,Mumbai
89,46,Male,103.0,Strong,Bangalore
76,80,Male,100.0,Mild,Bangalore
72,83,Female,101.0,Mild,Kolkata
23,80,Female,98.0,Mild,Delhi


In [14]:
y_train

59    Yes
37     No
88     No
99    Yes
28     No
     ... 
1     Yes
36     No
49     No
40     No
71     No
Name: has_covid, Length: 80, dtype: object

In [15]:
y_test

32    Yes
8      No
80    Yes
38    Yes
65     No
58    Yes
89     No
76    Yes
72     No
23    Yes
16    Yes
15    Yes
95     No
79    Yes
12     No
7     Yes
85    Yes
6      No
81     No
17     No
Name: has_covid, dtype: object

## Performing the required OHE, OE, Simple Imputation accordingly using Sklearn Column Transformer

In [16]:
from sklearn.compose import ColumnTransformer

In [19]:
transformer = ColumnTransformer(transformers=[
    ("tnf1",SimpleImputer(),["fever"]),
    ("tnf2",OrdinalEncoder(categories=[["Mild","Strong"]]),["cough"]),
    ("tnf3",OneHotEncoder(sparse_output=False,drop="first"),["gender","city"])],remainder='passthrough')

In [22]:
transformer.fit_transform(X_train).shape

(80, 7)

In [23]:
transformer.transform(X_test).shape

(20, 7)