In [44]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("covid_toy.csv")
print(df.shape)
df.sample(2)

(100, 6)


Unnamed: 0,age,gender,fever,cough,city,has_covid
82,24,Male,98.0,Mild,Kolkata,Yes
0,60,Male,103.0,Mild,Kolkata,No


In [3]:
df.isna().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [4]:
df['cough'].unique()

array(['Mild', 'Strong'], dtype=object)

In [52]:
df['city'].unique()

array(['Kolkata', 'Delhi', 'Mumbai', 'Bangalore'], dtype=object)

In [5]:
# Simple Imputer -> age
# Ordinal Encoder -> cough
# OneHot Enoder -> gender, city

In [40]:
x_train, x_test, y_train, y_test = train_test_split(df.drop('has_covid',axis=1),
                                                    df['has_covid'],
                                                   test_size=0.2,
                                                   random_state=24)
print(x_train.shape,x_test.shape)

(80, 5) (20, 5)


In [41]:
transformer = ColumnTransformer(transformers = [
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild', 'Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')


In [42]:
print(x_train.shape)
x_train.head(1)

(80, 5)


Unnamed: 0,age,gender,fever,cough,city
90,59,Female,99.0,Strong,Delhi


In [45]:
x_train = transformer.fit_transform(x_train)
x_test = transformer.transform(x_test)

print(x_train.shape,x_test.shape)

(80, 7) (20, 7)


In [49]:
feature_names = transformer.get_feature_names_out()
print(feature_names)

x_train = pd.DataFrame(x_train,columns=feature_names)
x_test = pd.DataFrame(x_test,columns=feature_names)

['tnf1__fever' 'tnf2__cough' 'tnf3__gender_Male' 'tnf3__city_Delhi'
 'tnf3__city_Kolkata' 'tnf3__city_Mumbai' 'remainder__age']


In [51]:
x_train.head(1)

Unnamed: 0,tnf1__fever,tnf2__cough,tnf3__gender_Male,tnf3__city_Delhi,tnf3__city_Kolkata,tnf3__city_Mumbai,remainder__age
0,99.0,1.0,0.0,1.0,0.0,0.0,59.0
