In [76]:
import numpy as np
import pandas as pd

In [77]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [78]:
df = pd.read_csv('covid_toy.csv')
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [79]:
df['cough'].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [80]:
df['city'].value_counts()

city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64

In [81]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [82]:
from sklearn.model_selection import train_test_split

In [83]:
X_train , X_test , y_train, y_test = train_test_split(df.drop(columns=['has_covid']),
                                                      df['has_covid'],
                                                      test_size=0.2,
                                                      random_state=0)

In [84]:
X_train.head()

Unnamed: 0,age,gender,fever,cough,city
43,22,Female,99.0,Mild,Bangalore
62,56,Female,104.0,Strong,Bangalore
3,31,Female,98.0,Mild,Kolkata
71,75,Female,104.0,Strong,Delhi
45,72,Male,99.0,Mild,Bangalore


In [85]:
X_train.shape

(80, 5)

## Normal Method :

In [None]:
# Adding simple imputer to 'fever' col
si = SimpleImputer()

X_train_fever = si.fit_transform(X_train[['fever']])
X_test_fever = si.fit_transform(X_test[['fever']])  # Also for Test data
                                 
X_train_fever.shape

(80, 1)

In [None]:
# Ordinalencoding -> 'cough' col
oe = OrdinalEncoder(categories=[['Mild','Strong']])

X_train_cough = oe.fit_transform(X_train[['cough']])
X_test_cough = oe.fit_transform(X_test[['cough']])  # Also for Test data

X_train_cough.shape

(80, 1)

In [None]:
# OneHotEncoding -> 'gender','city' cols

ohe = OneHotEncoder(drop='first',sparse_output=False)

X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])
X_test_gender_city = ohe.fit_transform(X_test[['gender','city']])  # Also for Test data

X_train_gender_city.shape

(80, 4)

In [None]:
# Extracting Age
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values

X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values   # Also for Test data

X_train_age.shape

(80, 1)

In [None]:
# Joining all the transformed columns

X_train_transformed = np.concatenate((X_train_age, X_train_fever, X_train_gender_city, X_train_cough),axis=1)
X_test_transformed = np.concatenate((X_test_age, X_test_fever, X_test_gender_city, X_test_cough),axis=1)

X_train_transformed.shape

(80, 7)

## Using Column Transformer :

In [91]:
from sklearn.compose import ColumnTransformer

In [None]:
# Creating the object
tr = ColumnTransformer(transformers = [
                        ('tnf1', SimpleImputer(), ['fever'] ),
                        ('tnf2', OrdinalEncoder(categories=[['Mild','Strong']]) ,['cough']),
                        ('tnf3', OneHotEncoder(sparse_output=False,drop='first'), ['gender','city'])
                        ] , remainder='passthrough')

- #####  `transformers` =>List of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data.

- ##### `remainder='passthrough'` => All remaining columns that were not specified in transformers, but present in data passed to fit will be automatically passed through. This subset of columns is concatenated with the output of the transformers.

In [93]:
tr.fit_transform(X_train).shape

(80, 7)

In [94]:
tr.transform(X_test).shape

(20, 7)