<a href="https://colab.research.google.com/github/astrovishalthakur/MachineLearning/blob/main/FeatureEngineering/Handling_Categorical_Data/ColumnTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd


In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [3]:
df = pd.read_csv("covid_toy.csv")

In [5]:
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [6]:
df.city.value_counts()

Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: city, dtype: int64

In [7]:
df.cough.value_counts()

Mild      62
Strong    38
Name: cough, dtype: int64

### Cough is ordinal : ordinal encoding
### Gender and city is nominal: one hot encoder.

### Fever is numeric, but some of its values are missing. so we will apply simple imputer to fill it.

### No preprocessing on age for now.

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:5], df.iloc[:, -1], test_size=0.2)

In [11]:
X_train

Unnamed: 0,age,gender,fever,cough,city
21,73,Male,98.0,Mild,Bangalore
79,48,Female,103.0,Mild,Kolkata
37,55,Male,100.0,Mild,Kolkata
14,51,Male,104.0,Mild,Bangalore
7,20,Female,,Strong,Mumbai
...,...,...,...,...,...
38,49,Female,101.0,Mild,Delhi
4,65,Female,101.0,Mild,Mumbai
88,5,Female,100.0,Mild,Kolkata
96,51,Female,101.0,Strong,Kolkata


# 1. Aam Zindagi

In [None]:
# adding simple imputer to fever col
si = SimpleImputer() # you can pass strategy parameter with things like mean(Default), median, most_freauent, etc.
X_train_fever = si.fit_transform(X_train[["fever"]])

# also the test data
X_test_fever = si.fit_transform(X_test[["fever"]])

X_train_fever

# simple imputer by default replaces missing values with mean of data

In [None]:
# Ordinalencoding --> caugh

oe = OrdinalEncoder(categories=[["Mild", "Strong"]])
X_train_cough = oe.fit_transform(X_train[["cough"]])

# also the test data
X_test_cough = oe.fit_transform(X_test[["cough"]])

X_train_cough

In [14]:
# OneHotEncoding --> gender, city

ohe = OneHotEncoder(drop="first", sparse=False)

X_train_gender_city = ohe.fit_transform(X_train[["gender", "city"]])

# same for test data

X_test_gender_city = ohe.fit_transform(X_test[["gender", "city"]])

X_train_gender_city[:5]

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.]])

In [15]:
# Extracting Age

X_train_age = X_train[["age"]].values

# also the test data

X_test_age = X_test[["age"]]

X_train_age.shape

(80, 1)

In [16]:
# now joining all columns.

X_train_transformed = np.concatenate((X_train_age, X_train_fever, X_train_gender_city, X_train_cough), axis=1)

# for test data
X_test_transformed = np.concatenate((X_test_age, X_test_fever, X_test_gender_city, X_test_cough), axis=1)

X_train_transformed.shape

(80, 7)

In [23]:
X_train_transformed[:5]

array([[ 73.        ,  98.        ,   1.        ,   0.        ,
          0.        ,   0.        ,   0.        ],
       [ 48.        , 103.        ,   0.        ,   0.        ,
          1.        ,   0.        ,   0.        ],
       [ 55.        , 100.        ,   1.        ,   0.        ,
          1.        ,   0.        ,   0.        ],
       [ 51.        , 104.        ,   1.        ,   0.        ,
          0.        ,   0.        ,   0.        ],
       [ 20.        , 100.86111111,   0.        ,   0.        ,
          0.        ,   1.        ,   1.        ]])

In [31]:
X_test_transformed[:5]

array([[ 65.,  99.,   1.,   0.,   0.,   0.,   0.],
       [ 69., 102.,   0.,   0.,   0.,   0.,   0.],
       [ 49.,  99.,   0.,   0.,   0.,   0.,   1.],
       [ 11., 100.,   0.,   0.,   1.,   0.,   1.],
       [ 74., 102.,   1.,   0.,   0.,   1.,   0.]])

# 2. Mentos zindagi

In [19]:
from sklearn.compose import ColumnTransformer

transformers_needed = [
                       ("tnf1", SimpleImputer(),['fever']),
                       ("tnf2", OrdinalEncoder(categories=[["Mild", "Strong"]]), ["cough"]),
                       ("tnf3", OneHotEncoder(sparse=False, drop="first"), ["gender", "city"])
                       ]

                      # tnf1,2,3 are names decided by us, can be anything.
                      # second parameter is encoder object with parameters if necessary
                      # third is list of columns which we want to apply transformation on.

ct = ColumnTransformer(transformers=transformers_needed, remainder="passthrough") 
# transformer specifies list of transformers to be applied on data
# remainder tells us what do to with columns which were not used,
# we can drop them by setting remainder as "drop", or or as  "passthrough" meaning don't touch that column.



In [21]:
X_train_transformed_using_colTf = ct.fit_transform(X_train)

In [33]:
X_train_transformed_using_colTf[:5]

array([[ 98.        ,   0.        ,   1.        ,   0.        ,
          0.        ,   0.        ,  73.        ],
       [103.        ,   0.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  48.        ],
       [100.        ,   0.        ,   1.        ,   0.        ,
          1.        ,   0.        ,  55.        ],
       [104.        ,   0.        ,   1.        ,   0.        ,
          0.        ,   0.        ,  51.        ],
       [100.86111111,   1.        ,   0.        ,   0.        ,
          0.        ,   1.        ,  20.        ]])

In [24]:
X_train_transformed_using_colTf.shape

(80, 7)

In [34]:
X_test_transformed_using_colTf = ct.fit_transform(X_test)

In [37]:
X_test_transformed_using_colTf[:5]

array([[ 99.,   0.,   1.,   0.,   0.,   0.,  65.],
       [102.,   0.,   0.,   0.,   0.,   0.,  69.],
       [ 99.,   1.,   0.,   0.,   0.,   0.,  49.],
       [100.,   1.,   0.,   0.,   1.,   0.,  11.],
       [102.,   0.,   1.,   0.,   0.,   1.,  74.]])