# ***column Transformer***
Applies transformers to columns of an array or pandas DataFrame. This estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space.

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.impute import SimpleImputer # imputation of missing values
from sklearn.preprocessing import OneHotEncoder # Encoding of Nominal Categorical variables
from sklearn.preprocessing import OrdinalEncoder # Encoding of Ordinal Categorical variables

In [3]:
df = pd.read_csv('images/covid_toy.csv')
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [5]:
df['cough'].value_counts()  # it is a ordinal categorical column

Mild      62
Strong    38
Name: cough, dtype: int64

In [6]:
df['fever'].value_counts() # it is a numerical and age column is also a numerical feature

101.0    17
98.0     17
104.0    14
100.0    13
99.0     10
102.0    10
103.0     9
Name: fever, dtype: int64

In [7]:
df['city'].value_counts() # it is a nominal categorical feature

Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: city, dtype: int64

In [8]:
df['has_covid'].value_counts() # label feature

No     55
Yes    45
Name: has_covid, dtype: int64

In [9]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train,X_test,y_train,y_test=train_test_split(df.drop(columns=['has_covid']), df['has_covid'], 
                                              test_size=0.2, random_state=True)

In [12]:
X_train

Unnamed: 0,age,gender,fever,cough,city
2,42,Male,101.0,Mild,Delhi
73,34,Male,98.0,Strong,Kolkata
97,20,Female,101.0,Mild,Bangalore
62,56,Female,104.0,Strong,Bangalore
19,42,Female,,Strong,Bangalore
...,...,...,...,...,...
75,5,Male,102.0,Mild,Kolkata
9,64,Female,101.0,Mild,Delhi
72,83,Female,101.0,Mild,Kolkata
12,25,Female,99.0,Strong,Kolkata


# Aam Zindagi 😂😂

In [13]:
# adding simple imputer to transform column
si = SimpleImputer()
X_train_fever = si.fit_transform(X_train[['fever']])

# also the test data
X_test_fever = si.transform(X_test[['fever']])

X_train_fever.shape

(80, 1)

In [14]:
# Ordinal Encoding -> cough
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])

# also the test data
X_test_cough = oe.transform(X_test[['cough']])

X_test_cough.shape

(20, 1)

In [15]:
# One_hot_encoding -> gender, city
ohe = OneHotEncoder(drop='first', sparse=False)
X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])#.toarray()

# also on test data
X_test_gender_city = ohe.transform(X_test[['gender','city']])#.toarray()

X_train_gender_city.shape

(80, 4)

In [16]:
# Extracting age
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values

# also the test data
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values

X_train_age.shape

(80, 1)

In [17]:
X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
# also the test data
X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis=1)

X_train_transformed.shape

(80, 7)

# Mentos Zindagi 😂😎😘🤣

In [18]:
from sklearn.compose import ColumnTransformer

In [19]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')


In [20]:
transformer.fit_transform(X_train).shape

(80, 7)

In [21]:
transformer.transform(X_test).shape

(20, 7)