# ***column Transformer***
Applies transformers to columns of an array or pandas DataFrame. This estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space.

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.impute import SimpleImputer # imputation of missing values
from sklearn.preprocessing import OneHotEncoder # Encoding of Nominal Categorical variables
from sklearn.preprocessing import OrdinalEncoder # Encoding of Ordinal Categorical variables

In [3]:
df = pd.read_csv('images/covid_toy.csv')
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [5]:
df['cough'].value_counts()  # it is a ordinal categorical column

Mild      62
Strong    38
Name: cough, dtype: int64

In [6]:
df['fever'].value_counts() # it is a numerical and age column is also a numerical feature

101.0    17
98.0     17
104.0    14
100.0    13
99.0     10
102.0    10
103.0     9
Name: fever, dtype: int64

In [7]:
df['city'].value_counts() # it is a nominal categorical feature

Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: city, dtype: int64

In [8]:
df['has_covid'].value_counts() # label feature

No     55
Yes    45
Name: has_covid, dtype: int64

In [9]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train,X_test,y_train,y_test=train_test_split(df.drop(columns=['has_covid']), df['has_covid'], 
                                              test_size=0.2, random_state=True)

In [12]:
X_train

Unnamed: 0,age,gender,fever,cough,city
2,42,Male,101.0,Mild,Delhi
73,34,Male,98.0,Strong,Kolkata
97,20,Female,101.0,Mild,Bangalore
62,56,Female,104.0,Strong,Bangalore
19,42,Female,,Strong,Bangalore
...,...,...,...,...,...
75,5,Male,102.0,Mild,Kolkata
9,64,Female,101.0,Mild,Delhi
72,83,Female,101.0,Mild,Kolkata
12,25,Female,99.0,Strong,Kolkata


# Aam Zindagi ðŸ˜‚ðŸ˜‚

In [13]:
# adding simple imputer to transform column
si = SimpleImputer()
X_train_fever = si.fit_transform(X_train[['fever']])

# also the test data
X_test_fever = si.transform(X_test[['fever']])

X_train_fever.shape

(80, 1)

In [14]:
# Ordinal Encoding -> cough
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])

# also the test data
X_test_cough = oe.transform(X_test[['cough']])

X_test_cough.shape

(20, 1)

In [15]:
# One_hot_encoding -> gender, city
ohe = OneHotEncoder(drop='first', sparse=False)
X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])#.toarray()

# also on test data
X_test_gender_city = ohe.transform(X_test[['gender','city']])#.toarray()

X_train_gender_city.shape

(80, 4)

In [16]:
# Extracting age
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values

# also the test data
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values

X_train_age.shape

(80, 1)

In [17]:
X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
# also the test data
X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis=1)

X_train_transformed.shape

(80, 7)

# Mentos Zindagi ðŸ˜‚ðŸ˜ŽðŸ˜˜ðŸ¤£

In [18]:
from sklearn.compose import ColumnTransformer

In [19]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')


In [20]:
transformer.fit_transform(X_train).shape

(80, 7)

In [21]:
transformer.transform(X_test).shape

(20, 7)

https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

# Practice on another Dataset

In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder


In [23]:
pd.set_option('display.max_columns',None)
df = pd.read_csv('/home/saad/Downloads/tips.csv')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train,X_test,y_train,y_test=train_test_split(df.drop(columns=['total_bill']), df['total_bill'], 
                                              test_size=0.2, random_state=True)

In [27]:
X_train

Unnamed: 0,tip,sex,smoker,day,time,size
0,1.01,Female,No,Sun,Dinner,2
154,2.00,Male,No,Sun,Dinner,4
167,4.50,Male,No,Sun,Dinner,4
110,3.00,Male,No,Sat,Dinner,2
225,2.50,Female,Yes,Fri,Lunch,2
...,...,...,...,...,...,...
137,2.00,Female,No,Thur,Lunch,2
72,3.14,Female,Yes,Sat,Dinner,2
140,3.50,Female,No,Thur,Lunch,2
235,1.25,Male,No,Sat,Dinner,2


In [28]:
from sklearn.preprocessing import StandardScaler

In [29]:
from sklearn.compose import ColumnTransformer

In [33]:
transformer = ColumnTransformer(transformers=[
    ('tnf1', OneHotEncoder(drop='first',sparse=False,dtype=np.int64),
     ['sex','smoker', 'day','time']),
    ('tnf2',StandardScaler(),['tip'])], remainder='passthrough')

In [34]:
pd.DataFrame(transformer.fit_transform(X_train)).astype('int64')

Unnamed: 0,0,1,2,3,4,5,6,7
0,0,0,0,1,0,0,-1,2
1,1,0,0,1,0,0,0,4
2,1,0,0,1,0,0,1,4
3,1,0,1,0,0,0,0,2
4,0,1,0,0,0,1,0,2
...,...,...,...,...,...,...,...,...
190,0,0,0,0,1,1,0,2
191,0,1,1,0,0,0,0,2
192,0,0,0,0,1,1,0,2
193,1,0,1,0,0,0,-1,2


In [35]:
transformer.transform(X_test).shape

(49, 8)