## How to use column transformer in scikit learn

In [34]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [35]:
# taking a sample data to learn OHE, Ordinal encoding, Standard scaler and finall column transformer
data = {
    'city': ['Mumbai', 'Delhi', 'Bangalore', 'Mumbai', 'Bangalore', 'Delhi'],
    'education': ['UG', 'PG', 'PhD', 'PG', 'UG', 'PhD'],
    'gpa': [3.2, 3.8, 3.9, 3.5, 2.9, 4.0],
    'experience_months': [12, 24, 60, 36, 6, 72],
    'hired': ['No', 'Yes', 'Yes', 'Yes', 'No', 'Yes']
}

df = pd.DataFrame(data)

# Separate Features (X) and Target (y)
X = df.drop('hired', axis=1)
y = df['hired']

print("--- Original Data ---")

df.head()

--- Original Data ---


Unnamed: 0,city,education,gpa,experience_months,hired
0,Mumbai,UG,3.2,12,No
1,Delhi,PG,3.8,24,Yes
2,Bangalore,PhD,3.9,60,Yes
3,Mumbai,PG,3.5,36,Yes
4,Bangalore,UG,2.9,6,No


#### Now we will apply different techniques of scaling and encoding on this data in column transformer to preprocess this entire dataset

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1)

In [37]:
X_train.shape, X_test.shape

((4, 4), (2, 4))

### 1. Before Column Transformer (Aam Zindagi)

In [38]:
'''we will individually apply 

1. OHE to City
2. Ordinal Encoder to education
3. scaling on gpa and experience, as both are varied range of values 
4. finally label encoding on targt "hired" as it is categorical as well
'''
ohe = OneHotEncoder(drop='first', sparse_output=False)
col_to_ohe = ['city']
print(X_train[col_to_ohe].shape)
print(f"Unique categories in {col_to_ohe} is {np.unique(X_train[col_to_ohe])}")
X_train_ohe = ohe.fit_transform(X_train[col_to_ohe])
X_test_ohe = ohe.transform(X_test[col_to_ohe])
print(X_train_ohe.shape)

print('--------------------')

# Note: Defining categories manually ensures UG < PG < PhD
oe = OrdinalEncoder(categories=[['UG', 'PG', 'PhD']])
# oe = OrdinalEncoder() # if we dont define the categories manually, it sorts them alphabetically by default, and in this case
# Output will be: [array(['PG', 'PhD', 'UG'], dtype=object)] which doesnt make sense and is in right ordinality 
col_to_oe = ['education']
X_train_oe = oe.fit_transform(X_train[col_to_oe])
X_test_oe = oe.transform(X_test[col_to_oe])
print(X_train_oe.shape)

print('--------------------')

scalar = StandardScaler()
col_to_scale = ['gpa','experience_months']
X_train_scaled = scalar.fit_transform(X_train[col_to_scale])
X_test_scaled = scalar.transform(X_test[col_to_scale])
print(X_train_scaled.shape)

print('--------------------')

label = LabelEncoder()
y_train_labled = label.fit_transform(y_train)
y_test_labled = label.transform(y_test)
print(y_train_labled.shape)

print('--------------------')

# Finally, we will concatinate/stack all of them to create the final train and test arrays
X_train_final = np.hstack((X_train_ohe,X_train_oe,X_train_scaled))
# or we can even use np.concatenate
X_test_final = np.concatenate((X_test_ohe, X_test_oe, X_test_scaled), axis=1)

print(X_train_final.shape)

(4, 1)
Unique categories in ['city'] is ['Bangalore' 'Delhi' 'Mumbai']
(4, 2)
--------------------
(4, 1)
--------------------
(4, 2)
--------------------
(4,)
--------------------
(4, 5)


### 2. Column Transformer (Mentos zindagi)

In [39]:
from sklearn.compose import ColumnTransformer

In [44]:
# remainder = passthrough remainder ensure that the cols which are not getting transformed are kept as it is, otherwise they get dropper
transformer = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(sparse_output=False, drop='first'), col_to_ohe),
    ('oe', OrdinalEncoder(categories=[['UG', 'PG', 'PhD']]), col_to_oe),
    ('scalar', StandardScaler(),col_to_scale )
], remainder='passthrough') 

In [None]:
transformer.fit_transform(X_train).shape
# same shape as we had doing each transformation manually
transformer.transform(X_test).shape

# 2. Handle the Target (y) separately
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

(2, 5)