In [3]:
import numpy as np
import pandas as pd

In [5]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [4]:
df = pd.read_csv('customer.csv')
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [32]:
df.nunique()

age          41
gender        2
review        3
education     3
purchased     2
dtype: int64

In [33]:
# Gender, Purchased -> Nominal Data
# Review, Education -> Ordinal Data 
# Age -> Numerical Data

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['purchased']), df['purchased'], test_size=0.2)
X_train.head()

Unnamed: 0,age,gender,review,education
7,60,Female,Poor,School
48,39,Female,Good,UG
3,72,Female,Good,PG
14,15,Male,Poor,PG
17,22,Female,Poor,UG


## Without Column Transformer

In [35]:

oe = OrdinalEncoder(categories=[['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']])

# Fit on train, transform both train & test
X_train_trans = oe.fit_transform(X_train[['review', 'education']])
X_test_trans = oe.transform(X_test[['review', 'education']])

X_train_trans.shape

(40, 2)

In [36]:
ohe = OneHotEncoder(drop='first', sparse_output=False)

X_train_gen = ohe.fit_transform(X_train[['gender']])
X_test_gen = ohe.transform(X_test[['gender']])

In [37]:
# Extracting age

X_train_age = X_train[['age']].values
X_test_age = X_test[['age']].values
X_train_age.shape

(40, 1)

In [38]:
# Concatenating

X_train_transformed = np.concatenate((X_train_age, X_train_gen, X_train_trans), axis=1)
X_test_transformed = np.concatenate((X_test_age, X_test_gen, X_test_trans), axis=1)

X_train_transformed

array([[23.,  1.,  2.,  2.],
       [94.,  1.,  1.,  2.],
       [39.,  1.,  2.,  0.],
       [19.,  1.,  2.,  0.],
       [68.,  0.,  0.,  1.],
       [45.,  0.,  2.,  0.],
       [25.,  0.,  2.,  1.],
       [15.,  1.,  0.,  2.],
       [74.,  1.,  2.,  1.],
       [83.,  0.,  1.,  1.],
       [16.,  0.,  1.,  2.],
       [77.,  0.,  1.,  1.],
       [57.,  0.,  1.,  0.],
       [18.,  1.,  2.,  0.],
       [69.,  0.,  0.,  2.],
       [61.,  1.,  0.,  2.],
       [74.,  1.,  0.,  0.],
       [76.,  1.,  0.,  2.],
       [57.,  0.,  1.,  0.],
       [59.,  1.,  0.,  1.],
       [73.,  1.,  1.,  1.],
       [16.,  0.,  1.,  1.],
       [39.,  0.,  2.,  1.],
       [57.,  0.,  2.,  0.],
       [98.,  0.,  2.,  1.],
       [72.,  0.,  2.,  2.],
       [75.,  1.,  0.,  1.],
       [97.,  1.,  0.,  2.],
       [64.,  0.,  0.,  2.],
       [38.,  0.,  2.,  2.],
       [27.,  1.,  0.,  2.],
       [53.,  0.,  0.,  2.],
       [96.,  0.,  2.,  0.],
       [30.,  0.,  2.,  2.],
       [22.,  

## Using Column Transformer

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [14]:
transformer = ColumnTransformer(transformers=[
    ('tnf0', StandardScaler(), ['age']),
    ('tnf1', OrdinalEncoder(), ['review', 'education']),
    ('tnf2', OneHotEncoder(sparse_output=False, drop='first'), ['gender'])
], remainder='passthrough')

In [15]:
transformer.fit(X_train)
transformer.transform(X_train)
transformer.transform(X_test)

array([[-1.50496496,  1.        ,  1.        ,  1.        ],
       [ 0.07608818,  2.        ,  2.        ,  1.        ],
       [-0.99112269,  0.        ,  0.        ,  1.        ],
       [-1.62354394,  0.        ,  0.        ,  0.        ],
       [ 1.45950968,  0.        ,  0.        ,  1.        ],
       [-0.71443839,  1.        ,  1.        ,  1.        ],
       [-0.00296447,  0.        ,  1.        ,  0.        ],
       [-1.03064902,  0.        ,  1.        ,  0.        ],
       [-1.18875433,  2.        ,  0.        ,  1.        ],
       [-0.24012245,  2.        ,  1.        ,  1.        ]])