One-Hot Encoding (OHE)  
│── Definition: Converts categorical values into multiple binary (0/1) columns.  
│  
├── Dummy Variable Trap  
│     │── Problem: When one OHE column is redundant because it can be predicted from others,  
│     │            leading to multicollinearity.  
│     │── Solution: Drop one column from each OHE group.  
│  
├── OHE using Most Frequent Variables  
│     │── Definition: Keep only the most frequent categories as separate columns  
│     │               and group less frequent ones into "Other".  
│  
└── Example  
      │── Variable: Color = {Red, Blue, Green}  
      │── OHE Result:  
      │       Red   Blue   Green  
      │        1     0       0  
      │        0     1       0  
      │        0     0       1  


In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('cars.csv')

df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [2]:
df['brand'].nunique()

32

In [3]:
df['fuel'].value_counts()

Unnamed: 0_level_0,count
fuel,Unnamed: 1_level_1
Diesel,4402
Petrol,3631
CNG,57
LPG,38


In [4]:
df['owner'].value_counts()

Unnamed: 0_level_0,count
owner,Unnamed: 1_level_1
First Owner,5289
Second Owner,2105
Third Owner,555
Fourth & Above Owner,174
Test Drive Car,5


1. OneHotEcoding using pandas

In [5]:
pd.get_dummies(df, columns=['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


2. K-1 OneHotEncoding

In [6]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test =train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=49)

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
ohe = OneHotEncoder()

In [17]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']]).toarray()
X_test_new = ohe.transform(X_test[['fuel','owner']]).toarray()

print(X_train_new.shape)
print(X_test_new.shape)

(6502, 9)
(1626, 9)


4. OneHotEncoding with Top Categories

In [11]:
from re import X
X_train[['brand', 'km_driven']].value_counts(),X_train[['brand', 'km_driven']].nunique()

(brand       km_driven
 Maruti      70000        141
             60000        111
             50000        108
             120000       105
             90000         98
                         ... 
 Ashok       200000         1
 Ambassador  100000         1
             90000          1
             80000          1
             60000          1
 Name: count, Length: 1682, dtype: int64,
 brand         32
 km_driven    799
 dtype: int64)

In [12]:
counts = df['brand' ]. value_counts()

df['brand' ].nunique()
threshold = 100

repl = counts[counts <= threshold].index

pd.get_dummies(df['brand'].replace(repl, 'uncommon'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,False,True,False,False,False,False,False,False,False,False
8124,False,False,False,False,True,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,False,True,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,False,True,False,False,False
