# ONE HOT ENCODING

In [6]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('cars.csv')

In [8]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [9]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

## **1. One-Hot Encoding using Pandas**

In [12]:
# Apply One-Hot Encoding with Pandas
df_one_hot = pd.get_dummies(df, columns=['fuel', 'owner'])

df_one_hot.head()

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False


## **2. K-1 One-Hot Encoding using Pandas**

In [13]:
# Apply K-1 One-Hot Encoding with Pandas
df_k1_one_hot = pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True)

df_k1_one_hot.head()

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False


## **3. One-Hot Encoding using Sklearn**

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [15]:
# Split the data

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:4], df.iloc[:, -1], test_size=0.2, random_state=2)


In [16]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [17]:
# Initialize the OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)


In [18]:
# Fit and transform on the training set
X_train_new = ohe.fit_transform(X_train[['fuel', 'owner']])

In [19]:
# Transform the test set
X_test_new = ohe.transform(X_test[['fuel', 'owner']])

In [20]:
X_train_new.shape

(6502, 7)

In [23]:
# Combine with other columns
X_train_combined = np.hstack((X_train[['brand', 'km_driven']].values, X_train_new))

# Display the first 5 rows
X_train_combined[:5]

array([['Hyundai', 35000, 1, 0, 0, 0, 0, 0, 0],
       ['Jeep', 60000, 1, 0, 0, 0, 0, 0, 0],
       ['Hyundai', 25000, 0, 0, 1, 0, 0, 0, 0],
       ['Mahindra', 130000, 1, 0, 0, 0, 1, 0, 0],
       ['Hyundai', 155000, 1, 0, 0, 0, 0, 0, 0]], dtype=object)

## **4. One-Hot Encoding with Top Categories**

In [24]:
# Group less frequent categories into "uncommon"
threshold = 100
counts = df['brand'].value_counts()
repl = counts[counts <= threshold].index
df['brand'] = df['brand'].replace(repl, 'uncommon')


In [26]:
# Apply One-Hot Encoding for top categories
df_top_categories = pd.get_dummies(df['brand'])

df_top_categories.sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
4819,False,False,False,False,False,False,False,False,False,False,False,False,True
2979,False,False,False,False,True,False,False,False,False,False,False,False,False
7699,False,False,False,False,False,False,False,False,False,False,False,False,True
6359,False,False,False,False,True,False,False,False,False,False,False,False,False
5086,False,False,False,False,False,False,False,False,False,True,False,False,False


## **5. One-Hot Encoding using `category_encoders`**

In [27]:
import category_encoders as ce

In [28]:
# Initialize the encoder
encoder = ce.OneHotEncoder(cols=['fuel', 'owner'], use_cat_names=True)

In [29]:
# Apply encoding
df_encoded = encoder.fit_transform(df)

df_encoded.head()

Unnamed: 0,brand,km_driven,fuel_Diesel,fuel_Petrol,fuel_LPG,fuel_CNG,owner_First Owner,owner_Second Owner,owner_Third Owner,owner_Fourth & Above Owner,owner_Test Drive Car,selling_price
0,Maruti,145500,1,0,0,0,1,0,0,0,0,450000
1,Skoda,120000,1,0,0,0,0,1,0,0,0,370000
2,Honda,140000,0,1,0,0,0,0,1,0,0,158000
3,Hyundai,127000,1,0,0,0,1,0,0,0,0,225000
4,Maruti,120000,0,1,0,0,1,0,0,0,0,130000
