In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('cars.csv')

In [3]:
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
4603,Hyundai,90000,Diesel,Second Owner,380000
5726,Maruti,94000,Petrol,Fourth & Above Owner,125000
2401,Ford,80000,Diesel,First Owner,530000
6387,Maruti,100000,Petrol,Third Owner,45000
5213,Chevrolet,60000,Petrol,First Owner,140000


In [13]:
df['brand'].nunique()

32

In [10]:
df.shape

(8128, 5)

# **OneHotEncoding Using Pandas**

In [27]:
one_hot_encoded_df = pd.get_dummies(df, columns=['fuel', 'owner'])

one_hot_encoded_df = one_hot_encoded_df.map(lambda x: int(x) if isinstance(x, bool) else x)

In [28]:
one_hot_encoded_df

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


# **K-1 Encoding**

In [29]:
one_hot_encoded_df = pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True)

one_hot_encoded_df = one_hot_encoded_df.map(lambda x: int(x) if isinstance(x, bool) else x)

one_hot_encoded_df

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


# **Using Scikit-Learn**

In [30]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=2)

In [31]:
from sklearn.preprocessing import OneHotEncoder

In [53]:
ohe = OneHotEncoder(drop='first',sparse_output=False, dtype=np.int32)

In [54]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])

In [55]:
X_test_new = ohe.transform(X_test[['fuel','owner']])

In [56]:
X_train_new

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [57]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

# **OneHotEncoding with Top Categories**

In [58]:
counts = df['brand'].value_counts()

In [62]:
df['brand'].nunique()
threshold = 100

In [63]:
rep1 = counts[counts <= threshold].index