### Importing Packages and Load Data

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [3]:
data = pd.read_csv("./cars.csv")

In [4]:
data.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
2829,BMW,17100,Diesel,First Owner,3900000
4001,Fiat,120000,Diesel,First Owner,450000
2860,Mahindra,120000,Diesel,Third Owner,125000
164,Tata,8000,Diesel,First Owner,1025000
1316,Maruti,150000,Petrol,Third Owner,150000


In [5]:
data['brand'].nunique()

32

In [6]:
data['fuel'].nunique()

4

In [7]:
data['owner'].nunique()

5

### One Hot Encoding using Pandas

In [8]:
pd.get_dummies(data,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


### K-1 One Hot Encoding in Pandas

In [9]:
pd.get_dummies(data,columns=['fuel','owner'],drop_first=True) # To avoid multicollinearity

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


### Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,0:4],data.iloc[:,-1],test_size=2)

### One Hot Encoding using SkLea

In [11]:
ohe = OneHotEncoder(drop='first',sparse_output=False,dtype=np.int32)

X_train_new = ohe.fit_transform(X_train[['fuel','owner']])
X_test_new = ohe.transform(X_test[['fuel','owner']])

In [12]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Maruti', 80000, 0, ..., 0, 0, 0],
       ['Maruti', 89100, 1, ..., 1, 0, 0],
       ['Hyundai', 56000, 1, ..., 0, 0, 0],
       ...,
       ['Renault', 90000, 1, ..., 1, 0, 0],
       ['Toyota', 149032, 1, ..., 0, 0, 0],
       ['Honda', 56494, 0, ..., 0, 0, 0]], shape=(6502, 9), dtype=object)

### One Hot Encoding with Top Categories

In [13]:
counts = data['brand'].value_counts()
data['brand'].nunique()
threshold = 100

In [14]:
repl = counts[counts <= threshold].index

In [34]:
pd.get_dummies(data['brand'].replace(repl,'Other')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Other,Renault,Skoda,Tata,Toyota,Volkswagen
5515,False,False,False,True,False,False,False,False,False,False,False,False,False
4645,False,False,False,False,False,False,True,False,False,False,False,False,False
1114,False,False,False,False,False,False,False,True,False,False,False,False,False
3651,False,False,False,False,False,False,True,False,False,False,False,False,False
991,False,False,False,False,False,True,False,False,False,False,False,False,False
