In [1]:
import numpy as np
import pandas as pd

In [9]:
df = pd.read_csv("cars.csv")
df.sample(10)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
2494,Maruti,96000,Diesel,Second Owner,370000
6812,Hyundai,130000,Diesel,First Owner,900000
2295,Maruti,3000,Petrol,First Owner,577000
2946,Tata,35000,Petrol,Second Owner,409999
8085,Chevrolet,40000,Diesel,First Owner,300000
287,Mahindra,207890,Diesel,Second Owner,210000
207,Maruti,35000,Petrol,First Owner,420000
7642,Chevrolet,70000,LPG,First Owner,300000
5819,Hyundai,20000,Petrol,First Owner,350000
8047,Honda,39000,Petrol,First Owner,649000


In [11]:
df['brand'].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [13]:
df.nunique()

brand             32
km_driven        921
fuel               4
owner              5
selling_price    677
dtype: int64

## One Hot Endoing using Pandas

In [15]:
pd.get_dummies(df, columns=['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


In [None]:
# k-1 OHE (solving multicollinearity)

pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


## One Hot Encoding using Scikit Learn

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:4], df.iloc[:, -1], test_size=0.2, random_state=42)

In [19]:
X_test.head()

Unnamed: 0,brand,km_driven,fuel,owner
1971,Honda,110000,Petrol,Third Owner
4664,Tata,291977,Diesel,First Owner
5448,Maruti,70000,Diesel,First Owner
3333,Honda,120000,Petrol,Second Owner
2316,Maruti,69000,Diesel,Second Owner


In [41]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop='first', dtype=np.int32, sparse_output=False)

In [43]:
X_train_new = ohe.fit_transform(X_train[['fuel', 'owner']])
X_test_new = ohe.fit_transform(X_test[['fuel', 'owner']])

In [44]:
np.hstack((X_train[['brand', 'km_driven']].values, X_train_new)).shape

(6502, 9)

In [46]:
counts = df['brand'].value_counts()
counts

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [48]:
df['brand'].nunique()
threshold = 100

In [49]:
replace = counts[counts <= threshold].index

In [50]:
pd.get_dummies(df['brand'].replace(replace, 'uncommon'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,False,True,False,False,False,False,False,False,False,False
8124,False,False,False,False,True,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,False,True,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,False,True,False,False,False
