# **Importing Liraries**

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# **Data Loading**

In [14]:
df = pd.read_csv('cars.csv')

In [15]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [16]:
df.brand.nunique()

32

In [17]:
df.fuel.value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

In [18]:
df.owner.value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

In [19]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,0:4], df.iloc[:, -1], test_size=0.2, random_state=2)

In [20]:
x_train

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner
...,...,...,...,...
3606,Ford,35000,Diesel,First Owner
5704,Maruti,120000,Petrol,First Owner
6637,Tata,15000,Petrol,First Owner
2575,Maruti,32500,Diesel,Second Owner


# **One Hot Encoding**

In [21]:
from sklearn.preprocessing import OneHotEncoder

In [25]:
ohe = OneHotEncoder(drop='first', sparse_output=False)
# drop='first' will drop the first columns from both the columns(fuel & owner).
# this is important while applying One Hot Encoding to avoid the MULTICOLLINEARITY from the Independent features.

In [26]:
x_train_new = ohe.fit_transform(x_train[['fuel', 'owner']])

In [27]:
x_test_new = ohe.transform(x_test[['fuel', 'owner']])

In [28]:
x_train_new

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

# **One Hot Encoding on Top Categories**

In [29]:
counts = df['brand'].value_counts()

In [30]:
df['brand'].nunique()
threshold=100

In [31]:
repl = counts[counts <= threshold].index

In [33]:
pd.get_dummies(df["brand"].replace(repl, 'uncommon'), dtype='int')

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0
