In [1]:
import pandas as pd
import numpy as np

In [19]:
df = pd.read_csv("cars.csv")

In [9]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [23]:
# df['brand'].value_counts()
# df['fuel'].value_counts()
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

**1. OneHotEncoding using Pandas**

In [29]:
pd.get_dummies(df,columns=['fuel','owner'])
# In this case, it will create dummy variables for the 'fuel' and 'owner' columns.

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


**2.K-1 OneHot Encoding**

In [44]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)
# It takes N-1 variable in each(fuel,owner) column.

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


**3.OneHotEncoding Using Sklearn**

In [47]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(df.iloc[:,0:4],
                                df.iloc[:,-1],test_size=0.2,random_state=2)
# here i give 0.8% data to training set and 0.2% of data test set

In [74]:
X_train

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner
...,...,...,...,...
3606,Ford,35000,Diesel,First Owner
5704,Maruti,120000,Petrol,First Owner
6637,Tata,15000,Petrol,First Owner
2575,Maruti,32500,Diesel,Second Owner


In [72]:
Y_train

5571    1150000
2038    1689999
2957     580000
7618     150000
6684     320000
         ...   
3606     620000
5704     335000
6637     450000
2575     651000
7336    1160000
Name: selling_price, Length: 6502, dtype: int64

In [63]:
from sklearn.preprocessing import OneHotEncoder

In [122]:
ohe =  OneHotEncoder(drop='first',sparse=False,dtype=np.int32)
# the sparse parameter determines the format of the output.

**sparse=False**: This means that the output will be a dense array. A dense array is a regular NumPy array where all elements are stored, including zeros. This is useful when you want to work with the encoded data directly in a format that is easy to manipulate and understand.

**sparse=True**: If this were set to True, the output would be a sparse matrix. A sparse matrix is a more memory-efficient way to store data when most of the elements are zeros. This can be beneficial when dealing with large datasets with many categorical variables, as it saves memory and computational resources.

In summary, setting sparse=False ensures that the one-hot encoded data is returned as a dense array, which is easier to work with for many machine learning tasks.

In [124]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])

In [126]:
X_train_new.shape

(6502, 7)

In [128]:
X_test_new = ohe.transform(X_test[['fuel','owner']])

In [130]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))      #????

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

**4.ONeHotEncoding with Top Categories**

In [None]:
counts = df['brand'].value_counts()

In [137]:
df['brand'].nunique()
threshold = 100

In [141]:
repl = counts[counts<=threshold].index

In [147]:
pd.get_dummies(df['brand'].replace(repl,'uncommon')).sample(5)
# Here False=0 , True=1

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
3127,False,False,False,False,True,False,False,False,False,False,False,False,False
4939,False,False,False,False,False,False,True,False,False,False,False,False,False
2117,False,False,False,False,False,False,False,False,False,False,True,False,False
3761,False,False,False,False,False,False,True,False,False,False,False,False,False
214,False,False,False,False,False,False,True,False,False,False,False,False,False
