In [141]:
import pandas as pd 

df = pd.read_csv("cars.csv")
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [142]:
df["owner"].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

In [143]:
df["brand"].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Land                6
Force               6
Isuzu               5
Ambassador          4
Kia                 4
MG                  3
Daewoo              3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [144]:
df["fuel"].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

# OHE USING PANDAS


<h3>ISSUE : Pandas do not maintain same OHE whenever we run the code that lead to great problem in ML</h3>

In [145]:
# one hot encoding using pandas
pd.get_dummies(df, columns=["fuel","owner"], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


# OHE USING SKLEARN

In [146]:
df.shape

(8128, 5)

In [164]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:4], df["selling_price"], test_size=0.2, random_state=42)

In [165]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
6518,Tata,2560,Petrol,First Owner
6144,Honda,80000,Petrol,Second Owner
6381,Hyundai,150000,Diesel,Fourth & Above Owner
438,Maruti,120000,Diesel,Second Owner
5939,Maruti,25000,Petrol,First Owner


In [189]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop='first', sparse_output=True, dtype='int64')
ohe.fit(X_train[["fuel","owner"]])

X_train_trans= ohe.transform(X_train[["fuel","owner"]]).toarray()   
X_test_trans= ohe.transform(X_test[["fuel","owner"]]).toarray()

X_train_trans = pd.DataFrame(X_train_trans, columns=ohe.get_feature_names_out(["fuel","owner"]))
X_test_trans = pd.DataFrame(X_test_trans, columns=ohe.get_feature_names_out(["fuel","owner"]))


In [190]:
X_train_trans.shape


(6502, 7)

In [191]:
X_test_trans.shape

(1626, 7)

In [192]:
X_train_trans.head()


Unnamed: 0,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,0,0,1,0,0,0,0
1,0,0,1,0,1,0,0
2,1,0,0,1,0,0,0
3,1,0,0,0,1,0,0
4,0,0,1,0,0,0,0


In [193]:
X_train[["brand", "km_driven"]]

Unnamed: 0,brand,km_driven
6518,Tata,2560
6144,Honda,80000
6381,Hyundai,150000
438,Maruti,120000
5939,Maruti,25000
...,...,...
5226,Mahindra,120000
5390,Maruti,80000
860,Hyundai,35000
7603,Maruti,27000


In [194]:
import numpy as np

In [195]:
# For train set: combine 'brand', 'km_driven' with OHE features
X_train_new=np.hstack((X_train[["brand", "km_driven"]].values, X_train_trans))
X_test_new=np.hstack((X_test[["brand", "km_driven"]].values, X_test_trans))

In [196]:
X_train_new

array([['Tata', 2560, 0, ..., 0, 0, 0],
       ['Honda', 80000, 0, ..., 1, 0, 0],
       ['Hyundai', 150000, 1, ..., 0, 0, 0],
       ...,
       ['Hyundai', 35000, 0, ..., 0, 0, 0],
       ['Maruti', 27000, 1, ..., 0, 0, 0],
       ['Maruti', 70000, 0, ..., 1, 0, 0]], dtype=object)

In [197]:
X_train_new = pd.DataFrame(X_train_new, columns=np.concatenate((["brand", "km_driven"], ohe.get_feature_names_out(["fuel","owner"]))))
X_test_new = pd.DataFrame(X_test_new, columns=np.concatenate((["brand", "km_driven"], ohe.get_feature_names_out(["fuel","owner"]))))

In [198]:
X_train_new.head()

Unnamed: 0,brand,km_driven,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Tata,2560,0,0,1,0,0,0,0
1,Honda,80000,0,0,1,0,1,0,0
2,Hyundai,150000,1,0,0,1,0,0,0
3,Maruti,120000,1,0,0,0,1,0,0
4,Maruti,25000,0,0,1,0,0,0,0


In [199]:
X_test_new.head()

Unnamed: 0,brand,km_driven,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Honda,110000,0,0,1,0,0,0,1
1,Tata,291977,1,0,0,0,0,0,0
2,Maruti,70000,1,0,0,0,0,0,0
3,Honda,120000,0,0,1,0,1,0,0
4,Maruti,69000,1,0,0,0,1,0,0


In [200]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


# HANDLE "LARGE VARIABLES COLOUM e.g. Brands"

In [None]:
X_train_new.head() 

Unnamed: 0,brand,km_driven,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Tata,2560,0,0,1,0,0,0,0
1,Honda,80000,0,0,1,0,1,0,0
2,Hyundai,150000,1,0,0,1,0,0,0
3,Maruti,120000,1,0,0,0,1,0,0
4,Maruti,25000,0,0,1,0,0,0,0


In [205]:
ohe = OneHotEncoder(drop='first', sparse_output=True, dtype='int64')
counts = df['brand'].value_counts()
df['brand'].nunique()
threshold = 100
repl = counts[counts <= threshold].index 
ohe.fit(X_train_new[["brand"]])
X_train_new['brand'] = X_train_new['brand'].replace(repl, 'Other')
X_test_new['brand'] = X_test_new['brand'].replace(repl, 'Other')
X_test_brand = ohe.transform(X_test_new[["brand"]]).toarray()
X_train_brand = ohe.transform(X_train_new[["brand"]]).toarray()

In [207]:
X_test_brand

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [208]:
# For train set: combine 'brand', 'km_driven' with OHE features
X_train_brandnew=np.hstack((X_train_new[["km_driven","fuel_Diesel"	,"fuel_LPG",	"fuel_Petrol",	"owner_Fourth & Above Owner",	"owner_Second Owner",	"owner_Test Drive Car",	"owner_Third Owner"]].values, X_train_brand))
X_test_brandnew=np.hstack((X_test_new[["km_driven","fuel_Diesel"	,"fuel_LPG",	"fuel_Petrol",	"owner_Fourth & Above Owner",	"owner_Second Owner",	"owner_Test Drive Car",	"owner_Third Owner"]].values, X_test_brand))

In [209]:
X_train_brandnew = pd.DataFrame(X_train_brandnew, columns=np.concatenate((["km_driven","fuel_Diesel"	,"fuel_LPG",	"fuel_Petrol",	"owner_Fourth & Above Owner",	"owner_Second Owner",	"owner_Test Drive Car",	"owner_Third Owner"], ohe.get_feature_names_out(["brand"]))))
X_test_brandnew = pd.DataFrame(X_test_brandnew, columns=np.concatenate((["km_driven","fuel_Diesel"	,"fuel_LPG",	"fuel_Petrol",	"owner_Fourth & Above Owner",	"owner_Second Owner",	"owner_Test Drive Car",	"owner_Third Owner"], ohe.get_feature_names_out(["brand"]))))


In [210]:
X_train_brandnew.head()

Unnamed: 0,km_driven,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,brand_Chevrolet,brand_Ford,brand_Honda,brand_Hyundai,brand_Mahindra,brand_Maruti,brand_Other,brand_Renault,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen
0,2560,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,80000,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,150000,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,120000,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,25000,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [212]:
X_train_brandnew.shape

(6502, 20)

In [219]:
df.value_counts()

brand   km_driven  fuel    owner         selling_price
Lexus   20000      Petrol  First Owner   5150000          34
Jaguar  45000      Diesel  First Owner   3200000          34
Toyota  68089      Petrol  First Owner   2000000          32
Honda   56494      Petrol  First Owner   550000           32
        7032       Petrol  First Owner   779000           31
                                                          ..
Volvo   30000      Diesel  First Owner   2500000           1
                   Petrol  First Owner   10000000          1
        48000      Diesel  First Owner   1950000           1
        50000      Diesel  First Owner   5500000           1
        72500      Diesel  Second Owner  1200000           1
Name: count, Length: 6450, dtype: int64

In [220]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [231]:
X_train["brand"].value_counts()

brand
Maruti           1953
Hyundai          1127
Mahindra          635
Tata              586
Toyota            391
Honda             369
Ford              320
Chevrolet         185
Renault           183
Volkswagen        154
BMW                96
Skoda              82
Nissan             62
Jaguar             59
Volvo              54
Datsun             48
Mercedes-Benz      43
Fiat               35
Audi               30
Jeep               26
Lexus              22
Mitsubishi         13
Force               6
Land                5
Kia                 4
Daewoo              3
MG                  3
Ambassador          3
Isuzu               2
Ashok               1
Peugeot             1
Opel                1
Name: count, dtype: int64

In [232]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Identify rare brands
counts = df['brand'].value_counts()
threshold = 100
repl = counts[counts <= threshold].index

# Replace rare brands with 'Other' in both train and test sets
X_train['brand'] = X_train['brand'].replace(repl, 'Other')
X_test['brand'] = X_test['brand'].replace(repl, 'Other')

# Now apply ColumnTransformer
ct = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first', sparse_output=True, dtype='int64'), ['brand', 'fuel', 'owner'])
    ],
    remainder='passthrough'
)

ct.fit(X_train[["brand", "fuel", "owner"]])
X_Train = ct.transform(X_train[["brand", "fuel", "owner"]])
X_Test = ct.transform(X_test[["brand", "fuel", "owner"]])

In [233]:
X_Train

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 15124 stored elements and shape (6502, 19)>