In [200]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cars-dataset/cars.csv


# One Hot Encoding
![](https://www.researchgate.net/publication/344409939/figure/fig1/AS:940907041918978@1601341128930/An-example-of-one-hot-encoding.png)

<p>These resulting columns from OneHotEncoding are referred to as <em>dummy variables</em>. The creation of these dummy variables introduces multicollinearity, which is known as the <em>"dummy variable trap."</em> To avoid falling into this trap, we remove one column.</p>
<p>In Machine Learning, it is important to ensure that your columns are not dependent on or have mathematical relationships with each other, which is known as <em>multicollinearity</em>. Hence, your columns should not exhibit multicollinearity.</p>

To address multicollinearity, we remove one column, in OneHotEncoding. If you have 'n' categories in a column, after OneHotEncoding, you will have 'n-1' columns remaining.

#   Let's start coding

In [201]:
df = pd.read_csv("/kaggle/input/cars-dataset/cars.csv")
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [202]:
from sklearn.model_selection import train_test_split
X = df.drop(['selling_price'],axis=1)
y = df.selling_price
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2, random_state=42)

<h4>Handling large category column</h4>

In [203]:
cat_cols = [col for col in X_train.columns if X_train[col].dtype=="object"]

high_cat_cols = [col for col in X_train.columns if X_train[col].dtype=="object" and X_train[col].nunique() > 9]

#Lets say one column - brand is high categorical column so, all brand names will be replaced by 'uncommon', who have less-than-equals 100 occurence
threshold = 100
counts = X_train[high_cat_cols].value_counts()
replace_cols = counts[counts <= threshold].index

X_train[high_cat_cols] = X_train[high_cat_cols].replace(replace_cols, 'uncommon')
X_valid[high_cat_cols] = X_valid[high_cat_cols].replace(replace_cols, 'uncommon')

<h4>One Hot Encoding using Pandas</h4>

In [204]:
ohe_cols = ['brand','fuel','owner']

X_train_encoded = X_train.copy()
X_valid_encoded = X_valid.copy()

X_train_encoded = pd.get_dummies(X_train_encoded, columns=ohe_cols, drop_first=True)
X_valid_encoded = pd.get_dummies(X_valid_encoded, columns=ohe_cols, drop_first=True)

<h4>One Hot Encoding using sklearn</h4>

In [205]:
from sklearn.preprocessing import OneHotEncoder

X_train_encoded2 = X_train.copy()
X_valid_encoded2 = X_valid.copy()

# Use handle_unknown="ignore" - To avoid inconsistency of no_of_categories in any column between X_train and X_valid
ohe = OneHotEncoder(handle_unknown='ignore', drop='first', dtype=np.int32, sparse=False)

X_train_encoded2 = pd.DataFrame(ohe.fit_transform(X_train[ohe_cols]))
X_valid_encoded2 = pd.DataFrame(ohe.transform(X_valid[ohe_cols]))

# One-hot encoding removed index; put it back
X_train_encoded2.index = X_train.index
X_valid_encoded2.index = X_valid.index

X_train_encoded2.columns = ohe.get_feature_names_out()
X_valid_encoded2.columns = ohe.get_feature_names_out()

num_X_train = X_train.drop(columns=ohe_cols)
num_X_valid = X_valid.drop(columns=ohe_cols)
X_train_encoded2 = pd.concat([num_X_train, X_train_encoded2], axis=1)
X_valid_encoded2 = pd.concat([num_X_valid, X_valid_encoded2], axis=1)

# Ensure all columns have string type
X_train_encoded2.columns = X_train_encoded2.columns.astype(str)
X_valid_encoded2.columns = X_valid_encoded2.columns.astype(str)



<h4>Result</h4>

In [206]:
print(X_train.shape)
print(X_valid.shape)
print(X_train_encoded.shape)
print(X_valid_encoded.shape)
print(X_train_encoded2.shape)
print(X_valid_encoded2.shape)

print("\n\nUSING PANDAS")
print("\nX_train \n", X_train.head(5))
print("\nX_train_encoded \n", X_train_encoded.head(5))
print("\nX_valid \n", X_valid.head(5))
print("\nX_valid_encoded \n", X_valid_encoded.head(5))

print("\n\nUSING SKLEARN")
print("\nX_train \n", X_train.head(5))
print("\nX_train_encoded2 \n", X_train_encoded2.head(5))
print("\nX_valid \n", X_valid.head(5))
print("\nX_valid_encoded2 \n", X_valid_encoded2.head(5))

(6502, 4)
(1626, 4)
(6502, 18)
(1626, 18)
(6502, 18)
(1626, 18)


USING PANDAS

X_train 
         brand  km_driven    fuel                 owner
6518     Tata       2560  Petrol           First Owner
6144    Honda      80000  Petrol          Second Owner
6381  Hyundai     150000  Diesel  Fourth & Above Owner
438    Maruti     120000  Diesel          Second Owner
5939   Maruti      25000  Petrol           First Owner

X_train_encoded 
       km_driven  brand_Ford  brand_Honda  brand_Hyundai  brand_Mahindra  \
6518       2560           0            0              0               0   
6144      80000           0            1              0               0   
6381     150000           0            0              1               0   
438      120000           0            0              0               0   
5939      25000           0            0              0               0   

      brand_Maruti  brand_Renault  brand_Tata  brand_Toyota  brand_Volkswagen  \
6518             0           

<h5>
👇👇 How the large category columns were handled 👇👇 <br>
the less frequent one were clubbed into 'uncommon'
</h5>

In [207]:
df.brand.value_counts()

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [208]:
X_train.brand.value_counts()

Maruti        1953
Hyundai       1127
Mahindra       635
uncommon       599
Tata           586
Toyota         391
Honda          369
Ford           320
Chevrolet      185
Renault        183
Volkswagen     154
Name: brand, dtype: int64