<a href="https://colab.research.google.com/github/ayushsyntax/ML_Journey/blob/main/F_E(One_Hot_Encoding).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [3]:
df  = pd.read_csv('cars.csv')


In [4]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [5]:
df['owner'].value_counts()

Unnamed: 0_level_0,count
owner,Unnamed: 1_level_1
First Owner,5289
Second Owner,2105
Third Owner,555
Fourth & Above Owner,174
Test Drive Car,5


###Why Use One-Hot Encoding?
Avoids Ordinal Assumptions :
Unlike label encoding, one-hot encoding does not assign numerical values (e.g., 0, 1, 2) that might imply an ordinal relationship between categories.
Example: If gender is encoded as 0 for "Female" and 1 for "Male," a machine learning model might incorrectly assume that "Male" is greater than "Female."

Suitable for Algorithms :
Many machine learning algorithms (e.g., linear regression, logistic regression, neural networks) require numerical input. One-hot encoding converts categorical data into a numerical format without introducing bias.

Improves Model Performance :
By explicitly representing each category as a binary feature, the model can better understand the data and make more accurate predictions.

##1. ONEHOTENCODING USING PANDAS

In [8]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


###2. K-1 ONEHOTENCODING

In [9]:
pd.get_dummies(df,columns= ['fuel','owner'],drop_first = True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


##3. ONEHOTENCODING USING SKLEARN

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=2)


In [22]:
X_train.head()


Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [23]:
from sklearn.preprocessing import OneHotEncoder


In [12]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [13]:
from sklearn.preprocessing import OneHotEncoder

In [23]:
ohe = OneHotEncoder(drop='first',sparse=False,dtype=np.int32)


In [24]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])


In [25]:
X_test_new = ohe.transform(X_test[['fuel','owner']])


In [26]:
X_train_new.shape


(6502, 7)

##4. ONEHOTENCODING WITH TOP CATEGORIES

In [17]:
counts = df['brand'].value_counts()


In [18]:
df['brand'].nunique()
threshold = 100


In [19]:
repl = counts[counts <= threshold].index


In [20]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon')).sample(5)


Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
3321,False,False,False,False,False,False,False,False,False,False,False,False,True
1956,False,False,False,False,False,False,False,False,False,False,True,False,False
6753,False,False,False,True,False,False,False,False,False,False,False,False,False
7014,False,False,False,False,True,False,False,False,False,False,False,False,False
298,False,False,False,False,True,False,False,False,False,False,False,False,False


##IN STEP FORM

In [31]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Step 1: Load the dataset
data = pd.read_csv('cars.csv')

# Step 2: Separate categorical and numerical columns
categorical_columns = ['brand', 'fuel', 'owner']
numerical_columns = ['km_driven', 'selling_price']

# Step 3: Initialize OneHotEncoder
encoder = OneHotEncoder(drop=None, sparse_output=False)  # Set drop='first' for K-1 encoding

# Step 4: Fit and transform the categorical columns
encoded_categorical = encoder.fit_transform(data[categorical_columns])

# Step 5: Create a DataFrame for the encoded data
encoded_df = pd.DataFrame(
    encoded_categorical,
    columns=encoder.get_feature_names_out(categorical_columns)
)

# Step 6: Combine the encoded data with numerical columns
final_data = pd.concat([data[numerical_columns], encoded_df], axis=1)

# Step 7: Display the final dataset
print(final_data.head())

   km_driven  selling_price  brand_Ambassador  brand_Ashok  brand_Audi  \
0     145500         450000               0.0          0.0         0.0   
1     120000         370000               0.0          0.0         0.0   
2     140000         158000               0.0          0.0         0.0   
3     127000         225000               0.0          0.0         0.0   
4     120000         130000               0.0          0.0         0.0   

   brand_BMW  brand_Chevrolet  brand_Daewoo  brand_Datsun  brand_Fiat  ...  \
0        0.0              0.0           0.0           0.0         0.0  ...   
1        0.0              0.0           0.0           0.0         0.0  ...   
2        0.0              0.0           0.0           0.0         0.0  ...   
3        0.0              0.0           0.0           0.0         0.0  ...   
4        0.0              0.0           0.0           0.0         0.0  ...   

   brand_Volvo  fuel_CNG  fuel_Diesel  fuel_LPG  fuel_Petrol  \
0          0.0       0

In [32]:
import pandas as pd

# Step 1: Load the dataset
data = pd.read_csv('cars.csv')

# Step 2: Perform one-hot encoding using pd.get_dummies()
encoded_data = pd.get_dummies(data, columns=['brand', 'fuel', 'owner'], drop_first=False)

# Step 3: Display the encoded dataset
print(encoded_data.head())

   km_driven  selling_price  brand_Ambassador  brand_Ashok  brand_Audi  \
0     145500         450000             False        False       False   
1     120000         370000             False        False       False   
2     140000         158000             False        False       False   
3     127000         225000             False        False       False   
4     120000         130000             False        False       False   

   brand_BMW  brand_Chevrolet  brand_Daewoo  brand_Datsun  brand_Fiat  ...  \
0      False            False         False         False       False  ...   
1      False            False         False         False       False  ...   
2      False            False         False         False       False  ...   
3      False            False         False         False       False  ...   
4      False            False         False         False       False  ...   

   brand_Volvo  fuel_CNG  fuel_Diesel  fuel_LPG  fuel_Petrol  \
0        False     Fal

##One-Hot Encoding :

A process of converting categorical variables into binary (0/1) columns for each category, making them suitable for machine learning models.


##Key Points :
Purpose : Transforms categorical data into numerical format.

Output : Binary columns (0 or 1) for each category.

Avoids Order Bias : Ensures no implicit ranking in categories.

Common Methods :
pd.get_dummies() (Pandas).
OneHotEncoder (Scikit-learn).
High Dimensionality : Can increase feature count significantly.