### Why One Hot Encoding

*Machine Learning* model dosent **support** characters to train the model, so we need to preprocess the character values

In [31]:
# importing the necessary modules
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

In [20]:
# load the dataset
dataset = pd.read_csv('carprices.csv')
df = dataset
dataset

Unnamed: 0,car models,mileage,price,age
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


## 3 methods the remove the characters

   1. Dummy variables
   2. Label encoder
   3. One Hot Encoding

# 1. Pandas Dummy Variables Creating the Model

In [7]:
car_model = pd.get_dummies(dataset['car models'])
car_model.drop(['Mercedez Benz C class'], axis=1, inplace=True)
car_model

Unnamed: 0,Audi A5,BMW X5
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
5,1,0
6,1,0
7,1,0
8,1,0
9,0,0


*concatenate these dummy variables in the original dataset and remove the alphabetical column* and one of the **dummies** column because it will tend to produce ***dummy variable trap***

In [8]:
dummy_dataset = dataset
dummy_dataset.drop('car models', inplace=True, axis=1)
dummy_dataset

Unnamed: 0,mileage,price,age
0,69000,18000,6
1,35000,34000,3
2,57000,26100,5
3,22500,40000,2
4,46000,31500,4
5,59000,29400,5
6,52000,32000,5
7,72000,19300,6
8,91000,12000,8
9,67000,22000,6


concatenating the dummy variables to the above dataset

In [9]:
dummy_dataset = pd.concat([dummy_dataset, car_model], axis=1)
dummy_dataset

Unnamed: 0,mileage,price,age,Audi A5,BMW X5
0,69000,18000,6,0,1
1,35000,34000,3,0,1
2,57000,26100,5,0,1
3,22500,40000,2,0,1
4,46000,31500,4,0,1
5,59000,29400,5,1,0
6,52000,32000,5,1,0
7,72000,19300,6,1,0
8,91000,12000,8,1,0
9,67000,22000,6,0,0


In [10]:
# splitting x and y
x = dummy_dataset.drop('price', axis=1)
y = dummy_dataset['price']

In [11]:
x

Unnamed: 0,mileage,age,Audi A5,BMW X5
0,69000,6,0,1
1,35000,3,0,1
2,57000,5,0,1
3,22500,2,0,1
4,46000,4,0,1
5,59000,5,1,0
6,52000,5,1,0
7,72000,6,1,0
8,91000,8,1,0
9,67000,6,0,0


In [12]:
# creating the model
model = LinearRegression()
model.fit(x, y)

### Question:-

   1. predict the benz model price with 4yrs and 45000 mileage
   2. predict the price for BMW when 7 years old and 86000 mileage

In [13]:
dummy_mileage = model.predict([[45000, 4, 0, 0]])
dummy_bmw = model.predict([[86000, 7, 1, 0]])

print(f"price for benz = {dummy_mileage}")
print(f"price for bmw = {dummy_bmw}")

price for benz = [36991.31721061]
price for bmw = [15365.40972059]


## 2. Label Encoder

In [15]:
le = LabelEncoder()

In [21]:
labelEncoder_dataset = df

In [22]:
labelEncoder_dataset['car models'] = le.fit_transform(labelEncoder_dataset['car models'])
labelEncoder_dataset

Unnamed: 0,car models,mileage,price,age
0,1,69000,18000,6
1,1,35000,34000,3
2,1,57000,26100,5
3,1,22500,40000,2
4,1,46000,31500,4
5,0,59000,29400,5
6,0,52000,32000,5
7,0,72000,19300,6
8,0,91000,12000,8
9,2,67000,22000,6


In [23]:
x = labelEncoder_dataset.drop(['price'], axis=1)
y = labelEncoder_dataset['price']

x

Unnamed: 0,car models,mileage,age
0,1,69000,6
1,1,35000,3
2,1,57000,5
3,1,22500,2
4,1,46000,4
5,0,59000,5
6,0,52000,5
7,0,72000,6
8,0,91000,8
9,2,67000,6


In [24]:
model = LinearRegression()
model.fit(x, y)

### Question:-

   1. predict the benz model price with 4yrs and 45000 mileage
   2. predict the price for BMW when 7 years old and 86000 mileage

In [25]:
LinearEncoder_mileage = model.predict([[2, 45000, 4]])
LinearEncoder_bmw = model.predict([[0, 86000, 7]])

print(f"price for benz = {LinearEncoder_mileage}")
print(f"price for bmw = {LinearEncoder_bmw}")

price for benz = [33336.7721827]
price for bmw = [15005.0687045]


## 3. One Hot Encoding

In [26]:
x

Unnamed: 0,car models,mileage,age
0,1,69000,6
1,1,35000,3
2,1,57000,5
3,1,22500,2
4,1,46000,4
5,0,59000,5
6,0,52000,5
7,0,72000,6
8,0,91000,8
9,2,67000,6


In [28]:
x = x[['car models', 'mileage', 'age']].values
x

array([[    1, 69000,     6],
       [    1, 35000,     3],
       [    1, 57000,     5],
       [    1, 22500,     2],
       [    1, 46000,     4],
       [    0, 59000,     5],
       [    0, 52000,     5],
       [    0, 72000,     6],
       [    0, 91000,     8],
       [    2, 67000,     6],
       [    2, 83000,     7],
       [    2, 79000,     7],
       [    2, 59000,     5]], dtype=int64)

In [30]:
y = y.values
y

array([18000, 34000, 26100, 40000, 31500, 29400, 32000, 19300, 12000,
       22000, 20000, 21000, 33000], dtype=int64)

### since x and y are generated

In [32]:
column_transfer = ColumnTransformer([('car models', OneHotEncoder(), [0])], remainder='passthrough')

In [34]:
column_transfer

In [35]:
x = column_transfer.fit_transform(x)
x

array([[0.00e+00, 1.00e+00, 0.00e+00, 6.90e+04, 6.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 3.50e+04, 3.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 5.70e+04, 5.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 2.25e+04, 2.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 4.60e+04, 4.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 5.90e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 5.20e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 7.20e+04, 6.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 9.10e+04, 8.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 6.70e+04, 6.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 8.30e+04, 7.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 7.90e+04, 7.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 5.90e+04, 5.00e+00]])

x is similar to dummy variables in pandas

*0th* : BMW <br>
*1st* : Audi  <br>
*3rd* : Benz <br>

In [36]:
y

array([18000, 34000, 26100, 40000, 31500, 29400, 32000, 19300, 12000,
       22000, 20000, 21000, 33000], dtype=int64)

In [37]:
model = LinearRegression()
model.fit(x, y)

### Question:-

   1. predict the benz model price with 4yrs and 45000 mileage
   2. predict the price for BMW when 7 years old and 86000 mileage

In [39]:
OneHotEncoder_mileage = model.predict([[0, 0, 1, 45000, 4]])
OneHotEncoder_bmw = model.predict([[1, 0, 0, 86000, 7]])

print(f"price for benz = {OneHotEncoder_mileage}")
print(f"price for bmw = {OneHotEncoder_bmw}")

price for benz = [36991.31721061]
price for bmw = [15365.4097206]
