In [74]:
url = 'https://raw.githubusercontent.com/codebasics/py/master/ML/5_one_hot_encoding/Exercise/carprices.csv'
import pandas as pd
import numpy as np

In [75]:
# df.to_csv('carprices.csv', index=False)
df = pd.read_csv('carprices.csv')
df

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [76]:
df['Car Model'].unique()

array(['BMW X5', 'Audi A5', 'Mercedez Benz C class'], dtype=object)

In [77]:
# use of dummy variables
dummies = pd.get_dummies(df['Car Model'])
merged = pd.concat([df, dummies], axis=1)
# drop original col `Car Model` and dummy-col `Mercedez Benz C class`
final = merged.drop(['Car Model', 'Mercedez Benz C class'], axis='columns')
final

Unnamed: 0,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5
0,69000,18000,6,0,1
1,35000,34000,3,0,1
2,57000,26100,5,0,1
3,22500,40000,2,0,1
4,46000,31500,4,0,1
5,59000,29400,5,1,0
6,52000,32000,5,1,0
7,72000,19300,6,1,0
8,91000,12000,8,1,0
9,67000,22000,6,0,0


In [78]:
# Now, we know `final` dataframe covers all models.
# `BMW X5`, `Audi A5` and `Mercedez Benz C class`
# when all dummy-cols have value '0', row represents `Mercedez Benz C class`
# all are independent variables except `Sell Price($)` -> X

X = final.drop(['Sell Price($)'], axis='columns')
# dependent variable-> y
y = final['Sell Price($)']

In [79]:
# use sklearn model for Linear Regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model

LinearRegression()

In [80]:
#independent-> X; dependent-> y
model.fit(X, y) #independent, dependent

LinearRegression()

In [81]:
model.predict(X)

array([18705.2723644 , 35286.78445645, 24479.19112468, 41245.76426391,
       29882.98779056, 28023.6135243 , 30614.46818502, 21879.57266964,
       12182.34562104, 26183.72387884, 18929.31674102, 20409.80511857,
       30477.15426156])

In [82]:
# accuracy
model.score(X, y)

0.9417050937281083

In [83]:
# mileaga 45000, 4 yr old, Mercedez 
model.predict([[45000,4,0,0]]) #->array([34537.77647335])

array([36991.31721061])

In [84]:
#Price of BMW X5 that is 7 yr old with mileage 86000
model.predict([[86000, 7, 0,0]]) #->array([17818.95045785])


array([17818.95045785])

In [85]:
# now, we save the model into pickle file
import pickle
with open('model_pickle', 'wb') as f:
    pickle.dump(model, f)

In [86]:
# now use the model from pickle
with open('model_pickle', 'rb') as f:
    mp = pickle.load(f)

In [87]:
# prediction using pickle-dumped model
mp.predict([[45000,4,0,0]])

array([36991.31721061])

In [88]:
# second way of saving ->`joblib`
import joblib

In [89]:
joblib.dump(model, 'model_joblib')

['model_joblib']

In [90]:
mj = joblib.load('model_joblib')

In [91]:
#Price of BMW X5 that is 7 yr old with mileage 86000
mj.predict([[86000, 7, 0,0]])

array([17818.95045785])

In [92]:
# checking different values in different methods
model.coef_

array([-3.70122094e-01, -1.33245363e+03, -2.45354074e+03, -6.73820733e+03])

In [93]:
mp.coef_

array([-3.70122094e-01, -1.33245363e+03, -2.45354074e+03, -6.73820733e+03])

In [94]:
mj.coef_

array([-3.70122094e-01, -1.33245363e+03, -2.45354074e+03, -6.73820733e+03])

In [95]:
model.intercept_

58976.625968537235

In [96]:
mp.intercept_

58976.625968537235

In [97]:
mj.intercept_

58976.625968537235

In [98]:
# use of one_hot_encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [99]:
#independent-> X; dependent-> y
model.fit(X, y) #independent, dependent

LinearRegression()

In [100]:
# `car model` column gets transformed into numeric
dfle = df
dfle['Car Model'] = le.fit_transform(dfle['Car Model'])
dfle
# `BMW X5`-> 1, `Audi A5`-> 0, `Mercedez Benz C class`-> 2

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,1,69000,18000,6
1,1,35000,34000,3
2,1,57000,26100,5
3,1,22500,40000,2
4,1,46000,31500,4
5,0,59000,29400,5
6,0,52000,32000,5
7,0,72000,19300,6
8,0,91000,12000,8
9,2,67000,22000,6


In [101]:
# training
X = dfle[['Car Model', 'Mileage', 'Age(yrs)']].values
X

array([[    1, 69000,     6],
       [    1, 35000,     3],
       [    1, 57000,     5],
       [    1, 22500,     2],
       [    1, 46000,     4],
       [    0, 59000,     5],
       [    0, 52000,     5],
       [    0, 72000,     6],
       [    0, 91000,     8],
       [    2, 67000,     6],
       [    2, 83000,     7],
       [    2, 79000,     7],
       [    2, 59000,     5]])

In [102]:
y = dfle['Sell Price($)'].values
y

array([18000, 34000, 26100, 40000, 31500, 29400, 32000, 19300, 12000,
       22000, 20000, 21000, 33000])

In [103]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('Car Model', OneHotEncoder(), [0])], remainder = 'passthrough')

In [104]:
X = ct.fit_transform(X)  #col0->Audi, col1->BMW, col2->Mercedez 
#represented as binary
X

array([[0.00e+00, 1.00e+00, 0.00e+00, 6.90e+04, 6.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 3.50e+04, 3.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 5.70e+04, 5.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 2.25e+04, 2.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 4.60e+04, 4.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 5.90e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 5.20e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 7.20e+04, 6.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 9.10e+04, 8.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 6.70e+04, 6.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 8.30e+04, 7.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 7.90e+04, 7.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 5.90e+04, 5.00e+00]])

In [105]:
X = X[:, 1:]
print(X.shape)
X

(13, 4)


array([[1.00e+00, 0.00e+00, 6.90e+04, 6.00e+00],
       [1.00e+00, 0.00e+00, 3.50e+04, 3.00e+00],
       [1.00e+00, 0.00e+00, 5.70e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 2.25e+04, 2.00e+00],
       [1.00e+00, 0.00e+00, 4.60e+04, 4.00e+00],
       [0.00e+00, 0.00e+00, 5.90e+04, 5.00e+00],
       [0.00e+00, 0.00e+00, 5.20e+04, 5.00e+00],
       [0.00e+00, 0.00e+00, 7.20e+04, 6.00e+00],
       [0.00e+00, 0.00e+00, 9.10e+04, 8.00e+00],
       [0.00e+00, 1.00e+00, 6.70e+04, 6.00e+00],
       [0.00e+00, 1.00e+00, 8.30e+04, 7.00e+00],
       [0.00e+00, 1.00e+00, 7.90e+04, 7.00e+00],
       [0.00e+00, 1.00e+00, 5.90e+04, 5.00e+00]])

In [106]:
model.fit(X, y)

LinearRegression()

In [107]:
#col0->Audi, col1->BMW, col2->Mercedez for X
#represented as binary
#mileaga 45000, 4 yr old, Mercedez 
#X = dfle[['Car Model', 'Mileage', 'Age(yrs)']].values

model.predict([[0,0,45000,4]]) #->array([36991.31721061])

array([34537.77647335])

In [108]:
#Price of BMW X5 that is 7 yr old with mileage 86000
# model.predict([[86000, 7, 0,0]]) #->array([17818.95045785])
model.predict([[0,0,86000,7]])

array([15365.40972059])