In [39]:
import pandas as pd
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [18]:
df = pd.read_excel('dataset/homeprices3.xlsx')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [19]:
dummies = pd.get_dummies(df.town)
merged = pd.concat([df, dummies], axis = 'columns')
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [20]:
final = merged.drop(['town', 'west windsor'], axis ='columns')
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [21]:
model = linear_model.LinearRegression()
X = final.drop('price', axis = 'columns')
y = final.price
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [22]:
## y = mx+b // m is slope(or)coefficient and b is intercept
## price = coefficient * area + intercept

print(f" Coefficient : {model.coef_}")
print(f" Intercept : {model.intercept_}")

new_area, inp1_1, inp1_2 = 2800, 0, 1
print(f" Prediction Input1 : {model.predict([[new_area, inp1_1, inp1_2]])}")

new_area, inp2_1, inp2_2 = 3400, 0, 0
print(f" Prediction Input2 : {model.predict([[new_area, inp2_1, inp2_2]])}")

 Coefficient : [   126.89744141 -40013.97548914 -14327.56396474]
 Intercept : 249790.36766292533
 Prediction Input1 : [590775.63964739]
 Prediction Input2 : [681241.66845839]


In [27]:
# Check accuracy of the trained model
accuracy = model.score(X, y)
print(f"Accuracy of this Trained Model is {round(accuracy * 100, 2)} %")

Accuracy of this Trained Model is 95.74 %


In [56]:
## One Hot Encoding
lab_enc = LabelEncoder()

df_lab_enc = df
df_lab_enc.town = lab_enc.fit_transform(df_lab_enc.town)
df_lab_enc

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [57]:
X = df_lab_enc[['town', 'area']].values
y = df_lab_enc.price

In [58]:
# categorical features 0 from X 
onehot_enc = OneHotEncoder(categorical_features=[0])
X = onehot_enc.fit_transform(X).toarray()
X

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [59]:
X = X[ : , 1 : ]
X

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [60]:
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [64]:
# 1 means robinville, 0 means west winder
inp1_1, inp1_2, area = 1, 0, 2800
print(f" Prediction Input1 : {model.predict([[inp1_1, inp1_2, area]])}")

inp2_1, inp2_2, area = 0, 1, 3400
print(f" Prediction Input2 : {model.predict([[inp2_1, inp2_2, area]])}")

 Prediction Input1 : [590775.63964739]
 Prediction Input2 : [681241.6684584]


In [65]:
## Exercise - Car Price
df2 = pd.read_excel('dataset/carprices.xlsx')
df2

Unnamed: 0,car_model,mileage,sell_price_usd,age
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [75]:
dummies = pd.get_dummies(df2.car_model)
merged = pd.concat([df2, dummies], axis = 'columns')
merged

Unnamed: 0,car_model,mileage,sell_price_usd,age,Audi A5,BMW X5,Mercedez Benz C class
0,BMW X5,69000,18000,6,0,1,0
1,BMW X5,35000,34000,3,0,1,0
2,BMW X5,57000,26100,5,0,1,0
3,BMW X5,22500,40000,2,0,1,0
4,BMW X5,46000,31500,4,0,1,0
5,Audi A5,59000,29400,5,1,0,0
6,Audi A5,52000,32000,5,1,0,0
7,Audi A5,72000,19300,6,1,0,0
8,Audi A5,91000,12000,8,1,0,0
9,Mercedez Benz C class,67000,22000,6,0,0,1


In [76]:
final = merged.drop(['car_model', 'Audi A5'], axis ='columns')
final

Unnamed: 0,mileage,sell_price_usd,age,BMW X5,Mercedez Benz C class
0,69000,18000,6,1,0
1,35000,34000,3,1,0
2,57000,26100,5,1,0
3,22500,40000,2,1,0
4,46000,31500,4,1,0
5,59000,29400,5,0,0
6,52000,32000,5,0,0
7,72000,19300,6,0,0
8,91000,12000,8,0,0
9,67000,22000,6,0,1


In [77]:
model = linear_model.LinearRegression()
X = final.drop('sell_price_usd', axis = 'columns')
y = final.sell_price_usd
X

Unnamed: 0,mileage,age,BMW X5,Mercedez Benz C class
0,69000,6,1,0
1,35000,3,1,0
2,57000,5,1,0
3,22500,2,1,0
4,46000,4,1,0
5,59000,5,0,0
6,52000,5,0,0
7,72000,6,0,0
8,91000,8,0,0
9,67000,6,0,1


In [78]:
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [80]:
## y = mx+b // m is slope(or)coefficient and b is intercept
## price = coefficient * area + intercept

print(f" Coefficient : {model.coef_}")
print(f" Intercept : {model.intercept_}")

inp1_1, inp1_2, inp1_3, inp1_4 = 45000, 4, 0, 1
print(f" Prediction Input1 : {model.predict([[inp1_1, inp1_2, inp1_3, inp1_4]])}")

inp2_1, inp2_2, inp2_3, inp2_4 = 86000, 7, 1, 0
print(f" Prediction Input1 : {model.predict([[inp2_1, inp2_2, inp2_3, inp2_4]])}")

 Coefficient : [-3.70122094e-01 -1.33245363e+03 -4.28466659e+03  2.45354074e+03]
 Intercept : 56523.08523127495
 Prediction Input1 : [36991.31721061]
 Prediction Input1 : [11080.74313219]


In [74]:
# Check accuracy of the trained model
accuracy = model.score(X, y)
print(f"Accuracy of this Trained Model is {round(accuracy * 100, 2)} %")

Accuracy of this Trained Model is 94.17 %
