## **One Hot Encoding**

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('carprices.csv')
df.head()

In [None]:
df.corr()

In [None]:
# Plotting the data points we can see that there seems to be a negative correlation between the mileage and the sell price
# Therefore a linear regression can be used

x = df[['Mileage']]
y = df['Sell Price($)']

plt.scatter(x, y)
plt.show()

In [None]:
# Let's get the dummy variables for the categorical data using pd.get_dummies

dummies = pd.get_dummies(df['Car Model'])
final_dummies = dummies.drop('Audi A5', axis=1)

# We can merge the df and the final dummies
final_df = pd.concat([df, final_dummies], axis=1)
final_df = final_df.drop('Car Model', axis=1)
final_df.head()

In [None]:
# We can now get our X and y variables and do our linear regression

lre = LinearRegression()

X = final_df.drop('Sell Price($)', axis=1)

lre.fit(X, y)

print('The r2 score is: ', lre.score(X, y))

In [None]:
yhat = lre.predict(X)
yhat

In [None]:
final_df.head()

In [None]:
# Exercise:
# 1. Predict price of a Mercedes Benz that is 4 yrs old and a mileage of 45000

print('The price of the Mercedes Benz is: ', lre.predict([[45000, 4, 0, 1]]))

In [None]:
# 2. Predict price of a BMW that is 7 yrs old and a mileage of 86000

print('The price of the BMW is: ', lre.predict([[86000, 7, 1, 0]]))

In [None]:
# 3. Tell the accuracy(score) of the model.

print('The accuracy of the model is: ', lre.score(X, y))

In [None]:
# Predict the price of an Audi that is 8 yrs old and a mileage of 49500

print('The price of the Audi is: ', lre.predict([[49500, 8, 0, 0]]))

In [None]:
df2 = df

df2['Predicted Price ($)'] = yhat
df2

In [None]:
# Calculating the mean squared error
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y, yhat)
print('The mean squared error is: ', mse)

In [None]:
import seaborn as sns

ax1 = sns.distplot(y, hist=False, color='r', label='Actual Value')
sns.distplot(yhat, hist=False, color='g', label='Predicted Value', ax=ax1)

plt.show()
