# Predict automobile price using Machine Learning

## Import Libraries 

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from matplotlib.pyplot import xticks
import warnings
warnings.filterwarnings("ignore")

## Importing Data

In [None]:
df=pd.read_csv("C:\\Users\\Darshu\\Documents\\Automobile.csv")
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info

In [None]:
df.describe()

In [None]:
df.columns


In [None]:
print(df['fueltype'].value_counts())
print(df['aspiration'].value_counts())
print(df['doornumber'].value_counts())
print(df['carbody'].value_counts())
print(df['drivewheel'].value_counts())
print(df['enginelocation'].value_counts())

## DATA CLEANING 

In [None]:
df.loc[df.duplicated()] # There is no dupicate values 

In [None]:
# Checking Null values
df.isnull().sum() 
# No null values

# Exploratory Data Analysis ( EDA )

## Univariate Analysis

### Target variable: Price

In [None]:
df.price.describe()

In [None]:
sns.distplot(df['price'])

### Symboling

In [None]:
#symboling column- Its assigned insurance risk rating, 
#A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe
df['symboling'].value_counts()

In [None]:
plt1 = sns.countplot(df['symboling'])
plt1.set(xlabel = 'Symbol', ylabel= 'Count of Cars')
plt.show()
plt.tight_layout()

In [None]:
df['car_company'] = df.make.str.split(' ').str.get(0).str.upper() # spilting car company and model

In [None]:
df = df.drop(['make'], axis =1) # droping make column

In [None]:
print(len('car_company'))
df.head()


In [None]:
# It's noticed that in brand names,
# VOLKSWAGON has three different values as VOLKSWAGEN, VOKSWAGEN and VW
# MAZDA is also spelled as MAXDA
# PORSCHE as PORSCHE and PORCSCHE.
# Let's fix these data issues.
df['car_company'] = df['car_company'].replace(['VW', 'VOKSWAGEN'], 'VOLKSWAGEN')
df['car_company'] =df['car_company'].replace(['MAXDA'], 'MAZDA')
df['car_company'] =df['car_company'].replace(['PORCSHCE'], 'PORSCHE')
df['car_company'] = df['car_company'].replace(['TOYOUTA'], 'TOYOTA')

In [None]:
fig, ax = plt.subplots(figsize = (15,5))
plt1 = sns.countplot(df['car_company'], order=pd.value_counts(df['car_company']).index,)
plt1.set(xlabel = 'car_company', ylabel= 'Count of Cars')
xticks(rotation=90)
plt.show()


In [None]:
df.car_company.describe()

In [None]:
# Toyota has the most no of models.

###  car_company

In [None]:
df_comp_avg_price = df[['car_company','price']].groupby("car_company", as_index = False).mean().rename(columns={'price':'car_company_avg_price'})
plt1 = df_comp_avg_price.plot(x = 'car_company', kind='bar',legend = False, sort_columns = True, figsize = (15,3))
plt1.set_xlabel("car_company")
plt1.set_ylabel("Avg Price (Dollars)")
xticks(rotation = 90)
plt.show()

In [None]:
df = df.merge(df_comp_avg_price, on = 'car_company')


In [None]:
df['car_category'] = df['car_company_avg_price'].apply(lambda x : "Budget" if x < 10000 
                                                     else ("Mid_Range" if 10000 <= x < 20000
                                                           else "Luxury"))


###  Fueltype

In [None]:
fuel_avg_price = df[['fueltype','price']].groupby("fueltype", as_index = False).mean().rename(columns={'price':'fuel_avg_price'})
plt1 = fuel_avg_price.plot(x = 'fueltype', kind='bar',legend = False, sort_columns = True)
plt1.set_xlabel("Fuel Type")
plt1.set_ylabel("Avg Price (Dollars)")
plt.show()

In [None]:
# Diesel cars are priced more than gas cars.

### Doornumber

In [None]:
door_avg_price = df[['doornumber','price']].groupby("doornumber", as_index = False).mean().rename(columns={'price':'door_avg_price'})
plt1 = door_avg_price.plot(x = 'doornumber', kind='bar',legend = False, sort_columns = True)
plt1.set_xlabel("No of Doors")
plt1.set_ylabel("Avg Price (Dollars)")
plt.show()

###  Aspiration

In [None]:
aspir_avg_price = df[['aspiration','price']].groupby("aspiration", as_index = False).mean().rename(columns={'price':'aspir_avg_price'})
plt1 = aspir_avg_price.plot(x = 'aspiration', kind='bar',legend = False, sort_columns = True)
plt1.set_xlabel("Aspiration")
plt1.set_ylabel("Avg Price (Dollars)")

plt.show()

### Carbody 

In [None]:
df_body_avg_price = df[['carbody','price']].groupby("carbody", as_index = False).mean().rename(columns={'price':'carbody_avg_price'})
plt1 = df_body_avg_price.plot(x = 'carbody', kind='bar',legend = False, sort_columns = True)
plt1.set_xlabel("Car Body")
plt1.set_ylabel("Avg Price (Dollars)")
xticks(rotation = 0)
plt.show() 

### Engine Type, Cylinder, Fuel System 

In [None]:
fig, axs = plt.subplots(1,3,figsize=(20,5))


df_engine_avg_price = df[['enginetype','price']].groupby("enginetype", as_index = False).mean().rename(columns={'price':'engine_avg_price'})
plt1 = df_engine_avg_price.plot(x = 'enginetype', kind='bar', sort_columns = True, legend = False, ax = axs[0])
plt1.set_xlabel("Engine Type")
plt1.set_ylabel("Avg Price (Dollars)")
xticks(rotation = 0)
df_cylindernumber_avg_price = df[['cylindernumber','price']].groupby("cylindernumber", as_index = False).mean().rename(columns={'price':'cylindernumber_avg_price'})
plt1 = df_cylindernumber_avg_price.plot(x = 'cylindernumber', kind='bar', sort_columns = True,legend = False, ax = axs[1])
plt1.set_xlabel("Cylinder Number")
plt1.set_ylabel("Avg Price (Dollars)")
xticks(rotation = 0)
df_fuelsystem_avg_price = df[['fuelsystem','price']].groupby("fuelsystem", as_index = False).mean().rename(columns={'price':'fuelsystem_avg_price'})
plt1 = df_fuelsystem_avg_price.plot(x = 'fuelsystem', kind='bar', sort_columns = True,legend = False, ax = axs[2])
plt1.set_xlabel("Fuel System")
plt1.set_ylabel("Avg Price (Dollars)")
xticks(rotation = 0)
plt.show()

In [None]:
#A single variable mileage can be calculated taking the weighted average of 55% city and 45% highways.

In [None]:
df['mileage'] = df['citympg']*0.55 + df['highwaympg']*0.45

### Mileage-price

In [None]:
plt1 = sns.scatterplot(x = 'mileage', y = 'price', data = df)
plt1.set_xlabel('Mileage')
plt1.set_ylabel('Price of Car (Dollars)')
plt.show()

###  Wheelbase-Price

In [None]:
plt1 = sns.scatterplot(x = 'wheelbase', y = 'price', data = df)
plt1.set_xlabel('Wheelbase (Inches)')
plt1.set_ylabel('Price of Car (Dollars)')
plt.show()


### Car Dimensions 

In [None]:
fig, axs = plt.subplots(2,2,figsize=(10,10))
plt1 = sns.scatterplot(x = 'carlength', y = 'price', data = df, ax = axs[0,0])
plt1.set_xlabel('Length of Car (Inches)')
plt1.set_ylabel('Price of Car (Dollars)')
plt2 = sns.scatterplot(x = 'carwidth', y = 'price', data = df, ax = axs[0,1])
plt2.set_xlabel('Width of Car (Inches)')
plt2.set_ylabel('Price of Car (Dollars)')
plt3 = sns.scatterplot(x = 'carheight', y = 'price', data = df, ax = axs[1,0])
plt3.set_xlabel('Height of Car (Inches)')
plt3.set_ylabel('Price of Car (Dollars)')
plt3 = sns.scatterplot(x = 'curbweight', y = 'price', data = df, ax = axs[1,1])
plt3.set_xlabel('Weight of Car (Pounds)')
plt3.set_ylabel('Price of Car (Dollars)')
plt.tight_layout()

### Engine Size, Bore Ratio, Stroke, Horsepower & Compression Ratio


In [None]:
fig, axs = plt.subplots(3,2,figsize=(20,20))
#
plt1 = sns.scatterplot(x = 'enginesize', y = 'price', data = df, ax = axs[0,0])
plt1.set_xlabel('Size of Engine (Cubic Inches)')
plt1.set_ylabel('Price of Car (Dollars)')
#
plt2 = sns.scatterplot(x = 'boreratio', y = 'price', data = df, ax = axs[0,1])
plt2.set_xlabel('Bore Ratio')
plt2.set_ylabel('Price of Car (Dollars)')
#
plt3 = sns.scatterplot(x = 'stroke', y = 'price', data = df, ax = axs[1,0])
plt3.set_xlabel('Stroke')
plt3.set_ylabel('Price of Car (Dollars)')
#
plt4 = sns.scatterplot(x = 'compressionratio', y = 'price', data = df, ax = axs[1,1])
plt4.set_xlabel('Compression Ratio')
plt4.set_ylabel('Price of Car (Dollars)')
#
plt5 = sns.scatterplot(x = 'horsepower', y = 'price', data = df, ax = axs[2,0])
plt5.set_xlabel('Horsepower')
plt5.set_ylabel('Price of Car (Dollars)')
plt5 = sns.scatterplot(x = 'peakrpm', y = 'price', data = df, ax = axs[2,1])
plt5.set_xlabel('Peak RPM')
plt5.set_ylabel('Price of Car (Dollars)')
plt.tight_layout()
plt.show()


### Enginesize-price 

In [None]:
plt.scatter(x=df['enginesize'],y=df['price'])
plt.show()

## Bivariate Analysis

### Brand Category - Mileage

In [None]:
plt1 = sns.scatterplot(x = 'mileage', y = 'price', hue = 'car_category', data = df)
plt1.set_xlabel('Mileage')
plt1.set_ylabel('Price of Car (Dollars)')
plt.show()

### Brand Category - Horsepower 

In [None]:
plt1 = sns.scatterplot(x = 'horsepower', y = 'price', hue = 'car_category', data = df)
plt1.set_xlabel('Horsepower')
plt1.set_ylabel('Price of Car (Dollars)')
plt.show()

### Mileage - Fuel Type 

In [None]:
plt1 = sns.scatterplot(x = 'mileage', y = 'price', hue = 'fueltype', data = df)
plt1.set_xlabel('Mileage')
plt1.set_ylabel('Price of Car (Dollars)')
plt.show()

### Horsepower - Fuel Type 

In [None]:
plt1 = sns.scatterplot(x = 'horsepower', y = 'price', hue = 'fueltype', data = df)
plt1.set_xlabel('Horsepower')
plt1.set_ylabel('Price of Car (Dollars)')
plt.show()

In [None]:
auto = df[['fueltype', 'aspiration', 'carbody', 'drivewheel', 'wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize',  'boreratio', 'horsepower', 'price', 'car_category', 'mileage']]

In [None]:
auto.head()

In [None]:
plt.figure(figsize=(15, 15))
sns.pairplot(df)
plt.show()

### Visualising Categorical Variables 

In [None]:
plt.figure(figsize=(10, 20))
plt.subplot(4,2,1)
sns.boxplot(x = 'fueltype', y = 'price', data = auto)
plt.subplot(4,2,2)
sns.boxplot(x = 'aspiration', y = 'price', data = auto)
plt.subplot(4,2,3)
sns.boxplot(x = 'carbody', y = 'price', data = auto)
plt.subplot(4,2,4)
sns.boxplot(x = 'drivewheel', y = 'price', data = auto)
plt.subplot(4,2,5)
sns.boxplot(x = 'enginetype', y = 'price', data = auto)
plt.subplot(4,2,6)
sns.boxplot(x = 'car_category', y = 'price', data = auto)
plt.subplot(4,2,7)
sns.boxplot(x = 'cylindernumber', y = 'price', data = auto)
plt.tight_layout()
plt.show()


### Dummy Variables

In [None]:
# Categorical Variables are converted into Neumerical Variables with the help of Dummy Variable 


In [None]:
cyl_no = pd.get_dummies(auto['cylindernumber'], drop_first = True)
auto = pd.concat([auto, cyl_no], axis = 1)
brand_cat = pd.get_dummies(auto['car_category'], drop_first = True)
auto = pd.concat([auto, brand_cat], axis = 1)
eng_typ = pd.get_dummies(auto['enginetype'], drop_first = True)
auto = pd.concat([auto, eng_typ], axis = 1)
drwh = pd.get_dummies(auto['drivewheel'], drop_first = True)
auto = pd.concat([auto, drwh], axis = 1)
carb = pd.get_dummies(auto['carbody'], drop_first = True)
auto = pd.concat([auto, carb], axis = 1)
asp = pd.get_dummies(auto['aspiration'], drop_first = True)
auto = pd.concat([auto, asp], axis = 1)
fuelt = pd.get_dummies(auto['fueltype'], drop_first = True)
auto = pd.concat([auto, fuelt], axis = 1)


In [None]:
auto.drop(['fueltype', 'aspiration', 'carbody', 'drivewheel', 
           'enginetype', 'cylindernumber','car_category'], axis = 1, inplace = True)

In [None]:
auto.head()

# LINEAR REGRESSION

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error

In [None]:
df.columns

In [None]:
x=df['enginesize']
y=df['price']
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=31)

print(len(x.index))

In [None]:
x.head()

In [None]:
y.head()

In [None]:
x_train = x_train.values.reshape(-1,1)
x_test = x_test.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)
y_train = y_train.values.reshape(-1,1)

x_train

In [None]:
reg=LinearRegression()

reg.fit(x_train,y_train)

In [None]:
print(reg.intercept_)
print(reg.coef_)

In [None]:
y_train_pred = reg.predict(x_train)

y_train_pred

plt.scatter(x = x_train, y = y_train)
plt.scatter(x= x_train, y = y_train_pred)
plt.show()

### R2 score 

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

r2_score(y_train, y_train_pred)

In [None]:
y_test_pred = reg.predict(x_test)

plt.scatter(x = x_test, y = y_test)
plt.scatter(x = x_test, y = y_test_pred)

plt.show()

In [None]:
r2_score(y_test,y_test_pred)

## MODEL BUILDING

In [None]:
from sklearn.model_selection import train_test_split

np.random.seed(0)
df_train, df_test = train_test_split(auto, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

num_vars = ['wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginesize','boreratio', 'horsepower', 'price','mileage']

df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

print(df_train[num_vars])

In [None]:
df_train.head()

In [None]:
# Let's check the correlation coefficients to see which variables are highly correlated

plt.figure(figsize = (16, 10))
sns.heatmap(df_train.corr(), annot = True, cmap="YlGnBu")
plt.show()

## Dividing into X and Y sets for the model building 

In [None]:
y_train = df_train.pop('price')
x_train = df_train

In [None]:
from sklearn.feature_selection import RFE
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression


In [None]:
multi_model = LinearRegression()
multi_model.fit(x_train, y_train)
rfe = RFE(multi_model, 10)          
rfe = rfe.fit(x_train, y_train)

In [None]:
list(zip(x_train.columns,rfe.support_,rfe.ranking_))

In [None]:
x_train.columns[rfe.support_]

In [None]:
x_train_rfe=x_train[x_train.columns[rfe.support_]]
x_train_rfe.head()

In [None]:
col = x_train.columns[rfe.support_]
col

In [None]:
# Creating X_test dataframe with RFE selected variables
x_train_rfe = x_train[col]

In [None]:
def build_mlr_model(x,y):
    x = sm.add_constant(x)
    lm = sm.OLS(y_train,x_train_rfe).fit() 
    print(lm.summary())
    return x

def checkVIF(x):
    vif = pd.DataFrame()
    vif['Features'] = x.columns
    vif['VIF'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    return(vif)

In [None]:
x_train_new = build_mlr_model(x_train_rfe,y_train)

In [None]:
# Dropping curbweight as p-value is high.
x_train_new1 = x_train_rfe.drop(["twelve"], axis = 1)


# Adding a constant variable 
import statsmodels.api as sm  
x_train_lm = sm.add_constant(x_train_new1)

lm = sm.OLS(y_train,x_train_lm).fit()   # Running the linear model

#Let's see the summary of our linear model
print(lm.summary())

In [None]:
# Dropping hardtop as p value is high.
x_train_new2 = x_train_new1.drop(["mileage"], axis = 1)


# Adding a constant variable 
import statsmodels.api as sm  
x_train_lm = sm.add_constant(x_train_new2)

lm = sm.OLS(y_train,x_train_lm).fit()   # Running the linear model

#Let's see the summary of our linear model
print(lm.summary())

In [None]:
# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
x = x_train_new2
vif['Features'] = x.columns
vif['VIF'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping mileage as p-value is high.
x_train_new3 = x_train_new2.drop(["curbweight"], axis = 1)


# Adding a constant variable 
import statsmodels.api as sm  
x_train_lm = sm.add_constant(x_train_new3)

lm = sm.OLS(y_train,x_train_lm).fit()   # Running the linear model

#Let's see the summary of our linear model
print(lm.summary())

In [None]:
# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
x = x_train_new3
vif['Features'] = x.columns
vif['VIF'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping sedan as VIF value is high.
x_train_new4 = x_train_new3.drop(["sedan"], axis = 1)


# Adding a constant variable 
import statsmodels.api as sm  
x_train_lm = sm.add_constant(x_train_new4)

lm = sm.OLS(y_train,x_train_lm).fit()   # Running the linear model

#Let's see the summary of our linear model
print(lm.summary())

In [None]:
# Dropping wagon as p value is high.
x_train_new5 = x_train_new4.drop(["wagon"], axis = 1)


# Adding a constant variable 
import statsmodels.api as sm  
x_train_lm = sm.add_constant(x_train_new5)

lm = sm.OLS(y_train,x_train_lm).fit()   # Running the linear model

#Let's see the summary of our linear model
print(lm.summary())

In [None]:
# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
x = x_train_new5
vif['Features'] = x.columns
vif['VIF'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif


In [None]:
# Dropping dohcv to see if any change in model.
x_train_new6 = x_train_new5.drop(["dohcv"], axis = 1)


# Adding a constant variable 
import statsmodels.api as sm  
x_train_lm = sm.add_constant(x_train_new6)

lm = sm.OLS(y_train,x_train_lm).fit()   # Running the linear model

#Let's see the summary of our linear model
print(lm.summary())

In [None]:
y_train_price = lm.predict(x_train_lm)

In [None]:
num_vars = ['wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginesize','boreratio', 'horsepower', 'price','mileage']

df_test[num_vars] = scaler.transform(df_test[num_vars])

In [None]:
y_test = df_test.pop('price')
x_test = df_test

In [None]:
# Now let's use our model to make predictions.

# Creating X_test_new dataframe by dropping variables from X_test
x_test_new = x_test[['carwidth', 'horsepower', 'Luxury', 'hatchback']]

# Adding a constant variable 
x_test_new = sm.add_constant(x_test_new)

In [None]:
# Making predictions
y_pred = lm.predict(x_test_new)

### R2-SCORE 

In [None]:
from sklearn.metrics import r2_score 
r2_score(y_test, y_pred)

In [None]:
# Plotting y_test and y_pred to understand the spread.
fig = plt.figure()
plt.scatter(y_test,y_pred)
fig.suptitle('y_test vs y_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('y_pred', fontsize=16)  

carwidth', 'curbweight', 'horsepower', 'mileage', 'twelve', 'Luxury','dohcv', 'hatchback', 'sedan', 'wagon'this are importent factor to calculate y variable means "price"