In [None]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
from sklearn.feature_selection import RFE

In [None]:
data = pd.read_csv (r'../input/car-price-prediction/CarPrice_Assignment.csv') 

In [None]:
plt.rcParams['axes.facecolor'] = 'Beige'

# Exploratory Data Analysis (EDA)

## Take a quick look at the data:

In [None]:
data.tail()

In [None]:
data.info()

### Data types: float64 (8 columns), int64 (8 columns), object (10 columns)


In [None]:
data.columns

In [None]:
data.describe()

### Car names are actually written with typos:
#### And the names are actually written with model number, so we split them.

In [None]:
data['CarName'] = data['CarName'].apply(lambda x : x.split()[0])

In [None]:
data['CarName'].unique()

### Names of models are actually written with Typos, so we correct this.

In [None]:
data['CarName'] = data['CarName'].replace({'maxda': 'mazda', 'nissan': 'Nissan', 'porcshce': 'porsche', 'toyouta': 'toyota', 
                            'vokswagen': 'volkswagen', 'vw': 'volkswagen'})

## Visualizing Data


In [None]:
data.hist(bins=25,figsize=(20,10));

### Visualising distribution of car brands:

In [None]:
n=pd.DataFrame(data['CarName'].value_counts()).reset_index().rename(columns={'index':'car_name','CarName': 'count'})

In [None]:
plt.figure(figsize=(11,6))
plot = sns.barplot(y='car_name',x='count',data=n)
plot=plt.setp(plot.get_xticklabels(), rotation=90)

## Distribution of prices: 

In [None]:
sns.distplot(data['price'],kde=True)

In [None]:
d=pd.DataFrame(data['carbody'].value_counts())
plot = d.plot.pie(y='carbody', figsize=(8, 8))

In [None]:
sns.pairplot(data[['horsepower','price','carbody']], hue="carbody");


In [None]:
data.info()

## Understanding our data:
### We use correlation matrix 

In [None]:
corr_matrix=data.corr()
corr_matrix

### But Plotting a correlation plot is actually clearer 

In [None]:
plt.figure(figsize=(14,8))
sns.heatmap(data.corr(), annot=True, cmap="YlGnBu")

### To understand the correlation plot: it creates a relation between 2 variables and checks if it's proportional(higher than 0.5) or inversely proportional (less than -0.5) , or no relation (close to zero)
#### Understanding the correlation between columns helps you make a better model as the insertion of lesser important columns will actually cause bias or corrupt the model.
### We found this:
- wheelbase has high positive correlation with carlength,carwidth and curbweight
- carlength has high postive correlation with curbweight
- carlength has negative correlation with highwaympg
- carwidth has high postive correlation with curbweight and engine size
- enginesize has high positive correlation with horsepower
- curbweight has high positive correlation with engine size and horse power, negative correlation with highwaympg
- horsepower has negative correlation with citympg and highwaympg
- citympg and highwaympg are highly correlated

## EDA foundings:

- The cars with fueltype as diesel are comparatively expensive than the cars with fueltype as gas.
- All the types of carbody is relatively cheaper as compared to convertible carbody.
- The cars with rear enginelocation are way expensive than cars with front enginelocation.
- The price of car is directly proportional to no. of cylinders in most cases.
- Enginetype ohcv comes into higher price range cars.
- DoorNumber isn't affecting the price much.
- HigerEnd cars seems to have rwd drivewheel

### We drop car_ID

In [None]:
data=data.drop(['car_ID'],axis=1)

## Is there missing data in our dataset?

In [None]:
print(data.isnull().values.any())

## Encoding categorical columns:

In [None]:
data.info()

In [None]:
le = LabelEncoder()
data['CarName']=le.fit_transform(data['CarName'])
data['fueltype']=le.fit_transform(data['fueltype'])
data['aspiration']=le.fit_transform(data['aspiration'])
data['doornumber']=le.fit_transform(data['doornumber'])
data['drivewheel']=le.fit_transform(data['drivewheel'])
data['enginelocation']=le.fit_transform(data['enginelocation'])
data['enginetype']=le.fit_transform(data['enginetype'])
data['cylindernumber']=le.fit_transform(data['cylindernumber'])
data['carbody']=le.fit_transform(data['carbody'])
data['fuelsystem']=le.fit_transform(data['fuelsystem'])

# Outliers:
## We are going to use two methods to deal with oultiers:
## 1) Deletion by scatterplot identification.
## 2) Standardization.

### ______________________________________________

### To look for ouliers we use boxplot 

In [None]:
plt.figure(figsize = (25,8))
sns.boxplot(palette = 'cool', data=data)

### A more accurate representation for ouliers is using a scatter plot
#### We make a scatter plot matrix:

In [None]:
#plt.figure(figsize = (20,20)
#sns.set_theme(style = "ticks")
#sns.pairplot(data)

### We make the scatterplot for the numerical columns only:

In [None]:
plt.figure(figsize = (20,20))
sns.pairplot(data = data , x_vars = ['carwidth', 'carheight', 'curbweight', 'enginesize'] , y_vars = ['price'])

In [None]:
sns.pairplot(data = data , x_vars = ['wheelbase','carlength','peakrpm', 'citympg', 'highwaympg'] , y_vars = ['price'])


In [None]:
sns.pairplot(data = data , x_vars = ['boreratio', 'stroke', 'compressionratio', 'horsepower'] , y_vars = ['price'])

## 1) Deletion of outliers:

In [None]:
data = data.drop(data[data['price']>35000].index)
#data = data.drop(data[data['carwidth']>70].index)
#data = data.drop(data[data['carheight']>58].index)
#data = data.drop(data[data['curbweight']>3500].index)
#data = data.drop(data[data['enginesize']>250].index)
#data = data.drop(data[data['wheelbase']>115].index)
#data = data.drop(data[data['carlength']>200].index)
data = data.drop(data[data['peakrpm']>6000].index)
#data = data.drop(data[data['citympg']>40].index)
data = data.drop(data[data['highwaympg']>40].index)
#data = data.drop(data[data['boreratio']>4].index)
#data = data.drop(data[data['stroke']>4].index)
data = data.drop(data[data['compressionratio']>20].index)
#data = data.drop(data[data['horsepower']>250].index)

####  We only deleted the outliers that could actually corrupt data:

## VIF:

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data=data
VIF=pd.Series([variance_inflation_factor(vif_data.values,i) 
for i in range(vif_data.shape[1])],index=vif_data.columns)
VIF

## Treatment of multicolinearity:

In [None]:
def MC_remover(data):
    vif=pd.Series([variance_inflation_factor(data.values,i)for i in range(data.shape[1])],index=data.columns)
    if vif.max()>5:
        print(vif[vif == vif.max()].index[0],'has been removed')
        data = data.drop(columns=[vif[vif==vif.max()].index[0]])
        return data
    else:
        print("No multicollinearity present anymore")
        return data

In [None]:
for i in range(10):
    vif_data=MC_remover(vif_data)
vif_data.head()

### calculating VIF for remaining columns

In [None]:
VIF=pd.Series([variance_inflation_factor(vif_data.values,i) for i in range(vif_data.shape[1])],index=vif_data.columns)
VIF,len(vif_data.columns)

# Splitting data:

In [None]:
reg = linear_model.LinearRegression()

### Rearragning columns:
- No need because price is the last column.

### To create the proper comparison we drop house value column from x and make it y


In [None]:
x = data.drop(['price'] , axis = 1).values
y= data['price' ].values

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y , test_size= 0.25 , random_state=42)


In [None]:
x_train.shape

In [None]:
y_train.shape

# Scaling
### Make sure all data aren't categorical

In [None]:
data.info()

In [None]:
from sklearn.preprocessing import RobustScaler

#### Robust scaler reduces the effect of ouliers


In [None]:
ro_scaler = RobustScaler()
x_train = ro_scaler.fit_transform(x_train)
x_test = ro_scaler.fit_transform(x_test)

In [None]:
x_train.shape

In [None]:
## copy data
datal = data.copy()

# Linear regression:

In [None]:
reg.fit(x_train , y_train)
reg.score(x_train , y_train)


In [None]:
reg.score(x_test , y_test)

In [None]:
reg.coef_

In [None]:
pd.DataFrame(reg.coef_ , datal.columns[:-1] ,  columns=['Coeficient'])

In [None]:
y_pred =reg.predict(x_test)
datal = pd.DataFrame({"Y_test": y_test , "Y_pred" : y_pred})
datal.head(10)

In [None]:
plt.figure(figsize=(10,8))
plt.plot(datal[:50])
plt.legend(["Actual" , "Predicted"])

## Other regression algortithms:

## Decision Tree Regression:

In [None]:
dt_regressor = DecisionTreeRegressor(random_state=0)
dt_regressor.fit(x_train,y_train)
y_train_pred = dt_regressor.predict(x_train)
y_test_pred = dt_regressor.predict(x_test)
dt_regressor.score(x_test,y_test)

## Random forest Regression:

In [None]:
Rf = RandomForestRegressor(n_estimators = 15,
                              criterion = 'mse',
                              random_state = 20,
                              n_jobs = -1)
Rf.fit(x_train,y_train)
Rf_train_pred = Rf.predict(x_train)
Rf_test_pred = Rf.predict(x_test)


r2_score(y_test,Rf_test_pred)

In [None]:
y_pred =reg.predict(x_test)
datal = pd.DataFrame({"Y_test": y_test , "Y_pred" : y_pred})
datal.head(10)

In [None]:
plt.figure(figsize=(10,8))
plt.plot(datal[:50])
plt.legend(["Actual" , "Predicted"])

### Ridge regression:

In [None]:
reg = linear_model.Ridge(  alpha=0.9)
reg.fit(x_train , y_train)
reg.fit(x_train , y_train)
reg.score(x_train , y_train)

In [None]:
y_pred =reg.predict(x_test)
datal = pd.DataFrame({"Y_test": y_test , "Y_pred" : y_pred})
datal.head(10)

In [None]:
plt.figure(figsize=(10,8))
plt.plot(datal[:50])
plt.legend(["Actual" , "Predicted"])

### Lasso regression:

In [None]:
reg = linear_model.Lasso(alpha=0.9)
reg.fit(x_train,y_train)
reg.score(x_train , y_train)


In [None]:
reg.score(x_test , y_test)


In [None]:
y_pred =reg.predict(x_test)
datal = pd.DataFrame({"Y_test": y_test , "Y_pred" : y_pred})
datal.head(10)

In [None]:
plt.figure(figsize=(10,8))
plt.plot(datal[:50])
plt.legend(["Actual" , "Predicted"])

In [None]:
reg_score = r2_score(y_test , y_pred)
p = len(x_train[0])
n = len(y_train)
adj_R1 = 1-(1-reg_score)*(n-1)/(n-p-1)
adj_R1

In [None]:
adj_R1< reg_score

# Evaluation:

In [None]:
def run_experiment(model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print("R^2 : ", r2_score(y_test, y_pred))
    print("MAE :", mean_absolute_error(y_test,y_pred))
    print("RMSE:",np.sqrt(mean_squared_error(y_test, y_pred)))

## Linear regression

In [None]:
model = LinearRegression()
run_experiment(model)

## Lasso regression

In [None]:
model = linear_model.Lasso()
run_experiment(model)

## Ridge regression

In [None]:
model = linear_model.Ridge()
run_experiment(model)

## Random forest regression

In [None]:
model = RandomForestRegressor()
run_experiment(model)

### Decision Tree Regression

In [None]:
model = DecisionTreeRegressor()
run_experiment(model)

### We can use Another regression evaluation model (SGD regressor)

In [None]:
from sklearn.linear_model import SGDRegressor
model = SGDRegressor()
run_experiment(model)

## Explanation:
- Lasso gives the highest R square.
- Random forest gives the least error.

## Accuracy, percision and Recall:

In [None]:
def run_experiment(model):
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    plot_confusion_matrix(model, x_test, y_test, cmap='GnBu')
    plt.show()
    print('Precision: %.3f' % precision_score(y_test, y_pred))
    print('Recall: %.3f' % recall_score(y_test, y_pred))
    print('F1: %.3f' % f1_score(y_test, y_pred))
    print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))