<a href="https://colab.research.google.com/github/bathicodes/EXPERIMENTS/blob/main/Experiment_Motorcycle_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import dataset

In [55]:
dataset = pd.read_csv("BIKE DETAILS.csv")

columns_to_keep = ["name","year","km_driven","selling_price"]

dataset = dataset[columns_to_keep]
dataset.head()

Unnamed: 0,name,year,km_driven,selling_price
0,Royal Enfield Classic 350,2019,350,175000
1,Honda Dio,2017,5650,45000
2,Royal Enfield Classic Gunmetal Grey,2018,12000,150000
3,Yamaha Fazer FI V 2.0 [2016-2018],2015,23000,65000
4,Yamaha SZ [2013-2014],2011,21000,20000


# Data cleaning

## Checking dataset shape (rows and columns)

In [56]:
dataset.shape

(1061, 4)

## Checking the unique values for the each column

In [57]:
dataset.nunique()

name             279
year              28
km_driven        304
selling_price    130
dtype: int64

## Checking empty values

In [58]:
dataset.isna().sum()

name             0
year             0
km_driven        0
selling_price    0
dtype: int64

# Encode categorical data

In [59]:
dataset = pd.get_dummies(dataset, drop_first=True)
dataset.head()

Unnamed: 0,year,km_driven,selling_price,name_Activa 4g,name_Aprilia SR 125,name_BMW G310GS,name_Bajaj ct 100,name_Bajaj Avenger 150,name_Bajaj Avenger 150 street,name_Bajaj Avenger 220,...,name_Yamaha SZ [2013-2014],name_Yamaha SZ-S,name_Yamaha Saluto,name_Yamaha Saluto RX,name_Yamaha YBR 125,name_Yamaha YZF R15 S,name_Yamaha YZF R15 V3,name_Yamaha YZF R15 [2011-2018],name_Yamaha YZF R3,name_Yo Style
0,2019,350,175000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2017,5650,45000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2018,12000,150000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2015,23000,65000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2011,21000,20000,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


## Checking dataset shape after encoding categorical data

In [60]:
dataset.shape

(1061, 281)

# Creating X and y

In [61]:
X = dataset.drop("selling_price",axis=1)
y = dataset["selling_price"]

# Calculating VIF of each feature

In [62]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate(datasets):
  vif = pd.DataFrame()
  vif["features"] = datasets.columns
  vif["VIF_Value"] = [variance_inflation_factor(datasets.values, i) for i in range(datasets.shape[1])]
  return(vif)


calculate(X)

Unnamed: 0,features,VIF_Value
0,year,564.703839
1,km_driven,2.220255
2,name_Activa 4g,1.534605
3,name_Aprilia SR 125,2.066728
4,name_BMW G310GS,1.533757
...,...,...
275,name_Yamaha YZF R15 S,4.162577
276,name_Yamaha YZF R15 V3,2.065971
277,name_Yamaha YZF R15 [2011-2018],4.160992
278,name_Yamaha YZF R3,1.534651


# Split dataset into training and test set

In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Training multiple linear regression model

In [64]:
from sklearn.linear_model import LinearRegression

lin_regressor = LinearRegression()
lin_regressor.fit(X_train, y_train)

LinearRegression()

## Evaluating linear model

In [65]:
print(f"Training set score: {lin_regressor.score(X_train, y_train):.2f}")
print(f"Test set score: {lin_regressor.score(X_test, y_test):.2f}")

Training set score: 0.96
Test set score: 0.68


# Applying Ridge regression

In [66]:
from sklearn.linear_model import Ridge

ridge = Ridge()
ridge.fit(X_train, y_train)

Ridge()

## Evaluating Ridge regression model

In [67]:
print(f"Training set score: {ridge.score(X_train, y_train):.2f}")
print(f"Test set score: {ridge.score(X_test, y_test):.2f} ")

Training set score: 0.82
Test set score: 0.64 


# Applying Lasso regression

In [68]:
from sklearn.linear_model import Lasso

lasso = Lasso()
lasso.fit(X_train, y_train)

Lasso()

## Evaluating Lasso regression model

In [69]:
print(f"Training set score: {lasso.score(X_train, y_train):.2f}")
print(f"Test set score: {lasso.score(X_test, y_test):.2f}")
print(f"Number of features used: {np.sum(lasso.coef_ != 0)}")

Training set score: 0.96
Test set score: 0.70
Number of features used: 254


# Evaluating models use R²

In [70]:
from sklearn.metrics import r2_score

y_pred_lin = lin_regressor.predict(X_test)
y_pred_ridge = ridge.predict(X_test)
y_pred_lasso = lasso.predict(X_test)

print(f"MSE for linear regression model: {r2_score(y_test,y_pred_lin):.2f}")
print(f"MSE for ridge regression model: {r2_score(y_test,y_pred_ridge):.2f}")
print(f"MSE for lasso regression model: {r2_score(y_test,y_pred_lasso):.2f}")

MSE for linear regression model: 0.68
MSE for ridge regression model: 0.64
MSE for lasso regression model: 0.70
