In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib


Using matplotlib backend: <object object at 0x7910aa117af0>


In [5]:
bike_df=pd.read_csv('/content/drive/MyDrive/ML/Linear_Reg/bike_details.csv')

In [6]:
bike_df.head()

Unnamed: 0,name,selling_price,year,seller_type,owner,km_driven,ex_showroom_price
0,Royal Enfield Classic 350,175000,2019,Individual,1st owner,350,
1,Honda Dio,45000,2017,Individual,1st owner,5650,
2,Royal Enfield Classic Gunmetal Grey,150000,2018,Individual,1st owner,12000,148114.0
3,Yamaha Fazer FI V 2.0 [2016-2018],65000,2015,Individual,1st owner,23000,89643.0
4,Yamaha SZ [2013-2014],20000,2011,Individual,2nd owner,21000,


In [8]:
print(bike_df.isnull().sum())


name                   0
selling_price          0
year                   0
seller_type            0
owner                  0
km_driven              0
ex_showroom_price    435
dtype: int64


#### Identify and decide how to handle missing values. Filling ex_showroom_price with the median or drop rows with missing values.

In [9]:
bike_df['ex_showroom_price'].fillna(bike_df['ex_showroom_price'].median(), inplace=True)
bike_df.dropna(inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  bike_df['ex_showroom_price'].fillna(bike_df['ex_showroom_price'].median(), inplace=True)


In [10]:
bike_df.head()

Unnamed: 0,name,selling_price,year,seller_type,owner,km_driven,ex_showroom_price
0,Royal Enfield Classic 350,175000,2019,Individual,1st owner,350,72752.5
1,Honda Dio,45000,2017,Individual,1st owner,5650,72752.5
2,Royal Enfield Classic Gunmetal Grey,150000,2018,Individual,1st owner,12000,148114.0
3,Yamaha Fazer FI V 2.0 [2016-2018],65000,2015,Individual,1st owner,23000,89643.0
4,Yamaha SZ [2013-2014],20000,2011,Individual,2nd owner,21000,72752.5


#### Since the model year affects depreciation, it’s useful to calculate the bike's age.

In [11]:
current_year = 2024
bike_df['age'] = current_year - bike_df['year']
bike_df.drop(['year'], axis=1, inplace=True)  # Drop the original year column


In [12]:
bike_df.head()

Unnamed: 0,name,selling_price,seller_type,owner,km_driven,ex_showroom_price,age
0,Royal Enfield Classic 350,175000,Individual,1st owner,350,72752.5,5
1,Honda Dio,45000,Individual,1st owner,5650,72752.5,7
2,Royal Enfield Classic Gunmetal Grey,150000,Individual,1st owner,12000,148114.0,6
3,Yamaha Fazer FI V 2.0 [2016-2018],65000,Individual,1st owner,23000,89643.0,9
4,Yamaha SZ [2013-2014],20000,Individual,2nd owner,21000,72752.5,13


#### Convert categorical variables (seller_type, owner) into numeric codes for machine learning using One-hot Encoding.

In [14]:
bike_df = pd.get_dummies(bike_df, columns=['seller_type', 'owner'], drop_first=True)


In [15]:
bike_df.head()

Unnamed: 0,name,selling_price,km_driven,ex_showroom_price,age,seller_type_Individual,owner_2nd owner,owner_3rd owner,owner_4th owner
0,Royal Enfield Classic 350,175000,350,72752.5,5,True,False,False,False
1,Honda Dio,45000,5650,72752.5,7,True,False,False,False
2,Royal Enfield Classic Gunmetal Grey,150000,12000,148114.0,6,True,False,False,False
3,Yamaha Fazer FI V 2.0 [2016-2018],65000,23000,89643.0,9,True,False,False,False
4,Yamaha SZ [2013-2014],20000,21000,72752.5,13,True,True,False,False


#### Select Features and Target Variable

In [16]:
X = bike_df.drop(['name', 'selling_price'], axis=1)  # Drop non-numeric and target columns
y = bike_df['selling_price']


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Initialize and Train the Linear Regression Model

In [19]:
model = LinearRegression()
model.fit(X_train, y_train)


In [20]:
y_pred = model.predict(X_test)


In [21]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")


Mean Absolute Error (MAE): 19939.103979873376
Mean Squared Error (MSE): 745590142.462802
Root Mean Squared Error (RMSE): 27305.496561366577
R-squared (R2): 0.7172356700224405


### Price Prediction on New Data
* To predict the selling price of a new motorcycle, we need to create a feature vector with the same columns as X.

In [24]:

new_motorcycle = {
    'km_driven': 15000,
    'ex_showroom_price': 85000,
    'age': 3,
    'seller_type_Individual': 1,
    'owner_2nd owner': 0,
    'owner_3rd owner': 0,
    "owner_4th owner" : 0
}

# Convert to DataFrame
new_motorcycle_df = pd.DataFrame([new_motorcycle])

# Predict price
predicted_price = model.predict(new_motorcycle_df)
print(f"Predicted Selling Price: {predicted_price[0]}")


Predicted Selling Price: 94022.45509907839


In [26]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
print("Cross-Validation MAE: ", -scores.mean())


Cross-Validation MAE:  20887.88979363962


### Try Polynomial Features

In [25]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

In [28]:
poly_model = LinearRegression()
poly_model.fit(X_train, y_train)

In [29]:
y_polypred = poly_model.predict(X_test)


In [30]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_polypred)
mse = mean_squared_error(y_test, y_polypred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_polypred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")


Mean Absolute Error (MAE): 19212.252492170082
Mean Squared Error (MSE): 666052705.877277
Root Mean Squared Error (RMSE): 25807.996936555868
R-squared (R2): 0.7474001648076711


In [32]:
y_train_pred = poly_model.predict(X_train)

# Training set evaluation
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"Training MAE: {mae_train}")
print(f"Training R2 Score: {r2_train}")


Training MAE: 18062.530978166415
Training R2 Score: 0.7406694995104157
