In [116]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from xgboost import XGBRegressor

In [117]:
car_data = pd.read_csv("Car_Data.csv")

In [118]:
car_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [119]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [120]:
car_data.shape

(4340, 8)

In [121]:
car_data.isna().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [122]:
car_data.value_counts()

name                              year  selling_price  km_driven  fuel    seller_type       transmission  owner       
Renault Duster 85PS Diesel RxL    2013  450000         1000       Diesel  Dealer            Manual        Second Owner    12
Hyundai Verna 1.6 VTVT SX         2015  760000         55340      Petrol  Trustmark Dealer  Manual        First Owner     12
Hyundai Santro GS                 2005  80000          56580      Petrol  Dealer            Manual        First Owner     12
Maruti S-Cross Zeta DDiS 200 SH   2015  750000         45974      Diesel  Trustmark Dealer  Manual        First Owner     12
Maruti SX4 Vxi BSIV               2012  225000         110000     Petrol  Individual        Manual        Second Owner    12
                                                                                                                          ..
Hyundai i20 Asta Option 1.4 CRDi  2016  550000         60000      Diesel  Individual        Manual        First Owner      1
      

In [123]:
car_data.owner.value_counts()

owner
First Owner             2832
Second Owner            1106
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: count, dtype: int64

In [124]:
car_data.replace({"owner" : {"First Owner" : 0, "Second Owner" : 1, "Third Owner": 2, "Fourth & Above Owner" : 3, "Test Drive Car": 4}}, inplace=True)

  car_data.replace({"owner" : {"First Owner" : 0, "Second Owner" : 1, "Third Owner": 2, "Fourth & Above Owner" : 3, "Test Drive Car": 4}}, inplace=True)


In [125]:
car_data.transmission.value_counts()

transmission
Manual       3892
Automatic     448
Name: count, dtype: int64

In [126]:
car_data.replace({"transmission" : {"Manual" : 0, "Automatic" : 1}}, inplace=True)

  car_data.replace({"transmission" : {"Manual" : 0, "Automatic" : 1}}, inplace=True)


In [127]:
car_data.seller_type.value_counts()

seller_type
Individual          3244
Dealer               994
Trustmark Dealer     102
Name: count, dtype: int64

In [128]:
car_data.replace({"seller_type" : {"Individual" : 0, "Dealer" : 1, "Trustmark Dealer": 2}}, inplace=True)

  car_data.replace({"seller_type" : {"Individual" : 0, "Dealer" : 1, "Trustmark Dealer": 2}}, inplace=True)


In [129]:
car_data.fuel.value_counts()

fuel
Diesel      2153
Petrol      2123
CNG           40
LPG           23
Electric       1
Name: count, dtype: int64

In [130]:
car_data.replace({"fuel" : {"Diesel" : 0, "Petrol" : 1, "CNG": 2, "LPG" : 3, "Electric": 4}}, inplace=True)

  car_data.replace({"fuel" : {"Diesel" : 0, "Petrol" : 1, "CNG": 2, "LPG" : 3, "Electric": 4}}, inplace=True)


In [131]:
car_data.tail()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
4335,Hyundai i20 Magna 1.4 CRDi (Diesel),2014,409999,80000,0,0,0,1
4336,Hyundai i20 Magna 1.4 CRDi,2014,409999,80000,0,0,0,1
4337,Maruti 800 AC BSIII,2009,110000,83000,1,0,0,1
4338,Hyundai Creta 1.6 CRDi SX Option,2016,865000,90000,0,0,0,0
4339,Renault KWID RXT,2016,225000,40000,1,0,0,0


In [132]:
car_data.drop(columns="name", axis=1, inplace=True)

In [133]:
car_data.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,1,0,0,0
1,2007,135000,50000,1,0,0,0
2,2012,600000,100000,0,0,0,0
3,2017,250000,46000,1,0,0,0
4,2014,450000,141000,0,0,0,1


In [134]:
Y = car_data.selling_price
X = car_data.drop(columns="selling_price", axis=1)

In [135]:
print(X)
print(Y)

      year  km_driven  fuel  seller_type  transmission  owner
0     2007      70000     1            0             0      0
1     2007      50000     1            0             0      0
2     2012     100000     0            0             0      0
3     2017      46000     1            0             0      0
4     2014     141000     0            0             0      1
...    ...        ...   ...          ...           ...    ...
4335  2014      80000     0            0             0      1
4336  2014      80000     0            0             0      1
4337  2009      83000     1            0             0      1
4338  2016      90000     0            0             0      0
4339  2016      40000     1            0             0      0

[4340 rows x 6 columns]
0        60000
1       135000
2       600000
3       250000
4       450000
         ...  
4335    409999
4336    409999
4337    110000
4338    865000
4339    225000
Name: selling_price, Length: 4340, dtype: int64


In [136]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [137]:
linear_regress = LinearRegression()

In [138]:
linear_regress.fit(X_train, Y_train)

In [139]:
# Predict on test data
y_test_predict = linear_regress.predict(X_test)

# Calculate metrics with correct parameter order
rsquare_linear = r2_score(Y_test, y_test_predict)
mae_linear = mean_absolute_error(Y_test, y_test_predict)

print("R-squared: " + str(rsquare_linear))
print("MAE: " + str(mae_linear))

R-squared: 0.49988298730549363
MAE: 221279.58700085527


In [140]:
X_train_predict = linear_regress.predict(X_train)
rsquare_linear = r2_score(Y_train, X_train_predict)
print("R-squared: " + str(rsquare_linear))
mae_linear = mean_absolute_error(X_train_predict, Y_train)
print("MAE: " + str(mae_linear))

R-squared: 0.43922410637048903
MAE: 233466.33996739757


## Lasso Regression:

In [141]:
lasso_reg = Lasso()

In [142]:
lasso_reg.fit(X_train, Y_train)

In [143]:
Y_train_prediction = lasso_reg.predict(X_train)
rsquare_lasso = r2_score(Y_train, Y_train_prediction)
print(rsquare_lasso)

0.4392241063188459


In [144]:
Y_test_prediction = lasso_reg.predict(X_test)
rsquare_lasso_test = r2_score(Y_test, Y_test_prediction)
print(rsquare_lasso_test)

0.49988271066297674


In [145]:
booster = XGBRegressor()

In [146]:
booster.fit(X_train, Y_train)

In [147]:
xgb_train = booster.predict(X_train)
rsquarexgb = r2_score(Y_train, xgb_train)
print(rsquarexgb)

0.931390643119812


In [148]:
xgb_test = booster.predict(X_test)
rsquarexgb_test = r2_score(Y_test, xgb_test)
print(rsquarexgb_test)

0.7165423035621643
