# Import all necessary libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import regex as re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, BayesianRidge
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Read data from s3 bucket of AWS and load to this notebook

In [2]:
df = pd.read_csv("s3://abdullah-shafiq/cars_raw.csv")

In [3]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,9379.0,2018.721719,2.221708,2001.0,2018.0,2019.0,2020.0,2022.0
ConsumerRating,9379.0,4.702825,0.240795,2.5,4.7,4.8,4.8,5.0
ConsumerReviews,9379.0,133.187014,154.98564,1.0,30.0,75.0,182.0,817.0
SellerRating,9379.0,4.412571,0.626258,1.0,4.3,4.6,4.8,5.0
SellerReviews,9379.0,984.089988,1609.039864,1.0,112.0,542.0,1272.0,27824.0
ComfortRating,9379.0,4.771895,0.217822,3.0,4.7,4.8,4.9,5.0
InteriorDesignRating,9379.0,4.727391,0.194391,2.8,4.7,4.8,4.8,5.0
PerformanceRating,9379.0,4.69629,0.253664,1.0,4.6,4.7,4.8,5.0
ValueForMoneyRating,9379.0,4.537083,0.338098,1.0,4.5,4.6,4.7,5.0
ExteriorStylingRating,9379.0,4.782194,0.171537,3.0,4.7,4.8,4.9,5.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9379 entries, 0 to 9378
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   9379 non-null   int64  
 1   Make                   9379 non-null   object 
 2   Model                  9379 non-null   object 
 3   Used/New               9379 non-null   object 
 4   Price                  9379 non-null   object 
 5   ConsumerRating         9379 non-null   float64
 6   ConsumerReviews        9379 non-null   int64  
 7   SellerType             9379 non-null   object 
 8   SellerName             9379 non-null   object 
 9   SellerRating           9379 non-null   float64
 10  SellerReviews          9379 non-null   int64  
 11  StreetName             9379 non-null   object 
 12  State                  9379 non-null   object 
 13  Zipcode                9379 non-null   object 
 14  DealType               9157 non-null   object 
 15  Comf

In [5]:
df.isna().sum()

Year                       0
Make                       0
Model                      0
Used/New                   0
Price                      0
ConsumerRating             0
ConsumerReviews            0
SellerType                 0
SellerName                 0
SellerRating               0
SellerReviews              0
StreetName                 0
State                      0
Zipcode                    0
DealType                 222
ComfortRating              0
InteriorDesignRating       0
PerformanceRating          0
ValueForMoneyRating        0
ExteriorStylingRating      0
ReliabilityRating          0
ExteriorColor              0
InteriorColor              0
Drivetrain                 0
MinMPG                     0
MaxMPG                     0
FuelType                   0
Transmission               0
Engine                     0
VIN                        0
Stock#                     0
Mileage                    0
dtype: int64

# EXPLINATION
### The data has no empty values accept "DealType" which is by the way not important.
### if our data had any null values we would type the following code
### for numeric: df.fillna(df[column].mean())
### for alphebatic values : df.fillna(df[column].mode()[0])

In [6]:
df['DealType'].fillna(df['DealType'].mode()[0], inplace = True)

In [7]:
df.isna().sum()

Year                     0
Make                     0
Model                    0
Used/New                 0
Price                    0
ConsumerRating           0
ConsumerReviews          0
SellerType               0
SellerName               0
SellerRating             0
SellerReviews            0
StreetName               0
State                    0
Zipcode                  0
DealType                 0
ComfortRating            0
InteriorDesignRating     0
PerformanceRating        0
ValueForMoneyRating      0
ExteriorStylingRating    0
ReliabilityRating        0
ExteriorColor            0
InteriorColor            0
Drivetrain               0
MinMPG                   0
MaxMPG                   0
FuelType                 0
Transmission             0
Engine                   0
VIN                      0
Stock#                   0
Mileage                  0
dtype: int64

# Dropping all the useless columns

In [8]:
newdf = df.drop(['Model', 'SellerType', 'SellerName','SellerRating','SellerReviews','StreetName','State','Zipcode','DealType','Transmission','Engine',"VIN","Stock#",'ExteriorColor','InteriorColor'],axis=1)

In [9]:
newdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9379 entries, 0 to 9378
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   9379 non-null   int64  
 1   Make                   9379 non-null   object 
 2   Used/New               9379 non-null   object 
 3   Price                  9379 non-null   object 
 4   ConsumerRating         9379 non-null   float64
 5   ConsumerReviews        9379 non-null   int64  
 6   ComfortRating          9379 non-null   float64
 7   InteriorDesignRating   9379 non-null   float64
 8   PerformanceRating      9379 non-null   float64
 9   ValueForMoneyRating    9379 non-null   float64
 10  ExteriorStylingRating  9379 non-null   float64
 11  ReliabilityRating      9379 non-null   float64
 12  Drivetrain             9379 non-null   object 
 13  MinMPG                 9379 non-null   int64  
 14  MaxMPG                 9379 non-null   int64  
 15  Fuel

# More preprocessing

In [10]:
newdf.drop(['Price'],axis =1, inplace = True)

In [11]:
newdf['PRICE'] = df['Price']

In [12]:
newdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9379 entries, 0 to 9378
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   9379 non-null   int64  
 1   Make                   9379 non-null   object 
 2   Used/New               9379 non-null   object 
 3   ConsumerRating         9379 non-null   float64
 4   ConsumerReviews        9379 non-null   int64  
 5   ComfortRating          9379 non-null   float64
 6   InteriorDesignRating   9379 non-null   float64
 7   PerformanceRating      9379 non-null   float64
 8   ValueForMoneyRating    9379 non-null   float64
 9   ExteriorStylingRating  9379 non-null   float64
 10  ReliabilityRating      9379 non-null   float64
 11  Drivetrain             9379 non-null   object 
 12  MinMPG                 9379 non-null   int64  
 13  MaxMPG                 9379 non-null   int64  
 14  FuelType               9379 non-null   object 
 15  Mile

In [13]:
val=newdf['PRICE'].mode()[0]
rep = "Not Priced"

In [14]:
newdf['PRICE'].replace(rep,val, inplace = True)

In [15]:
newdf['PRICE'] = newdf['PRICE'].apply(lambda x : int(re.sub("[^0-9]+","",x)))

In [16]:
newdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9379 entries, 0 to 9378
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   9379 non-null   int64  
 1   Make                   9379 non-null   object 
 2   Used/New               9379 non-null   object 
 3   ConsumerRating         9379 non-null   float64
 4   ConsumerReviews        9379 non-null   int64  
 5   ComfortRating          9379 non-null   float64
 6   InteriorDesignRating   9379 non-null   float64
 7   PerformanceRating      9379 non-null   float64
 8   ValueForMoneyRating    9379 non-null   float64
 9   ExteriorStylingRating  9379 non-null   float64
 10  ReliabilityRating      9379 non-null   float64
 11  Drivetrain             9379 non-null   object 
 12  MinMPG                 9379 non-null   int64  
 13  MaxMPG                 9379 non-null   int64  
 14  FuelType               9379 non-null   object 
 15  Mile

# LabelEncoding so that the computer can understand the labels

In [17]:
final = pd.DataFrame()
for i in newdf.columns:
    if newdf[i].dtype == 'object':
        le = LabelEncoder()
        v = list(newdf[i].unique())
        le.fit(v)
        final[i]=le.transform(newdf[i])
    else:
        final[i]=newdf[i]

In [18]:
final.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,9379.0,2018.721719,2.221708,2001.0,2018.0,2019.0,2020.0,2022.0
Make,9379.0,18.968334,12.251982,0.0,7.0,19.0,28.0,40.0
Used/New,9379.0,21.247468,4.992323,0.0,23.0,23.0,23.0,25.0
ConsumerRating,9379.0,4.702825,0.240795,2.5,4.7,4.8,4.8,5.0
ConsumerReviews,9379.0,133.187014,154.98564,1.0,30.0,75.0,182.0,817.0
ComfortRating,9379.0,4.771895,0.217822,3.0,4.7,4.8,4.9,5.0
InteriorDesignRating,9379.0,4.727391,0.194391,2.8,4.7,4.8,4.8,5.0
PerformanceRating,9379.0,4.69629,0.253664,1.0,4.6,4.7,4.8,5.0
ValueForMoneyRating,9379.0,4.537083,0.338098,1.0,4.5,4.6,4.7,5.0
ExteriorStylingRating,9379.0,4.782194,0.171537,3.0,4.7,4.8,4.9,5.0


# Spliting the data for futher computation

In [19]:
x = final.iloc[:,:-1]

In [20]:
y = final.iloc[:,-1]

In [21]:
x_train,xtest,y_train,ytest = train_test_split(x,y,test_size = 0.3, random_state=101)

## These are all the techniques that our group could apply

In [22]:
LR = LinearRegression()
sgd = SGDRegressor()
en = ElasticNet()
BR = BayesianRidge()
svr = SVR()
cb = CatBoostRegressor()
xgb = XGBRegressor()
ada = AdaBoostRegressor()
etr = ExtraTreesRegressor()
rfr = RandomForestRegressor()
gbr = GradientBoostingRegressor()

In [23]:
LR.fit(x_train,y_train)

In [24]:
sgd.fit(x_train,y_train)

In [25]:
en.fit(x_train,y_train)

In [26]:
BR.fit(x_train,y_train)

In [27]:
svr.fit(x_train,y_train)

In [28]:
cb.fit(x_train,y_train)

Learning rate set to 0.055119
0:	learn: 20630.9345789	total: 49.2ms	remaining: 49.1s
1:	learn: 20070.3811592	total: 51.4ms	remaining: 25.7s
2:	learn: 19555.8170283	total: 53.9ms	remaining: 17.9s
3:	learn: 19083.7929611	total: 56.5ms	remaining: 14.1s
4:	learn: 18602.6482477	total: 59ms	remaining: 11.8s
5:	learn: 18199.8845055	total: 61.4ms	remaining: 10.2s
6:	learn: 17801.4100113	total: 64ms	remaining: 9.07s
7:	learn: 17401.1826808	total: 66.3ms	remaining: 8.23s
8:	learn: 17047.5910640	total: 68.6ms	remaining: 7.56s
9:	learn: 16697.5940298	total: 71.9ms	remaining: 7.12s
10:	learn: 16350.7485209	total: 74ms	remaining: 6.65s
11:	learn: 16041.3813796	total: 76.7ms	remaining: 6.32s
12:	learn: 15733.5394419	total: 79ms	remaining: 6s
13:	learn: 15450.8791278	total: 81.4ms	remaining: 5.73s
14:	learn: 15175.9392143	total: 83.8ms	remaining: 5.5s
15:	learn: 14892.8305959	total: 86.3ms	remaining: 5.31s
16:	learn: 14645.7977016	total: 88.9ms	remaining: 5.14s
17:	learn: 14391.9708622	total: 91.2ms	r

<catboost.core.CatBoostRegressor at 0x7f68262b5750>

In [29]:
xgb.fit(x_train,y_train)

In [30]:
ada.fit(x_train,y_train)

In [31]:
etr.fit(x_train,y_train)

In [32]:
rfr.fit(x_train,y_train)

In [33]:
gbr.fit(x_train,y_train)

In [34]:
yprdct = LR.predict(xtest)
print("Training Score:\n",LR.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(ytest,yprdct))
print("R2 score is:\n",r2_score(ytest,yprdct))

Training Score:
 39.52280313163074
Mean Squared Error:
 224359878.8387111
R2 score is:
 0.427976385863992


In [35]:
yprdct = sgd.predict(xtest)
print("Training Score:\n",sgd.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(ytest,yprdct))
print("R2 score is:\n",r2_score(ytest,yprdct))

Training Score:
 -2.934089213022643e+28
Mean Squared Error:
 1.3213906626372669e+35
R2 score is:
 -3.3689921140968565e+26


In [36]:
yprdct = en.predict(xtest)
print("Training Score:\n",en.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(ytest,yprdct))
print("R2 score is:\n",r2_score(ytest,yprdct))

Training Score:
 29.127880369150926
Mean Squared Error:
 265891071.4366703
R2 score is:
 0.32208925928757914


In [37]:
yprdct = BR.predict(xtest)
print("Training Score:\n",BR.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(ytest,yprdct))
print("R2 score is:\n",r2_score(ytest,yprdct))

Training Score:
 39.51536355419514
Mean Squared Error:
 224422332.8781661
R2 score is:
 0.4278171542511414


In [38]:
yprdct = svr.predict(xtest)
print("Training Score:\n",svr.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(ytest,yprdct))
print("R2 score is:\n",r2_score(ytest,yprdct))

Training Score:
 -2.2025938710827475
Mean Squared Error:
 401762227.77527446
R2 score is:
 -0.02432521690100775


In [39]:
yprdct = cb.predict(xtest)
print("Training Score:\n",cb.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(ytest,yprdct))
print("R2 score is:\n",r2_score(ytest,yprdct))

Training Score:
 97.20281665981831
Mean Squared Error:
 62815043.2599369
R2 score is:
 0.8398479788202705


In [40]:
yprdct = xgb.predict(xtest)
print("Training Score:\n",xgb.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(ytest,yprdct))
print("R2 score is:\n",r2_score(ytest,yprdct))

Training Score:
 98.60074590930616
Mean Squared Error:
 70679750.15049307
R2 score is:
 0.8197962740192958


In [41]:
yprdct = ada.predict(xtest)
print("Training Score:\n",ada.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(ytest,yprdct))
print("R2 score is:\n",r2_score(ytest,yprdct))

Training Score:
 5.019824254081273
Mean Squared Error:
 473460645.1449262
R2 score is:
 -0.20712611715066354


In [42]:
yprdct = etr.predict(xtest)
print("Training Score:\n",etr.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(ytest,yprdct))
print("R2 score is:\n",r2_score(ytest,yprdct))

Training Score:
 99.99993071791108
Mean Squared Error:
 62948563.49219472
R2 score is:
 0.8395075582147161


In [43]:
yprdct = rfr.predict(xtest)
print("Training Score:\n",rfr.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(ytest,yprdct))
print("R2 score is:\n",r2_score(ytest,yprdct))

Training Score:
 97.34004988055887
Mean Squared Error:
 64847117.26110235
R2 score is:
 0.8346670421913374


In [44]:
yprdct = gbr.predict(xtest)
print("Training Score:\n",gbr.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(ytest,yprdct))
print("R2 score is:\n",r2_score(ytest,yprdct))

Training Score:
 86.66684363728282
Mean Squared Error:
 88999655.96435982
R2 score is:
 0.7730881959595246


# Hence the most accurate predictor is "CatBoostRegressor" which is 83.984% accurate