In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import r2_score,mean_squared_error
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r'vehicles_data.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,id,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,county,state,lat,long
0,55483,7315914053,0,2018.0,ram,promaster 2500,excellent,,gas,44244.0,clean,automatic,,,van,,,ca,32.7928,-116.9665
1,162368,7310885048,13995,2017.0,mazda,cx-3,,4 cylinders,gas,7037.0,rebuilt,automatic,,,SUV,white,,ia,41.207382,-96.023096
2,234393,7308243856,19990,2019.0,mitsubishi,eclipse cross sp,good,,gas,35313.0,clean,other,4wd,,hatchback,white,,nc,35.19,-80.83
3,276110,7315817729,0,2019.0,honda,cr-v,,,gas,25626.0,clean,automatic,,,SUV,orange,,ny,40.854573,-74.120219
4,349033,7301620999,42900,2015.0,chevrolet,corvette,excellent,8 cylinders,gas,29000.0,clean,automatic,,,convertible,black,,sc,34.755562,-82.906419


In [3]:
df.shape

(64032, 20)

In [4]:
df.columns

Index(['Unnamed: 0', 'id', 'price', 'year', 'manufacturer', 'model',
       'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'drive', 'size', 'type', 'paint_color', 'county',
       'state', 'lat', 'long'],
      dtype='object')

In [5]:
drop_columns = ['Unnamed: 0','id','title_status','size','lat','long','county']
df = df.drop(columns = drop_columns,axis = 1)

In [6]:
df.shape

(64032, 13)

In [7]:
df.isna().sum()

price               0
year              158
manufacturer     2569
model             802
condition       26097
cylinders       26511
fuel              424
odometer          669
transmission      353
drive           19471
type            13785
paint_color     19505
state               0
dtype: int64

In [8]:
df = df.dropna()
df.head(5)

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
5,0,2006.0,chrysler,300,like new,8 cylinders,gas,149000.0,automatic,rwd,sedan,white,fl
9,20995,2011.0,chevrolet,silverado 1500,excellent,8 cylinders,gas,92001.0,automatic,4wd,truck,blue,wi
15,50995,2017.0,gmc,yukon denali,like new,8 cylinders,gas,70227.0,automatic,4wd,SUV,grey,ak
22,13500,2014.0,chevrolet,tahoe,good,8 cylinders,gas,96007.0,automatic,rwd,SUV,white,fl
29,34990,2016.0,gmc,canyon crew cab sle pickup,good,6 cylinders,gas,34425.0,other,4wd,pickup,red,ma


In [9]:
df.shape

(17491, 13)

In [10]:
df.describe()

Unnamed: 0,price,year,odometer
count,17491.0,17491.0,17491.0
mean,16325.45,2009.535247,112261.6
std,129442.9,9.596493,215514.9
min,0.0,1918.0,0.0
25%,5600.0,2006.0,55787.0
50%,10950.0,2012.0,102567.0
75%,22500.0,2015.0,148000.0
max,17000000.0,2022.0,10000000.0


## Check if there are any duplicates.Remove Duplicates

In [11]:
df.drop_duplicates(inplace = True)

In [12]:
df.shape

(16399, 13)

# Filtering categorical features

In [17]:
numerics = ['int8','int16','int32','int64','float16','float32','float64']
categorical_columns = []
features = df.columns.values.tolist()
for col in features:
    if df[col].dtype in numerics:
        continue
    categorical_columns.append(col)

In [18]:
categorical_columns

['manufacturer',
 'model',
 'condition',
 'cylinders',
 'fuel',
 'transmission',
 'drive',
 'type',
 'paint_color',
 'state']

# Encoding categorical columns using get_dummies.

In [19]:
df_dummies = pd.get_dummies(df[categorical_columns],drop_first = True)

In [20]:
df_dummies.head()

Unnamed: 0,manufacturer_alfa-romeo,manufacturer_audi,manufacturer_bmw,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,manufacturer_chrysler,manufacturer_datsun,manufacturer_dodge,manufacturer_ferrari,...,state_sd,state_tn,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
5,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
df_dummies.shape

(16399, 4317)

In [22]:
df = df.join(df_dummies)

In [23]:
df.shape

(16399, 4330)

In [24]:
df.head()

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,drive,...,state_sd,state_tn,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
5,0,2006.0,chrysler,300,like new,8 cylinders,gas,149000.0,automatic,rwd,...,0,0,0,0,0,0,0,0,0,0
9,20995,2011.0,chevrolet,silverado 1500,excellent,8 cylinders,gas,92001.0,automatic,4wd,...,0,0,0,0,0,0,0,1,0,0
15,50995,2017.0,gmc,yukon denali,like new,8 cylinders,gas,70227.0,automatic,4wd,...,0,0,0,0,0,0,0,0,0,0
22,13500,2014.0,chevrolet,tahoe,good,8 cylinders,gas,96007.0,automatic,rwd,...,0,0,0,0,0,0,0,0,0,0
29,34990,2016.0,gmc,canyon crew cab sle pickup,good,6 cylinders,gas,34425.0,other,4wd,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df.drop(columns = categorical_columns,axis = 1,inplace = True)

In [27]:
df.head(2)

Unnamed: 0,price,year,odometer,manufacturer_alfa-romeo,manufacturer_audi,manufacturer_bmw,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,manufacturer_chrysler,...,state_sd,state_tn,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
5,0,2006.0,149000.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9,20995,2011.0,92001.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


# Selecting realistic data.Here domain knowledge will help a lot to decide what could be the higher and lower price

In [28]:
df = df[df['price'] > 1000]
df = df[df['price'] < 40000]

In [29]:
df.shape

(14742, 4320)

#### Divide dataset inot features and label

In [30]:
x = df.drop(['price'],axis = 1)
y = df['price']

In [31]:
# Data split into train test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25,random_state = 10)

In [32]:
import xgboost as xgb
xgb = xgb.XGBRegressor()

In [34]:
xgb.fit(x_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [35]:
y_pred = xgb.predict(x_test)

In [36]:
r2_score(y_test,y_pred)

0.8494808147571992