# Car Prices Predictor

In this notebook, I will be predicting car prices using a Kaggle Data Set and regression models. 

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from IPython.display import display
import seaborn as sns
from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelBinarizer, StandardScaler, LabelEncoder,PolynomialFeatures, MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import FunctionTransformer
from sklearn_pandas import CategoricalImputer
from sklearn.dummy import DummyRegressor
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV
import copy
from sklearn.svm import SVC
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
cars = pd.read_csv("data/train-data.csv")

In [3]:
cars = cars.dropna(subset = ["Power", "Mileage", "Engine", "Seats"])

In [4]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5975 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         5975 non-null   int64  
 1   Name               5975 non-null   object 
 2   Location           5975 non-null   object 
 3   Year               5975 non-null   int64  
 4   Kilometers_Driven  5975 non-null   int64  
 5   Fuel_Type          5975 non-null   object 
 6   Transmission       5975 non-null   object 
 7   Owner_Type         5975 non-null   object 
 8   Mileage            5975 non-null   object 
 9   Engine             5975 non-null   object 
 10  Power              5975 non-null   object 
 11  Seats              5975 non-null   float64
 12  New_Price          823 non-null    object 
 13  Price              5975 non-null   float64
dtypes: float64(2), int64(3), object(9)
memory usage: 700.2+ KB


In [5]:
del cars['New_Price']

In [6]:
cars.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


In [7]:
np.sort(cars["Seats"].unique())

array([ 0.,  2.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

In [8]:
cars = cars[cars.Power != 'null bhp']

In [9]:
cars["Price"] = cars["Price"]*100000/54

In [10]:
cars["Brand"] = cars["Name"].str.split().str[0]

In [11]:
cars["Brand"] = cars["Brand"].str.lower()

In [12]:
cars["Age"] = 2020 - cars["Year"]

In [13]:
brandlist = cars["Brand"].unique()

In [14]:
def clean(column):
    cars[column] = cars[column].str.split(" ").str[0]

In [15]:
cars["Seats"] = cars["Seats"].astype(int)

In [16]:
clean("Mileage")
clean("Engine")
clean("Power")

In [17]:
cars["Mileage"] = cars["Mileage"].str.split(".").str[0].astype(int)

In [18]:
cars["Power"] = cars["Power"].str.split(".").str[0].astype(int)

In [19]:
cars["Engine"] = cars["Engine"].astype(int)

In [20]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5872 entries, 0 to 6018
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         5872 non-null   int64  
 1   Name               5872 non-null   object 
 2   Location           5872 non-null   object 
 3   Year               5872 non-null   int64  
 4   Kilometers_Driven  5872 non-null   int64  
 5   Fuel_Type          5872 non-null   object 
 6   Transmission       5872 non-null   object 
 7   Owner_Type         5872 non-null   object 
 8   Mileage            5872 non-null   int64  
 9   Engine             5872 non-null   int64  
 10  Power              5872 non-null   int64  
 11  Seats              5872 non-null   int64  
 12  Price              5872 non-null   float64
 13  Brand              5872 non-null   object 
 14  Age                5872 non-null   int64  
dtypes: float64(1), int64(8), object(6)
memory usage: 734.0+ KB


In [21]:
y = cars['Price']

X = cars.drop(['Name', "Price", 'Unnamed: 0', 'Year'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

In [22]:
mapper = DataFrameMapper([
    ('Location', LabelBinarizer()),
    (['Kilometers_Driven'], StandardScaler()),
    ('Fuel_Type', LabelBinarizer()),
    ('Transmission', LabelEncoder()),
    ('Owner_Type', LabelBinarizer()),
    (['Mileage'], StandardScaler()),
    (['Engine'], StandardScaler()),
    (['Power'], StandardScaler()),
    ('Seats', None),
    ('Brand', LabelBinarizer()),
    ('Age', None),
], df_out=True)

In [23]:
cars = mapper.fit_transform(cars)

In [24]:
pd.set_option('display.max_columns', 100)

In [25]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [26]:
import xgboost as xgb

In [27]:
data_dmatrix = xgb.DMatrix(data=Z_train,label=y_train)

In [28]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.5,
                max_depth = 25, alpha = 25, n_estimators = 50)

In [29]:
xg_reg.fit(Z_train,y_train)



XGBRegressor(alpha=25, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.3, gamma=0,
       importance_type='gain', learning_rate=0.5, max_delta_step=0,
       max_depth=25, min_child_weight=1, missing=None, n_estimators=50,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [30]:
xg_reg.score(Z_train,y_train)

0.9999728286195684

In [31]:
xg_reg.score(Z_test,y_test)

0.8639555024678095

In [32]:
y_testpred = xg_reg.predict(Z_test)

In [33]:
xb_params = {
    'max_depth = 25':[25,50,75],
    'n_estimators':[10,20,30,40,50]}

In [34]:
xb_gridsearch = GridSearchCV(xgb.XGBRegressor(), xb_params, cv=5, verbose=1, n_jobs=-1)

xb_gridsearch.fit(Z_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    9.0s finished




GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth = 25': [25, 50, 75], 'n_estimators': [10, 20, 30, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [35]:
xb_gridsearch.score(Z_train, y_train)

0.9196854037201292

In [36]:
xb_gridsearch.score(Z_test, y_test)

0.8463643034542491

In [37]:
pipe = make_pipeline(mapper, xg_reg)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
pickle.dump(pipe, open("pipe.pkl", "wb"))



In [38]:
del pipe

In [39]:
pipe = pickle.load(open("pipe.pkl", "rb"))



In [40]:
new_data = pd.DataFrame({
    'Location': ['Mumbai'],
    'Kilometers_Driven': [72000],
    'Fuel_Type': ['CNG'],
    'Transmission': ['Manual'],
    'Owner_Type': ['First'],
    "Mileage": [26.6],
    'Engine': [998],
    'Power': [58.16],
    'Seats': [5.0],
    'Brand': ['maruti'],
    'Age': [10]
    
})

In [41]:
prediction = pipe.predict(new_data)

In [42]:
np.round(prediction, 2)[0]

3561.96