<a href="https://colab.research.google.com/github/arnavk09/jupyterNotebooks_py/blob/master/car_price_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Dependencies:

In [139]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from os import pipe
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
pd.options.mode.chained_assignment = None  # default='warn'

In [140]:
car=pd.read_csv('/content/quikr_cars.csv')
car.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


## Cleaning the dataset (Data Preprocessing and Outlier removal)

In [141]:
## year has many non unique values!
## year is object instead of integer!
## undesirable values in price
## kms_driven has undesirable/non-int/NaN values
## keep 1st 3 name words
## There is an outlier in car['Price] (very expensive buy/sell) that may cause undesirable results, and must be removed

In [142]:
##fixing issues with year

car=car[car['year'].str.isnumeric()]

In [143]:
car['year'].unique() ##no more undesirable values in year

array(['2007', '2006', '2018', '2014', '2015', '2012', '2013', '2016',
       '2010', '2017', '2008', '2011', '2019', '2009', '2005', '2000',
       '2003', '2004', '1995', '2002', '2001'], dtype=object)

In [144]:
##converting typeobj to int and fixing prices by removing "Ask For Price", undesirable comma and converting to int

car['year']=car['year'].astype(int)
car=car[car['Price']!='Ask For Price']
car['Price']=car['Price'].str.replace(',','').astype(int)

In [145]:
##fixing kms driven by removing commas
car['kms_driven']=car['kms_driven'].str.split().str.get(0).str.replace(',','') 

In [146]:
car=car[car['kms_driven'].str.isnumeric()]
car['kms_driven']=car['kms_driven'].astype(int)

In [147]:
car=car[~car['fuel_type'].isna()] ##rows with fuel type not null
car.shape

(816, 6)

In [148]:
car['name']=car['name'].str.split(' ').str.slice(start=0,stop=3).str.join(' ') ##keep first 3 values of Name

In [149]:
car=car.reset_index(drop=True) ##resetting indices after cleaning out undesirable data to get a clean nice index

In [150]:
car ##cleaned data:

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...
811,Maruti Suzuki Ritz,Maruti,2011,270000,50000,Petrol
812,Tata Indica V2,Tata,2009,110000,30000,Diesel
813,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
814,Tata Zest XM,Tata,2018,260000,27000,Diesel


In [151]:
##outliers
car[car['Price']>6e6]

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
534,Mahindra XUV500 W6,Mahindra,2014,8500003,45000,Diesel


In [152]:
##removing outlier:
car=car[car['Price']<6e6].reset_index(drop=True)

In [153]:
car

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...
810,Maruti Suzuki Ritz,Maruti,2011,270000,50000,Petrol
811,Tata Indica V2,Tata,2009,110000,30000,Diesel
812,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
813,Tata Zest XM,Tata,2018,260000,27000,Diesel


In [154]:
car.to_csv('CleanedCar.csv')

## Defining Model

In [155]:
X=car[['name','company','year','kms_driven','fuel_type']]  ##all except price
y=car['Price']             ##price
X

Unnamed: 0,name,company,year,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,36000,Diesel
4,Ford Figo,Ford,2012,41000,Diesel
...,...,...,...,...,...
810,Maruti Suzuki Ritz,Maruti,2011,50000,Petrol
811,Tata Indica V2,Tata,2009,30000,Diesel
812,Toyota Corolla Altis,Toyota,2009,132000,Petrol
813,Tata Zest XM,Tata,2018,27000,Diesel


In [169]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
ohe=OneHotEncoder(handle_unknown='ignore')
ohe.fit(X[['name','company','fuel_type']])

OneHotEncoder(handle_unknown='ignore')

In [170]:
column_trans=make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type']),
                                    remainder='passthrough')

In [171]:
lr=LinearRegression()

In [172]:
pipe=make_pipeline(column_trans,lr)  ##pipeline ensures we have an input and output with one hot encoding happening in the pipeline itself.

In [173]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories=[array(['Audi A3 Cabriolet', 'Audi A4 1.8', 'Audi A4 2.0', 'Audi A6 2.0',
       'Audi A8', 'Audi Q3 2.0', 'Audi Q5 2.0', 'Audi Q7', 'BMW 3 Series',
       'BMW 5 Series', 'BMW 7 Series', 'BMW X1', 'BMW X1 sDrive20d',
       'BMW X1 xDrive20d', 'Chevrolet Beat', 'Chevrolet Beat...
                                                                            array(['Audi', 'BMW', 'Chevrolet', 'Datsun', 'Fiat', 'Force', 'Ford',
       'Hindustan', 'Honda', 'Hyundai', 'Jaguar', 'Jeep', 'Land',
       'Mahindra', 'Maruti', 'Mercedes', 'Mini', 'Mitsubishi', 'Nissan',
       'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen', 'Volvo'],
      dtype=object),
                                                                            array(['Diesel', 'LPG', 'Pe

In [174]:
y_pred=pipe.predict(X_test)

In [162]:
scores=[]
for i in range(1000):
  X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=i)
  lr=LinearRegression()
  pipe=make_pipeline(column_trans,lr)
  pipe.fit(X_train,y_train)
  y_pred=pipe.predict(X_test)
  scores.append(r2_score(y_test,y_pred))

In [175]:
np.argmax(scores)

661

In [176]:
scores[np.argmax(scores)]

0.889770931767991

In [177]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=np.argmax(scores))
lr=LinearRegression()
pipe=make_pipeline(column_trans,lr)
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
r2_score(y_test,y_pred)

0.889770931767991

In [178]:
import pickle

In [179]:
pickle.dump(pipe,open('LinearRegressionModel.pkl','wb'))

In [182]:
pipe.predict(pd.DataFrame([['Maruti Suzuki Swift','Maruti',2022,100,'Petrol']], columns=['name','company','year','kms_driven','fuel_type']))

array([456940.21313918])