### Importing libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("./Car_Price_Prediction.csv")

### Data Cleaning

Some problems with the data

- Year has many irrelevant values
- Convert year to it
- Remove irrelevent values from Price
- Convert Price to int
- kms_driven has words 'kms'
- Convert kms_driven to int
- fuel_type has NaN values
- keep first 3 words of name

In [3]:
df1 = df.copy()

In [4]:
df = df[df['year'].str.isnumeric()]

In [5]:
df['year'] = df['year'].astype(int)

In [6]:
df = df[df['Price'] != "Ask For Price"]

In [7]:
df['Price'] = df['Price'].str.replace(',', '')
df['Price']

0       80000
1      425000
3      325000
4      575000
6      175000
        ...  
886    300000
888    260000
889    390000
890    180000
891    160000
Name: Price, Length: 819, dtype: object

In [8]:
df['Price'] = df['Price'].astype(int)

In [9]:
df['kms_driven'] = df['kms_driven'].str.split(' ').str.get(0).str.replace(',', '')

In [10]:
df = df[df['kms_driven'].str.isnumeric()]

In [11]:
df['kms_driven'] = df['kms_driven'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 817 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        817 non-null    object
 1   company     817 non-null    object
 2   year        817 non-null    int32 
 3   Price       817 non-null    int32 
 4   kms_driven  817 non-null    int32 
 5   fuel_type   816 non-null    object
dtypes: int32(3), object(3)
memory usage: 35.1+ KB


In [12]:
df = df[~df['fuel_type'].isna()]

In [13]:
df['name'] = df['name'].str.split(' ').str.slice(0, 3).str.join(' ')

In [14]:
df.reset_index(drop = True)

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...
811,Maruti Suzuki Ritz,Maruti,2011,270000,50000,Petrol
812,Tata Indica V2,Tata,2009,110000,30000,Diesel
813,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
814,Tata Zest XM,Tata,2018,260000,27000,Diesel


### Outliers Removal 

In [15]:
df = df[df['Price'] < 6000000].reset_index(drop = True)

In [16]:
df.to_csv('Cleaned_Car_Data.csv')

### Building Model

In [17]:
X = df.drop(columns = 'Price')
Y = df['Price']

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [20]:
ohe = OneHotEncoder()
ohe.fit(X[['name', 'company', 'fuel_type']])

In [21]:
column_trans = make_column_transformer((OneHotEncoder(categories = ohe.categories_), ['name', 'company', 'fuel_type']), remainder = 'passthrough')

In [22]:
lr = LinearRegression()

In [23]:
pipe = make_pipeline(column_trans, lr)

In [24]:
pipe.fit(X_train,  Y_train)

In [25]:
Y_pred = pipe.predict(X_test)

In [26]:
r2_score(Y_test, Y_pred)

0.5725226569685651

In [27]:
scores = []
for i in range(1000):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = i)
    lr = LinearRegression()
    pipe = make_pipeline(column_trans, lr)
    pipe.fit(X_train, Y_train)
    Y_pred = pipe.predict(X_test)
    scores.append(r2_score(Y_test, Y_pred))

In [28]:
scores[np.argmax(scores)]

0.8456891236558007

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = np.argmax(scores))
lr = LinearRegression()
pipe = make_pipeline(column_trans, lr)
pipe.fit(X_train, Y_train)
Y_pred = pipe.predict(X_test)
r2_score(Y_test, Y_pred)

0.8456891236558007

In [30]:
import pickle

In [31]:
pickle.dump(pipe, open('LinearRegressionModel.pkl', 'wb'))

In [33]:
pipe.predict(pd.DataFrame([['Maruti Suzuki Swift', 'Maruti', 2019, 1000, 'Petrol']], columns = ['name', 'company', 'year', 'kms_driven', 'fuel_type']))

array([458566.05861598])