# Importing Libraries:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
cars_data = pd.read_csv(r"D:\Data Is Good\Projects\car price prediction\Car details v3.csv")
cars_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [3]:
# We have to decide which columns to keep. We can see torque can be removed. So we will drop that column.

cars_data.drop(columns = ['torque'], inplace = True)

In [4]:
cars_data.shape

(8128, 12)

# Preprocessing:

In [5]:
# Checking null values:

pd.DataFrame({
    'Column': cars_data.columns,
    'Null count': cars_data.isnull().sum().values,
    'Null %tage': 100*cars_data.isnull().sum().values/cars_data.shape[0]
})

Unnamed: 0,Column,Null count,Null %tage
0,name,0,0.0
1,year,0,0.0
2,selling_price,0,0.0
3,km_driven,0,0.0
4,fuel,0,0.0
5,seller_type,0,0.0
6,transmission,0,0.0
7,owner,0,0.0
8,mileage,221,2.718996
9,engine,221,2.718996


In [6]:
# since we have 8k records and only 200 odd records are missing, we will drop those records
cars_data.dropna(inplace = True)

In [7]:
cars_data.shape
# We can see we have dropped about 800 records

(7907, 12)

In [8]:
# Checking duplicate values:

cars_data.duplicated().sum()

1189

In [9]:
cars_data.drop_duplicates(inplace=True)

cars_data.shape

(6718, 12)

## Data Analysis:

In [10]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6718 entries, 0 to 8125
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6718 non-null   object 
 1   year           6718 non-null   int64  
 2   selling_price  6718 non-null   int64  
 3   km_driven      6718 non-null   int64  
 4   fuel           6718 non-null   object 
 5   seller_type    6718 non-null   object 
 6   transmission   6718 non-null   object 
 7   owner          6718 non-null   object 
 8   mileage        6718 non-null   object 
 9   engine         6718 non-null   object 
 10  max_power      6718 non-null   object 
 11  seats          6718 non-null   float64
dtypes: float64(1), int64(3), object(8)
memory usage: 682.3+ KB


In [11]:
a = 1_000_000_000
b = 2_000

c = a*b

print(c)

2000000000000


In [12]:
print(f'{c:,}') #adding separator

2,000,000,000,000


In [13]:
# Checking unique value counts and values in each column:

for i in cars_data.columns:
    if cars_data[i].dtype == 'O':
        print("The unique values in {} column are:".format(i))
        print(cars_data[i].nunique())
        print()
        print(cars_data[i].unique())
        print("++++++======================================++++++++")

The unique values in name column are:
1983

['Maruti Swift Dzire VDI' 'Skoda Rapid 1.5 TDI Ambition'
 'Honda City 2017-2020 EXi' ... 'Tata Nexon 1.5 Revotorq XT'
 'Ford Freestyle Titanium Plus Diesel BSIV'
 'Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV']
The unique values in fuel column are:
4

['Diesel' 'Petrol' 'LPG' 'CNG']
The unique values in seller_type column are:
3

['Individual' 'Dealer' 'Trustmark Dealer']
The unique values in transmission column are:
2

['Manual' 'Automatic']
The unique values in owner column are:
5

['First Owner' 'Second Owner' 'Third Owner' 'Fourth & Above Owner'
 'Test Drive Car']
The unique values in mileage column are:
393

['23.4 kmpl' '21.14 kmpl' '17.7 kmpl' '23.0 kmpl' '16.1 kmpl' '20.14 kmpl'
 '17.3 km/kg' '23.59 kmpl' '20.0 kmpl' '19.01 kmpl' '17.3 kmpl'
 '19.3 kmpl' '18.9 kmpl' '18.15 kmpl' '24.52 kmpl' '19.7 kmpl'
 '22.54 kmpl' '21.0 kmpl' '25.5 kmpl' '26.59 kmpl' '21.5 kmpl' '20.3 kmpl'
 '21.4 kmpl' '24.7 kmpl' '18.2 kmpl' '16.8 kmpl' '24.3 kmp

# Data Preprocessing:

## Name Columns:

In [14]:
# Since the name column has large number of unique values, we need to work on the column.
# We can see the first word in the name is brand
# We will extract that name and then we will process it to feed into ML Model

cars_data['name'] = cars_data['name'].apply(lambda x : x.split(' ')[0].strip())

In [15]:
cars_data['name'].value_counts()

name
Maruti           2090
Hyundai          1214
Mahindra          709
Tata              633
Honda             361
Ford              353
Toyota            324
Chevrolet         216
Renault           206
Volkswagen        173
Nissan             73
Skoda              69
Datsun             57
Mercedes-Benz      46
BMW                45
Fiat               39
Audi               33
Jeep               22
Mitsubishi         11
Volvo               9
Jaguar              8
Isuzu               4
Ambassador          4
Force               4
Land                3
Kia                 3
Daewoo              3
MG                  3
Ashok               1
Lexus               1
Opel                1
Name: count, dtype: int64

## Mileage Column:
* The column contains values like 24 kmpl or 17 km/kg.
* We need to remove the suffixes.

In [16]:
cars_data['mileage'] = cars_data['mileage'].apply(lambda x: x.split(' ')[0].strip())
cars_data['mileage'] = cars_data['mileage'].astype(float)

## Engine Column:
* Removing CC.

In [17]:
cars_data['engine'] = cars_data['engine'].apply(lambda x: x.split()[0].strip())
cars_data['engine'] = cars_data['engine'].astype(float)

## Max Power Column:

* Removing BHP.

In [18]:
# cars_data['max_power1'] = cars_data['max_power'].apply(lambda x : x.split(' ')[0].strip())
# # cars_data['max_power'] = cars_data['max_power1'].astype(float)
# cars_data['max_power'] = cars_data['max_power1']
# cars_data.drop(columns = ['max_power1'], inplace = True)

In [19]:
# cars_data[cars_data['max_power'] == '']['max_power'] = 0

In [20]:
def clean_d(x):
    val = x.split(' ')[0]
    if val == '':
        return 0
    else:
        return float(val)

cars_data['max_power'] = cars_data['max_power'].apply(clean_d)

## Processing categorical columns:
* Since model can't handle categorical columns directly. We need to work on them.
* Here, we will simply assign numbers to the values. Numbers will start at 1,2,3,...

In [21]:
cars_data.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'mileage', 'engine', 'max_power', 'seats'],
      dtype='object')

In [22]:
cols1 = ['name','fuel','seller_type','transmission','owner',]

for i in cols1:
    cars_data[i].replace(cars_data[i].unique(), list(range(1,cars_data[i].nunique()+1)), inplace = True)


In [42]:
for i in cols1:
    print(i)
    print(dict(zip(cars_data[i].unique(), list(range(1,cars_data[i].nunique()+1)))))
    print("++++++++++=================++++++++++")

name
{1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31}
fuel
{1: 1, 2: 2, 3: 3, 4: 4}
seller_type
{1: 1, 2: 2, 3: 3}
transmission
{1: 1, 2: 2}
owner
{1: 1, 2: 2, 3: 3, 4: 4, 5: 5}


In [43]:
dict(zip(cars_data['name'].unique(), range(1,cars_data['name'].nunique()+1)))

{1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 11: 11,
 12: 12,
 13: 13,
 14: 14,
 15: 15,
 16: 16,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 22: 22,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: 29,
 30: 30,
 31: 31}

In [23]:
# Reseting index:
cars_data.reset_index()

Unnamed: 0,index,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,0,1,2014,450000,145500,1,1,1,1,23.40,1248.0,74.00,5.0
1,1,2,2014,370000,120000,1,1,1,2,21.14,1498.0,103.52,5.0
2,2,3,2006,158000,140000,2,1,1,3,17.70,1497.0,78.00,5.0
3,3,4,2010,225000,127000,1,1,1,1,23.00,1396.0,90.00,5.0
4,4,1,2007,130000,120000,2,1,1,1,16.10,1298.0,88.20,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6713,8121,1,2013,260000,50000,2,1,1,2,18.90,998.0,67.10,5.0
6714,8122,4,2014,475000,80000,1,1,1,2,22.54,1396.0,88.73,5.0
6715,8123,4,2013,320000,110000,2,1,1,1,18.50,1197.0,82.85,5.0
6716,8124,4,2007,135000,119000,1,1,1,4,16.80,1493.0,110.00,5.0


In [24]:
# Splitting data between input and output.

input_data = cars_data.drop(columns = ['selling_price'])
output_data = cars_data['selling_price']

In [25]:
# Splitting in training and testing data:

xtrain, xtest, ytrain, ytest = train_test_split(input_data, output_data, test_size = 0.2)

# Creating Model:

In [26]:
# Linear Regression object
model = LinearRegression()


In [27]:
# training model
model.fit(xtrain, ytrain)

In [28]:
# Predicting the values:
predict = model.predict(xtest)

In [29]:
# Checking the model with some sample values:

# Creating input dataframe:

In [30]:
xtrain.head(1)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
1058,1,2018,41779,2,2,1,1,21.21,1197.0,81.8,5.0


In [31]:
xtrain.iloc[0]

name                1.00
year             2018.00
km_driven       41779.00
fuel                2.00
seller_type         2.00
transmission        1.00
owner               1.00
mileage            21.21
engine           1197.00
max_power          81.80
seats               5.00
Name: 1058, dtype: float64

In [32]:
xtrain.iloc[0].values

array([1.0000e+00, 2.0180e+03, 4.1779e+04, 2.0000e+00, 2.0000e+00,
       1.0000e+00, 1.0000e+00, 2.1210e+01, 1.1970e+03, 8.1800e+01,
       5.0000e+00])

In [33]:
sample_values = [1,2020,125500,1,1,1,1,23.4,1248.0,74.0,5.0]

input_df = pd.DataFrame([sample_values], columns = xtrain.columns)
input_df

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,1,2020,125500,1,1,1,1,23.4,1248.0,74.0,5.0


In [34]:
model.predict(input_df) # playing around the values in 'sample_values', like changing the year, or mileage

array([594043.3129383])

In [35]:
# In order to deply as web application we need to save this model. 
# We will create a pickle dump for the same.

import pickle as pk

In [46]:
xtrain.columns

Index(['name', 'year', 'km_driven', 'fuel', 'seller_type', 'transmission',
       'owner', 'mileage', 'engine', 'max_power', 'seats'],
      dtype='object')

In [36]:
pk.dump(model, open('model.pkl', 'wb'))

In [37]:
# The web application is created in VS Code.