Importing the Libraries

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")


In [14]:
# loading the data from csv file to pandas dataframe
car_dataset = pd.read_csv('car data.csv')

In [16]:
df = car_dataset.copy()

In [17]:
# inspecting the first 5 rows of the dataframe
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [18]:
# checking the number of rows and columns
df.shape

(301, 9)

In [19]:
# getting some information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [20]:
# checking the number of missing values
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

# checking the distribution of categorical data


In [21]:
df['Fuel_Type'].value_counts()

Fuel_Type
Petrol    239
Diesel     60
CNG         2
Name: count, dtype: int64

In [22]:
df['Seller_Type'].value_counts()


Seller_Type
Dealer        195
Individual    106
Name: count, dtype: int64

In [23]:
df['Transmission'].value_counts()


Transmission
Manual       261
Automatic     40
Name: count, dtype: int64

Encoding the Categorical Data

In [24]:
# encoding "Fuel_Type" Column
df.replace({'Fuel_Type':{'Petrol':0,'Diesel':1,'CNG':2}},inplace=True)

# encoding "Seller_Type" Column
df.replace({'Seller_Type':{'Dealer':0,'Individual':1}},inplace=True)

# encoding "Transmission" Column
df.replace({'Transmission':{'Manual':0,'Automatic':1}},inplace=True)

In [25]:
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,0,0,0,0
1,sx4,2013,4.75,9.54,43000,1,0,0,0
2,ciaz,2017,7.25,9.85,6900,0,0,0,0
3,wagon r,2011,2.85,4.15,5200,0,0,0,0
4,swift,2014,4.6,6.87,42450,1,0,0,0


Splitting the data and Target column & droping the car name column

In [30]:
X = df.drop(['Car_Name','Selling_Price'],axis=1)
y = df['Selling_Price']

In [28]:
X

Unnamed: 0,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,5.59,27000,0,0,0,0
1,2013,9.54,43000,1,0,0,0
2,2017,9.85,6900,0,0,0,0
3,2011,4.15,5200,0,0,0,0
4,2014,6.87,42450,1,0,0,0
...,...,...,...,...,...,...,...
296,2016,11.60,33988,1,0,0,0
297,2015,5.90,60000,0,0,0,0
298,2009,11.00,87934,0,0,0,0
299,2017,12.50,9000,1,0,0,0


In [31]:
y

0       3.35
1       4.75
2       7.25
3       2.85
4       4.60
       ...  
296     9.50
297     4.00
298     3.35
299    11.50
300     5.30
Name: Selling_Price, Length: 301, dtype: float64

Splitting Training and Test data

In [None]:
# importing train_test_split

from sklearn.model_selection import train_test_split


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=2)

# importing the algorithms for model 

In [33]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error


In [34]:
lr = LinearRegression()
lr2 = LinearRegression()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor(n_estimators=200)
abr = AdaBoostRegressor(estimator=lr2, n_estimators=200)

In [35]:
d = []
for i in [lr, dtr, rfr, abr]:
    i.fit(X_train, y_train)
    y_pr = i.predict(X_test)
    r2 = r2_score(y_test, y_pr)
    mse = mean_squared_error(y_test, y_pr)
    d.append({"Model":i, "Score":r2, "Mean_squared Error":mse})

In [36]:
d

[{'Model': LinearRegression(),
  'Score': 0.8401532365378493,
  'Mean_squared Error': 2.935823428931178},
 {'Model': DecisionTreeRegressor(),
  'Score': 0.9354976268923872,
  'Mean_squared Error': 1.1846819672131144},
 {'Model': RandomForestRegressor(n_estimators=200),
  'Score': 0.961440513741074,
  'Mean_squared Error': 0.7082022852049158},
 {'Model': AdaBoostRegressor(estimator=LinearRegression(), n_estimators=200),
  'Score': 0.8271877170384022,
  'Mean_squared Error': 3.1739544682484313}]

In [37]:
model_details = pd.DataFrame(d)

In [38]:
model_details

Unnamed: 0,Model,Score,Mean_squared Error
0,LinearRegression(),0.840153,2.935823
1,DecisionTreeRegressor(),0.935498,1.184682
2,"(DecisionTreeRegressor(max_features=1.0, rando...",0.961441,0.708202
3,"(LinearRegression(), LinearRegression(), Linea...",0.827188,3.173954


# we get Decision tree regressor more model score

# Model Save

In [39]:
import joblib

In [40]:
joblib.dump(dtr, "Car_Price_prediction_project_joblib") 

['Car_Price_prediction_project_joblib']

In [41]:
dtr_jl = joblib.load("Car_Price_prediction_project_joblib")

In [43]:
dtr_jl.predict(X_test)

array([ 9.25,  0.65,  4.  ,  3.1 ,  8.65,  5.3 ,  2.65,  7.45,  0.25,
        5.8 ,  7.5 ,  4.6 ,  0.25,  8.65,  2.  ,  0.75,  0.65,  0.48,
       11.75,  4.15,  1.35,  5.9 ,  0.48,  9.15,  0.5 ,  7.25,  0.6 ,
        0.2 ,  3.1 ,  2.  ,  0.48,  3.35,  6.  ,  8.65,  0.55,  4.75,
        8.75,  6.6 ,  7.5 ,  4.95, 14.9 ,  0.78,  0.4 ,  0.6 ,  8.25,
        9.1 ,  0.3 ,  4.75, 18.  ,  2.5 ,  7.25,  0.6 ,  7.45,  0.6 ,
        0.75,  0.55,  0.42,  3.8 ,  0.6 ,  0.2 ,  5.9 ])