In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
df = pd.read_csv('car data car_dekho.csv')

In [3]:
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [5]:
df.describe(include='all')

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
count,301,301.0,301.0,301.0,301.0,301,301,301,301.0
unique,98,,,,,3,2,2,
top,city,,,,,Petrol,Dealer,Manual,
freq,26,,,,,239,195,261,
mean,,2013.627907,4.661296,7.628472,36947.20598,,,,0.043189
std,,2.891554,5.082812,8.644115,38886.883882,,,,0.247915
min,,2003.0,0.1,0.32,500.0,,,,0.0
25%,,2012.0,0.9,1.2,15000.0,,,,0.0
50%,,2014.0,3.6,6.4,32000.0,,,,0.0
75%,,2016.0,6.0,9.9,48767.0,,,,0.0


In [6]:
df['Fuel_Type'].value_counts()

Fuel_Type
Petrol    239
Diesel     60
CNG         2
Name: count, dtype: int64

In [7]:
df['Fuel_Type'].replace({'Petrol':0,'Diesel':1,'CNG':2},inplace=True)

In [8]:
df['Seller_Type'].value_counts()

Seller_Type
Dealer        195
Individual    106
Name: count, dtype: int64

In [9]:
df['Seller_Type'].replace({'Dealer': 1,
                           'Individual': 0}, inplace=True)

In [10]:
df['Transmission'].value_counts()

Transmission
Manual       261
Automatic     40
Name: count, dtype: int64

In [11]:
df['Transmission'].replace({'Manual': 1,
                            'Automatic': 0}, inplace=True)

In [12]:
df.corr(numeric_only=True)

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
Year,1.0,0.236141,-0.047584,-0.524342,0.053643,0.039896,0.000394,-0.182104
Selling_Price,0.236141,1.0,0.878983,0.029187,0.509467,0.550724,-0.367128,-0.088344
Present_Price,-0.047584,0.878983,1.0,0.203647,0.440415,0.51203,-0.348715,0.008057
Kms_Driven,-0.524342,0.029187,0.203647,1.0,0.166801,0.101419,-0.16251,0.089216
Fuel_Type,0.053643,0.509467,0.440415,0.166801,1.0,0.352415,-0.080466,-0.055705
Seller_Type,0.039896,0.550724,0.51203,0.101419,0.352415,1.0,-0.06324,-0.124269
Transmission,0.000394,-0.367128,-0.348715,-0.16251,-0.080466,-0.06324,1.0,-0.050316
Owner,-0.182104,-0.088344,0.008057,0.089216,-0.055705,-0.124269,-0.050316,1.0


In [13]:
df.drop(columns=['Car_Name','Owner'],axis=1, inplace=True)

In [14]:
X = df.drop('Present_Price', axis=1)
y = df['Present_Price']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

In [16]:
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(240, 6) (61, 6)
(240,) (61,)


In [17]:
model = LinearRegression()
model.fit(X_train,y_train)

In [18]:
y_pred = model.predict(X_test)

In [19]:
r_sqr = r2_score(y_test, y_pred)
r_sqr

0.8971184291530079

In [20]:
model.score(X_train,y_train)

0.8319735552817265

In [21]:
adj_r2 = (1 - ((1 - r_sqr)*(len(X_test) -1))/(len(X_test)- len(X_test.columns)-1))
adj_r2

0.8856871435033421

In [22]:
with open("lr_model.pkl", "wb") as f:
    pickle.dump(model, f)
