In [1]:
from pandas import read_csv
from pandas.plotting import scatter_matrix
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [3]:
#load the dataset
dataset = read_csv("../car_price_prediction_model/CAR DETAILS FROM CAR DEKHO.csv")

In [14]:
dataset.head(10)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner
5,Maruti Alto LX BSIII,2007,140000,125000,Petrol,Individual,Manual,First Owner
6,Hyundai Xcent 1.2 Kappa S,2016,550000,25000,Petrol,Individual,Manual,First Owner
7,Tata Indigo Grand Petrol,2014,240000,60000,Petrol,Individual,Manual,Second Owner
8,Hyundai Creta 1.6 VTVT S,2015,850000,25000,Petrol,Individual,Manual,First Owner
9,Maruti Celerio Green VXI,2017,365000,78000,CNG,Individual,Manual,First Owner


In [15]:
dataset.shape

(4340, 8)

In [16]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [44]:
print(dataset.fuel.value_counts())
print(dataset.seller_type.value_counts())
print(dataset.transmission.value_counts())
print(dataset.owner.value_counts())

fuel
1           2153
0           2123
2             40
3             23
Electric       1
Name: count, dtype: int64
seller_type
1    3244
0     994
2     102
Name: count, dtype: int64
transmission
0    3892
1     448
Name: count, dtype: int64
owner
0                 2832
1                 1106
2                  304
3                   81
Test Drive Car      17
Name: count, dtype: int64


In [45]:
#encode the categorical data
dataset.replace({"seller_type":{"Dealer":0, "Individual":1, "Trustmark Dealer":2}}, inplace = True)
dataset.replace({"transmission":{"Manual":0, "Automatic":1}}, inplace = True)
dataset.replace({"fuel":{"Petrol":0, "Diesel":1, "CNG":2, "LPG":3, "Electric":4}}, inplace = True)
dataset.replace({"owner":{"First Owner":0, "Second Owner":1, "Third Owner":2, "Fourth & Above Owner":3, "Test Drive Car":4}}, inplace = True)
dataset.head()

  dataset.replace({"fuel":{"Petrol":0, "Diesel":1, "CNG":2, "LPG":3, "Electric":4}}, inplace = True)
  dataset.replace({"owner":{"First Owner":0, "Second Owner":1, "Third Owner":2, "Fourth & Above Owner":3, "Test Drive Car":4}}, inplace = True)


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,0,1,0,0
1,Maruti Wagon R LXI Minor,2007,135000,50000,0,1,0,0
2,Hyundai Verna 1.6 SX,2012,600000,100000,1,1,0,0
3,Datsun RediGO T Option,2017,250000,46000,0,1,0,0
4,Honda Amaze VX i-DTEC,2014,450000,141000,1,1,0,1


In [46]:
# Split-out validation dataset
x = dataset.drop(["name", "selling_price"], axis = 1)
y = dataset["selling_price"]
X_train, X_validation, Y_train, Y_validation = train_test_split(x, y, test_size=0.20, random_state=1, shuffle=True)

In [47]:
#Build the models
models = []
models.append(("LR", LogisticRegression(solver= "liblinear", multi_class="ovr")))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("CART", DecisionTreeClassifier()))
models.append(("NB", GaussianNB()))
models.append(("SVM", SVC(gamma="auto")))

In [48]:
# evaluate each model
results = []
names = []
for name, model in models:
	kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
	cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
	results.append(cv_results)
	names.append(name)
	print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))



LR: 0.039458 (0.001306)
LDA: 0.070559 (0.010285)




KNN: 0.113194 (0.014067)




CART: 0.213998 (0.015302)




NB: 0.047234 (0.008364)




SVM: 0.167341 (0.013318)
