# Car classification problem

In [204]:
import pandas as pd
import numpy as np

In [205]:
cars = pd.read_csv('data/Car-Data-Set/Car Data Set/car.data')
cars

Unnamed: 0,buying,maint,door,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [206]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   door      1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [207]:
cars.isna().sum()

buying      0
maint       0
door        0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

# Convert to numerical data

In [208]:
for car in cars:
    print(cars[car].value_counts())

vhigh    432
high     432
low      432
med      432
Name: buying, dtype: int64
vhigh    432
high     432
low      432
med      432
Name: maint, dtype: int64
2        432
3        432
4        432
5more    405
more      27
Name: door, dtype: int64
2       576
more    576
4       576
Name: persons, dtype: int64
big      576
med      576
small    576
Name: lug_boot, dtype: int64
high    576
low     576
med     576
Name: safety, dtype: int64
unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64


This shows that we have a categorical data

We have no data missing but if we look at our data, we have some text like 'more' on the persons column and 5more on the door column
let's assume that a car can take maximum of 5 people, so we replace more with 5 and a car can have a maximum of 5 door so, we replcae '5more' with 5

In [209]:
cars['persons'].replace('more', str(5), inplace=True)
cars['door'].replace('5more', str(5), inplace=True)

In [210]:
cars

Unnamed: 0,buying,maint,door,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5,5,med,med,good
1724,low,low,5,5,med,high,vgood
1725,low,low,5,5,big,low,unacc
1726,low,low,5,5,big,med,good


In [211]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   door      1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [212]:
# split into X(Feature) and y(Label)
X = cars.drop('class', axis=1)
y = np.array(cars['class'])

now we convert to numerical data using `pd.get_dummies`

In [213]:
X = pd.get_dummies(cars[['buying','maint','door','persons','lug_boot','safety']])
X.tail().T

Unnamed: 0,1723,1724,1725,1726,1727
buying_high,0,0,0,0,0
buying_low,1,1,1,1,1
buying_med,0,0,0,0,0
buying_vhigh,0,0,0,0,0
maint_high,0,0,0,0,0
maint_low,1,1,1,1,1
maint_med,0,0,0,0,0
maint_vhigh,0,0,0,0,0
door_2,0,0,0,0,0
door_3,0,0,0,0,0


# Fit and Train the model

Following the sklearn algorithm map https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html, let's try Linear SVC model

In [214]:
# split into train and test set
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

model = LinearSVC()

model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8872832369942196

88% accuracy, let's try another algorithm to see if we can improve our score

In [215]:
y_preds = model.predict(X_test)
y_preds[:20]

array(['acc', 'acc', 'unacc', 'acc', 'acc', 'unacc', 'unacc', 'unacc',
       'unacc', 'unacc', 'unacc', 'acc', 'acc', 'unacc', 'acc', 'unacc',
       'unacc', 'acc', 'acc', 'unacc'], dtype=object)

In [216]:
y_test[:20]

array(['acc', 'acc', 'unacc', 'acc', 'vgood', 'unacc', 'unacc', 'unacc',
       'unacc', 'unacc', 'unacc', 'vgood', 'acc', 'unacc', 'acc', 'unacc',
       'unacc', 'acc', 'acc', 'unacc'], dtype=object)

Next algorithm to try according to  sklearn algorithm map https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html, is KNeigborsClassifier SVC model

In [217]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8670520231213873

We get a lower score of 86%

For the sake of trying, let's try another classification algorithm (RandomForestClassifier) according to sklearn algorithm map https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [218]:
from sklearn.ensemble import RandomForestClassifier

rclf = RandomForestClassifier()

rclf.fit(X_train, y_train)

rclf.score(X_test, y_test)

0.9450867052023122

A better accuracy of 94% using RandomForestClassifier algorithm

In [221]:
rclf_preds = rclf.predict(X_test)
rclf_preds[:20]

array(['acc', 'acc', 'unacc', 'unacc', 'vgood', 'unacc', 'unacc', 'unacc',
       'unacc', 'unacc', 'unacc', 'vgood', 'unacc', 'unacc', 'acc',
       'unacc', 'unacc', 'acc', 'acc', 'unacc'], dtype=object)

In [222]:
for i in range(len(rclf_preds)):
    print(f'Predictions: {rclf_preds[i]}, Actual: {y_test[i]}')

Predictions: acc, Actual: acc
Predictions: acc, Actual: acc
Predictions: unacc, Actual: unacc
Predictions: unacc, Actual: acc
Predictions: vgood, Actual: vgood
Predictions: unacc, Actual: unacc
Predictions: unacc, Actual: unacc
Predictions: unacc, Actual: unacc
Predictions: unacc, Actual: unacc
Predictions: unacc, Actual: unacc
Predictions: unacc, Actual: unacc
Predictions: vgood, Actual: vgood
Predictions: unacc, Actual: acc
Predictions: unacc, Actual: unacc
Predictions: acc, Actual: acc
Predictions: unacc, Actual: unacc
Predictions: unacc, Actual: unacc
Predictions: acc, Actual: acc
Predictions: acc, Actual: acc
Predictions: unacc, Actual: unacc
Predictions: unacc, Actual: unacc
Predictions: acc, Actual: acc
Predictions: acc, Actual: acc
Predictions: unacc, Actual: unacc
Predictions: unacc, Actual: unacc
Predictions: unacc, Actual: unacc
Predictions: acc, Actual: acc
Predictions: unacc, Actual: unacc
Predictions: acc, Actual: acc
Predictions: unacc, Actual: acc
Predictions: unacc, Ac

In [223]:
rclf.predict_proba(X_test)

array([[0.85, 0.01, 0.14, 0.  ],
       [0.66, 0.2 , 0.14, 0.  ],
       [0.02, 0.  , 0.98, 0.  ],
       ...,
       [0.02, 0.  , 0.98, 0.  ],
       [0.  , 0.  , 1.  , 0.  ],
       [0.01, 0.  , 0.98, 0.01]])

# Save model

In [227]:
import pickle
pickle.dump(rclf, open('car_classification_model.pkl', 'wb'))