# Car Evaluation Analysis Using Decision Tree Classifier

In [67]:
import numpy as np 
import pandas as pd 


## **Import The dataset**

In [19]:
df = pd.read_csv('car_evaluation.csv')
df.head()

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [58]:
col_names = ['buying_price', 'maint_cost', 'doors_no', 'persons', 'lug_size', 'safety', 'class']


df.columns = col_names

col_names

['buying_price',
 'maint_cost',
 'doors_no',
 'persons',
 'lug_size',
 'safety',
 'class']

In [59]:
df.head()

Unnamed: 0,buying_price,maint_cost,doors_no,persons,lug_size,safety,class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


**Data set info:**

1.  Buying Price:  v-high, high, med, low
2.   Maintenance: v-high, high, med, low
3. Doors : 2, 3, 4, 5-more
4. Person : 2, 4, more
5. Size of luggage capacity: small, med, high
6. safety: low, med, high






## **Check null vales**

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   buying_price  1727 non-null   object
 1   maint_cost    1727 non-null   object
 2   doors_no      1727 non-null   object
 3   persons       1727 non-null   object
 4   lug_size      1727 non-null   object
 5   safety        1727 non-null   object
 6   class         1727 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [61]:
df.isnull().sum()

buying_price    0
maint_cost      0
doors_no        0
persons         0
lug_size        0
safety          0
class           0
dtype: int64

## **Explore class variable¶**

In [62]:
df['class'].value_counts()

unacc    1209
acc       384
good       69
vgood      65
Name: class, dtype: int64

## **Declare feature vector and target variable**


In [63]:
y = df.loc[: , 'class'] 

In [64]:
x = df.drop(['class'], axis = 1 )

In [65]:
x

Unnamed: 0,buying_price,maint_cost,doors_no,persons,lug_size,safety
0,vhigh,vhigh,2,2,small,med
1,vhigh,vhigh,2,2,small,high
2,vhigh,vhigh,2,2,med,low
3,vhigh,vhigh,2,2,med,med
4,vhigh,vhigh,2,2,med,high
...,...,...,...,...,...,...
1722,low,low,5more,more,med,med
1723,low,low,5more,more,med,high
1724,low,low,5more,more,big,low
1725,low,low,5more,more,big,med


## **Dummy Encoding**

In [66]:
features = pd.get_dummies(x, columns= ['buying_price', 'maint_cost', 'doors_no', 'persons', 'lug_size', 'safety'])
features

Unnamed: 0,buying_price_high,buying_price_low,buying_price_med,buying_price_vhigh,maint_cost_high,maint_cost_low,maint_cost_med,maint_cost_vhigh,doors_no_2,doors_no_3,...,doors_no_5more,persons_2,persons_4,persons_more,lug_size_big,lug_size_med,lug_size_small,safety_high,safety_low,safety_med
0,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,0,1
1,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,0,1,1,0,0
2,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,1,0,0,1,0
3,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,1,0,0,0,1
4,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,0,1,0,0,0,1,0,0,0,0,...,1,0,0,1,0,1,0,0,0,1
1723,0,1,0,0,0,1,0,0,0,0,...,1,0,0,1,0,1,0,1,0,0
1724,0,1,0,0,0,1,0,0,0,0,...,1,0,0,1,1,0,0,0,1,0
1725,0,1,0,0,0,1,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1


# Split data into separate training and test sets

In [52]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2)

In [53]:
from sklearn import tree 
clf = tree.DecisionTreeClassifier(max_depth = 6, min_samples_split=5 , min_samples_leaf=2)
clf.fit(X_train , y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=2, min_samples_split=5)

In [54]:
y_train_predict = clf.predict(X_train)
y_test_predict = clf.predict(X_test)
from sklearn.metrics import accuracy_score
train_acc= accuracy_score(y_train, y_train_predict)
test_acc= accuracy_score(y_test ,y_test_predict )

In [55]:
print('train accuracy = ' , train_acc)
print('test accuracy = ' , test_acc)

train accuracy =  0.8819695872556119
test accuracy =  0.8815028901734104
