In [1]:
import numpy as np 
import pandas as pd 

from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

import warnings
warnings.simplefilter(action='ignore')

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/abdullabasim/dataset/main/car_evaluation.csv", header=None)


In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       1728 non-null   object
 1   1       1728 non-null   object
 2   2       1728 non-null   object
 3   3       1728 non-null   object
 4   4       1728 non-null   object
 5   5       1728 non-null   object
 6   6       1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [5]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']


data.columns = col_names

In [6]:
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [7]:
print("buying : ",data['buying'].unique())
print("maint : ",data['maint'].unique())
print("doors : ",data['doors'].unique())
print("persons : ",data['persons'].unique())
print("lug_boot : ",data['lug_boot'].unique())
print("safety : ",data['safety'].unique())
print("class : ",data['class'].unique())

buying :  ['vhigh' 'high' 'med' 'low']
maint :  ['vhigh' 'high' 'med' 'low']
doors :  ['2' '3' '4' '5more']
persons :  ['2' '4' 'more']
lug_boot :  ['small' 'med' 'big']
safety :  ['low' 'med' 'high']
class :  ['unacc' 'acc' 'vgood' 'good']


In [8]:
from sklearn.preprocessing import OrdinalEncoder


encoder = OrdinalEncoder()


# X = encoder.fit_transform(X)

data = encoder.fit_transform(data)

#X_test = encoder.transform(X_test)

data = pd.DataFrame(data, columns = col_names)


In [9]:
X = data.drop(['class'], axis=1)

y = data['class']

In [None]:
#split X and y into training and testing sets

# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier( random_state = 14)

parameters = {'splitter' : ['best', 'random'],
'criterion' : ['gini', 'entropy'],
'max_features': ['log2', 'sqrt','auto'],
'max_depth': [2, 3, 5, 10, 17],
'min_samples_split': [2, 3, 5, 7, 9],
'min_samples_leaf': [1,5,8,11],
'random_state' : [0,1,2,3,4,5]
}
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X,y)
# grid_search.fit(X_train,y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 83.68 %
Best Parameters: {'criterion': 'gini', 'max_depth': 17, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 4, 'splitter': 'best'}


In [None]:
# import DecisionTreeClassifier

from sklearn.tree import DecisionTreeClassifier

# instantiate the DecisionTreeClassifier model with criterion gini index

classifier = DecisionTreeClassifier(criterion='gini', max_depth=17,max_features ='log2',min_samples_leaf =1 ,min_samples_split=4,splitter='best', random_state= 4)


# fit the model
classifier.fit(X,y)

In [None]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X, y = y, cv = 20)
print(accuracies)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

[0.77011494 0.85057471 0.79310345 0.93103448 0.88505747 0.91954023
 0.7816092  0.94252874 0.94186047 0.6627907  0.77906977 0.75581395
 0.79069767 0.8255814  0.95348837 0.86046512 0.31395349 0.26744186
 0.79069767 0.95348837]
Accuracy: 78.84 %
Standard Deviation: 18.34 %
