In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv("car_evaluation.csv")

In [3]:
df.head()

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [4]:
df.rename({"vhigh":"buying_price","vhigh.1":"maintenance_cost","2":"number_of_doors","2.1":"number_of_persons","small":"lug_boot","low":"safety","unacc":"decision"},axis=1,inplace=True)

In [5]:
df.head()

Unnamed: 0,buying_price,maintenance_cost,number_of_doors,number_of_persons,lug_boot,safety,decision
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   buying_price       1727 non-null   object
 1   maintenance_cost   1727 non-null   object
 2   number_of_doors    1727 non-null   object
 3   number_of_persons  1727 non-null   object
 4   lug_boot           1727 non-null   object
 5   safety             1727 non-null   object
 6   decision           1727 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [7]:
df.isnull().sum()

buying_price         0
maintenance_cost     0
number_of_doors      0
number_of_persons    0
lug_boot             0
safety               0
decision             0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,buying_price,maintenance_cost,number_of_doors,number_of_persons,lug_boot,safety,decision
count,1727,1727,1727,1727,1727,1727,1727
unique,4,4,4,3,3,3,4
top,high,high,3,4,med,med,unacc
freq,432,432,432,576,576,576,1209


In [9]:
df["number_of_doors"].value_counts()

number_of_doors
3        432
4        432
5more    432
2        431
Name: count, dtype: int64

In [10]:
df["number_of_persons"].value_counts()

number_of_persons
4       576
more    576
2       575
Name: count, dtype: int64

In [11]:
df["number_of_doors"]=df["number_of_doors"].replace("5more","5")

In [12]:
df["number_of_persons"]=df["number_of_persons"].replace("more","5")

In [13]:
df["number_of_doors"]=df["number_of_doors"].astype(int)

In [14]:
df["number_of_persons"]=df["number_of_persons"].astype(int)

In [15]:
df.describe()

Unnamed: 0,number_of_doors,number_of_persons
count,1727.0,1727.0
mean,3.500869,3.667632
std,1.118098,1.247296
min,2.0,2.0
25%,3.0,2.0
50%,4.0,4.0
75%,4.5,5.0
max,5.0,5.0


In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X=df.drop(columns="decision")
y=df["decision"]

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=14)

In [19]:


from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



In [38]:


categorical_cols = ["buying_price", "maintenance_cost", "lug_boot", "safety"]
numerical_cols = ["number_of_doors", "number_of_persons"]

ordinal_encoder = OrdinalEncoder(categories = [
    ["low", "med", "high", "vhigh"], #buying
    ["low", "med", "high", "vhigh"], #maint
    ["small", "med", "big"], #lug_boot
    ["low", "med", "high"] #safety
])

preprocessor = ColumnTransformer( transformers= [
    ('transformation_name_doesnt_matter', ordinal_encoder, categorical_cols),
], remainder="passthrough")

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)



In [39]:
pd.DataFrame(X_train_transformed)[5].unique()

array([4., 2., 5.])

In [40]:
from sklearn.tree import DecisionTreeClassifier

In [41]:
tree=DecisionTreeClassifier(max_depth=3,random_state=0,criterion="gini")

In [42]:
tree.fit(X_train_transformed,y_train)

In [43]:
y_pred=tree.predict(X_test_transformed)

In [44]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [45]:
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))


0.7708333333333334
              precision    recall  f1-score   support

         acc       0.52      0.59      0.55       103
        good       0.00      0.00      0.00        19
       unacc       0.87      0.94      0.90       289
       vgood       0.00      0.00      0.00        21

    accuracy                           0.77       432
   macro avg       0.35      0.38      0.36       432
weighted avg       0.70      0.77      0.74       432

[[ 61   0  42   0]
 [ 19   0   0   0]
 [ 17   0 272   0]
 [ 21   0   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [46]:
from sklearn.model_selection import GridSearchCV

In [47]:
params={
    "criterion" : ["gini", "entropy", "log_loss"],
    "splitter":["best", "random"],
    "max_depth":[3,5,7,9,11,13]
}

In [48]:
grid=GridSearchCV(estimator=DecisionTreeClassifier(),cv=5,param_grid=params,scoring="accuracy")

In [49]:
grid.fit(X_train_transformed,y_train)


In [50]:
grid.best_params_

{'criterion': 'gini', 'max_depth': 11, 'splitter': 'best'}

In [52]:
y_preds=grid.predict(X_test_transformed)

In [53]:
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))


0.7708333333333334
              precision    recall  f1-score   support

         acc       0.52      0.59      0.55       103
        good       0.00      0.00      0.00        19
       unacc       0.87      0.94      0.90       289
       vgood       0.00      0.00      0.00        21

    accuracy                           0.77       432
   macro avg       0.35      0.38      0.36       432
weighted avg       0.70      0.77      0.74       432

[[ 61   0  42   0]
 [ 19   0   0   0]
 [ 17   0 272   0]
 [ 21   0   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
