In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# load dataset
df_car=pd.read_csv('car_evaluation.csv', header=None)
df_car.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


## Exploratory Analysis

In [4]:
#visualize dataset
df_car.head(10)

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


In [5]:
# analyze the shape of the data
print(df_car.shape)

(1728, 7)


In [6]:
#rename columns
col_names=['buying','maint','doors','persons','lug_boot','safety','class']
df_car.columns=col_names
df_car.head(5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [7]:
#visualize dataset data types
df_car.dtypes


buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
class       object
dtype: object

In [8]:
#analyze the target variable
df_car['class'].value_counts()

class
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64

In [9]:
# analyze missing values
df_car.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

## Procesamiento de Datos


In [10]:
#separate dependent and independent variables (X and y)
X=df_car.drop('class',axis=1)
y=df_car['class']

In [11]:
#import libraries to create the model
from sklearn.model_selection import train_test_split

#we split the data in train (70%) and test (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
#see what we get
X_train.shape, X_test.shape

((1209, 6), (519, 6))

In [13]:
y_train.shape, y_test.shape

((1209,), (519,))

In [14]:
#see what we got in X_train
X_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
1178,med,med,5more,4,big,high
585,high,high,3,more,small,low
1552,low,med,3,4,med,med
1169,med,med,5more,2,big,high
1033,med,high,4,2,big,med


In [15]:
X_train.dtypes

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
dtype: object

## Train model with decision tree

In [17]:
#all the variables are objects, we need to change them to numeric

import category_encoders as ce #pip install category_encoders

encoder=ce.OrdinalEncoder(cols=['buying','maint','doors','persons','lug_boot','safety'])
X_train=encoder.fit_transform(X_train)
X_test=encoder.fit_transform(X_test)    


In [18]:
#verify the data types
X_train.dtypes

buying      int32
maint       int32
doors       int32
persons     int32
lug_boot    int32
safety      int32
dtype: object

In [20]:
X_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
1178,1,1,1,1,1,1
585,2,2,2,2,2,2
1552,3,1,2,1,3,3
1169,1,1,1,3,1,1
1033,1,2,3,3,1,3


In [21]:
#import libraries to create the model
from sklearn.tree import DecisionTreeClassifier

#create the model
tree=DecisionTreeClassifier(max_depth=2, random_state=0)

In [22]:
#train the model
tree.fit(X_train,y_train)

In [24]:
#calculate the prediction for train and test
y_train_pred_tree=tree.predict(X_train)
y_test_test_tree=tree.predict(X_test)
# print(y_pred_train)
# print(y_pred_test)

In [25]:
#see the variables
y_train_pred_tree

array(['acc', 'unacc', 'unacc', ..., 'acc', 'unacc', 'unacc'],
      dtype=object)