In [1]:
import numpy as np
import pandas as pd
# import category encoders
import category_encoders as ce
# Libraries for ML
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [2]:
df_header = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class_values']
df = pd.read_csv('car.data', names=df_header)
df.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class_values
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


In [3]:
#check if there is null data in the data
df.isnull().sum()  

buying          0
maint           0
doors           0
persons         0
lug_boot        0
safety          0
class_values    0
dtype: int64

In [4]:
#Check data freq count
for col in df_header:
    
    print(df[col].value_counts()) 

vhigh    432
high     432
med      432
low      432
Name: buying, dtype: int64
vhigh    432
high     432
med      432
low      432
Name: maint, dtype: int64
2        432
3        432
4        432
5more    432
Name: doors, dtype: int64
2       576
4       576
more    576
Name: persons, dtype: int64
small    576
med      576
big      576
Name: lug_boot, dtype: int64
low     576
med     576
high    576
Name: safety, dtype: int64
unacc    1210
acc       384
good       69
vgood      65
Name: class_values, dtype: int64


In [5]:
#Declare feature vector and target variable
X = df.drop(['persons', 'buying'], axis=1)
y = df['buying']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 0)

X_train.shape, X_test.shape

((1036, 5), (692, 5))

In [6]:
X_train

Unnamed: 0,maint,doors,lug_boot,safety,class_values
505,vhigh,4,small,med,unacc
987,high,2,big,low,unacc
1532,med,2,small,high,unacc
1521,med,2,small,low,unacc
64,vhigh,4,small,med,unacc
...,...,...,...,...,...
835,low,4,big,med,acc
1216,low,3,small,med,unacc
1653,low,3,big,low,unacc
559,high,2,small,med,unacc


In [7]:
#Feature Engineering
encoder = ce.OrdinalEncoder(cols=['maint', 'doors', 'lug_boot', 'safety', 'class_values'])

X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [8]:
X_train

Unnamed: 0,maint,doors,lug_boot,safety,class_values
505,1,1,1,1,1
987,2,2,2,2,1
1532,3,2,1,3,1
1521,3,2,1,2,1
64,1,1,1,1,1
...,...,...,...,...,...
835,4,1,2,1,3
1216,4,4,1,1,1
1653,4,4,2,2,1
559,2,2,1,1,1


In [9]:
#train the model using gini index
model = DecisionTreeClassifier(criterion="gini", max_depth=3, random_state=42)
model.fit(X_train, y_train)

In [10]:
#predict the model
y_pred = model.predict(X_test)

In [11]:
#evaluate the model
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.3078034682080925


In [12]:
# Maintenance: High == 3 
# Number of doors: 4 == 4
# Lug Boot Size: Big == 3
# Safety: High == 2
# Class Value: Good == 2

#input data to predict pricing
new_data = ["2", "4", "3", "2", "2"]
#convert data to 2D array
new_data = np.array(new_data)
new_data = new_data.reshape(1, -1)

# predict the new data point
new_data_pred = model.predict(new_data)
print(new_data_pred)


['low']


