# Decision Tree

In [1]:
import pandas as pd 
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

## Loading Dataset

In [3]:
dataset = pd.read_csv("../../../Datasets/car_evaluation.txt", sep=",")
dataset.columns = ['buying', 'maintenance', 'doors', 'persons', 'lug_boot', 'safety', 'target']
dataset.head()

Unnamed: 0,buying,maintenance,doors,persons,lug_boot,safety,target
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [4]:
samples_count, features_count = dataset.shape
samples_count, features_count

(1727, 7)

## Separating target column from dataset

In [5]:
target = dataset['target']
dataset = dataset.drop('target',axis=1)

## One-Hot Encoding of Dataset & Target column

### Dataset

In [8]:
ohe_dataset = preprocessing.OneHotEncoder()
ohe_dataset.fit(dataset.values) 
dataset_encoded = ohe_dataset.transform(dataset.values).toarray()

### Target Column

In [9]:
ohe_target = preprocessing.OneHotEncoder()
target_2D = target.values.reshape(-1,1)
ohe_target.fit(target_2D)
target_encoded = ohe_target.transform(target_2D).toarray()

## Evaluate Decision Tree using Cross Validation
* __Cross validation is an alternative approach of train/test split. It just used for model performance evaluation
  and not for model building__
* __For classification problems, stratified k-fold cross validation is recommended__
* __Stratified cross validation makes sure that all blocks of data should consist of all representations of data__
* __cv=10(no of folds/blocks) is recommended__
* __if cv=integer and estimator is classifier algorithm then StratifiedKFold is used by cross_val_score as default__

In [10]:
x = dataset_encoded
y = target_encoded
tree = DecisionTreeClassifier()
scores = cross_val_score(tree, x, y, cv=5)
print("%.2f" % scores.mean())

0.76


## Train Decision Tree

In [11]:
tree.fit(x,y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

## Prediction by using input data from user

In [12]:
buying = input("Enter buying category(vhigh,high,med,low)::")
maintenance = input("Enter buying maintenance(vhigh,high,med,low)::")
doors = input("Enter no of doors(2,3,4,5more)::")
persons = input("Enter no of persons(2,4,more)::")
lug_boot = input("Enter lug_boot category(small,med,big)::")
safety = input("Enter safety category(low,med,high)::")

Enter buying category(vhigh,high,med,low)::vhigh
Enter buying maintenance(vhigh,high,med,low)::low
Enter no of doors(2,3,4,5more)::3
Enter no of persons(2,4,more)::4
Enter lug_boot category(small,med,big)::small
Enter safety category(low,med,high)::low


In [13]:
user_sample = [[buying, maintenance, doors, persons, lug_boot, safety]]
user_sample_encoded = ohe_dataset.transform(user_sample).toarray()
result_encoded = tree.predict(user_sample_encoded)
result_decoded = ohe_target.inverse_transform(result_encoded)
result = result_decoded[0]

In [15]:
if result == 'unacc':
    print("Your car is unaccurate")
elif result == 'acc':
    print("Your car is accurate")
elif result == 'good':
    print("Your car is good")
else:
    print("Your car is very good")

Your car is unaccurate
