In [91]:
import pandas as pd
import numpy as np

data = pd.read_csv('car_evaluation.csv')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
data.rename(columns={'vhigh': 'buyprice', 'vhigh.1': 'maintainprice', '2': 'doors', '2.1': 'persons', 'small': 'luggagesize', 'low': 'safety', 'unacc':'rating'}, inplace=True)
data

Unnamed: 0,buyprice,maintainprice,doors,persons,luggagesize,safety,rating
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
5,vhigh,vhigh,2,2,big,low,unacc
6,vhigh,vhigh,2,2,big,med,unacc
7,vhigh,vhigh,2,2,big,high,unacc
8,vhigh,vhigh,2,4,small,low,unacc
9,vhigh,vhigh,2,4,small,med,unacc


In [92]:
for column in data.columns:
    if data[column].dtype =='object':
        unique_values = data[column].unique()
        print(f"Column '{column}' has {len(unique_values)} unique values: " )
        print(unique_values)
        print("\n")
    else:
        min_value = data[column].min()
        max_value = data[column].max()
        print(f"Column '{column}' has values in range from {min_value} to {max_value}")
        print("\n")

Column 'buyprice' has 4 unique values: 
['vhigh' 'high' 'med' 'low']


Column 'maintainprice' has 4 unique values: 
['vhigh' 'high' 'med' 'low']


Column 'doors' has 4 unique values: 
['2' '3' '4' '5more']


Column 'persons' has 3 unique values: 
['2' '4' 'more']


Column 'luggagesize' has 3 unique values: 
['small' 'med' 'big']


Column 'safety' has 3 unique values: 
['med' 'high' 'low']


Column 'rating' has 4 unique values: 
['unacc' 'acc' 'vgood' 'good']




In [93]:
for column in data.columns:
    print(data[column].value_counts())
    print("\n")

high     432
med      432
low      432
vhigh    431
Name: buyprice, dtype: int64


high     432
med      432
low      432
vhigh    431
Name: maintainprice, dtype: int64


3        432
4        432
5more    432
2        431
Name: doors, dtype: int64


4       576
more    576
2       575
Name: persons, dtype: int64


med      576
big      576
small    575
Name: luggagesize, dtype: int64


med     576
high    576
low     575
Name: safety, dtype: int64


unacc    1209
acc       384
good       69
vgood      65
Name: rating, dtype: int64




In [94]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1, 1)

def custom_train_test_split(X, Y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    
    num_samples = X.shape[0]
    num_test = int(test_size * num_samples)
    
    indices = np.arange(num_samples)
    np.random.shuffle(indices)
    X = X[indices]
    Y = Y[indices]
    
    X_test = X[:num_test]
    Y_test = Y[:num_test]
    X_train = X[num_test:]
    Y_train = Y[num_test:]
    
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = custom_train_test_split(X, Y, test_size=0.2, random_state=0)

In [95]:
class DecisionTree():
    class Node():
        def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):        
            self.feature_index = feature_index
            self.threshold = threshold
            self.left = left
            self.right = right
            self.info_gain = info_gain

            self.value = value        


    def build(self, dataset, curr_depth=0):        
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
    
        
        best_split = {}
        max_info_gain = -float("inf")
        
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            
            for threshold in possible_thresholds:
                
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]

                    weight_l = len(left_y) / len(y)
                    weight_r = len(right_y) / len(y)
                    gain = self.entropy(y) - (weight_l*self.entropy(left_y) + weight_r*self.entropy(right_y))
                    
                    if gain > max_info_gain:
                        best_split["feature"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["left"] = dataset_left
                        best_split["right"] = dataset_right
                        best_split["info_gain"] = gain
                        max_info_gain = gain


        if best_split.get("info_gain", 0)>0:
            
            left_subtree = self.build(best_split["left"], curr_depth+1)
            
            right_subtree = self.build(best_split["right"], curr_depth+1)
            
            return DecisionTree.Node(best_split["feature"], best_split["threshold"], 
                        left_subtree, right_subtree, best_split["info_gain"])
        
        else:
            leaf_value = self.calculate_leaf_value(Y)
        
            return DecisionTree.Node(value=leaf_value)
    
    
    def split(self, dataset, feature_index, threshold):
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    
    def entropy(self, y):        
        class_labels = np.unique(y)
        entropy = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            if p_cls > 0:
                entropy += -p_cls * np.log2(p_cls)
        return entropy

        

    def calculate_leaf_value(self, Y):
        Y = list(Y)
        return max(Y, key=Y.count)
    
    def fit(self, X, Y):        
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build(dataset)
    
    def predict(self, X):
        
        predictions = []
        
        tree = self.root

        for x in X:
            if tree.value!=None: return tree.value
            feature_val = x[tree.feature_index]
            if feature_val<=tree.threshold:
                predictions.append(self.make_prediction(x, tree.left))
            else:
                predictions.append(self.make_prediction(x, tree.right))
        return predictions
    
    def make_prediction(self, x, tree):        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

In [96]:
def calculate_accuracy(Y_test, Y_predict):
    correct_predictions = 0
    total_samples = len(Y_test)

    for i in range(total_samples):
        if Y_test[i] == Y_predict[i]:
            correct_predictions += 1

    accuracy = correct_predictions / total_samples
    return accuracy

In [97]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

best_accuracy = -1
accuracies = []
best_fold = None

for fold_idx, (train_index, val_index) in enumerate(skf.split(X_train, Y_train)):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    Y_train_fold, Y_val_fold = Y_train[train_index], Y_train[val_index]

    d_tree = DecisionTree()
    d_tree.fit(X_train_fold, Y_train_fold)

    Y_predict = d_tree.predict(X_val_fold)

    accuracy = calculate_accuracy(Y_val_fold, Y_predict)
    accuracies.append(accuracy)

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_fold = fold_idx

print("Best Fold Accuracy: {:.2f}%".format(best_accuracy * 100))

X_train_best_fold, Y_train_best_fold = X_train[train_index], Y_train[train_index]

d_tree_best_fold = DecisionTree()
d_tree_best_fold.fit(X_train_best_fold, Y_train_best_fold)

Y_predict_best_fold = d_tree_best_fold.predict(X_test)

cm = confusion_matrix(Y_test, Y_predict_best_fold)

print("\n\nConfusion Matrix for Best Fold:")
print(cm)


Best Fold Accuracy: 97.83%


Confusion Matrix for Best Fold:
[[ 70   3   3   0]
 [  0  12   0   2]
 [  0   0 239   0]
 [  0   1   0  15]]
