# USING DECISION TREES 
~ Rohan Singhal 240880

In [60]:
import numpy as np

In [61]:
class Node():
    def __init__(self,threshold= None, feature_index= None, value= None,left= None,right= None,info_gain=None):
        self.value= value

        self.threshold = threshold
        self.feature_index= feature_index
        self.left= left
        self.right= right
        self.info_gain = info_gain
    
    

In [None]:
class DecisionTree():
    def __init__(self, min_samples_split=2, max_depth=2):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.root = None
    
    def build_tree(self, X, y, depth =0):

        num_samples, num_features = X.shape
        unique_classes = set(y)


        if num_samples>=self.min_samples_split and depth<=self.max_depth:
            best_split, best_info_gain = self.get_best_split(X, y, num_samples, num_features)
            if best_info_gain > 0:
                left_indices = best_split['left_indices']
                right_indices = best_split['right_indices']
                left_subtree = self.build_tree(X[left_indices], y[left_indices], depth + 1)
                right_subtree = self.build_tree(X[right_indices], y[right_indices], depth + 1)
                return Node(
                    threshold=best_split['threshold'],
                    feature_index=best_split['feature_index'],
                    left=left_subtree,
                    right=right_subtree,
                    info_gain=best_info_gain
                )
        
        leaf_value = self.get_leaf_value(y)
        return Node(value=leaf_value)
    def fit(self, X, y):
        self.root = self.build_tree(X, y)
    def predict(self, X):
        predictions = []
        for sample in X:
            predictions.append(self.traverse_tree(sample, self.root))
        return np.array(predictions)
    def traverse_tree(self, sample, node):
        if node.value is not None:
            return node.value
        feature_value = sample[node.feature_index]
        if feature_value <= node.threshold:
            return self.traverse_tree(sample, node.left)
        else:
            return self.traverse_tree(sample, node.right)
    


    def get_best_split(self,X,y, num_samples, num_features):
        best_split = {}
        best_info_gain = -1
        for feature_index in range(num_features):
            feature_values = set(X[:, feature_index])
            for value in feature_values:
                left_indices = [i for i in range(num_samples) if X[i, feature_index] <= value]
                right_indices = [i for i in range(num_samples) if X[i, feature_index] > value]

                if len(left_indices) < self.min_samples_split or len(right_indices) < self.min_samples_split:
                    continue
                left_y = y[left_indices]
                right_y = y[right_indices]
                info_gain = self.calculate_info_gain(y, left_y, right_y)
                if info_gain <= 0:
                    continue
                if info_gain > best_info_gain:
                    best_info_gain = info_gain
                    best_split = {
                        'feature_index': feature_index,
                        'threshold': value,
                        'left_indices': left_indices,
                        'right_indices': right_indices
                    }
        return best_split, best_info_gain
    def calculate_info_gain(self, parent_y, left_y, right_y):
        parent_entropy = self.calculate_entropy(parent_y)
        left_entropy = self.calculate_entropy(left_y)
        right_entropy = self.calculate_entropy(right_y)

        left_weight = len(left_y) / len(parent_y)
        right_weight = len(right_y) / len(parent_y)

        info_gain = parent_entropy - (left_weight * left_entropy + right_weight * right_entropy)
        return info_gain
    def calculate_entropy(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities + 1e-9))
        return entropy
    def get_leaf_value(self, y):
        classes, counts = np.unique(y, return_counts=True)
        return classes[np.argmax(counts)]


        
           

        

In [63]:
import pandas as pd


In [64]:
df = pd.read_csv('dataset/AER_credit_card_data.csv')

df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.52,0.03327,124.9833,yes,no,3,54,1,12
1,yes,0,33.25,2.42,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5,0.004156,15.0,yes,no,4,58,1,5
3,yes,0,30.5,2.54,0.065214,137.8692,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.5033,yes,no,2,64,1,5


In [65]:

df['owner'] = df['owner'].map({'yes': 1, 'no': 0})
df['card'] = df['card'].map({'yes': 1, 'no': 0})
df['selfemp'] = df['selfemp'].map({'yes': 1, 'no': 0})

# X = df.iloc[:, 1:].to_numpy()
# y = df['card'].to_numpy()
# X[0]
df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,1,0,37.66667,4.52,0.03327,124.9833,1,0,3,54,1,12
1,1,0,33.25,2.42,0.005217,9.854167,0,0,3,34,1,13
2,1,0,33.66667,4.5,0.004156,15.0,1,0,4,58,1,5
3,1,0,30.5,2.54,0.065214,137.8692,0,0,0,25,1,7
4,1,0,32.16667,9.7867,0.067051,546.5033,1,0,2,64,1,5


In [66]:
x = df.iloc[:, 1:].to_numpy()
y = df['card'].to_numpy()


In [67]:
tree = DecisionTree(min_samples_split=2, max_depth=5)
tree.fit(x, y)
predictions = tree.predict(x)
print("Predictions:", predictions)
accuracy = np.mean(predictions == y)
print("Accuracy:", accuracy)

Predictions: [1 1 1 ... 1 1 1]
Accuracy: 0.9863532979529946


# USING RANDOM FOREST

In [68]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Predictions:", rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
   
print("Decision Tree Predictions:", predictions)
print("Decision Tree Accuracy:", accuracy)


Random Forest Predictions: [1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0
 1 0 1 1 1 0 0 1 0 1 1 1 0 1 0 1 1 0 1 1 0 1 1 0 0 1 1 0 1 0 0 1 0 1 1 1 1
 1 0 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 0 1 0 1 1 1
 0 1 1 0 1 0 0 0 1 1 1 1 1 0 0 0 1 1 0 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1
 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 0 0 1 1 1 0
 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1
 1 1 1 0 0]
Random Forest Accuracy: 0.9772727272727273
Decision Tree Predictions: [1 1 1 ... 1 1 1]
Decision Tree Accuracy: 0.9863532979529946


Decision tree gave more accuracy in coparision to random forest