<a href="https://colab.research.google.com/github/VoKisnaHai1102/Frames-to-Fables/blob/main/240563_KrishnaAg_assgn2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


#DECISION TREES


In [2]:
import numpy as np
import pandas as pd

In [10]:
class Node():
   def __init__(self, threshold_val=None, feat_index=None, value=None, left=None, right=None, info_gain=None):
       self.value = value
       self.threshold_val = threshold_val
       self.feat_index = feat_index
       self.left = left
       self.right = right
       self.info_gain = info_gain

In [11]:
class DecisionTree:
   def __init__(self, min_samples_split=2, max_depth=2):
       self.min_samples_split = min_samples_split
       self.max_depth = max_depth
       self.root = None

   def calculate_entropy(self, labels):
       unique_vals, val_counts = np.unique(labels, return_counts=True)
       probabilities = val_counts / len(labels)
       entropy = -np.sum(probabilities * np.log2(probabilities + 1e-9))
       return entropy

   def calculate_info_gain(self, parent_labels, left_labels, right_labels):
       parent_entropy = self.calculate_entropy(parent_labels)
       left_entropy = self.calculate_entropy(left_labels)
       right_entropy = self.calculate_entropy(right_labels)

       left_ratio = len(left_labels) / len(parent_labels)
       right_ratio = len(right_labels) / len(parent_labels)

       info_gain = parent_entropy - (left_ratio * left_entropy + right_ratio * right_entropy)
       return info_gain

   def get_leaf_value(self, labels):
       unique_vals, val_counts = np.unique(labels, return_counts=True)
       return unique_vals[np.argmax(val_counts)]

   def get_best_split(self, X, labels, total_samples, total_features):
       optimal_split = {}
       max_info_gain = -1

       for feat_idx in range(total_features):
           feature_vals = set(X[:, feat_idx])
           for threshold in feature_vals:
               left_idx = [i for i in range(total_samples) if X[i, feat_idx] <= threshold]
               right_idx = [i for i in range(total_samples) if X[i, feat_idx] > threshold]

               if len(left_idx) < self.min_samples_split or len(right_idx) < self.min_samples_split:
                   continue

               left_labels = labels[left_idx]
               right_labels = labels[right_idx]
               current_info_gain = self.calculate_info_gain(labels, left_labels, right_labels)

               if current_info_gain <= 0:
                   continue

               if current_info_gain > max_info_gain:
                   max_info_gain = current_info_gain
                   optimal_split = {
                       'feat_index': feat_idx,
                       'threshold_val': threshold,
                       'left_idx': left_idx,
                       'right_idx': right_idx
                   }
       return optimal_split, max_info_gain

   def traverse_tree(self, sample, node):
       if node.value is not None:
           return node.value
       feature_val = sample[node.feat_index]
       if feature_val <= node.threshold_val:
           return self.traverse_tree(sample, node.left)
       else:
           return self.traverse_tree(sample, node.right)

   def build_tree(self, X, labels, current_depth=0):
       total_samples, total_features = X.shape
       unique_vals = set(labels)

       if total_samples >= self.min_samples_split and current_depth <= self.max_depth:
           optimal_split, max_info_gain = self.get_best_split(X, labels, total_samples, total_features)
           if max_info_gain > 0:
               left_idx = optimal_split['left_idx']
               right_idx = optimal_split['right_idx']
               left_child = self.build_tree(X[left_idx], labels[left_idx], current_depth + 1)
               right_child = self.build_tree(X[right_idx], labels[right_idx], current_depth + 1)
               return Node(
                   threshold_val=optimal_split['threshold_val'],
                   feat_index=optimal_split['feat_index'],
                   left=left_child,
                   right=right_child,
                   info_gain=max_info_gain
               )

       leaf_val = self.get_leaf_value(labels)
       return Node(value=leaf_val)

   def fit(self, X, labels):
       self.root = self.build_tree(X, labels)

   def predict(self, X):
       predictions = []
       for sample in X:
           predictions.append(self.traverse_tree(sample, self.root))
       return np.array(predictions)

In [23]:
df = pd.read_csv('AER_credit_card_data.csv')

df.head()

df['owner'] = df['owner'].map({'yes': 1, 'no': 0})
df['card'] = df['card'].map({'yes': 1, 'no': 0})
df['selfemp'] = df['selfemp'].map({'yes': 1, 'no': 0})


df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,1,0,37.66667,4.52,0.03327,124.9833,1,0,3,54,1,12
1,1,0,33.25,2.42,0.005217,9.854167,0,0,3,34,1,13
2,1,0,33.66667,4.5,0.004156,15.0,1,0,4,58,1,5
3,1,0,30.5,2.54,0.065214,137.8692,0,0,0,25,1,7
4,1,0,32.16667,9.7867,0.067051,546.5033,1,0,2,64,1,5


In [33]:
x = df.iloc[:, 1:].to_numpy()
y = df['card'].to_numpy()
tree = DecisionTree(min_samples_split=2, max_depth=7)
tree.fit(x, y)
predicted = tree.predict(x)
print("Predicted", predicted)
accuracy = np.mean(predicted == y)
print("Accuracy:", accuracy)

Predicted [1 1 1 ... 1 1 1]
Accuracy: 0.9916603487490523


# RANDOM TREES


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the data
features_train, features_test, target_train, target_test = train_test_split(x, y, test_size=0.25, random_state=123)

# Create and train the model
forest_model = RandomForestClassifier(n_estimators=150, random_state=123)
forest_model.fit(features_train, target_train)

# Make predictions and calculate accuracy
forest_pred = forest_model.predict(features_test)
forest_acc = accuracy_score(target_test, forest_pred)

print("Decision Tree Predictions:", predicted)
print("Decision Tree Accuracy:", accuracy)

print("Random Forest Predictions:", forest_pred)
print("Random Forest Accuracy:", forest_acc)

Decision Tree Predictions: [1 1 1 ... 1 1 1]
Decision Tree Accuracy: 0.9916603487490523
Random Forest Predictions: [1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 0 0 1 1 1
 1 0 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1
 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 0 0 0 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 0 1 1 0 1
 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1
 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 0 0 1 0 0 1 0 1 1 1 0 1 0 0 0 1
 1 1 1 0 0 1 0 1 1 1 1 0 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 1 1 1 1 0 1 1 0 1 1
 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1]
Random Forest Accuracy: 0.9757575757575757
