#Decision Tree


In [18]:
import numpy as np
import pandas as pd

In [19]:
data = pd.read_csv('AER_credit_card_data.csv')
data.head(10)

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.52,0.03327,124.9833,yes,no,3,54,1,12
1,yes,0,33.25,2.42,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5,0.004156,15.0,yes,no,4,58,1,5
3,yes,0,30.5,2.54,0.065214,137.8692,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.5033,yes,no,2,64,1,5
5,yes,0,23.25,2.5,0.044438,91.99667,no,no,0,54,1,1
6,yes,0,27.91667,3.96,0.012576,40.83333,no,no,2,7,1,5
7,yes,0,29.16667,2.37,0.076434,150.79,yes,no,0,77,1,3
8,yes,0,37.0,3.8,0.245628,777.8217,yes,no,0,97,1,6
9,yes,0,28.41667,3.2,0.01978,52.58,no,no,0,65,1,18


In [20]:
class Node():
  def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
    self.feature_index = feature_index
    self.threshold = threshold
    self.left = left
    self.right = right
    self.info_gain = info_gain
    self.value = value

In [21]:
class DecisionTreeClassifier():
  def __init__(self, min_samples_split=2, max_depth=6):
    self.root = None
    self.min_samples_split = min_samples_split
    self.max_depth = max_depth

  def build_tree(self, dataset, curr_depth=0):
    X, Y = dataset[:,:-1], dataset[:,-1]
    num_samples, num_features = np.shape(X)

    if num_samples >= self.min_samples_split and curr_depth <= self.max_depth:
      best_split = self.get_best_split(dataset, num_samples, num_features)
      if best_split["info_gain"] > 0:
        left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
        right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
        return Node(best_split["feature_index"], best_split["threshold"], left_subtree, right_subtree, best_split["info_gain"])

    leaf_value = self.calculate_leaf_value(Y)
    return Node(value=leaf_value)

  def get_best_split(self, dataset, num_samples, num_features):
    best_split = {}
    max_info_gain = -float("inf")

    for feature_index in range(num_features):
      feature_values = dataset[:, feature_index]
      possible_thresholds = np.unique(feature_values)
      for threshold in possible_thresholds:
        dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
        if len(dataset_left) > 0 and len(dataset_right) > 0:
          y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
          curr_info_gain = self.information_gain(y, left_y, right_y, "gini")
          if curr_info_gain > max_info_gain:
            best_split["feature_index"] = feature_index
            best_split["threshold"] = threshold
            best_split["dataset_left"] = dataset_left
            best_split["dataset_right"] = dataset_right
            best_split["info_gain"] = curr_info_gain
            max_info_gain = curr_info_gain

    return best_split

  def split(self, dataset, feature_index, threshold):
    dataset_left = np.array([row for row in dataset if row[feature_index] <= threshold])
    dataset_right = np.array([row for row in dataset if row[feature_index] > threshold])
    return dataset_left, dataset_right

  def information_gain(self, parent, l_child, r_child, mode="entropy"):

    weight_l = len(l_child) / len(parent)
    weight_r = len(r_child) / len(parent)
    if mode == "gini":
      gain = self.gini_index(parent) - (weight_l*self.gini_index(l_child) + weight_r*self.gini_index(r_child))
    else:
      gain = self.entropy(parent) - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
    return gain

  def entropy(self, y):

    class_labels = np.unique(y)
    entropy = 0
    for cls in class_labels:
      p_cls = len(y[y == cls]) / len(y)
      entropy += -p_cls * np.log2(p_cls)
    return entropy

  def gini_index(self, y):

    class_labels = np.unique(y)
    gini = 0
    for cls in class_labels:
      p_cls = len(y[y == cls]) / len(y)
      gini += p_cls**2
    return 1-gini

  def calculate_leaf_value(self, Y):
    Y = list(Y)
    return max(Y, key=Y.count)

  def print_tree(self, tree=None, indent=" "):
    if not tree:
      tree = self.root

    if tree.value is not None:
      print(tree.value)

    else:
      print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
      print("%sleft:" % (indent), end="")
      self.print_tree(tree.left, indent + indent)
      print(f"{indent}right:")
      self.print_tree(tree.right, indent + indent)

  def fit(self, X, Y):
    dataset = np.concatenate((X, Y), axis=1)
    self.root = self.build_tree(dataset)

  def predict(self, X):
    predictions = [self.make_prediction(x, self.root) for x in X]
    return predictions

  def make_prediction(self, x, tree):
    if tree.value != None: return tree.value
    feature_value = x[tree.feature_index]
    if feature_value <= tree.threshold:
      return self.make_prediction(x, tree.left)
    else:
      return self.make_prediction(x, tree.right)


In [22]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

In [23]:
classifier = DecisionTreeClassifier(min_samples_split=3, max_depth=3)
classifier.fit(X_train, Y_train)
classifier.print_tree()

X_9 <= 20 ? 0.013375227083887609
 left:X_1 <= 0 ? 0.018059303120154246
  left:X_4 <= 0.001 ? 0.026165762022288264
    left:X_2 <= 22.33333 ? 0.04039373325087614
        left:0
        right:
0
    right:
X_3 <= 2.25 ? 0.00795491752828803
        left:0
        right:
0
  right:
X_1 <= 1 ? 0.03657188849505466
    left:X_3 <= 3.305 ? 0.05545454545454542
        left:2
        right:
4
    right:
X_2 <= 26.41667 ? 0.0614051669817266
        left:7
        right:
9
 right:
X_0 <= no ? 0.006419247057657396
  left:X_1 <= 0 ? 0.06061456510224117
    left:X_9 <= 30 ? 0.02865962618596618
        left:0
        right:
0
    right:
X_9 <= 29 ? 0.022257836054958458
        left:5
        right:
7
  right:
X_3 <= 3.3280000000000003 ? 0.005215286902997773
    left:X_4 <= 0.3213480999999999 ? 0.007686983944647285
        left:3
        right:
2
    right:
X_3 <= 3.4 ? 0.009711756491079582
        left:4
        right:
7


#Random Forest


In [24]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 1. Load and preprocess the data
df = pd.read_csv("AER_credit_card_data.csv")
df.dropna(inplace=True)

# Convert categorical columns
df['card'] = df['card'].map({'yes': 1, 'no': 0})
df['owner'] = df['owner'].map({'yes': 1, 'no': 0})
df['selfemp'] = df['selfemp'].map({'yes': 1, 'no': 0})

# 2. Feature matrix and target
X = df.drop('card', axis=1)
y = df['card']

# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
rf.fit(X_train, y_train)

# 5. Predictions and evaluation
y_pred = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9772727272727273

Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95        62
           1       1.00      0.97      0.98       202

    accuracy                           0.98       264
   macro avg       0.96      0.99      0.97       264
weighted avg       0.98      0.98      0.98       264

