# Pre-Processing

In [260]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [261]:
col_names = ['card', 'reports', 'age', 'income', 'share', 'expenditure',
             'owner', 'selfempl', 'dependents', 'months', 'majorcards', 'active']

data = pd.read_csv("AER_credit_card_data.csv", skiprows=1, header=None, names=col_names)


In [262]:
data['card'] = data['card'].map({'yes': 1, 'no': 0})
data['owner'] = data['owner'].map({'yes': 1, 'no': 0})
data['selfempl'] = data['selfempl'].map({'yes': 1, 'no': 0})

In [263]:
cols_to_scale = ['reports', 'age', 'income', 'share', 'expenditure','dependents','months', 'majorcards', 'active']
scaler = StandardScaler()
data[cols_to_scale] = scaler.fit_transform(data[cols_to_scale])

# Decision tree

In [264]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        self.value = value

In [265]:
class DecisionTreeClassifier():
    def __init__(self, min_samples_split=2, max_depth=2):
        self.root = None
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth

    def build_tree(self, dataset, curr_depth=0):
        X = dataset[:, :-1]
        Y = dataset[:, -1]
        num_samples, num_features = X.shape

        if num_samples >= self.min_samples_split and curr_depth <= self.max_depth:
            best_split = self.get_best_split(dataset, num_samples, num_features)
            if best_split["info_gain"] > 0:
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth + 1)
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth + 1)
                return Node(best_split["feature_index"], best_split["threshold"],
                            left_subtree, right_subtree, best_split["info_gain"])

        leaf_value = self.calculate_leaf_value(Y)
        return Node(value=leaf_value)

    def get_best_split(self, dataset, num_samples, num_features):
        best_split = {}
        max_info_gain = -float("inf")

        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)

            for threshold in possible_thresholds:
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                if len(dataset_left) > 0 and len(dataset_right) > 0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    curr_info_gain = self.information_gain(y, left_y, right_y, "gini")
                    if curr_info_gain > max_info_gain:
                        best_split = {
                            "feature_index": feature_index,
                            "threshold": threshold,
                            "dataset_left": dataset_left,
                            "dataset_right": dataset_right,
                            "info_gain": curr_info_gain
                        }
                        max_info_gain = curr_info_gain

        return best_split

    def split(self, dataset, feature_index, threshold):
        left = np.array([row for row in dataset if row[feature_index] <= threshold])
        right = np.array([row for row in dataset if row[feature_index] > threshold])
        return left, right

    def information_gain(self, parent, l_child, r_child, mode="entropy"):
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        if mode == "gini":
            gain = self.gini_index(parent) - (weight_l * self.gini_index(l_child) + weight_r * self.gini_index(r_child))
        else:
            gain = self.entropy(parent) - (weight_l * self.entropy(l_child) + weight_r * self.entropy(r_child))
        return gain

    def entropy(self, y):
        class_labels, counts = np.unique(y, return_counts=True)
        probabilities = counts / counts.sum()
        return -np.sum(probabilities * np.log2(probabilities))

    def gini_index(self, y):
        class_labels, counts = np.unique(y, return_counts=True)
        probabilities = counts / counts.sum()
        return 1 - np.sum(probabilities**2)

    def calculate_leaf_value(self, Y):
        values, counts = np.unique(Y, return_counts=True)
        return values[np.argmax(counts)]

    def fit(self, X, Y):
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)

    def predict(self, X):
        return [self.make_prediction(x, self.root) for x in X]

    def make_prediction(self, x, tree):
        if tree.value is not None:
            return tree.value
        feature_val = x[tree.feature_index]
        if feature_val <= tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

    def print_tree(self, tree=None, indent=" "):
        if tree is None:
            tree = self.root
        if tree.value is not None:
            print(tree.value)
        else:
            print("X_" + str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
            print(indent + "Left:", end=" ")
            self.print_tree(tree.left, indent + "  ")
            print(indent + "Right:", end=" ")
            self.print_tree(tree.right, indent + "  ")

In [266]:
X = data.drop('card', axis=1).values
Y = data['card'].values.reshape(-1, 1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=41)


In [267]:
clf = DecisionTreeClassifier(min_samples_split=3, max_depth=3)
clf.fit(X_train, Y_train)
clf.print_tree()

X_4 <= -0.680067582792967 ? 0.33149008940208585
 Left: X_0 <= -0.3393968014785687 ? 0.0034772152880162416
   Left: X_10 <= 0.3177691062974968 ? 0.012424375031437257
     Left: X_1 <= -0.8100556113436747 ? 0.0059838366420871675
       Left: 0.0
       Right: 0.0
     Right: X_5 <= 0.0 ? 0.1519097222222222
       Left: 1.0
       Right: 0.0
   Right: X_6 <= 0.0 ? 0.0019750071818443616
     Left: 0.0
     Right: X_1 <= 1.4830840679064266 ? 0.21875
       Left: 0.0
       Right: 1.0
 Right: 1.0


In [268]:
Y_pred = np.array(clf.predict(X_test)).reshape(-1, 1)
print("Accuracy:", accuracy_score(Y_test, Y_pred))

Accuracy: 0.9659090909090909


In [269]:
from sklearn.metrics import confusion_matrix, classification_report

Y_pred = np.array(Y_pred).astype(int)
Y_true = Y_test.flatten()

cm = confusion_matrix(Y_true, Y_pred)
print("Confusion Matrix:")
print(cm)

cr = classification_report(Y_true, Y_pred, target_names=["Rejected (0)", "Accepted (1)"])
print("\nClassification Report:")
print(cr)

Confusion Matrix:
[[ 49   2]
 [  7 206]]

Classification Report:
              precision    recall  f1-score   support

Rejected (0)       0.88      0.96      0.92        51
Accepted (1)       0.99      0.97      0.98       213

    accuracy                           0.97       264
   macro avg       0.93      0.96      0.95       264
weighted avg       0.97      0.97      0.97       264



# Random Forest

In [270]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = data.drop('card', axis=1).values
Y = data['card'].values

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

rf_model.fit(X_train, Y_train)

Y_pred = rf_model.predict(X_test)

print( accuracy_score(Y_test, Y_pred))
print(confusion_matrix(Y_test, Y_pred))
print( classification_report(Y_test, Y_pred))

0.9772727272727273
[[ 62   0]
 [  6 196]]
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        62
           1       1.00      0.97      0.98       202

    accuracy                           0.98       264
   macro avg       0.96      0.99      0.97       264
weighted avg       0.98      0.98      0.98       264

