In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('AER_credit_card_data.csv')
df.head(10)

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.52,0.03327,124.9833,yes,no,3,54,1,12
1,yes,0,33.25,2.42,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5,0.004156,15.0,yes,no,4,58,1,5
3,yes,0,30.5,2.54,0.065214,137.8692,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.5033,yes,no,2,64,1,5
5,yes,0,23.25,2.5,0.044438,91.99667,no,no,0,54,1,1
6,yes,0,27.91667,3.96,0.012576,40.83333,no,no,2,7,1,5
7,yes,0,29.16667,2.37,0.076434,150.79,yes,no,0,77,1,3
8,yes,0,37.0,3.8,0.245628,777.8217,yes,no,0,97,1,6
9,yes,0,28.41667,3.2,0.01978,52.58,no,no,0,65,1,18


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   card         1319 non-null   object 
 1   reports      1319 non-null   int64  
 2   age          1319 non-null   float64
 3   income       1319 non-null   float64
 4   share        1319 non-null   float64
 5   expenditure  1319 non-null   float64
 6   owner        1319 non-null   object 
 7   selfemp      1319 non-null   object 
 8   dependents   1319 non-null   int64  
 9   months       1319 non-null   int64  
 10  majorcards   1319 non-null   int64  
 11  active       1319 non-null   int64  
dtypes: float64(4), int64(5), object(3)
memory usage: 123.8+ KB


In [4]:
df.isnull().sum()

card           0
reports        0
age            0
income         0
share          0
expenditure    0
owner          0
selfemp        0
dependents     0
months         0
majorcards     0
active         0
dtype: int64

In [5]:
df['owner'] = df['owner'].map({'yes': 1, 'no': 0})
df['selfemp'] = df['selfemp'].map({'yes': 1, 'no': 0})
df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.52,0.03327,124.9833,1,0,3,54,1,12
1,yes,0,33.25,2.42,0.005217,9.854167,0,0,3,34,1,13
2,yes,0,33.66667,4.5,0.004156,15.0,1,0,4,58,1,5
3,yes,0,30.5,2.54,0.065214,137.8692,0,0,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.5033,1,0,2,64,1,5


In [6]:
class Node():
    def __init__(self, feature_index = None, threshold = None, left = None, right = None, info_gain = None, value = None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        
        self.value = value

In [83]:
class DecisionTree():
    def __init__(self, min_samples_split = 2, max_depth = 2):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth

        self.root = None
    def build_tree(self, dataset, curr_depth = 0):
        X, y = dataset[:,:-1], dataset[:, -1]
        samples, features = np.shape(X)

        if samples >= self.min_samples_split and curr_depth <= self.max_depth:
            best_split = self.get_best_split(dataset, samples, features)

            if best_split['info_gain'] > 0:
                left_subtree = self.build_tree(best_split['left_dataset'], curr_depth+1)
                right_subtree = self.build_tree(best_split['right_dataset'], curr_depth+1)
                return Node(best_split['feature_index'], best_split['threshold'], left_subtree, right_subtree, best_split['info_gain'])
            
        values = y.copy()
        leaf_value = pd.Series(values.reshape(-1)).mode()[0]
        
        return Node(value = leaf_value)

    def get_best_split(self, dataset, samples, features):
        best_split = {}
        max_ig = -float('inf')

        for feature_index in range(features):
            feature_value = dataset[:,feature_index]
            possible_thresholds = np.unique(feature_value)

            for threshold in possible_thresholds:
                left_dataset, right_dataset = self.split(dataset, feature_index, threshold)

                if len(left_dataset) > 0 and len(right_dataset) > 0:
                    labels, left_labels, right_labels = dataset[:, -1], left_dataset[:, -1], right_dataset[:, -1]
                    curr_ig = self.compute_info_gain(labels, left_labels, right_labels)
                    
                    if curr_ig > max_ig:
                        best_split['left_dataset'] = left_dataset
                        best_split['right_dataset'] = right_dataset 
                        best_split['info_gain'] = curr_ig
                        best_split['feature_index'] = feature_index
                        best_split['threshold'] = threshold
                        max_ig = curr_ig
        return best_split

    def split(self, dataset, feature_index, threshold):
        left_dataset = np.array([row for row in dataset if row[feature_index]<=threshold])
        right_dataset = np.array([row for row in dataset if row[feature_index]>threshold])
        return left_dataset, right_dataset

    def compute_info_gain(self, parent, lchild, rchild):
        l_weight = len(lchild)/len(parent)
        r_weight = len(rchild)/len(parent)
        info_gain = self.gini(parent) - (l_weight * self.gini(lchild) + r_weight * self.gini(rchild))
        return info_gain

    def gini(self, y):
        gini = 0
        classes = np.unique(y)
        for i in classes:
            p_i = len(y[y == i])
            p_i = p_i/len(y)
            gini = gini + p_i**2
        return 1 - gini

    def fit(self, X, y):
        dataset = np.concatenate((X, y), axis = 1)
        self.root = self.build_tree(dataset)

    def traverse(self, i, node):
        if node.value == None:
            if i[node.feature_index] <= node.threshold:
                return self.traverse(i, node.left)
            else: return self.traverse(i, node.right)

        else: return node.value

    def predict(self, X):
        y_pred = []
        for i in X:
            y_pred.append(self.traverse(i, self.root))
        return y_pred    

In [84]:
X = df.drop('card', axis = 1).to_numpy()
y = df['card'].to_numpy().reshape(-1,1)

In [85]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

In [86]:
model = DecisionTree(min_samples_split=4, max_depth=5)
model.fit(X_train, y_train)

In [87]:
y_pred = model.predict(X_test) 

In [88]:
from sklearn.metrics import accuracy_score, classification_report 
accuracy = accuracy_score(y_test, y_pred)
print(f'The accuracy score for the model is {accuracy*100}%.')

The accuracy score for the model is 96.21212121212122%.


In [89]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          no       0.92      0.92      0.92        62
         yes       0.98      0.98      0.98       202

    accuracy                           0.96       264
   macro avg       0.95      0.95      0.95       264
weighted avg       0.96      0.96      0.96       264

