In [4]:
import pandas as pd

data=pd.read_csv('data.csv')

In [5]:
data.describe()

Unnamed: 0,Id,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45202.0,45208.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,23606.0,40.954714,1362.34662,15.806419,258.16308,2.763841,40.197828,0.580323
std,13051.435847,11.539144,3044.852387,8.322476,257.527812,3.098021,100.128746,2.303441
min,1001.0,-1.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,12303.5,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,23606.0,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,34908.5,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,46211.0,999.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [6]:
data.isnull().sum()

Id           0
age          9
job          0
marital      0
education    0
default      0
balance      3
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [7]:
data=data.dropna(axis=0)

In [8]:
data.head(10)

Unnamed: 0,Id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1001,999.0,management,married,tertiary,no,2143.0,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,1002,44.0,technician,single,secondary,no,29.0,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,1003,33.0,entrepreneur,married,secondary,no,2.0,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,1004,47.0,blue-collar,married,unknown,no,1506.0,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,1005,33.0,unknown,single,unknown,no,1.0,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,1006,35.0,management,married,tertiary,no,231.0,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,1007,28.0,management,single,tertiary,no,447.0,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
10,1011,41.0,admin.,divorced,secondary,no,270.0,yes,no,unknown,5,may,222,1,-1,0,unknown,no
11,1012,29.0,admin.,single,secondary,no,390.0,yes,no,unknown,5,may,137,1,-1,0,unknown,no
12,1013,53.0,technician,married,secondary,no,6.0,yes,no,unknown,5,may,517,1,-1,0,unknown,no


In [9]:
#Remove all 'unknown' values
data=data[(data !='unknown').all(axis=1)]

In [10]:
#Remove ages below 0
data= data[(data.age > 0)]

In [11]:
#apply a simple label encoding
def label_encode(df, categorical_columns):

    for column in categorical_columns:
        category_to_int = {category: idx for idx, category in enumerate(set(df[column]))}
        df[column] = df[column].map(category_to_int)
    return df

categorical_columns = ['job', 'marital', 'education','contact','poutcome']
encoded_df = label_encode(data, categorical_columns)


In [12]:
#mapping yes and no to 1 and 0
yn_map={'yes':1,'no':0}
yn_columns=['default','housing','loan','y']
for column in yn_columns:
    data[column]= data[column].map(yn_map)

In [13]:
#mapping months to a relative number
month_map={'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12}
data.month= data.month.map(month_map)

In [14]:
data

Unnamed: 0,Id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
24060,25061,33.0,5,0,1,0,882.0,0,0,1,21,10,39,1,151,3,0,0
24062,25063,42.0,5,1,0,0,-247.0,1,1,1,21,10,519,1,166,1,2,1
24064,25065,33.0,3,0,0,0,3444.0,1,0,1,21,10,144,1,91,4,0,1
24072,25073,36.0,6,0,1,0,2415.0,1,0,1,22,10,73,1,86,4,2,0
24077,25078,36.0,6,0,1,0,0.0,1,0,1,23,10,140,1,143,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45195,46196,68.0,0,0,0,0,1146.0,0,0,0,16,11,212,1,187,6,1,1
45199,46200,34.0,1,1,0,0,1475.0,1,0,0,16,11,1166,3,530,12,2,0
45201,46202,53.0,6,0,1,0,583.0,0,0,0,17,11,226,1,184,4,1,1
45208,46209,72.0,0,0,0,0,5715.0,0,0,0,17,11,1127,5,184,3,1,1


In [15]:
import graphviz
import numpy as np
from collections import Counter

#a class to save nodes
class DecisionTreeNode:
    def __init__(self, feature_idx=None, threshold=None, value=None, left=None, right=None):
        self.feature_idx = feature_idx
        self.threshold = threshold
        self.value = value
        self.left = left
        self.right = right

#the main decision tree class
class DecisionTree:
    def __init__(self, max_depth=None, criterion='gini'):
        self.max_depth = max_depth
        self.criterion = criterion
        self.root = None

    def gini_impurity(self, y):
        m = y.size
        counts = np.bincount(y)
        gini = 1 - np.sum((counts / m) ** 2)
        return gini

    def entropy(self, y):
        m = y.size
        counts = np.bincount(y)
        probabilities = counts / m
        entropy = -np.sum(probabilities * np.log2(probabilities + 1e-8))
        return entropy

    def split_data(self, X, y, feature_idx, threshold):
        left_idx = np.argwhere(X[:, feature_idx] <= threshold).flatten()
        right_idx = np.argwhere(X[:, feature_idx] > threshold).flatten()
        X_left, X_right = X[left_idx], X[right_idx]
        y_left, y_right = y[left_idx], y[right_idx]
        return X_left, X_right, y_left, y_right

    def best_split(self, X, y, criterion):
        m, n = X.shape
        best_gain = -np.inf
        best_idx, best_thresh = None, None

        for idx in range(n):
            thresholds = np.unique(X[:, idx])

            for thresh in thresholds:
                X_left, X_right, y_left, y_right = self.split_data(X, y, idx, thresh)

                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                if criterion == 'gini':
                    gain = self.gini_impurity(y) - (len(y_left) / m) * self.gini_impurity(y_left) - (len(y_right) / m) * self.gini_impurity(y_right)
                else:
                    gain = self.entropy(y) - (len(y_left) / m) * self.entropy(y_left) - (len(y_right) / m) * self.entropy(y_right)

                if gain > best_gain:
                    best_gain = gain
                    best_idx = idx
                    best_thresh = thresh

        return best_idx, best_thresh

    def build_tree(self, X, y, depth=0):
        m, n = X.shape

        if depth == self.max_depth or len(np.unique(y)) == 1:
            leaf_value = self.get_leaf_value(y)
            return DecisionTreeNode(value=leaf_value)

        best_idx, best_thresh = self.best_split(X, y, self.criterion)

        if best_idx is None:
            leaf_value = self.get_leaf_value(y)
            return DecisionTreeNode(value=leaf_value)

        left_idx = np.argwhere(X[:, best_idx] <= best_thresh).flatten()
        right_idx = np.argwhere(X[:, best_idx] > best_thresh).flatten()

        root = DecisionTreeNode(feature_idx=best_idx, threshold=best_thresh)
        root.left = self.build_tree(X[left_idx], y[left_idx], depth + 1)
        root.right = self.build_tree(X[right_idx], y[right_idx], depth + 1)

        return root

    def get_leaf_value(self, y):
        counts = np.bincount(y)
        if len(counts) == 0:
            return None
        return np.argmax(counts)

    def fit(self, X, y):
        self.root = self.build_tree(X, y)

    def predict(self, X):
        predictions = np.zeros(X.shape[0])

        for i, x in enumerate(X):
            node = self.root
            while node is not None:
                if node.value is not None:
                    predictions[i] = node.value
                    break
                if x[node.feature_idx] <= node.threshold:
                    node = node.left
                else:
                    node = node.right

        return predictions

#Calculate the cross-validation score for k=5 in default
    def cross_val_score(self, X, y, k=5):
        m = len(X)
        fold_size = m // k
        scores = []

        for i in range(k):
            start = i * fold_size
            end = start + fold_size
            X_test = X[start:end]
            y_test = y[start:end]
            X_train = np.concatenate((X[:start], X[end:]), axis=0)
            y_train = np.concatenate((y[:start], y[end:]), axis=0)

            self.fit(X_train, y_train)
            y_pred = self.predict(X_test)
            accuracy = np.mean(y_pred == y_test)
            scores.append(accuracy)

        return np.array(scores)

    def precision_score(self, y_true, y_pred):
        true_positives = np.sum((y_true == 1) & (y_pred == 1))
        predicted_positives = np.sum(y_pred == 1)
        if predicted_positives == 0:
            return 0
        return true_positives / predicted_positives

#making the tree graph using the graphviz module
    def visualize(self, feature_names=None):

        dot = graphviz.Digraph()
        self._add_node(dot, self.root, feature_names)
        return dot

    def _add_node(self, dot, node, feature_names):

        if node is None:
            return

        if node.value is not None:
            label = f"Value: {node.value}"
        else:
            feature_idx = node.feature_idx
            if feature_names is not None:
                feature_name = feature_names[feature_idx]
            else:
                feature_name = f"Feature {feature_idx}"
            label = f"{feature_name} <= {node.threshold}"

        dot.node(str(id(node)), label=label)

        if node.left:
            dot.edge(str(id(node)), str(id(node.left)))
            self._add_node(dot, node.left, feature_names)
        if node.right:
            dot.edge(str(id(node)), str(id(node.right)))
            self._add_node(dot, node.right, feature_names)

In [16]:
data=data.reset_index(drop=True)
data= data.drop(columns='Id')

In [17]:
#split X and y
X = data.drop('y', axis=1)
y= data.y

In [18]:
#Splitting the data into train and test manually with 80% ratio
split_index = int(0.8 * len(data))
X_train = X.loc[:split_index]
X_test = X.loc[split_index:]
y_train = y.loc[:split_index]
y_test = y.loc[split_index:]

In [19]:
#building the tree 
dt = DecisionTree(max_depth=5, criterion='gini')

In [20]:
dt.fit(X_train.to_numpy(), y_train.to_numpy())

In [25]:
#Splitting the data using a library for comparing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [34]:
#Applying decision tree using library for comparing results

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=5, random_state = 42)

clf.fit(X_train, y_train)



In [35]:
#predicting using the sklearn medel
test_pred_decision_tree = clf.predict(X_test)

In [21]:
#predicting using our own model
test_pred_decision_tree = dt.predict(X_test.values)

In [36]:
#some evaluation
from sklearn.metrics import accuracy_score
accuracy_score(y_test,test_pred_decision_tree)

0.8406628425748884

In [20]:
#calculating our models cross-validation score
cval = dt.cross_val_score(X.values ,y.values, k=5)

In [21]:
cval

array([0.88392857, 0.88201531, 0.91709184, 0.73086735, 0.60778061])

In [22]:
#calculate precision of our model
dt.precision_score(y_test.values,test_pred_decision_tree)

0.8177458033573142

In [23]:
#making the plot
plot = dt.visualize(X.columns)

In [24]:
#getting the final render
plot.render('tree', view=True, format='png')

'tree.png'