In [2]:
import pandas as pd
import numpy as np

# Loading the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI',
           'DiabetesPedigreeFunction','Age','Outcome']

data = pd.read_csv(url, names=columns)

X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values


In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
def train_test_split(X, y, test_size=0.2, seed=42):
    np.random.seed(seed)
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    split = int(len(X) * (1 - test_size))
    train_idx, test_idx = indices[:split], indices[split:]
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

X_train, X_test, y_train, y_test = train_test_split(X, y)


In [3]:
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)


In [4]:
class DecisionTree:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
        self.tree = None

    # Calculating Gini Impurity
    def gini(self, y):
        classes = np.unique(y)
        impurity = 1
        for c in classes:
            p = np.sum(y == c) / len(y)
            impurity -= p**2
        return impurity

    # Find the best split (feature + threshold)
    def best_split(self, X, y):
        best_idx, best_thresh, best_score = None, None, 1e9
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for t in thresholds:
                left = y[X[:, feature] <= t]
                right = y[X[:, feature] > t]
                if len(left) == 0 or len(right) == 0:
                    continue
                score = (len(left)/len(y))*self.gini(left) + (len(right)/len(y))*self.gini(right)
                if score < best_score:
                    best_idx, best_thresh, best_score = feature, t, score
        return best_idx, best_thresh

    # Building tree recursively
    def build(self, X, y, depth=0):
        if depth >= self.max_depth or len(np.unique(y)) == 1:
            return np.bincount(y).argmax()

        feat, thresh = self.best_split(X, y)
        if feat is None:
            return np.bincount(y).argmax()

        left_mask = X[:, feat] <= thresh
        right_mask = X[:, feat] > thresh

        return {
            'feature': feat,
            'threshold': thresh,
            'left': self.build(X[left_mask], y[left_mask], depth+1),
            'right': self.build(X[right_mask], y[right_mask], depth+1)
        }

    def fit(self, X, y):
        self.tree = self.build(X, y)

    def predict_one(self, x, node):
        if not isinstance(node, dict):
            return node
        if x[node['feature']] <= node['threshold']:
            return self.predict_one(x, node['left'])
        else:
            return self.predict_one(x, node['right'])

    def predict(self, X):
        return np.array([self.predict_one(x, self.tree) for x in X])


In [5]:
dt = DecisionTree(max_depth=5)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy(y_test, y_pred))


Decision Tree Accuracy: 0.7532467532467533


In [6]:
def cross_val_score_manual(model_class, X, y, k=5, **kwargs):
    fold_size = len(X) // k
    scores = []
    for i in range(k):
        start, end = i*fold_size, (i+1)*fold_size
        X_val, y_val = X[start:end], y[start:end]
        X_train = np.concatenate((X[:start], X[end:]), axis=0)
        y_train = np.concatenate((y[:start], y[end:]), axis=0)

        model = model_class(**kwargs)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        scores.append(accuracy(y_val, y_pred))
    return scores


In [7]:
cv_scores = cross_val_score_manual(DecisionTree, X, y, k=5, max_depth=5)
print("Cross-validation scores:", cv_scores)
print("Average score:", np.mean(cv_scores))


Cross-validation scores: [np.float64(0.7320261437908496), np.float64(0.6405228758169934), np.float64(0.7581699346405228), np.float64(0.8169934640522876), np.float64(0.7647058823529411)]
Average score: 0.742483660130719


In [8]:
def confusion_matrix(y_true, y_pred):
    classes = np.unique(y_true)
    matrix = np.zeros((len(classes), len(classes)), dtype=int)
    for i in range(len(y_true)):
        matrix[y_true[i], y_pred[i]] += 1
    return matrix


In [9]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


Confusion Matrix:
 [[83 13]
 [25 33]]
