In [1]:
import numpy as np
import pandas as pd

emails = pd.read_csv('mynewnew.csv')

X= emails['email_text'].values
y= emails['spam'].values


In [2]:
from collections import Counter

def bag_of_words(text):
    words = []
    for email in text:
        words.extend(email.split())
    vocab = sorted(set(words))
    
    bow = np.zeros((len(text), len(vocab)), dtype=np.int64)
    for i, email in enumerate(text):
        freq = Counter(email.split())
        for j, word in enumerate(vocab):
            bow[i, j] = freq[word]
    return bow, vocab

X, vocab = bag_of_words(X)
from numpy.random import shuffle

idx = np.arange(X.shape[0])
shuffle(idx)
X = X[idx]
y = y[idx]

split = int(0.8 * len(y))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]


In [3]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None
    
    def entropy(self, y):
        n_samples = len(y)
        if n_samples == 0:
            return 0
        class_counts = np.bincount(y)
        probs = class_counts / n_samples
        probs = probs[probs > 0]
        return -np.sum(probs * np.log2(probs))
    
    def information_gain(self, X, y, feature_idx):
        n_samples = len(y)
        y0 = y[X[:, feature_idx] == 0]
        y1 = y[X[:, feature_idx] == 1]
        p0 = len(y0) / n_samples
        p1 = len(y1) / n_samples
        entropy_y = self.entropy(y)
        entropy_y0 = self.entropy(y0)
        entropy_y1 = self.entropy(y1)
        return entropy_y - p0 * entropy_y0 - p1 * entropy_y1
    
    def build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        
        if n_classes == 1 or n_samples < 2 or (self.max_depth is not None and depth >= self.max_depth):
            return np.bincount(y).argmax()
        
        best_feature = None
        best_gain = -np.inf
        for feature in range(n_features):
            gain = self.information_gain(X, y, feature)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature
        
        tree = {'feature': best_feature}
        X_left = X[X[:, best_feature] == 0]
        y_left = y[X[:, best_feature] == 0]
        X_right = X[X[:, best_feature] == 1]
        y_right = y[X[:, best_feature] == 1]
        
        tree['left'] = self.build_tree(X_left, y_left, depth + 1)
        tree['right'] = self.build_tree(X_right, y_right, depth + 1)
        return tree
    
    def fit(self, X, y):
        self.tree = self.build_tree(X, y)
    
    def predict(self, X):
        predictions = np.zeros(len(X))
        for i, sample in enumerate(X):
            node = self.tree
            while isinstance(node, dict):
                feature = node['feature']
                if sample[feature] == 0:
                    node = node['left']
                else:
                    node = node['right']
            predictions[i] = node
        return predictions


In [4]:
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.91
