In [53]:
import csv
import random

def load_dataset(file_path):
    dataset = []
    with open(file_path, 'r') as file:
        csv_reader = csv.reader(file)
        next(csv_reader)  
        for row in csv_reader:
            if not row:
                continue
            processed_row = [float(x) for x in row[:-1]] + [int(row[-1])]
            dataset.append(processed_row)
    return dataset

def handle_missing_values(dataset, features):
    for row in dataset:
        for i in features:
            if row[i] == 0:
                feature_values = [r[i] for r in dataset if r[i] != 0]
                mean_value = sum(feature_values) / len(feature_values)
                row[i] = mean_value
    return dataset

def train_test_split(dataset, test_size=0.2):
    random.shuffle(dataset)
    split_index = int(len(dataset) * (1 - test_size))
    return dataset[:split_index], dataset[split_index:]

features_with_missing = [2, 3, 4, 5, 6]  

file_path = r'C:\Users\adith\Downloads\pima-indians-diabetes.csv'
dataset = load_dataset(file_path)
dataset = handle_missing_values(dataset, features_with_missing)
train_set, test_set = train_test_split(dataset, test_size=0.2)

print(f"Training samples: {len(train_set)}, Testing samples: {len(test_set)}")


Training samples: 614, Testing samples: 154


In [54]:
import math
from collections import Counter

In [55]:
def entropy(dataset):
    labels = [row[-1] for row in dataset]
    total = len(labels)
    counts = Counter(labels)
    ent = 0.0
    for count in counts.values():
        probability = count / total
        ent -= probability * math.log2(probability)
    return ent

def information_gain(parent, left, right):
    total = len(parent)
    ent_parent = entropy(parent)
    ent_left = entropy(left)
    ent_right = entropy(right)
    weighted_ent = (len(left) / total) * ent_left + (len(right) / total) * ent_right
    return ent_parent - weighted_ent

In [56]:
def best_split(dataset, features):
    best_feature = None
    best_value = None
    best_gain = -1
    for feature in features:
        values = set([row[feature] for row in dataset])
        for value in values:
            left = [row for row in dataset if row[feature] <= value]
            right = [row for row in dataset if row[feature] > value]
            if not left or not right:
                continue
            gain = information_gain(dataset, left, right)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_value = value
    return best_feature, best_value


In [57]:
def build_tree(dataset, features, depth=0, max_depth=10):
    labels = [row[-1] for row in dataset]
    if len(set(labels)) == 1:
        return labels[0]
    if not features or depth == max_depth:
        return Counter(labels).most_common(1)[0][0]
    best_feature, best_value = best_split(dataset, features)
    if best_feature is None:
        return Counter(labels).most_common(1)[0][0]
    left = [row for row in dataset if row[best_feature] <= best_value]
    right = [row for row in dataset if row[best_feature] > best_value]
    if not left or not right:
        return Counter(labels).most_common(1)[0][0]
    tree = {
        'feature': best_feature,
        'value': best_value,
        'left': build_tree(left, features, depth + 1, max_depth),
        'right': build_tree(right, features, depth + 1, max_depth)
    }
    return tree

In [58]:
def predict(tree, instance):
    if isinstance(tree, int):
        return tree
    feature = tree['feature']
    value = tree['value']
    if instance[feature] <= value:
        return predict(tree['left'], instance)
    else:
        return predict(tree['right'], instance)

In [59]:
def bootstrap_sample(dataset):
    n = len(dataset)
    sample = []
    for _ in range(n):
        index = random.randint(0, n - 1)
        sample.append(dataset[index])
    return sample

class BaggingClassifier:
    def __init__(self, n_estimators=10, max_depth=10):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []
    
    def fit(self, dataset, features):
        for _ in range(self.n_estimators):
            sample = bootstrap_sample(dataset)
            tree = build_tree(sample, features, max_depth=self.max_depth)
            self.trees.append(tree)
    
    def predict(self, instance):
        predictions = [predict(tree, instance) for tree in self.trees]
        return Counter(predictions).most_common(1)[0][0]
    
    def predict_batch(self, dataset):
        return [self.predict(instance) for instance in dataset]


In [60]:
class AdaBoostClassifier:
    def __init__(self, n_estimators=10, max_depth=1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []
        self.alpha = []
    
    def fit(self, dataset, features):
        n = len(dataset)
        weights = [1 / n] * n
        for _ in range(self.n_estimators):
            weighted_dataset = []
            for i in range(n):
                weighted_dataset.extend([dataset[i]] * int(weights[i] * 1000))  # Scale weights
            tree = build_tree(weighted_dataset, features, max_depth=self.max_depth)
            predictions = [predict(tree, row) for row in dataset]
            error = 0
            for i in range(n):
                if predictions[i] != dataset[i][-1]:
                    error += weights[i]
            if error > 0.5:
                break
            alpha = 0.5 * math.log((1 - error) / (error + 1e-10))  # Add epsilon to avoid division by zero
            self.alpha.append(alpha)
            self.trees.append(tree)
            # Update weights
            for i in range(n):
                if predictions[i] == dataset[i][-1]:
                    weights[i] *= math.exp(-alpha)
                else:
                    weights[i] *= math.exp(alpha)
            total_weight = sum(weights)
            weights = [w / total_weight for w in weights]
    
    def predict(self, instance):
        total = 0
        for tree, alpha in zip(self.trees, self.alpha):
            prediction = predict(tree, instance)
            total += alpha if prediction == 1 else -alpha
        return 1 if total >= 0 else 0
    
    def predict_batch(self, dataset):
        return [self.predict(instance) for instance in dataset]


In [61]:
def majority_vote(predictions_list):
    final_predictions = []
    for i in range(len(predictions_list[0])):
        votes = [predictions[i] for predictions in predictions_list]
        final = Counter(votes).most_common(1)[0][0]
        final_predictions.append(final)
    return final_predictions


In [62]:
def confusion_matrix(true_labels, predicted_labels):
    TP = FP = TN = FN = 0
    for true, pred in zip(true_labels, predicted_labels):
        if true == 1 and pred == 1:
            TP += 1
        elif true == 0 and pred == 1:
            FP += 1
        elif true == 0 and pred == 0:
            TN += 1
        elif true == 1 and pred == 0:
            FN += 1
    return TP, FP, TN, FN

def accuracy(true_labels, predicted_labels):
    correct = sum(1 for true, pred in zip(true_labels, predicted_labels) if true == pred)
    return correct / len(true_labels)

def precision(true_labels, predicted_labels):
    TP, FP, _, _ = confusion_matrix(true_labels, predicted_labels)
    return TP / (TP + FP) if (TP + FP) != 0 else 0

def recall(true_labels, predicted_labels):
    TP, _, _, FN = confusion_matrix(true_labels, predicted_labels)
    return TP / (TP + FN) if (TP + FN) != 0 else 0

def f1_score(true_labels, predicted_labels):
    prec = precision(true_labels, predicted_labels)
    rec = recall(true_labels, predicted_labels)
    return 2 * (prec * rec) / (prec + rec) if (prec + rec) != 0 else 0


In [63]:
def print_performance(true_labels, predicted_labels, model_name):
    TP, FP, TN, FN = confusion_matrix(true_labels, predicted_labels)
    acc = accuracy(true_labels, predicted_labels)
    prec = precision(true_labels, predicted_labels)
    rec = recall(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    print(f"Performance of {model_name}:")
    print(f"Confusion Matrix: TP={TP}, FP={FP}, TN={TN}, FN={FN}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}\n")

In [64]:
features = list(range(len(dataset[0]) - 1))

conventional_tree = build_tree(train_set, features, max_depth=10)
conventional_predictions = [predict(conventional_tree, instance) for instance in test_set]
true_labels = [row[-1] for row in test_set]
print_performance(true_labels, conventional_predictions, "Conventional Decision Tree")

Performance of Conventional Decision Tree:
Confusion Matrix: TP=28, FP=18, TN=80, FN=28
Accuracy: 0.7013
Precision: 0.6087
Recall: 0.5000
F1 Score: 0.5490



In [65]:
bagging = BaggingClassifier(n_estimators=10, max_depth=10)
bagging.fit(train_set, features)
bagging_predictions = bagging.predict_batch(test_set)
print_performance(true_labels, bagging_predictions, "Bagging Ensemble")

Performance of Bagging Ensemble:
Confusion Matrix: TP=35, FP=15, TN=83, FN=21
Accuracy: 0.7662
Precision: 0.7000
Recall: 0.6250
F1 Score: 0.6604



In [66]:
boosting = AdaBoostClassifier(n_estimators=10, max_depth=1)
boosting.fit(train_set, features)
boosting_predictions = boosting.predict_batch(test_set)
print_performance(true_labels, boosting_predictions, "Boosting Ensemble")

Performance of Boosting Ensemble:
Confusion Matrix: TP=29, FP=20, TN=78, FN=27
Accuracy: 0.6948
Precision: 0.5918
Recall: 0.5179
F1 Score: 0.5524



In [67]:
votes = [bagging_predictions, boosting_predictions]
voted_predictions = majority_vote(votes)
print_performance(true_labels, voted_predictions, "Majority Voting Ensemble")

Performance of Majority Voting Ensemble:
Confusion Matrix: TP=35, FP=15, TN=83, FN=21
Accuracy: 0.7662
Precision: 0.7000
Recall: 0.6250
F1 Score: 0.6604

