In [1]:
from sklearn.datasets import fetch_openml
import numpy as np
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

mnist = fetch_openml('mnist_784', version=1)

X = mnist.data  
y = mnist.target 

X = X.to_numpy()
y = y.to_numpy()

X = X / 255.0 

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, train_size=60000, random_state=42)

X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)

y_train = y_train.astype(np.int64)
y_test = y_test.astype(np.int64)

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Decision Tree Classifier: {accuracy * 100:.2f}%")

Accuracy of Decision Tree Classifier: 87.42%


In [4]:
from sklearn.utils import resample

def train_decision_trees(X_train, y_train, n_trees=20, feature_fraction=0.7, data_fraction=0.5):
    n_samples = X_train.shape[0]
    n_features = X_train.shape[1]
    feature_subset_size = int(n_features * feature_fraction)
    data_subset_size = int(n_samples * data_fraction)
    
    trees = []
    
    for i in range(n_trees):
        X_bootstrap, y_bootstrap = resample(X_train, y_train, replace=True, n_samples=data_subset_size)
        
        feature_indices = np.random.choice(range(n_features), size=feature_subset_size, replace=True)
        
        X_bootstrap = X_bootstrap[:, feature_indices]
        
        clf = DecisionTreeClassifier(random_state=i)
        clf.fit(X_bootstrap, y_bootstrap)
        
        trees.append((clf, feature_indices))
    
    return trees


trained_trees = train_decision_trees(X_train, y_train)


In [5]:
def test_majority_voting(trees, X_test, y_test):
    predictions = []
    for clf, feature_indices in trees:
        y_pred = clf.predict(X_test[:, feature_indices])
        predictions.append(y_pred)
    
    predictions = np.array(predictions)  # Shape: (n_trees, n_samples)
    y_final_pred = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)
    
    accuracy = accuracy_score(y_test, y_final_pred)
    return accuracy

def test_weighted_voting(trees, X_train, y_train, X_test, y_test):
    predictions = []
    weights = []

    for clf, feature_indices in trees:
        y_pred = clf.predict(X_test[:, feature_indices])
        predictions.append(y_pred)
        
        X_train_subset = X_train[:, feature_indices]
        y_train_pred = clf.predict(X_train_subset)
        tree_accuracy = accuracy_score(y_train, y_train_pred)
        weights.append(tree_accuracy)
    
    weights = np.array(weights)
    weights /= np.sum(weights)
    
    predictions = np.array(predictions)
    weighted_votes = np.zeros((predictions.shape[1], len(np.unique(y_test))))
    for i, weight in enumerate(weights):
        for j, pred in enumerate(predictions[i]):
            weighted_votes[j, int(pred)] += weight
    
    y_final_pred = np.argmax(weighted_votes, axis=1)
    
    accuracy = accuracy_score(y_test, y_final_pred)
    return accuracy


In [6]:

accuracy_majority = test_majority_voting(trained_trees, X_test, y_test)
print(f"Accuracy with Majority Voting: {accuracy_majority * 100:.2f}%")

accuracy_weighted = test_weighted_voting(trained_trees, X_train, y_train, X_test, y_test)
print(f"Accuracy with Weighted Voting: {accuracy_weighted * 100:.2f}%")

Accuracy with Majority Voting: 95.47%
Accuracy with Weighted Voting: 95.47%


In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
        n_estimators=20,        
        max_features=0.7,
        max_samples=0.5,  
        bootstrap=True,             
        random_state=42             
    )

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with RF: {accuracy * 100:.2f}%")


Accuracy with RF: 95.28%


In [19]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

def fit_adaboost(X, y, n_estimators=20):
    n_samples, n_features = X.shape
    weights = np.ones(n_samples) / n_samples  
    models = []
    alphas = []

    # Loop to train weak classifiers
    for estimator in range(n_estimators):
        model = DecisionTreeClassifier(max_depth=5) 
        model.fit(X, y, sample_weight=weights) 
        
        y_pred = model.predict(X)
        
        incorrect = (y_pred != y)
        error = np.dot(weights, incorrect) / np.sum(weights)
        
        alpha = 0.5 * np.log((1 - error) / error) if error < 1 else 0
        
        models.append(model)
        alphas.append(alpha)
        
        weights = weights * np.exp(-alpha * (y_pred == y)) 
        weights = weights / np.sum(weights) 

    return models, alphas

def predict_and_accuracy(X, y, models, alphas):
    clf_preds = np.array([model.predict(X) for model in models])
    n_samples = X.shape[0]
    n_classes = len(np.unique(clf_preds))

    weighted_preds = np.zeros((n_classes, n_samples))

    for i in range(len(models)):
        for c in range(n_classes):
            weighted_preds[c] += (clf_preds[i] == c) * alphas[i]

    final_pred = np.argmax(weighted_preds, axis=0)

    accuracy = np.mean(final_pred == y)
    
    return  accuracy

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

models, alphas = fit_adaboost(X_train, y_train, n_estimators=20)

acc = predict_and_accuracy(X_test, y_test, models, alphas)
print(f"Accuracy: {acc * 100:.4f}%")

Accuracy: 82.6800%


In [20]:
from sklearn.ensemble import AdaBoostClassifier

def adaboost_sklearn(X_train, y_train, X_test, y_test, n_trees=20):    
    clf = AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=5),  # Weak learner (stump)
        n_estimators=n_trees,
        random_state=42,
        algorithm="SAMME.R"
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

accuracy_sklearn = adaboost_sklearn(X_train, y_train, X_test, y_test)
print(f"Accuracy with Sklearn AdaBoost: {accuracy_sklearn * 100:.2f}%")



Accuracy with Sklearn AdaBoost: 82.93%
