In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math

In [16]:
#Reading Data
df = pd.read_csv("dataset.csv")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)
X_train = train_df.drop(columns=['Unnamed: 0', 'smoking']).to_numpy()
y_train = train_df[['smoking']].to_numpy().reshape(-1) #Reshape from (n,1) to (n)

X_val = val_df.drop(columns=['Unnamed: 0', 'smoking']).to_numpy()
y_val = val_df[['smoking']].to_numpy().reshape(-1)

num_samples, num_features = X_train.shape
print("Number of samples: ", num_samples)
print("Number of features: ", num_features)

Number of samples:  127404
Number of features:  10


### **Boosting**

In [52]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

class MyAdaBoostTree(AdaBoostClassifier):
    def __init__(self, num_samples, num_features, num_iterations, max_tree_height = 1):
        self.num_samples = num_samples
        self.num_features = num_features
        self.num_iterations = num_iterations
        self.max_tree_height = max_tree_height
        self.alphas = []
        self.models = []
        self.sample_weights = np.ones((num_samples))/num_samples

    def train(self, X_train, y_train):
        for iteration in range(self.num_iterations):
            weak_learner = DecisionTreeClassifier(criterion='gini', max_depth=self.max_tree_height)
            weak_learner.fit(X_train, y_train, sample_weight=self.sample_weights)

            sample_predictions = weak_learner.predict(X_train)
            incorrect = (sample_predictions != y_train)*1 #Multiply by 1 to convert True/False to 1/0
            weighted_error = np.multiply(incorrect, self.sample_weights).sum()  / self.sample_weights.sum()
            alpha = (0.5) * math.log((1-weighted_error) / weighted_error)
            
            #Add Model and Alpha to Ensemble
            self.alphas.append(alpha)
            self.models.append(weak_learner)
            
            #Update Weights
            self.sample_weights = np.multiply(self.sample_weights, np.exp(2*alpha*incorrect))
            self.sample_weights = self.sample_weights / self.sample_weights.sum()
    
    def predict(self, X):
        sum_predictions = np.zeros(X.shape[0])
        for idx, model in enumerate(self.models):
            prediction = model.predict(X)
            sum_predictions += self.alphas[idx] * np.where(prediction == 0, -1, prediction)  #np.where used to replace 0s with -1s      
        return np.where(sum_predictions >= 0, 1, 0)
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0]   


    
    def get_params(self, deep=True):
        print(f"num_features: {self.num_features}, num_samples: {self.num_samples}, num_iterations: {self.num_iterations}, max_tree_height: {self.max_tree_height}, sample_weights: {self.sample_weights}")
        return {"num_iterations": self.num_iterations, "max_tree_height": self.max_tree_height, "num_samples": self.num_samples, "num_features": self.num_features}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            print(f"Setting {parameter} to {value}")
            setattr(self, parameter, value)
        return self



class MyAdaBoostLogistic:
    def __init__(self, num_samples, num_features, num_iterations):
        self.num_samples = num_samples
        self.num_features = num_features
        self.num_iterations = num_iterations
        
        self.alphas = []
        self.models = []
        self.sample_weights = np.ones((num_samples))/num_samples

    def train(self, X_train, y_train):
        for iteration in range(self.num_iterations):
            weak_learner = LogisticRegression()
            weak_learner.fit(X_train, y_train, sample_weight=self.sample_weights)

            sample_predictions = weak_learner.predict(X_train)
            incorrect = (sample_predictions != y_train)*1 #Multiply by 1 to convert True/False to 1/0
            weighted_error = np.multiply(incorrect, self.sample_weights).sum()  / self.sample_weights.sum()
            alpha = (0.5) * math.log((1-weighted_error) / weighted_error)
            
            #Add Model and Alpha to Ensemble
            self.alphas.append(alpha)
            self.models.append(weak_learner)
            
            #Update Weights
            self.sample_weights = np.multiply(self.sample_weights, np.exp(2*alpha*incorrect))
            self.sample_weights = self.sample_weights / self.sample_weights.sum()
    
    def predict(self, X):
        sum_predictions = np.zeros(X.shape[0])
        for idx, model in enumerate(self.models):
            prediction = model.predict(X)
            sum_predictions += self.alphas[idx] * np.where(prediction == 0, -1, prediction)        
        return np.where(sum_predictions >= 0, 1, 0)
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/ X.shape[0]  

In [None]:
adaboost = MyAdaBoostTree(num_samples, num_features, 100)
adaboost.train(X_train, y_train)
print(adaboost.score(X_val, y_val))

adaboost = MyAdaBoostLogistic(num_samples, num_features, 100)
adaboost.train(X_train, y_train)
print(adaboost.score(X_val, y_val))

### **Random Forest**

In [50]:
from sklearn.tree import DecisionTreeClassifier

class MyRandomForest():
    def __init__(self, num_trees, max_height, max_features, num_samples):
        self.num_trees = num_trees
        self.max_height = max_height
        self.max_features = max_features
        self.trees = []
        self.num_samples = num_samples

    def train(self, X_train, y_train):        
        for i in range(self.num_trees):
            samples = np.random.choice(self.num_samples, size=self.num_samples, replace=True)
            sampled_X = X_train[samples]
            sampled_Y = y_train[samples]
            tree = DecisionTreeClassifier(max_depth=self.max_height, max_features=self.max_features)
            tree.fit(sampled_X, sampled_Y)
            self.trees.append(tree)


    def predict(self, X):
        sum_predictions = np.zeros(X.shape[0])
        for tree in self.trees:
            prediction = tree.predict(X)
            sum_predictions +=  np.where(prediction == 0, -1, prediction)  #np.where used to replace 0s with -1s         
        return np.where(sum_predictions > 0, 1, 0)
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0]   
    
    def fit(self, X, y):
        self.train(X, y)
        return self
    
    def get_params(self, deep=True):
        return {"num_trees": self.num_trees, "max_height": self.max_height, "max_features": self.max_features, "num_samples": self.num_samples}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
    








In [26]:
forest = MyRandomForest(num_trees=50, max_height=12, max_features=3)

forest.train(X_train, y_train)

print(forest.score(X_train, y_train))
print(forest.score(X_val, y_val))

0.7778562682490345
0.7522918498053497


### **Bagging**

In [5]:
## Bagging using sklearn
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=12, max_features=3), n_estimators=50)
bagging.fit(X_train, y_train)
print(bagging.score(X_train, y_train))
print(bagging.score(X_val, y_val))




0.7913566293052023
0.7524174306166018


In [29]:
## Implementing Bagging from scratch
from collections import Counter

class MyBagging():
    def __init__(self, num_trees, max_height, max_features):
        self.num_trees = num_trees
        self.max_height = max_height
        self.max_features = max_features
        self.trees = []

    def train(self, X_train, y_train):
        num_samples  = X_train.shape[0]        
        for i in range(self.num_trees):
            samples = np.random.choice(num_samples, size=num_samples, replace=True)
            sampled_X = X_train[samples]
            sampled_Y = y_train[samples]
            tree = DecisionTreeClassifier(max_depth=self.max_height, max_features=self.max_features)
            tree.fit(sampled_X, sampled_Y)
            self.trees.append(tree)

    # calculate the prediction of each tree and return the maximum voted prediction
    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        mode_predictions =np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=predictions)
        return mode_predictions
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0]
    
    def fit(self, X, y):
        self.train(X, y)
        return self
    
    def get_params(self, deep=True):
        return {"num_trees": self.num_trees, "max_height": self.max_height, "max_features": self.max_features, "num_features": self.max_features, "num_samples": self.num_samples}
    
    def set_params(self, **parameters): 
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [30]:
## Testing Bagging

bagging = MyBagging(num_trees=10, max_height=12, max_features=3)
bagging.train(X_train, y_train)

print(bagging.score(X_train, y_train))
print(bagging.score(X_val, y_val))


0.7744419327493642
0.7505965088534472


In [29]:
## Bagging using KNN from scratch
from sklearn.neighbors import KNeighborsClassifier

class MyBaggingKNN():
    def __init__(self, num_models, k):
        self.num_models = num_models
        self.k = k
        self.models = []

    def train(self, X_train, y_train):
        num_samples  = X_train.shape[0]        
        for i in range(self.num_models):
            samples = np.random.choice(num_samples, size=num_samples, replace=True)
            sampled_X = X_train[samples]
            sampled_Y = y_train[samples]
            model = KNeighborsClassifier(n_neighbors=self.k)
            model.fit(sampled_X, sampled_Y)
            self.models.append(model)


    def predict(self, X):
        predictions = np.array([model.predict(X) for model in self.models])
        mode_predictions =np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=predictions)
        return mode_predictions
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0]


In [31]:
## Testing Bagging KNN

bagging = MyBaggingKNN(num_models=5, k=3)
bagging.train(X_train, y_train)

print(bagging.score(X_train, y_train))
print(bagging.score(X_val, y_val))

0.834000502339016
0.6904433002637197


## **Hyperparameter tuning**

### **Grid Search**

In [53]:
# Hyperparameter Tuning using Grid Search
from sklearn.model_selection import GridSearchCV

# Grid Search for Boosting
adaboost = MyAdaBoostTree(num_samples, num_features, 100)
parameters = {'num_iterations': [10, 20, 50, 100, 200, 300, 400, 500]}
clf = GridSearchCV(adaboost, parameters, cv=5)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print(clf.score(X_val, y_val))

# Grid Search for Random Forest
#forest = MyRandomForest(num_trees=50, max_height=12, max_features=3, num_samples=num_samples)
#parameters = {'num_trees': [10, 20, 50, 100, 200, 300, 400, 500]}
#clf = GridSearchCV(forest, parameters, cv=5)
#clf.fit(X_train, y_train)
#print(clf.best_params_)
#print(clf.best_score_)
#print(clf.score(X_val, y_val))

# Grid Search for Bagging
#bagging = MyBagging(num_trees=10, max_height=12, max_features=3)
#parameters = {'num_trees': [10, 20, 50, 100, 200, 300, 400, 500]}
#clf = GridSearchCV(bagging, parameters, cv=5)
#clf.fit(X_train, y_train)
#print(clf.best_params_)
#print(clf.best_score_)

num_features: 10, num_samples: 127404, num_iterations: 100, max_tree_height: 1, sample_weights: [7.84904713e-06 7.84904713e-06 7.84904713e-06 ... 7.84904713e-06
 7.84904713e-06 7.84904713e-06]
num_features: 10, num_samples: 127404, num_iterations: 100, max_tree_height: 1, sample_weights: [7.84904713e-06 7.84904713e-06 7.84904713e-06 ... 7.84904713e-06
 7.84904713e-06 7.84904713e-06]
num_features: 10, num_samples: 127404, num_iterations: 100, max_tree_height: 1, sample_weights: [7.84904713e-06 7.84904713e-06 7.84904713e-06 ... 7.84904713e-06
 7.84904713e-06 7.84904713e-06]
num_features: 10, num_samples: 127404, num_iterations: 100, max_tree_height: 1, sample_weights: [7.84904713e-06 7.84904713e-06 7.84904713e-06 ... 7.84904713e-06
 7.84904713e-06 7.84904713e-06]
Setting num_iterations to 10
num_features: 10, num_samples: 127404, num_iterations: 100, max_tree_height: 1, sample_weights: [7.84904713e-06 7.84904713e-06 7.84904713e-06 ... 7.84904713e-06
 7.84904713e-06 7.84904713e-06]
num_fe

ValueError: 
All the 40 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/maram/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/tmp/ipykernel_14048/969004525.py", line 45, in fit
    self.train(X, y, sample_weight=self.sample_weights)
TypeError: MyAdaBoostTree.train() got an unexpected keyword argument 'sample_weight'
