In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math

In [5]:
#Reading Data
df = pd.read_csv("dataset.csv")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)
X_train = train_df.drop(columns=['Unnamed: 0', 'smoking']).to_numpy()
y_train = train_df[['smoking']].to_numpy().reshape(-1) #Reshape from (n,1) to (n)

X_val = val_df.drop(columns=['Unnamed: 0', 'smoking']).to_numpy()
y_val = val_df[['smoking']].to_numpy().reshape(-1)

num_samples, num_features = X_train.shape
print("Number of samples: ", num_samples)
print("Number of features: ", num_features)

Number of samples:  127404
Number of features:  10


### **Boosting**

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeClassifier

class MyAdaBoostTree(BaseEstimator):
    def __init__(self, num_iterations=10, max_tree_height = 1):
        self.num_iterations = num_iterations
        self.max_tree_height = max_tree_height

    def train(self, X, y):
        return self.fit(X,y)

    def fit(self, X, y):
        num_samples = X.shape[0]
        self.alphas__ = []
        self.models__ = []
        sample_weights = np.ones((num_samples))/num_samples
        for iteration in range(self.num_iterations):
            weak_learner = DecisionTreeClassifier(criterion='gini', max_depth=self.max_tree_height)
            weak_learner.fit(X, y, sample_weight=sample_weights)

            sample_predictions = weak_learner.predict(X)
            incorrect = (sample_predictions != y)*1 #Multiply by 1 to convert True/False to 1/0
            weighted_error = np.multiply(incorrect, sample_weights).sum()  / sample_weights.sum()
            alpha = (0.5) * math.log((1-weighted_error) / weighted_error)
            
            #Add Model and Alpha to Ensemble
            self.alphas__.append(alpha)
            self.models__.append(weak_learner)
            
            #Update Weights
            sample_weights = np.multiply(sample_weights, np.exp(2*alpha*incorrect))
            sample_weights = sample_weights / sample_weights.sum()
    
    def predict(self, X):
        sum_predictions = np.zeros(X.shape[0])
        for idx, model in enumerate(self.models__):
            prediction = model.predict(X)
            sum_predictions += self.alphas__[idx] * np.where(prediction == 0, -1, prediction)  #np.where used to replace 0s with -1s      
        return np.where(sum_predictions >= 0, 1, 0)
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0] 



class MyAdaBoostLogistic:
    def __init__(self, num_iterations=10):
        self.num_iterations = num_iterations

    def train(self, X, y):
        num_samples = X.shape[0]
        self.alphas_ = []
        self.models_ = []
        sample_weights = np.ones((num_samples))/num_samples
        for iteration in range(self.num_iterations):
            weak_learner = LogisticRegression()
            weak_learner.fit(X_train, y_train, sample_weight=sample_weights)

            sample_predictions = weak_learner.predict(X_train)
            incorrect = (sample_predictions != y_train)*1 #Multiply by 1 to convert True/False to 1/0
            weighted_error = np.multiply(incorrect, sample_weights).sum()  / sample_weights.sum()
            alpha = (0.5) * math.log((1-weighted_error) / weighted_error)
            
            #Add Model and Alpha to Ensemble
            self.alphas_.append(alpha)
            self.models_.append(weak_learner)
            
            #Update Weights
            sample_weights = np.multiply(sample_weights, np.exp(2*alpha*incorrect))
            sample_weights = sample_weights / sample_weights.sum()
    
    def predict(self, X):
        sum_predictions = np.zeros(X.shape[0])
        for idx, model in enumerate(self.models_):
            prediction = model.predict(X)
            sum_predictions += self.alphas_[idx] * np.where(prediction == 0, -1, prediction)        
        return np.where(sum_predictions >= 0, 1, 0)
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/ X.shape[0]  

### **Random Forest**

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator

class MyRandomForest(BaseEstimator):
    def __init__(self, num_trees=10, max_height=5, max_features=5):
        self.num_trees = num_trees
        self.max_height = max_height
        self.max_features = max_features
        
    def fit(self, X, y):
        self.trees_ = [] 
        num_samples = X.shape[0]       
        for i in range(self.num_trees):
            samples = np.random.choice(num_samples, size=num_samples, replace=True)
            sampled_X = X[samples]
            sampled_Y = y[samples]
            tree = DecisionTreeClassifier(max_depth=self.max_height, max_features=self.max_features)
            tree.fit(sampled_X, sampled_Y)
            self.trees_.append(tree)


    def predict(self, X):
        sum_predictions = np.zeros(X.shape[0])
        for tree in self.trees_:
            prediction = tree.predict(X)
            sum_predictions +=  np.where(prediction == 0, -1, prediction)  #np.where used to replace 0s with -1s         
        return np.where(sum_predictions > 0, 1, 0)
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0]   
    
    def train(self, X, y):
        return self.fit(X, y)

In [12]:
forest = MyRandomForest(num_trees=50, max_height=12, max_features=3)

forest.train(X_train, y_train)

print(forest.score(X_train, y_train))
print(forest.score(X_val, y_val))

0.7775737025525101
0.7529197538616099


### **Bagging**

In [5]:
## Bagging using sklearn
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=12, max_features=3), n_estimators=50)
bagging.fit(X_train, y_train)
print(bagging.score(X_train, y_train))
print(bagging.score(X_val, y_val))




0.7913566293052023
0.7524174306166018


In [29]:
## Implementing Bagging from scratch
from collections import Counter

class MyBagging():
    def __init__(self, num_trees, max_height, max_features):
        self.num_trees = num_trees
        self.max_height = max_height
        self.max_features = max_features
        self.trees = []

    def train(self, X_train, y_train):
        num_samples  = X_train.shape[0]        
        for i in range(self.num_trees):
            samples = np.random.choice(num_samples, size=num_samples, replace=True)
            sampled_X = X_train[samples]
            sampled_Y = y_train[samples]
            tree = DecisionTreeClassifier(max_depth=self.max_height, max_features=self.max_features)
            tree.fit(sampled_X, sampled_Y)
            self.trees.append(tree)

    # calculate the prediction of each tree and return the maximum voted prediction
    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        mode_predictions =np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=predictions)
        return mode_predictions
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0]
    
    def fit(self, X, y):
        self.train(X, y)
        return self
    
    def get_params(self, deep=True):
        return {"num_trees": self.num_trees, "max_height": self.max_height, "max_features": self.max_features, "num_features": self.max_features, "num_samples": self.num_samples}
    
    def set_params(self, **parameters): 
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [30]:
## Testing Bagging

bagging = MyBagging(num_trees=10, max_height=12, max_features=3)
bagging.train(X_train, y_train)

print(bagging.score(X_train, y_train))
print(bagging.score(X_val, y_val))


0.7744419327493642
0.7505965088534472


In [29]:
## Bagging using KNN from scratch
from sklearn.neighbors import KNeighborsClassifier

class MyBaggingKNN():
    def __init__(self, num_models, k):
        self.num_models = num_models
        self.k = k
        self.models = []

    def train(self, X_train, y_train):
        num_samples  = X_train.shape[0]        
        for i in range(self.num_models):
            samples = np.random.choice(num_samples, size=num_samples, replace=True)
            sampled_X = X_train[samples]
            sampled_Y = y_train[samples]
            model = KNeighborsClassifier(n_neighbors=self.k)
            model.fit(sampled_X, sampled_Y)
            self.models.append(model)


    def predict(self, X):
        predictions = np.array([model.predict(X) for model in self.models])
        mode_predictions =np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=predictions)
        return mode_predictions
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0]


In [31]:
## Testing Bagging KNN

bagging = MyBaggingKNN(num_models=5, k=3)
bagging.train(X_train, y_train)

print(bagging.score(X_train, y_train))
print(bagging.score(X_val, y_val))

0.834000502339016
0.6904433002637197


## **Hyperparameter tuning**

### **Grid Search**

In [None]:
# Hyperparameter Tuning using Grid Search
from sklearn.model_selection import GridSearchCV

# Grid Search for Boosting
adaboost = MyAdaBoostTree()
parameters = {'num_iterations': [10, 20, 30]}
clf = GridSearchCV(adaboost, parameters, cv=2)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print(clf.score(X_val, y_val))

In [None]:
# Hyperparameter Tuning using Grid Search
from sklearn.model_selection import GridSearchCV

# Grid Search for Random Forest
random_forest = MyRandomForest()
parameters = {'num_trees': [10, 20], 'max_height':[3, 5, 10], 'max_features':[2, 3, 5]}
clf = GridSearchCV(random_forest, parameters, cv=2, verbose=2)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print(clf.score(X_val, y_val))
