In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math

In [8]:
#Reading Data
df = pd.read_csv("dataset.csv")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)
X_train = train_df.drop(columns=['Unnamed: 0', 'smoking']).to_numpy()
y_train = train_df[['smoking']].to_numpy().reshape(-1) #Reshape from (n,1) to (n)

X_val = val_df.drop(columns=['Unnamed: 0', 'smoking']).to_numpy()
y_val = val_df[['smoking']].to_numpy().reshape(-1)

num_samples, num_features = X_train.shape
print("Number of samples: ", num_samples)
print("Number of features: ", num_features)

Number of samples:  127404
Number of features:  10


### **Boosting**

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeClassifier

class MyAdaBoostTree(BaseEstimator):
    def __init__(self, num_iterations=10, max_tree_height = 1):
        self.num_iterations = num_iterations
        self.max_tree_height = max_tree_height

    def train(self, X, y):
        return self.fit(X,y)

    def fit(self, X, y):
        num_samples = X.shape[0]
        self.alphas__ = []
        self.models__ = []
        sample_weights = np.ones((num_samples))/num_samples
        for iteration in range(self.num_iterations):
            weak_learner = DecisionTreeClassifier(criterion='gini', max_depth=self.max_tree_height)
            weak_learner.fit(X, y, sample_weight=sample_weights)

            sample_predictions = weak_learner.predict(X)
            incorrect = (sample_predictions != y)*1 #Multiply by 1 to convert True/False to 1/0
            weighted_error = np.multiply(incorrect, sample_weights).sum()  / sample_weights.sum()
            alpha = (0.5) * math.log((1-weighted_error) / weighted_error)
            
            #Add Model and Alpha to Ensemble
            self.alphas__.append(alpha)
            self.models__.append(weak_learner)
            
            #Update Weights
            sample_weights = np.multiply(sample_weights, np.exp(2*alpha*incorrect))
            sample_weights = sample_weights / sample_weights.sum()
    
    def predict(self, X):
        sum_predictions = np.zeros(X.shape[0])
        for idx, model in enumerate(self.models__):
            prediction = model.predict(X)
            sum_predictions += self.alphas__[idx] * np.where(prediction == 0, -1, prediction)  #np.where used to replace 0s with -1s      
        return np.where(sum_predictions >= 0, 1, 0)
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0] 

    def best_score_(self):
        return self


class MyAdaBoostLogistic:
    def __init__(self, num_iterations=10):
        self.num_iterations = num_iterations

    def train(self, X, y):
        num_samples = X.shape[0]
        self.alphas_ = []
        self.models_ = []
        sample_weights = np.ones((num_samples))/num_samples
        for iteration in range(self.num_iterations):
            weak_learner = LogisticRegression()
            weak_learner.fit(X_train, y_train, sample_weight=sample_weights)

            sample_predictions = weak_learner.predict(X_train)
            incorrect = (sample_predictions != y_train)*1 #Multiply by 1 to convert True/False to 1/0
            weighted_error = np.multiply(incorrect, sample_weights).sum()  / sample_weights.sum()
            alpha = (0.5) * math.log((1-weighted_error) / weighted_error)
            
            #Add Model and Alpha to Ensemble
            self.alphas_.append(alpha)
            self.models_.append(weak_learner)
            
            #Update Weights
            sample_weights = np.multiply(sample_weights, np.exp(2*alpha*incorrect))
            sample_weights = sample_weights / sample_weights.sum()
    
    def predict(self, X):
        sum_predictions = np.zeros(X.shape[0])
        for idx, model in enumerate(self.models_):
            prediction = model.predict(X)
            sum_predictions += self.alphas_[idx] * np.where(prediction == 0, -1, prediction)        
        return np.where(sum_predictions >= 0, 1, 0)
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/ X.shape[0]  

### **Random Forest**

In [91]:
from sklearn.tree import DecisionTreeClassifier

class MyRandomForest():
    def __init__(self, num_trees, max_height, max_features, num_samples):
        self.num_trees = num_trees
        self.max_height = max_height
        self.max_features = max_features
        self.trees = []
        self.num_samples = num_samples

    def train(self, X_train, y_train):        
        for i in range(self.num_trees):
            print(f"num_samples: {self.num_samples}")
            samples = np.random.choice(self.num_samples, size=self.num_samples, replace=True)
            sampled_X = X_train[samples]
            sampled_Y = y_train[samples]
            tree = DecisionTreeClassifier(max_depth=self.max_height, max_features=self.max_features)
            tree.fit(sampled_X, sampled_Y)
            self.trees.append(tree)


    def predict(self, X):
        sum_predictions = np.zeros(X.shape[0])
        for tree in self.trees:
            prediction = tree.predict(X)
            sum_predictions +=  np.where(prediction == 0, -1, prediction)  #np.where used to replace 0s with -1s         
        return np.where(sum_predictions > 0, 1, 0)
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0]   
    
    def fit(self, X, y):
        return self
    
    def get_params(self, deep=True):
        return {"num_trees": self.num_trees, "max_height": self.max_height, "max_features": self.max_features, "num_samples": self.num_samples}
    
    def set_params(self, **parameters):
        return self
    








In [26]:
forest = MyRandomForest(num_trees=50, max_height=12, max_features=3)

forest.train(X_train, y_train)

print(forest.score(X_train, y_train))
print(forest.score(X_val, y_val))

0.7778562682490345
0.7522918498053497


### **Bagging**

In [5]:
## Bagging using sklearn
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=12, max_features=3), n_estimators=50)
bagging.fit(X_train, y_train)
print(bagging.score(X_train, y_train))
print(bagging.score(X_val, y_val))




0.7913566293052023
0.7524174306166018


In [15]:
## Implementing Bagging from scratch
from collections import Counter

class MyBagging():
    def __init__(self, num_trees, max_height, max_features):
        self.num_trees = num_trees
        self.max_height = max_height
        self.max_features = max_features
        self.trees = []

    def train(self, X_train, y_train):
        num_samples  = X_train.shape[0]        
        for i in range(self.num_trees):
            samples = np.random.choice(num_samples, size=num_samples, replace=True)
            sampled_X = X_train[samples]
            sampled_Y = y_train[samples]
            tree = DecisionTreeClassifier(max_depth=self.max_height, max_features=self.max_features)
            tree.fit(sampled_X, sampled_Y)
            self.trees.append(tree)

    # calculate the prediction of each tree and return the maximum voted prediction
    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        mode_predictions =np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=predictions)
        return mode_predictions
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0]
    
    def fit(self, X, y,cv):
        return self
    
    def get_params(self, deep=True):
        return {"num_trees": self.num_trees, "max_height": self.max_height, "max_features": self.max_features, "num_features": self.max_features}
    
    def set_params(self, **parameters): 
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [30]:
## Testing Bagging

bagging = MyBagging(num_trees=10, max_height=12, max_features=3)
bagging.train(X_train, y_train)

print(bagging.score(X_train, y_train))
print(bagging.score(X_val, y_val))


0.7744419327493642
0.7505965088534472


In [29]:
## Bagging using KNN from scratch
from sklearn.neighbors import KNeighborsClassifier

class MyBaggingKNN():
    def __init__(self, num_models, k):
        self.num_models = num_models
        self.k = k
        self.models = []

    def train(self, X_train, y_train):
        num_samples  = X_train.shape[0]        
        for i in range(self.num_models):
            samples = np.random.choice(num_samples, size=num_samples, replace=True)
            sampled_X = X_train[samples]
            sampled_Y = y_train[samples]
            model = KNeighborsClassifier(n_neighbors=self.k)
            model.fit(sampled_X, sampled_Y)
            self.models.append(model)


    def predict(self, X):
        predictions = np.array([model.predict(X) for model in self.models])
        mode_predictions =np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=predictions)
        return mode_predictions
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0]


In [31]:
## Testing Bagging KNN

bagging = MyBaggingKNN(num_models=5, k=3)
bagging.train(X_train, y_train)

print(bagging.score(X_train, y_train))
print(bagging.score(X_val, y_val))

0.834000502339016
0.6904433002637197


## **Hyperparameter tuning**

### **Grid Search**

In [None]:
# Implementing Grid Search from scratch using threads
from sklearn.model_selection import ParameterGrid
from threading import Thread


class MyGridSearch():
    def __init__(self, model, params, cv):
        self.model = model
        self.params = params
        self.cv = cv
        self.best_model = None
        self.best_score = 0
        self.best_params = None
        self.predictions = []

    def worker(self,model, X_train, y_train, X_val, y_val):
        print(f"size of X_train: {X_train.shape}")
        print(f"size of X_val: {X_val.shape}")
        model.train(X_train, y_train)
        self.predictions.append((model.score(X_val, y_val), model.get_params()))

    def grid_search(self):
        parameters_grid = ParameterGrid(self.params)
        number_of_models = len(parameters_grid)
        print(f"Number of models: {number_of_models}")
        for i in range(number_of_models):
            # set the parameters for the model
            self.model.set_params(**parameters_grid[i])
            # Create a thread for each model
            training_ratio = (self.cv - 1)  / self.cv
            X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(X_train, y_train, train_size=training_ratio, random_state=42)
            t = Thread(target=self.worker, args=(self.model, X_train_new, y_train_new, X_val_new, y_val_new))
            t.start()
            t.join()
        return max(self.predictions)
            


In [17]:
adaboost = MyAdaBoostTree(num_iterations= 100)
clf = MyGridSearch(adaboost, {"num_iterations": [10, 20, 30, 40, 50]}, 5)
print(clf.grid_search())

#randomForest = MyRandomForest(num_trees=50, max_height=12, max_features=3, num_samples=num_samples)
#clf = MyGridSearch(randomForest, {"num_trees": [10, 20, 30, 40, 50]}, 5)
#print(clf.grid_search())

#bagging = MyBagging(num_trees=10, max_height=12, max_features=3)
#clf = MyGridSearch(bagging, {"num_trees": [10, 20, 30, 40, 50]}, 5)
#print(clf.grid_search())


Number of models: 5
size of X_train: (101923, 10)
size of X_val: (25481, 10)


size of X_train: (101923, 10)
size of X_val: (25481, 10)
size of X_train: (101923, 10)
size of X_val: (25481, 10)
size of X_train: (101923, 10)
size of X_val: (25481, 10)
size of X_train: (101923, 10)
size of X_val: (25481, 10)
(0.7399238648404693, {'max_tree_height': 1, 'num_iterations': 50})


In [137]:
# Hyperparameter Tuning using Grid Search
from sklearn.model_selection import GridSearchCV

# Grid Search for Boosting
adaboost = MyAdaBoostTree(num_samples, num_features)
parameters = {'num_iterations': [10, 20, 50, 100, 200, 300, 400, 500]}
cv=5
clf = MyGridSearch(adaboost, parameters, cv=cv)
print(clf.best_params_)
print(clf.best_score_)
print(clf.score(X_val, y_val))

# Grid Search for Random Forest
forest = MyRandomForest(num_trees=50, max_height=12, max_features=3, num_samples=num_samples)
parameters = {'num_trees': [10, 20, 50, 100, 200, 300, 400, 500]}
clf = GridSearchCV(forest, parameters, cv=5)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print(clf.score(X_val, y_val))

# Grid Search for Bagging
#bagging = MyBagging(num_trees=10, max_height=12, max_features=3)
#parameters = {'num_trees': [10, 20, 50, 100, 200, 300, 400, 500]}
#clf = GridSearchCV(bagging, parameters, cv=5)
#clf.fit(X_train, y_train)
#print(clf.best_params_)
#print(clf.best_score_)
parameters = {'num_iterations': [10, 20, 30]}
clf = GridSearchCV(adaboost, parameters, cv=2)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print(clf.score(X_val, y_val))



<bound method MyGridSearch.best_params_ of <__main__.MyGridSearch object at 0x7efe83114280>>
<bound method MyGridSearch.best_score_ of <__main__.MyGridSearch object at 0x7efe83114280>>


AttributeError: 'NoneType' object has no attribute 'score'