In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math

In [31]:
#Reading Data
df = pd.read_csv("dataset.csv")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)
X_train = train_df.drop(columns=['Unnamed: 0', 'smoking']).to_numpy()
y_train = train_df[['smoking']].to_numpy().reshape(-1) #Reshape from (n,1) to (n)

X_val = val_df.drop(columns=['Unnamed: 0', 'smoking']).to_numpy()
y_val = val_df[['smoking']].to_numpy().reshape(-1)

num_samples, num_features = X_train.shape
print("Number of samples: ", num_samples)
print("Number of features: ", num_features)

Number of samples:  127404
Number of features:  10


### **Boosting**

In [32]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeClassifier

class MyAdaBoostTree(BaseEstimator):
    def __init__(self, num_iterations=10, max_tree_height = 1):
        self.num_iterations = num_iterations
        self.max_tree_height = max_tree_height

    def train(self, X, y):
        return self.fit(X,y)

    def fit(self, X, y):
        num_samples = X.shape[0]
        self.alphas__ = []
        self.models__ = []
        sample_weights = np.ones((num_samples))/num_samples
        for iteration in range(self.num_iterations):
            weak_learner = DecisionTreeClassifier(criterion='gini', max_depth=self.max_tree_height)
            weak_learner.fit(X, y, sample_weight=sample_weights)

            sample_predictions = weak_learner.predict(X)
            incorrect = (sample_predictions != y)*1 #Multiply by 1 to convert True/False to 1/0
            weighted_error = np.multiply(incorrect, sample_weights).sum()  / sample_weights.sum()
            alpha = (0.5) * math.log((1-weighted_error) / weighted_error)
            
            #Add Model and Alpha to Ensemble
            self.alphas__.append(alpha)
            self.models__.append(weak_learner)
            
            #Update Weights
            sample_weights = np.multiply(sample_weights, np.exp(2*alpha*incorrect))
            sample_weights = sample_weights / sample_weights.sum()
    
    def predict(self, X):
        sum_predictions = np.zeros(X.shape[0])
        for idx, model in enumerate(self.models__):
            prediction = model.predict(X)
            sum_predictions += self.alphas__[idx] * np.where(prediction == 0, -1, prediction)  #np.where used to replace 0s with -1s      
        return np.where(sum_predictions >= 0, 1, 0)
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0] 

    def best_score_(self):
        return self


class MyAdaBoostLogistic:
    def __init__(self, num_iterations=10):
        self.num_iterations = num_iterations

    def train(self, X, y):
        num_samples = X.shape[0]
        self.alphas_ = []
        self.models_ = []
        sample_weights = np.ones((num_samples))/num_samples
        for iteration in range(self.num_iterations):
            weak_learner = LogisticRegression()
            weak_learner.fit(X_train, y_train, sample_weight=sample_weights)

            sample_predictions = weak_learner.predict(X_train)
            incorrect = (sample_predictions != y_train)*1 #Multiply by 1 to convert True/False to 1/0
            weighted_error = np.multiply(incorrect, sample_weights).sum()  / sample_weights.sum()
            alpha = (0.5) * math.log((1-weighted_error) / weighted_error)
            
            #Add Model and Alpha to Ensemble
            self.alphas_.append(alpha)
            self.models_.append(weak_learner)
            
            #Update Weights
            sample_weights = np.multiply(sample_weights, np.exp(2*alpha*incorrect))
            sample_weights = sample_weights / sample_weights.sum()
    
    def predict(self, X):
        sum_predictions = np.zeros(X.shape[0])
        for idx, model in enumerate(self.models_):
            prediction = model.predict(X)
            sum_predictions += self.alphas_[idx] * np.where(prediction == 0, -1, prediction)        
        return np.where(sum_predictions >= 0, 1, 0)
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/ X.shape[0]  

### **Random Forest**

In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator
from collections import Counter

class MyRandomForest(BaseEstimator):
    def __init__(self, num_trees=10, max_height=5, max_features=5):
        self.num_trees = num_trees
        self.max_height = max_height
        self.max_features = max_features
        
    def fit(self, X, y):
        self.trees_ = [] 
        num_samples = X.shape[0]       
        for i in range(self.num_trees):
            samples = np.random.choice(num_samples, size=num_samples, replace=True)
            sampled_X = X[samples]
            sampled_Y = y[samples]
            tree = DecisionTreeClassifier(max_depth=self.max_height, max_features=self.max_features)
            tree.fit(sampled_X, sampled_Y)
            self.trees_.append(tree)


    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees_])     
        mode_predictions =np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=predictions)
        return mode_predictions
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0]   
    
    def train(self, X, y):
        return self.fit(X, y)

In [34]:
forest = MyRandomForest(num_trees=50, max_height=12, max_features=3)

forest.train(X_train, y_train)

print(forest.score(X_train, y_train))
print(forest.score(X_val, y_val))

0.7771890992433519
0.7522918498053497


### **Bagging**

In [22]:
## Bagging using sklearn
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=12, max_features=3), n_estimators=50)
bagging.fit(X_train, y_train)
print(bagging.score(X_train, y_train))
print(bagging.score(X_val, y_val))




0.791317384069574
0.7533592867009921


In [23]:
## Implementing Bagging from scratch
from collections import Counter
from sklearn.base import BaseEstimator

class MyBagging(BaseEstimator):
    def __init__(self, num_trees=10, max_height=5):
        self.num_trees = num_trees
        self.max_height = max_height    

    def fit(self, X, y):
        self.trees = []
        num_samples  = X.shape[0]        
        for i in range(self.num_trees):
            samples = np.random.choice(num_samples, size=num_samples, replace=True)
            sampled_X = X[samples]
            sampled_Y = y[samples]
            tree = DecisionTreeClassifier(max_depth=self.max_height)
            tree.fit(sampled_X, sampled_Y)
            self.trees.append(tree)

    # calculate the prediction of each tree and return the maximum voted prediction
    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        mode_predictions =np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=predictions)
        return mode_predictions
    
    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0]
    
    def train(self, X, y):
        return self.fit(X, y)
    


In [24]:
## Testing Bagging

bagging = MyBagging(num_trees=10, max_height=12, max_features=3)
bagging.train(X_train, y_train)

print(bagging.score(X_train, y_train))
print(bagging.score(X_val, y_val))


TypeError: MyBagging.__init__() got an unexpected keyword argument 'max_features'

In [25]:
## Bagging using KNN from scratch
from sklearn.neighbors import KNeighborsClassifier

class MyBaggingKNN(BaseEstimator):
    def __init__(self, num_models=10, k=3):
        self.num_models = num_models
        self.k = k

    def fit(self, X_train, y_train):
        self.models = []
        num_samples  = X_train.shape[0]        
        for i in range(self.num_models):
            samples = np.random.choice(num_samples, size=num_samples, replace=True)
            sampled_X = X_train[samples]
            sampled_Y = y_train[samples]
            model = KNeighborsClassifier(n_neighbors=self.k)
            model.fit(sampled_X, sampled_Y)
            self.models.append(model)


    def predict(self, X):
        predictions = np.array([model.predict(X) for model in self.models])
        mode_predictions =np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=predictions)
        return mode_predictions
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0]
    

    def train(self, X, y):
        return self.fit(X,y)


In [26]:
## Testing Bagging KNN

bagging = MyBaggingKNN(num_models=5, k=3)
bagging.train(X_train, y_train)

print(bagging.score(X_train, y_train))
print(bagging.score(X_val, y_val))

0.834369407553923
0.6906316714805978


## **Hyperparameter tuning**

### **Grid Search**

In [27]:
# Implementing Grid Search from scratch using threads
from sklearn.model_selection import ParameterGrid
from threading import Thread


class MyGridSearch():
    def __init__(self, model, params, cv):
        self.model = model
        self.params = params
        self.cv = cv
        self.best_model = None
        self.best_score = 0
        self.best_params = None
        self.predictions = []

    def worker(self,model, X_train, y_train, X_val, y_val):
        print(f"size of X_train: {X_train.shape}")
        print(f"size of X_val: {X_val.shape}")
        model.train(X_train, y_train)
        self.predictions.append((model.score(X_val, y_val), model.get_params()))

    def grid_search(self):
        parameters_grid = ParameterGrid(self.params)
        number_of_models = len(parameters_grid)
        print(f"Number of models: {number_of_models}")
        for i in range(number_of_models):
            # set the parameters for the model
            self.model.set_params(**parameters_grid[i])
            # Create a thread for each model
            training_ratio = (self.cv - 1)  / self.cv
            X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(X_train, y_train, train_size=training_ratio, random_state=42)
            t = Thread(target=self.worker, args=(self.model, X_train_new, y_train_new, X_val_new, y_val_new))
            t.start()
            t.join()
        return max(self.predictions)
            


In [None]:
adaboost = MyAdaBoostTree(num_iterations= 100)
clf = MyGridSearch(adaboost, {"num_iterations": [10, 20, 30, 40, 50]}, 5)
print(clf.grid_search())

#randomForest = MyRandomForest(num_trees=50, max_height=12, max_features=3, num_samples=num_samples)
#clf = MyGridSearch(randomForest, {"num_trees": [10, 20, 30, 40, 50]}, 5)
#print(clf.grid_search())

#bagging = MyBagging(num_trees=10, max_height=12, max_features=3)
#clf = MyGridSearch(bagging, {"num_trees": [10, 20, 30, 40, 50]}, 5)
#print(clf.grid_search())


Number of models: 5
size of X_train: (101923, 10)
size of X_val: (25481, 10)


size of X_train: (101923, 10)
size of X_val: (25481, 10)
size of X_train: (101923, 10)
size of X_val: (25481, 10)
size of X_train: (101923, 10)
size of X_val: (25481, 10)
size of X_train: (101923, 10)
size of X_val: (25481, 10)
(0.7399238648404693, {'max_tree_height': 1, 'num_iterations': 50})


In [None]:
# Hyperparameter Tuning using Grid Search
from sklearn.model_selection import GridSearchCV

# Grid Search for Boosting
adaboost = MyAdaBoostTree(num_samples, num_features)
parameters = {'num_iterations': [10, 20, 50, 100, 200, 300, 400, 500]}
cv=5
clf = MyGridSearch(adaboost, parameters, cv=cv)
print(clf.best_params_)
print(clf.best_score_)
print(clf.score(X_val, y_val))

# Grid Search for Random Forest
forest = MyRandomForest(num_trees=50, max_height=12, max_features=3, num_samples=num_samples)
parameters = {'num_trees': [10, 20, 50, 100, 200, 300, 400, 500]}
clf = GridSearchCV(forest, parameters, cv=5)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print(clf.score(X_val, y_val))

# Grid Search for Bagging
#bagging = MyBagging(num_trees=10, max_height=12, max_features=3)
#parameters = {'num_trees': [10, 20, 50, 100, 200, 300, 400, 500]}
#clf = GridSearchCV(bagging, parameters, cv=5)
#clf.fit(X_train, y_train)
#print(clf.best_params_)
#print(clf.best_score_)
parameters = {'num_iterations': [10, 20, 30]}
clf = GridSearchCV(adaboost, parameters, cv=2)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print(clf.score(X_val, y_val))

<bound method MyGridSearch.best_params_ of <__main__.MyGridSearch object at 0x7efe83114280>>
<bound method MyGridSearch.best_score_ of <__main__.MyGridSearch object at 0x7efe83114280>>


AttributeError: 'NoneType' object has no attribute 'score'

In [None]:
# Hyperparameter Tuning using Grid Search
from sklearn.model_selection import GridSearchCV

# Grid Search for Random Forest
random_forest = MyRandomForest()
parameters = {'num_trees': [10, 20], 'max_height':[3, 5, 10], 'max_features':[2, 3, 5]}
clf = GridSearchCV(random_forest, parameters, cv=2, verbose=2)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print(clf.score(X_val, y_val))


In [None]:
# Hyperparameter Tuning using Grid Search
from sklearn.model_selection import GridSearchCV

# Grid Search for Bagging
bagging = MyBagging()
parameters = {'num_trees': [10, 20], 'max_height':[3, 5, 10]}
clf = GridSearchCV(bagging, parameters, cv=2, verbose=2)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print(clf.score(X_val, y_val))


In [None]:
# Hyperparameter Tuning using Grid Search
from sklearn.model_selection import GridSearchCV

# Grid Search for BaggingKNN
bagging = MyBaggingKNN()
parameters = {'num_models': [5], 'k':[3, 5, 7]}
clf = GridSearchCV(bagging, parameters, cv=2, verbose=2)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print(clf.score(X_val, y_val))

Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] END ..................................k=3, num_models=5; total time=  36.9s
[CV] END ..................................k=3, num_models=5; total time=  41.1s
[CV] END ..................................k=5, num_models=5; total time=  46.1s
[CV] END ..................................k=5, num_models=5; total time=  50.1s
[CV] END ..................................k=7, num_models=5; total time=  41.8s
[CV] END ..................................k=7, num_models=5; total time=  44.6s
{'k': 7, 'num_models': 5}
0.7075523531443283
0.7062664824814768


### **Random Search**

In [28]:
#Random Search from scratch
from sklearn.model_selection import ParameterSampler

In [45]:
#Hyperparameter tuning using Random Search for AdaboostTree
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

hyperparameters = {'num_iterations': [10, 20, 30, 40, 50],
                   'max_tree_height': randint(1, 3)}
adaboost = MyAdaBoostTree()
rs = RandomizedSearchCV(adaboost, hyperparameters, cv=5, verbose=2, random_state=42)
rs.fit(X_train, y_train)
print(f"Mean cross-validated training accuracy score: {rs.best_score_}\n")
print(f"Best Estimator: {rs.best_estimator_}\n")
print(f"Best Hyperparameter Combination: {rs.best_params_}\n")
print(f"Score: {rs.score(X_val, y_val)}\n")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ...............max_tree_height=1, num_iterations=40; total time=   2.4s
[CV] END ...............max_tree_height=1, num_iterations=40; total time=   2.2s
[CV] END ...............max_tree_height=1, num_iterations=40; total time=   2.3s
[CV] END ...............max_tree_height=1, num_iterations=40; total time=   2.2s
[CV] END ...............max_tree_height=1, num_iterations=40; total time=   2.2s
[CV] END ...............max_tree_height=1, num_iterations=30; total time=   1.8s
[CV] END ...............max_tree_height=1, num_iterations=30; total time=   1.7s
[CV] END ...............max_tree_height=1, num_iterations=30; total time=   1.5s
[CV] END ...............max_tree_height=1, num_iterations=30; total time=   1.6s
[CV] END ...............max_tree_height=1, num_iterations=30; total time=   1.6s
[CV] END ...............max_tree_height=2, num_iterations=50; total time=   4.9s
[CV] END ...............max_tree_height=2, num_i

In [None]:
#Hyperparameter tuning using Random Search for AdaboostLogistic

####hyperparameters = {'num_iterations': [10, 20, 30, 40, 50]}
###########until normalization occurs##########


In [42]:
#Hyperparameter tuning using Random Search for Random Forest
hyperparameters = {'num_trees': [10, 20, 50, 100, 200, 500],
                    'max_height':[3, 5, 10], 
                    'max_features': randint(1, 5)}
forest = MyRandomForest()
rs = RandomizedSearchCV(forest, hyperparameters, cv=2, verbose=2, random_state=42)
rs.fit(X_train, y_train)
print(f"Mean cross-validated training accuracy score: {rs.best_score_}\n")
print(f"Best Estimator: {rs.best_estimator_}\n")
print(f"Best Hyperparameter Combination: {rs.best_params_}\n")
print(f"Score: {rs.score(X_val, y_val)}\n")

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] END .........max_features=3, max_height=3, num_trees=50; total time=   2.6s
[CV] END .........max_features=3, max_height=3, num_trees=50; total time=   2.5s
[CV] END ........max_features=4, max_height=3, num_trees=200; total time=  10.3s
[CV] END ........max_features=4, max_height=3, num_trees=200; total time=  10.1s
[CV] END .........max_features=3, max_height=5, num_trees=50; total time=   3.1s
[CV] END .........max_features=3, max_height=5, num_trees=50; total time=   3.1s
[CV] END ........max_features=3, max_height=10, num_trees=50; total time=   4.5s
[CV] END ........max_features=3, max_height=10, num_trees=50; total time=   4.6s
[CV] END ........max_features=4, max_height=3, num_trees=100; total time=   5.2s
[CV] END ........max_features=4, max_height=3, num_trees=100; total time=   5.4s
[CV] END .......max_features=4, max_height=10, num_trees=500; total time=  52.2s
[CV] END .......max_features=4, max_height=10, n

In [46]:
#Hyperparameter tuning using Random Search for bagging
hyperparameters = {'num_trees': [10, 20, 50, 100, 200],
                    'max_height':[3, 5, 10]}
bagging = MyBagging()
rs = RandomizedSearchCV(bagging, hyperparameters, cv=2, verbose=2, random_state=42)
rs.fit(X_train, y_train)
print(f"Mean cross-validated training accuracy score: {rs.best_score_}\n")
print(f"Best Estimator: {rs.best_estimator_}\n")
print(f"Best Hyperparameter Combination: {rs.best_params_}\n")
print(f"Score: {rs.score(X_val, y_val)}\n")

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] END ........................max_height=5, num_trees=200; total time=  26.0s
[CV] END ........................max_height=5, num_trees=200; total time=  24.7s
[CV] END ........................max_height=10, num_trees=20; total time=   4.8s
[CV] END ........................max_height=10, num_trees=20; total time=   4.7s
[CV] END .........................max_height=3, num_trees=10; total time=   1.2s
[CV] END .........................max_height=3, num_trees=10; total time=   1.2s
[CV] END .......................max_height=10, num_trees=100; total time=  22.1s
[CV] END .......................max_height=10, num_trees=100; total time=  21.5s
[CV] END .........................max_height=5, num_trees=10; total time=   1.5s
[CV] END .........................max_height=5, num_trees=10; total time=   1.6s
[CV] END ........................max_height=5, num_trees=100; total time=  12.2s
[CV] END ........................max_height=5, n

In [49]:
#Hyperparameter tuning using Random Search for baggingKNN
hyperparameters = {'num_models': [5],
                    'k': [3,5,7]}
baggingKNN = MyBaggingKNN()
rs = RandomizedSearchCV(baggingKNN, hyperparameters, cv=2, verbose=2, random_state=42)
rs.fit(X_train, y_train)
print(f"Mean cross-validated training accuracy score: {rs.best_score_}\n")
print(f"Best Estimator: {rs.best_estimator_}\n")
print(f"Best Hyperparameter Combination: {rs.best_params_}\n")
print(f"Score: {rs.score(X_val, y_val)}\n")



Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] END ..................................k=3, num_models=5; total time=  29.6s
[CV] END ..................................k=3, num_models=5; total time=  28.3s
[CV] END ..................................k=5, num_models=5; total time=  29.7s
[CV] END ..................................k=5, num_models=5; total time=  28.4s
[CV] END ..................................k=7, num_models=5; total time=  29.9s
[CV] END ..................................k=7, num_models=5; total time=  30.5s
Mean cross-validated training accuracy score: 0.7071363536466673

Best Estimator: MyBaggingKNN(k=7, num_models=5)

Best Hyperparameter Combination: {'num_models': 5, 'k': 7}

Score: 0.7048223031520784

