In [22]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split

In [24]:
#Reading Data
df = pd.read_csv("dataset.csv")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)
X_train = train_df.drop(columns=['Unnamed: 0', 'smoking']).to_numpy()
y_train = train_df[['smoking']].to_numpy().reshape(-1) #Reshape from (n,1) to (n)

X_val = val_df.drop(columns=['Unnamed: 0', 'smoking']).to_numpy()
y_val = val_df[['smoking']].to_numpy().reshape(-1)

num_samples, num_features = X_train.shape

In [10]:
normalized_df = train_df.copy()

#Preprocessing
normalized_df['hemoglobin'] = (normalized_df['hemoglobin'] - normalized_df['hemoglobin'].mean()) / normalized_df['hemoglobin'].std()
normalized_df['hearing(right)'] = normalized_df['hearing(right)'] - 1
normalized_df['fasting blood sugar'] = (normalized_df['fasting blood sugar'] - normalized_df['fasting blood sugar'].mean()) / normalized_df['fasting blood sugar'].std()
normalized_df['LDL'] = (normalized_df['LDL'] - normalized_df['LDL'].mean()) / normalized_df['LDL'].std()
normalized_df['height(cm)'] = (normalized_df['height(cm)'] - normalized_df['height(cm)'].mean()) / normalized_df['height(cm)'].std()
normalized_df['weight(kg)'] = (normalized_df['weight(kg)'] - normalized_df['weight(kg)'].mean()) / normalized_df['weight(kg)'].std()
normalized_df['Cholesterol'] = (normalized_df['Cholesterol'] - normalized_df['Cholesterol'].mean()) / normalized_df['Cholesterol'].std()
normalized_df['serum creatinine'] = (normalized_df['serum creatinine'] - normalized_df['serum creatinine'].mean()) / normalized_df['serum creatinine'].std()
normalized_df['Gtp'] = (normalized_df['Gtp'] - normalized_df['Gtp'].mean()) / normalized_df['Gtp'].std()

X_train = normalized_df.drop(columns=['Unnamed: 0', 'smoking']).to_numpy()
y_train = normalized_df[['smoking']].to_numpy().reshape(-1) #Reshape from (n,1) to (n)
num_samples, num_features = X_train.shape


# **Boosting**

<h3> ScikitLearn Adaboost </h3>

In [88]:
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier(n_estimators=100)
adaboost.fit(X_train, y_train)
predictions = adaboost.predict(X_train)
score = adaboost.score(X_train, y_train)
print(score)

0.7514206775297478


<h3> Adaboost Implementation Using Decision Trees </h3>

In [43]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

class MyAdaBoostTree:
    def __init__(self, num_samples, num_features, num_iterations, max_tree_height = 1):
        self.num_samples = num_samples
        self.num_features = num_features
        self.num_iterations = num_iterations
        self.max_tree_height = max_tree_height
        self.alphas = []
        self.models = []
        self.sample_weights = np.ones((num_samples))/num_samples

    def train(self, X_train, y_train):
        for iteration in range(self.num_iterations):
            weak_learner = DecisionTreeClassifier(criterion='gini', max_depth=self.max_tree_height)
            weak_learner.fit(X_train, y_train, sample_weight=self.sample_weights)

            sample_predictions = weak_learner.predict(X_train)
            incorrect = (sample_predictions != y_train)*1 #Multiply by 1 to convert True/False to 1/0
            weighted_error = np.multiply(incorrect, self.sample_weights).sum()  / self.sample_weights.sum()
            alpha = (0.5) * math.log((1-weighted_error) / weighted_error)
            
            #Add Model and Alpha to Ensemble
            self.alphas.append(alpha)
            self.models.append(weak_learner)
            
            #Update Weights
            self.sample_weights = np.multiply(self.sample_weights, np.exp(2*alpha*incorrect))
            self.sample_weights = self.sample_weights / self.sample_weights.sum()
    
    def predict(self, X):
        sum_predictions = np.zeros(X.shape[0])
        for idx, model in enumerate(self.models):
            prediction = model.predict(X)
            sum_predictions += self.alphas[idx] * np.where(prediction == 0, -1, prediction)        
        return np.where(sum_predictions >= 0, 1, 0)
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/  X.shape[0]   




class MyAdaBoostLogistic:
    def __init__(self, num_samples, num_features, num_iterations):
        self.num_samples = num_samples
        self.num_features = num_features
        self.num_iterations = num_iterations
        
        self.alphas = []
        self.models = []
        self.sample_weights = np.ones((num_samples))/num_samples

    def train(self, X_train, y_train):
        for iteration in range(self.num_iterations):
            weak_learner = LogisticRegression()
            weak_learner.fit(X_train, y_train, sample_weight=self.sample_weights)

            sample_predictions = weak_learner.predict(X_train)
            incorrect = (sample_predictions != y_train)*1 #Multiply by 1 to convert True/False to 1/0
            weighted_error = np.multiply(incorrect, self.sample_weights).sum()  / self.sample_weights.sum()
            alpha = (0.5) * math.log((1-weighted_error) / weighted_error)
            
            #Add Model and Alpha to Ensemble
            self.alphas.append(alpha)
            self.models.append(weak_learner)
            
            #Update Weights
            self.sample_weights = np.multiply(self.sample_weights, np.exp(2*alpha*incorrect))
            self.sample_weights = self.sample_weights / self.sample_weights.sum()
    
    def predict(self, X):
        sum_predictions = np.zeros(X.shape[0])
        for idx, model in enumerate(self.models):
            prediction = model.predict(X)
            sum_predictions += self.alphas[idx] * np.where(prediction == 0, -1, prediction)        
        return np.where(sum_predictions >= 0, 1, 0)
    

    def score(self, X, y):
        prediction = self.predict(X)
        return (prediction == y).sum()/ X.shape[0]  

In [47]:
adaboost = MyAdaBoostTree(num_samples, num_features, 100)
adaboost.train(X_train, y_train)
print(adaboost.score(X_train, y_train))


0.7484851339047439


In [48]:
print(adaboost.score(X_val, y_val))

0.7477081501946503


In [None]:
adaboost = MyAdaBoostLogistic(num_samples, num_features, 100)
adaboost.train(X_train, y_train)
print(adaboost.score(X_train, y_train))