# CIS 419/519 
#**Homework 4 : Adaboost and the Challenge**

In [None]:
import pandas as pd
import numpy as np


# Adaboost-SAMME

In [None]:
import numpy as np
import random
import math
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score    # for comparing the labels and y_predict

class BoostedDT:

    def __init__(self, numBoostingIters=100, maxTreeDepth=3):
        '''
        Constructor

        Class Fields 
        clfs : List object containing individual DecisionTree classifiers, in order of creation during boosting
               finally we will have T ht models after training
        betas : List of beta values, in order of creation during boosting
        '''

        self.clfs = None  # keep the class fields, and be sure to keep them updated during boosting
        self.betas = None 
        self.numBoostingIters = numBoostingIters    # the number of iterations T
        self.maxTreeDepth = maxTreeDepth
        self.weights = None       # when the training set is read, get the number of instances n and initialize it
        self.instanceNum = 0      # initial instance number 
        self.classNum = 2         # labels include how many classes 
        self.label_class = None
        self.alpha_t = None
        self.alpha_sum = 0



    def fit(self, X, y, random_state=None):
        '''
        Trains the model. 
        Be sure to initialize all individual Decision trees with the provided random_state value if provided.
        
        Arguments:
            X is an n-by-d Pandas Data Frame
            y is an n-by-1 Pandas Data Frame
            random_seed is an optional integer value
        '''
        #TODO
        # get the class number K from the label y
        #

        X_copy = X.copy().to_numpy()
        y_copy = y.copy().to_numpy()

        num_iter = self.numBoostingIters  # get T 
        if self.clfs is None:   # initialize the clfs
            self.clfs = []
        if self.betas is None:  # initialize the beta list 
            self.betas = []


        # X_train, X_test, y_train, y_test = train_test_split(X, y)               
        # n, d = X_train.shape     
        n, d = X_copy.shape        # use the whole data set for training      
        self.instanceNum = n    # n is the instances number and use it to set weights
        weights = np.ones(n)/n
        self.label_class  = np.unique(y_copy)    # get the class in labels 
        self.classNum = len(self.label_class) 
     
        
        for t in range(self.numBoostingIters):   
            curr_clf = tree.DecisionTreeClassifier(max_depth=self.maxTreeDepth, random_state=random_state) # declare the tree
            curr_clf.fit(X_copy, y_copy, sample_weight = weights)    # train the model

            y_predict = curr_clf.predict(X_copy)    # h(x)

            # compute error 
            diff_y = (y_predict != y_copy).astype('int32')
            error = (np.multiply(diff_y, weights)).sum()      # 多分类会出现负值
            
            # print('error = '+str(error))
            curr_beta = 1/2 * (np.log((1 - error)/error) + np.log(self.classNum - 1))   # more than one class 

            self.clfs.append(curr_clf)      # update the current tree into the list of clfs
            self.betas.append(curr_beta)    # update the beta list
            # update the weights

            scalar = diff_y.copy()    #########
            scalar[scalar == 0] = -1 
      
            weights = np.multiply(weights, np.exp( curr_beta * scalar))
            weights = weights / np.sum(weights)    # normalize the weights

    

    def predict(self, X):
        '''
        Used the model to predict values for each instance in X
        Arguments:
            X is an n-by-d Pandas Data Frame
        Returns:
            an n-by-1 Pandas Data Frame of the predictions
        '''
        #TODO
        # print("Into prediction function!")
        T = len(self.clfs)  
        n, d = X.shape    # for n instances there should be n y-labels
        X_copy = X.copy().to_numpy()
        y_predict = np.zeros(n)
        y_class_pred = np.zeros((n, self.classNum))
        for t in range(T):
            curr_model = self.clfs[t] 
            curr_beta = self.betas[t]
            y_predict_t = curr_model.predict(X_copy)      # get the current y preditc vector

            for ins in range(n):    # for every instance (row)
                curr_label = y_predict_t[ins] 
                y_class_pred[ins, curr_label] += curr_beta
        y_predict = np.argmax(y_class_pred, axis=1).reshape((n, 1))
        return y_predict
        

# Test BoostedDT

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from scipy import stats
from google.colab import drive 
drive.mount('/content/gdrive')

def test_boostedDT():

  # load the data set     ORIGINAL
  # sklearn_dataset = datasets.load_breast_cancer()
  # convert to pandas df
  # df = pd.DataFrame(sklearn_dataset.data,columns=sklearn_dataset.feature_names)
  # df['CLASS'] = pd.Series(sklearn_dataset.target)
  # print(df)
  # df.head()

  # for 3-classes testing
  filepath = "/content/gdrive/My Drive/cis519/Codes/Homework/HW4/ChocolatePipes_trainData.csv"
  X_train_raw = pd.read_csv(filepath)      # header = None removed so the index will be applied
  
  filepath = "/content/gdrive/My Drive/cis519/Codes/Homework/HW4/ChocolatePipes_trainLabels.csv"
  y_label_raw = pd.read_csv(filepath)

  #======= preprocess the X_train_raw and y_label_raw ===============
  # match the X and y according to the ID and remove the trash columns
  df = X_train_raw.merge(y_label_raw, how = 'inner', on ='id')    # intersection of id, if not matched then drop
  df = df.drop(columns = ['id', 'Date of entry', 'Recorded by', 'Country of factory', 'Country funded by'])      # drop the id column after matching, and drop Date of entry and Recorded by

  # filling the NaN with features' mode contents 
  df_mode = df.mode(axis = 0).iloc[0, :]   # mode may have two rows 
  df = df.fillna(df_mode, axis=0)    # use the first mode content to replace

  # apply one-hot encoding to it
  df_getdummies = pd.get_dummies(df)
  df = df_getdummies.drop(labels= df_getdummies.columns[df_getdummies.dtypes == object], axis=1) # is the dtype is object(str) drop the column

  # deal with the size of pool 
  size_pool = df.loc[:, 'Size of chocolate pool'].copy().to_numpy()
  size_pool_replace = np.where(size_pool == 0, np.nan, size_pool)
  size_mode = stats.mode(size_pool_replace, nan_policy= 'omit')
  size_mean = np.nanmean(size_pool_replace)
  # df.loc[:,'Size of chocolate pool'].replace(0, float(size_mode[0]), inplace=True)
  df.loc[:,'Size of chocolate pool'].replace(0, size_mean, inplace=True)
  # print(df.loc[:,'Size of chocolate pool'].mean())
 
  # replace oulier with modes
  # df_mode = df.mode(axis = 0).iloc[0, :]   # mode may have two rows 


  #~~~~~~ df is the combination of preprocessed X and y
  
  #=============== Start the prediction (unchanged) =========================
  # split randomly into training/testing
  train, test = train_test_split(df, test_size=0.5, random_state=42)

  # Split into X,y matrices   FOR PART ONE TESTING AND COMPARING 
  # X_train = train.drop(['CLASS'], axis=1)
  # y_train = train['CLASS']
  # X_test = test.drop(['CLASS'], axis=1)
  # y_test = test['CLASS']

  # FOR WONKA MODEL TRAINING
  X_train = train.drop(['label'], axis = 1)
  y_train = train['label']
  X_test = test.drop(['label'], axis = 1)
  y_test = test['label']

  #============== FOR SELF TEST ======================
  # boostingValues = [100, 125, 150, 175, 200, 225, 250]
  # treedepthValues = [1, 2, 3, 4, 5]
  # boostingValues = [100,  150,  200,  250, 300]
  # treedepthValues = [10, 12, 14]
  # score = np.zeros((len(boostingValues), len(treedepthValues)))
  # for b, boostingiter in enumerate(boostingValues):
  #   for t, treedepth in enumerate(treedepthValues):
  #     modelBoostedDT = BoostedDT(numBoostingIters=boostingiter, maxTreeDepth=treedepth)    # given different values for tuning
  #     modelBoostedDT.fit(X_train, y_train)
  #     ypred_BoostedDT = modelBoostedDT.predict(X_test)
  #     accuracy_BoostedDT = accuracy_score(y_test, ypred_BoostedDT)
  #     score[b, t] = accuracy_BoostedDT
  #     print('====================')
  #     print('boosting iter = '+str(boostingiter)+ '  tree depth = '+str(treedepth) )
  #     print("My Boosted Decision Tree Accuracy = "+str(accuracy_BoostedDT))
  # best_score = np.max(score)
  # best_index = np.argwhere(score == best_score)
  # print('Best score is:'+str(best_score)+'  index is: '+str(best_index))
  # ========================================================================================

  #======================= Comparing accuracy ============================
  # train the decision tree
  modelDT = DecisionTreeClassifier()
  modelDT.fit(X_train, y_train)

  # train the boosted DT
  modelBoostedDT = BoostedDT(numBoostingIters=300, maxTreeDepth=10)
  modelBoostedDT.fit(X_train, y_train)

  # train sklearn's implementation of Adaboost
  modelSKBoostedDT = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=100)    # 4 & 150
  modelSKBoostedDT.fit(X_train, y_train)

  # output predictions on the test data
  # ypred_DT = modelDT.predict(X_test)
  # ypred_BoostedDT = modelBoostedDT.predict(X_test)
  # ypred_SKBoostedDT = modelSKBoostedDT.predict(X_test)

  ypred_DT = modelDT.predict(X_train)
  ypred_BoostedDT = modelBoostedDT.predict(X_train)
  ypred_SKBoostedDT = modelSKBoostedDT.predict(X_train)


  # compute the training accuracy of the model
  # accuracy_DT = accuracy_score(y_test, ypred_DT)
  # accuracy_BoostedDT = accuracy_score(y_test, ypred_BoostedDT)
  # accuracy_SKBoostedDT = accuracy_score(y_test, ypred_SKBoostedDT)

  accuracy_DT = accuracy_score(y_train, ypred_DT)
  accuracy_BoostedDT = accuracy_score(y_train, ypred_BoostedDT)
  accuracy_SKBoostedDT = accuracy_score(y_train, ypred_SKBoostedDT)


  print("Decision Tree Accuracy = "+str(accuracy_DT))
  print("My Boosted Decision Tree Accuracy = "+str(accuracy_BoostedDT))
  print("Sklearn's Boosted Decision Tree Accuracy = "+str(accuracy_SKBoostedDT))

  #============================ SVC model accuracy estimation =====================
  
  standardizer = StandardScaler()
  X_train_standardized = pd.DataFrame(standardizer.fit_transform(X_train))  # compute mean and stdev on training set for standardization
  modelSVC = SVC(gamma='scale', decision_function_shape='ovo', kernel='rbf')    # random state????
  modelSVC.fit(X_train_standardized, y_train)
  X_test_standardized = pd.DataFrame(standardizer.transform(X_test))

  # ypred_SVC = modelSVC.predict(X_test_standardized)
  # accuracy_SVC = accuracy_score(y_test, ypred_SVC)

  ypred_SVC = modelSVC.predict(X_train_standardized)
  accuracy_SVC = accuracy_score(y_train, ypred_SVC)

  print("Sklearn's SVC Accuracy = "+str(accuracy_SVC))

  print()
  print("Note that due to randomization, your boostedDT might not always have the ")
  print("exact same accuracy as Sklearn's boostedDT.  But, on repeated runs, they ")
  print("should be roughly equivalent and should usually exceed the standard DT.")
  #================================================================================================

  #========================================================================================
  #======================== Predict the unlabeled data ====================================
  # X_modeltrain = df.drop(['label'], axis = 1)   # use the previous 'df': X and labels for training
  # y_modeltrain = df['label']

  # # =========================================================================
  # # train models adaboost
  # standardizer = StandardScaler()
  # X_modeltrain_standardized = pd.DataFrame(standardizer.fit_transform(X_modeltrain))  # compute mean and stdev on training set for standardization

  # modelBoostedDT = BoostedDT(numBoostingIters=300, maxTreeDepth=10)
  # modelBoostedDT.fit(X_modeltrain_standardized, y_modeltrain)
  # # train SVC model
  # # standardizer = StandardScaler()
  # # X_modeltrain_standardized = pd.DataFrame(standardizer.fit_transform(X_modeltrain))  # compute mean and stdev on training set for standardization
  # modelSVC = SVC(gamma='scale', decision_function_shape='ovo', kernel='rbf')    # 'standardizer' has been stored the std() and mean, use 'transform' next time
  # modelSVC.fit(X_modeltrain_standardized, y_modeltrain)

  # # train simple DT model
  # modelDT = DecisionTreeClassifier()
  # modelDT.fit(X_modeltrain_standardized, y_modeltrain)
  # #========================================================================

  # # read the unlabeled csv
  # filepath_grading = "/content/gdrive/My Drive/cis519/Codes/Homework/HW4/ChocolatePipes_gradingTestData.csv"
  # X_unlabeled_grading = pd.read_csv(filepath_grading)

  # filepath_leader = "/content/gdrive/My Drive/cis519/Codes/Homework/HW4/ChocolatePipes_leaderboardTestData.csv"
  # X_unlabeled_leader = pd.read_csv(filepath_leader)
  
  # # preprocess the unlabeled data
  # ####### 1. grading dataset ///// X_unlabeled_grading
  # id_grading = X_unlabeled_grading.loc[:, ['id']]   # extract the id 
  # X_unlabeled_grading = X_unlabeled_grading.drop(columns = ['id', 'Date of entry', 'Recorded by', 'Country of factory', 'Country funded by']) 

  # grading_mode = X_unlabeled_grading.mode(axis = 0).iloc[0, :]   # mode may have two rows 
  # X_unlabeled_grading = X_unlabeled_grading.fillna(grading_mode, axis=0)    # use the first mode content to replace

  # grading_getdummies = pd.get_dummies(X_unlabeled_grading)
  # X_unlabeled_grading = grading_getdummies.drop(labels= grading_getdummies.columns[grading_getdummies.dtypes == object], axis=1)  

  # # adaboost
  # X_unlabeled_grading_standardized = pd.DataFrame(standardizer.transform(X_unlabeled_grading))  # standardize the X 'standardizer.transform()'
  # ypred_grading_BoostedDT = modelBoostedDT.predict(X_unlabeled_grading_standardized)
  # ypred_grading = pd.DataFrame(ypred_grading_BoostedDT, columns=['label'])
  # grading_output = pd.concat([id_grading, ypred_grading], axis=1)    # concat the id and predicted label of grading data
  # # print(grading_output.describe())     #######
  # grading_output.to_csv('/content/gdrive/My Drive/cis519/Codes/Homework/HW4/predictions-grading-BoostedDT.csv', index = 0)
  
  # #SVC
  # X_unlabeled_grading_standardized = pd.DataFrame(standardizer.transform(X_unlabeled_grading))  # standardize the X 'standardizer.transform()'
  # ypred_grading_SVC = modelSVC.predict(X_unlabeled_grading_standardized)
  # ypred_grading_SVC = pd.DataFrame(ypred_grading_SVC, columns=['label'])
  # grading_output = pd.concat([id_grading, ypred_grading_SVC], axis=1)    # concat the id and predicted label of grading data from SVC
  # # print(grading_output.describe())     #######
  # grading_output.to_csv('/content/gdrive/My Drive/cis519/Codes/Homework/HW4/predictions-grading-SVC.csv', index = 0)
  


  # ####### 2. leaderboard dataset ////// X_unlabeled_leader
  # id_leader = X_unlabeled_leader.loc[:, ['id']]   # extract the id 
  # X_unlabeled_leader = X_unlabeled_leader.drop(columns = ['id', 'Date of entry', 'Recorded by', 'Country of factory', 'Country funded by']) 

  # leader_mode = X_unlabeled_leader.mode(axis = 0).iloc[0, :]   # mode may have two rows 
  # X_unlabeled_leader = X_unlabeled_leader.fillna(leader_mode, axis=0)    # use the first mode content to replace

  # leader_getdummies = pd.get_dummies(X_unlabeled_leader)
  # X_unlabeled_leader = leader_getdummies.drop(labels= leader_getdummies.columns[leader_getdummies.dtypes == object], axis=1)  

  # # adaboost 
  # X_unlabeled_leader_standardized = pd.DataFrame(standardizer.transform(X_unlabeled_leader))  # standardize the X
  # ypred_leader_BoostedDT = modelBoostedDT.predict(X_unlabeled_leader_standardized)
  # ypred_leader = pd.DataFrame(ypred_leader_BoostedDT, columns=['label'])
  # leader_output = pd.concat([id_leader, ypred_leader], axis=1)   
  # leader_output.to_csv('/content/gdrive/My Drive/cis519/Codes/Homework/HW4/predictions-leaderboard-BoostedDT.csv', index = 0)

  # # SVC
  # X_unlabeled_leader_standardized = pd.DataFrame(standardizer.transform(X_unlabeled_leader))  # standardize the X
  # ypred_leader_SVC = modelSVC.predict(X_unlabeled_leader_standardized)
  # ypred_leader_SVC = pd.DataFrame(ypred_leader_SVC, columns=['label'])
  # leader_output = pd.concat([id_leader, ypred_leader_SVC], axis=1)    
  # leader_output.to_csv('/content/gdrive/My Drive/cis519/Codes/Homework/HW4/predictions-leaderboard-SVC.csv', index = 0)

  # # simple DT
  # X_unlabeled_leader_standardized = pd.DataFrame(standardizer.transform(X_unlabeled_leader))  # standardize the X
  # ypred_leader_modelDT = modelDT.predict(X_unlabeled_leader_standardized)
  # ypred_leader_modelDT = pd.DataFrame(ypred_leader_modelDT, columns=['label'])
  # leader_output = pd.concat([id_leader, ypred_leader_modelDT], axis=1)    
  # leader_output.to_csv('/content/gdrive/My Drive/cis519/Codes/Homework/HW4/predictions-leaderboard-modelDT.csv', index = 0)

test_boostedDT()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Decision Tree Accuracy = 0.9962920827539713
My Boosted Decision Tree Accuracy = 0.9893818733409178
Sklearn's Boosted Decision Tree Accuracy = 0.7491678253908061
Sklearn's SVC Accuracy = 0.7385918341549741

Note that due to randomization, your boostedDT might not always have the 
exact same accuracy as Sklearn's boostedDT.  But, on repeated runs, they 
should be roughly equivalent and should usually exceed the standard DT.


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Decision Tree Accuracy = 0.7333361422431954
My Boosted Decision Tree Accuracy = 0.7801044914468694
Sklearn's Boosted Decision Tree Accuracy = 0.732914805763883
Sklearn's SVC Accuracy = 0.713238392179995

Note that due to randomization, your boostedDT might not always have the 
exact same accuracy as Sklearn's boostedDT.  But, on repeated runs, they 
should be roughly equivalent and should usually exceed the standard DT.