# Adaboost Lab

## 准备工作
### 环境准备
请确保完成以下依赖包的安装，并且通过下面代码来导入与验证。

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


### 数据集准备
我们将使用以下数据集进行 Adaboost 的训练。

该数据集与决策树部分使用的数据集相同，包括 7 个特征以及一个标签“是否适合攻读博士”，涵盖了适合攻读博士的各种条件，如love doing research,I absolutely want to be a college professor等。

请执行下面的代码来加载数据集。


In [12]:
# read decision_tree_datasets.csv
train_data = pd.read_csv('train_phd_data.csv')
test_data = pd.read_csv('test_phd_data.csv')

# translate lables [0,1] to [-1,1]
# if 0 then -1, if 1 then 1
train_data.iloc[:, -1] = train_data.iloc[:, -1].map({0: -1, 1: 1})
test_data.iloc[:, -1] = test_data.iloc[:, -1].map({0: -1, 1: 1})

## Adaboost (15 pts)

在上一个lab中，你已经成功完成了 Decision Tree 的构建。在本部分，你可以继续沿用上一部分的代码，学习并完成 Adaboost 模型的训练。

在这个 Adaboost 模型中，我们选择了一层决策树作为弱学习器，并使用基尼系数作为分类标准。

请完成以下类的构建以及相应函数的实现：

1. **weakClassifier()**: 我们采用一层决策树，包括 `split()` 和 `predict()`。你可以参考上一次实验中的代码。

2. **Adaboost()**：包括弱学习器的集合，拟合过程 `fit()` 和预测过程 `predict()`。


In [13]:
class weakClassifier:
    def __init__(self):
        

        self.tree = None 
        self.alpha = None
    
    # here, we use the gini impurity to find the best feature and threshold
    # Note: you need consider sample_weight when computing the gini impurity
    
    def best_split(self, X, y, sample_weight):

        ''' 
            find the best feature and threshold to split the data based on the gini impurity

            Args:
                X: the features of the data
                y: the labels of the data
                sample_weight: the weight of each sample

            Returns:
                best_feature: the best feature to split the data
                best_Series: Series, the data set after splitting
        '''

        # TODO: implement the function to find the best feature and threshold to split the data based on the gini impurity
        best_feature = None
        best_splits = None
        best_gini = float('inf')

        # Iterate over features and thresholds to find the best split
        for feature in X.columns:
            for threshold in X[feature].unique():
                left_mask = X[feature] <=threshold
                right_mask = ~left_mask

                left_labels = y[left_mask]
                right_labels = y[right_mask]

                left_gini = self.gini_impurity(left_labels,sample_weight[left_mask])
                right_gini = self.gini_impurity(right_labels,sample_weight[right_mask])

                current_gini = (len(left_labels) / len(y)) * left_gini + (len(right_labels) / len(y)) * right_gini
                
                #print(f"debug for gini:{current_gini}")

                if current_gini < best_gini:
                    best_gini = current_gini
                    best_feature = feature
                    best_splits = {
                        'left': left_mask,
                        'right': right_mask,
                        'threshold':threshold
                    }
        return best_feature,best_splits
    
    def gini_impurity(self, labels, weights):
        class_counts = labels.value_counts()
        weights = np.array([weights[labels == label].sum() for label in class_counts.index])
        gini = 1 - np.sum(((class_counts / len(labels)) ** 2)*weights)
        return gini 
    
    def fit(self, X, y, sample_weight,max_depth = 10):
        '''  
            fit the data to the decision tree

            Args:
                X: the features of the data
                y: the labels of the data
                sample_weight: the weight of each sample

            Returns:
                None, but self.tree should be updated
        '''
        best_feature, best_splits = self.best_split(X, y, sample_weight)

        if best_feature is None or max_depth == 0:
            return 
        self.tree = self.build_tree(X,y, sample_weight,best_feature, best_splits)
    
    def build_tree(self,X,y, sample_weight,best_feature, splits):
        if len(set(y)) == 1:
            return y.iloc[0]
        
        #print(f"debug for splits:{splits}")

        if 'threshold' in splits:
            best_threshold = splits['threshold']
        else:
            return y.mode().iloc[0]

        decision_node ={
            'feature':best_feature,
            'threshold':best_threshold,
            'left':None,
            'right': None
        }
        left_mask = X[best_feature] <= best_threshold
        right_mask = ~left_mask
        
        # print(f"debug for left_mask:{left_mask} and right_mask:{right_mask}")

        if not left_mask.any() or not right_mask.any():
        # If any mask is empty, it means there's no split needed
            return y.mode().iloc[0]
        decision_node['left'] = self.build_tree(X[left_mask], y[left_mask], sample_weight[left_mask], best_feature, splits['left'])
        decision_node['right'] = self.build_tree(X[right_mask], y[right_mask], sample_weight[right_mask], best_feature, splits['right'])

        return decision_node


    def predict(self,x):
        '''  
        predict the label of the data

        Args:
            x: the features of the data
        Return:
            predict_lables: the predict labels of the data
        '''

        # Store the results
        predict_lables = []

        # predict the label of each sample
        for i in range(len(x)):
            sample = x.iloc[i,:]
            node = self.tree
            # TODO: predict the label of the sample
            while isinstance(node,dict):
                feature_value =sample[node['feature']]
                if feature_value <= node['threshold']:
                    node = node['left']
                else :
                    node = node['right']
            predict_lables.append(node)
        return predict_lables



In [14]:
class Adaboost:
    
    def __init__(self, n_estimators=10):

        # the number of weak classifier
        self.n_estimators = n_estimators
        # the list of weak classifier
        self.clfs = []
    
    # AdaBoost training process
    def fit(self, X, y):
        n_samples,m_features = X.shape
    
        # initialize weights
        w = np.ones(n_samples)/n_samples

        # for each weak classifier
        for _ in range(self.n_estimators):
            clf = weakClassifier()

            # 1. fit the weak classifier
            clf.fit(X,y,w)

            # TODO: 2. predict the label of the data using the weak classifier
            predict_labels = clf.predict(X)

            # TODO: 3. Calculate errors 
            errors = (predict_labels !=y).astype(int)
            error_rate = np.sum(errors * w) / np.sum(w)

            # TODO:4. Calculate alpha
            alpha = 0.5 *np.log((1-error_rate) / error_rate)
            # TODO: 5. Update weights
            w *= np.exp(-alpha *y *predict_labels)
            # normalize to one
            w /= np.sum(w)


            # save classifier and weight
            clf.alpha = alpha
            self.clfs.append(clf)
            

    def predict(self, X):
        '''  
        predict the label of the data
        
        Args:
            X: the features of the data
        Return:
            y_pred: the predict labels of the data
        '''

        #TODO: 1. compute the predict labels of the data using all weak classifiers
        predictions = np.array([clf.predict(X) for clf in self.clfs])

        #TODO: 2. compute the weighted sum of the predict labels
        
        # NOTE: use np.newaxis to make predictions (num_classifiers, num_samples) and alpha values (num_classfiers, ) be compatible
        weighted_sum = np.sum(predictions * np.array([clf.alpha for clf in self.clfs])[:,np.newaxis], axis=0)

        #TODO: 3. get the label of the data by sign function (if x>0 return 1, else return -1)
        return np.sign(weighted_sum)
        

In [15]:
adaboost_model = Adaboost(n_estimators=10)
# fit the model
adaboost_model.fit(train_data.iloc[:, :-1], train_data.iloc[:, -1])

# TODO: predict the test data
predictions = adaboost_model.predict(test_data.iloc[:,:-1])
print(predictions)
#print(test_data.iloc[:,-1])
# TODO: calculate the accuracy of test data
accuracy = np.mean(predictions == test_data.iloc[:,-1])
print("The accuracy of Adaboost is: ", accuracy)


[-1. -1. -1. -1. -1. -1. -1.]
The accuracy of Adaboost is:  1.0


Finished at 12-21 @Boyuan 2200017816