Kaggle: https://www.kaggle.com/jiuzhang/ninechapter-rf

Tutorial: https://www.kaggle.com/jiuzhang/aicamp-ensemble-exercise-4-adaboost

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder

In [2]:
TRAIN_DIR = './input/mushrooms.csv'

# Explantory Data Analyisis - Take a Glance at the Data

In [3]:
train = pd.read_csv(TRAIN_DIR, sep=',', header=0)

In [4]:
train.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
train.shape

(8124, 23)

# Prepare Data

In [6]:
def label_encode_data(data):
    # Create Encoders
    encoders = {}
    for col in data.columns:
        data.loc[data[col].isnull(), col] = 'missing'
        
        encoder = LabelEncoder()
        values = data[col].tolist() + ['missing']
        encoder.fit(values)
        encoders[col] = encoder
        
    # Calculate Dimension
    dim = 0
    for col, encoder in encoders.items():
        dim += len(encoder.classes_)
    
    # Create X
    num_sample = data.shape[0]
    X = np.zeros((num_sample, dim))
    col_num = 0
    for col, encoder in encoders.items():
        num_elements = len(encoder.classes_)
        X[np.arange(num_sample), encoder.transform(data[col]) + col_num] = 1
        col_num += len(encoder.classes_)
        
    return pd.DataFrame(X, dtype=np.int64)

In [7]:
X = label_encode_data(train.iloc[:, 1:])
y = np.array(train.iloc[:, 0].apply(lambda x: -1 if x == 'e' else 1))
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

X shape: (8124, 139)
y shape: (8124,)


# Models

In [8]:
from sklearn.model_selection import cross_val_score

1.Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
cross_val_score(logreg, X, y, cv=8).mean()

0.9260029188161825

2.Decision Tree

In [10]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=6)
cross_val_score(dt, X, y, cv=8).mean()

0.9655492416896163

3.AdaBoost

In [11]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3), n_estimators=40)
cross_val_score(ada, X, y, cv=8).mean()

0.9330359567123074

4.AdaBoost (Manual Implementation)

In [12]:
from sklearn.base import BaseEstimator

In [13]:
class AdaBoost(BaseEstimator):    
    def __init__(self, n_estimators):
        self.n_estimators = n_estimators 
        
    def fit(self, X, y):
        self.models = []
        self.model_weights = []
        num_samples = X.shape[0]
        alpha = np.ones(num_samples) / num_samples
        
        for m in range(self.n_estimators):
            tree = DecisionTreeClassifier(max_depth=2)
            tree.fit(X, y, sample_weight=alpha)
            prediction = tree.predict(X)
            weighted_error = alpha.dot(prediction != y)
            
            model_weight = 0.5 * (np.log(1 - weighted_error) - np.log(weighted_error))
            
            alpha = alpha * np.exp(-model_weight * y * prediction)
            alpha = alpha / alpha.sum()
            
            self.models.append(tree)
            self.model_weights.append(model_weight)          
              
    def predict(self, X):
        num_samples = len(X)
        results = np.zeros(num_samples)
        for model, model_weight in zip(self.models, self.model_weights):
            results += model_weight * model.predict(X)
        return np.sign(results)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y_pred == y)

In [14]:
ada2 = AdaBoost(n_estimators=50)
cross_val_score(ada2, X, y, cv=8).mean()

1.0

5.AdaBoost (Full Manual Implementation)

In [15]:
class TreeNode:
    def __init__(self, is_leaf, prediction, split_feature):
        self.is_leaf = is_leaf
        self.prediction = prediction
        self.split_feature = split_feature
        self.left = None
        self.right = None

In [16]:
class WeightedDecisionTree(BaseEstimator):
    def __init__(self, max_depth, min_error, verbose=False):
        self.max_depth = max_depth
        self.min_error = min_error
        self.verbose = verbose
    
    def fit(self, X, y, data_weights=None):
        if data_weights is None:
            data_weights = np.ones(X.shape[0]) / X.shape[0]      
        features = X.columns
        self.label_col = 'label'
        data = pd.concat([pd.DataFrame(y, columns=['label']), X], axis=1)
        self.root_node = self.__create_tree(data, data_weights, features, 0)
        
    def predict(self, X):
        return X.apply(lambda row: self.__predict_single_data(self.root_node, row), axis=1)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y_pred == y)
    
    def __create_tree(self, data, data_weights, features, curr_depth):
        # Exit Rule 1: No remaining features
        if len(features) == 0:
            if self.verbose:
                print('No remaining features.')
            return self.__create_leaf(data, data_weights)
        # Exit Rule 2: Reached max depth.
        if curr_depth >= self.max_depth:
            if self.verbose:
                print('Reached max depth.')
            return self.__create_leaf(data, data_weights)
            
        split_feature = self.__find_best_feature(data, data_weights, features)
        features = features.drop(split_feature)
        if self.verbose:
            print(f'Split on feature: {split_feature}')
        
        left_split = data[data[split_feature] == 0]
        left_data_weights = data_weights[data[split_feature] == 0]
        right_split = data[data[split_feature] == 1]
        right_data_weights = data_weights[data[split_feature] == 1]   
        
        # Exit Rule 3: Perfect Split
        if len(left_split) == 0:
            if self.verbose:
                print('Perfect Split.')
            return self.__create_leaf(right_split, right_data_weights)
        if len(right_split) == 0:
            if self.verbose:
                print('Perfect Split.')
            return self.__create_leaf(left_split, left_data_weights)
            
        left_tree = self.__create_tree(left_split, left_data_weights, features, curr_depth+1)
        right_tree = self.__create_tree(right_split, right_data_weights, features, curr_depth+1)
        
        curr_node = TreeNode(is_leaf=False, prediction=None, split_feature=split_feature)
        curr_node.left = left_tree
        curr_node.right = right_tree
        
        return curr_node
    
    def __create_leaf(self, data, data_weights):
        leaf = TreeNode(True, None, None)
        _, leaf.prediction = self.__calculate_node_error(data, data_weights)
        return leaf
           
    def __find_best_feature(self, data, data_weights, features):
        num_samples = float(len(data))
        best_feature, best_error = None, float('inf')       
        for feature in features:
            left_split = data[data[feature] == 0]
            left_data_weights = data_weights[data[feature] == 0]
            left_error, left_predict = self.__calculate_node_error(left_split, left_data_weights)
            
            right_split = data[data[feature] == 1]
            right_data_weights = data_weights[data[feature] == 1]   
            right_error, right_predict = self.__calculate_node_error(right_split, right_data_weights)
            
            error = (left_error + right_error) / sum(data_weights)
            if error < best_error:
                best_feature, best_error = feature, error
        return best_feature          
    
    def __calculate_node_error(self, data, data_weights):
        weight_positive = sum(data_weights[data[self.label_col]==1])
        error_predict_negative = weight_positive
        
        weight_negative = sum(data_weights[data[self.label_col]==-1])
        error_predict_positive = weight_negative
        
        if error_predict_negative < error_predict_positive:
            return (error_predict_negative, -1)
        else:
            return (error_predict_positive, 1)
           
    def __predict_single_data(self, tree_node, x):
        # Exit Rule
        if tree_node.is_leaf:
            return tree_node.prediction
        
        if x[tree_node.split_feature] == 0:
            return self.__predict_single_data(tree_node.left, x)
        else:
            return self.__predict_single_data(tree_node.right, x)

In [17]:
wdt = WeightedDecisionTree(max_depth=6, min_error=1e-5)
cross_val_score(wdt, X, y, cv=8)

array([       nan,        nan,        nan,        nan,        nan,
              nan,        nan, 0.99211823])

In [18]:
class FullAdaBoost(BaseEstimator):    
    def __init__(self, n_estimators):
        self.n_estimators = n_estimators 
        
    def fit(self, X, y):
        self.models = []
        self.model_weights = []
        
        num_samples = X.shape[0]
        alpha = np.ones(num_samples) / num_samples
        
        for m in range(self.n_estimators):
            tree = WeightedDecisionTree(max_depth=2, min_error=1e-5)
            tree.fit(X, y, data_weights=alpha)
            prediction = tree.predict(X)
            weighted_error = alpha.dot(prediction != y)
            
            model_weight = 0.5 * (np.log(1 - weighted_error) - np.log(weighted_error))
            
            alpha = alpha * np.exp(-model_weight * y * prediction)
            alpha = alpha / alpha.sum()
            
            self.models.append(tree)
            self.model_weights.append(model_weight)          
              
    def predict(self, X):
        num_samples = len(X)
        results = np.zeros(num_samples)
        for model, model_weight in zip(self.models, self.model_weights):
            results += model_weight * model.predict(X)
        return np.sign(results)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y_pred == y)

In [19]:
ada3 = FullAdaBoost(n_estimators=50)
cross_val_score(ada3, X, y, cv=8)

array([nan, nan, nan, nan, nan, nan, nan,  1.])