# CIS 419/519 
#**Homework 4 : Adaboost and the Challenge**

In [0]:
import pandas as pd
import numpy as np


# Adaboost-SAMME

In [330]:
import numpy as np
import math
from sklearn import tree

class BoostedDT:

    def __init__(self, numBoostingIters=100, maxTreeDepth=3):
        '''
        Constructor

        Class Fields 
        clfs : List object containing individual DecisionTree classifiers, in order of creation during boosting
        betas : List of beta values, in order of creation during boosting
        '''

        self.clfs = None  # keep the class fields, and be sure to keep them updated during boosting
        self.betas = None
        
        #TODO
        self.numBoostingIters = numBoostingIters
        self.maxTreeDepth = maxTreeDepth
        self.K = None
        self.classes = None
        



    def fit(self, X, y, random_state=None):
        '''
        Trains the model. 
        Be sure to initialize all individual Decision trees with the provided random_state value if provided.
        
        Arguments:
            X is an n-by-d Pandas Data Frame
            y is an n-by-1 Pandas Data Frame
            random_seed is an optional integer value
        '''
        #TODO
        X = X.to_numpy()
        y = y.to_numpy()
        n, d = X.shape
        y = y.reshape((n, 1))
        weights = np.zeros((n, 1)) + 1 / n
        
        
        # Initialization
        if self.betas is None:
            self.betas = []
        if self.clfs is None:
            self.clfs = []
        if self.classes is None:
            self.classes = np.unique(y)
        if self.K is None:
            self.K = len(self.classes)
            self.classes.reshape((self.K, 1))
        
        
        for iter in range(self.numBoostingIters):
            clf = self.get_weightedDT(X, y, weights, self.maxTreeDepth, random_state)
            y_train = clf.predict(X).reshape((n, 1))
            epsilon = (weights[ (~(y_train == y)).reshape((n, 1)) ]).sum() # weighted training error
            beta = 0.5 * (np.log( (1 - epsilon) / epsilon ) + np.log(self.K - 1)) # beta, the importance for current model
            print('beta: ', beta)
            self.betas.append(beta)
            self.clfs.append(clf)
            accuracy_array = (y_train == y).astype('int32').reshape((n, 1))
            # incorrect prediction -> -1, correct prediction -> 1
            sign_array = np.where( accuracy_array == 0, -1, accuracy_array)
            weights = weights * np.exp(-beta * sign_array) # update the weight
            weights = weights / sum(weights) # normalize the weights
    
    def get_weightedDT(self, X, y, weight, maxTreeDepth, random_seed):
        """
        Inputs:
            X is an n-by-d numpy array
            y is an n-by-1 numpy array
            weight is an n-by-1 numpy array
        Outputs:
            A Decision Tree Model with weighted bootstrap sampling
        """
        
        # Resampling n instanced based on the weight
        from sklearn import tree
        n, d = X.shape
        weight = weight.reshape((n,))
        clf = tree.DecisionTreeClassifier(max_depth = maxTreeDepth, random_state = random_seed)
        clf = clf.fit(X, y, sample_weight = weight)
        return clf
        

    def predict(self, X):
        '''
        Used the model to predict values for each instance in X
        Arguments:
            X is an n-by-d Pandas Data Frame
        Returns:
            an n-by-1 Pandas Data Frame of the predictions
        '''
        #TODO
        n, d = X.shape
        votes_array = np.zeros((n, self.K))
        for index, clf in enumerate(self.clfs):
            cur_predict = clf.predict(X).reshape((n, 1))
            # fill out the votes array
            votes_array = votes_array + (cur_predict == self.classes).astype('int32') * self.betas[index]
        y_predict = self.classes[np.argmax(votes_array, axis = 1)] # find the index associated with max votes
        y_predict = pd.DataFrame( np.where(y_predict == -1, 0, y_predict) )
        
        return y_predict
            
            
            
        

In [332]:
w = np.zeros((3,1)) + 1
y1 = np.array([1,0,1,1])
y2 = np.array([0,0,1,0])
y3 = np.array([2,4,1,4])
sum(y3[(~(y1 == y2))])
c = (y1 == y2).astype('int32').reshape((4,1))
(y1 == y2).reshape((4,1))
e = [1,2,3]
e = np.asarray(e) / sum(np.asarray(e))
e = 1.0
a = 1
a == e
a = np.zeros((10,3))
b = np.asarray([1,2,3])
c = np.array([1,2,3,4]).reshape(4,1)
print('bol: ', (c == b).astype('int32') * 3)
e = np.array([[1,2,3],[13,4,1],[1,3,4],[1,2,1]])
print('origin e: ', e)
e[c == b] = 2
print('now e: ', e)
a = np.array([1,0,0,1,0])
np.where(a==0, 1, a)

bol:  [[3 0 0]
 [0 3 0]
 [0 0 3]
 [0 0 0]]
origin e:  [[ 1  2  3]
 [13  4  1]
 [ 1  3  4]
 [ 1  2  1]]
now e:  [[ 2  2  3]
 [13  2  1]
 [ 1  3  2]
 [ 1  2  1]]


int

# Test BoostedDT

In [331]:
import numpy as np
from sklearn import datasets
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def test_boostedDT():

  # load the data set
  sklearn_dataset = datasets.load_breast_cancer()
  # convert to pandas df
  df = pd.DataFrame(sklearn_dataset.data,columns=sklearn_dataset.feature_names)
  df['CLASS'] = pd.Series(sklearn_dataset.target)
  df.head()

  # split randomly into training/testing
  train, test = train_test_split(df, test_size=0.5, random_state=42)
  # Split into X,y matrices
  X_train = train.drop(['CLASS'], axis=1)
  y_train = train['CLASS']
  X_test = test.drop(['CLASS'], axis=1)
  y_test = test['CLASS']


  # train the decision tree
  modelDT = DecisionTreeClassifier()
  modelDT.fit(X_train, y_train)

  # train the boosted DT
  modelBoostedDT = BoostedDT(numBoostingIters=100, maxTreeDepth=2)
  modelBoostedDT.fit(X_train, y_train)

  # train sklearn's implementation of Adaboost
  modelSKBoostedDT = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=100)
  modelSKBoostedDT.fit(X_train, y_train)

  # output predictions on the test data
  ypred_DT = modelDT.predict(X_test)
  ypred_BoostedDT = modelBoostedDT.predict(X_test)
  ypred_SKBoostedDT = modelSKBoostedDT.predict(X_test)

  # compute the training accuracy of the model
  accuracy_DT = accuracy_score(y_test, ypred_DT)
  accuracy_BoostedDT = accuracy_score(y_test, ypred_BoostedDT)
  accuracy_SKBoostedDT = accuracy_score(y_test, ypred_SKBoostedDT)

  print("Decision Tree Accuracy = "+str(accuracy_DT))
  print("My Boosted Decision Tree Accuracy = "+str(accuracy_BoostedDT))
  print("Sklearn's Boosted Decision Tree Accuracy = "+str(accuracy_SKBoostedDT))
  print()
  print("Note that due to randomization, your boostedDT might not always have the ")
  print("exact same accuracy as Sklearn's boostedDT.  But, on repeated runs, they ")
  print("should be roughly equivalent and should usually exceed the standard DT.")

test_boostedDT()

Decision Tree Accuracy = 0.9263157894736842
My Boosted Decision Tree Accuracy = 0.9649122807017544
Sklearn's Boosted Decision Tree Accuracy = 0.9649122807017544

Note that due to randomization, your boostedDT might not always have the 
exact same accuracy as Sklearn's boostedDT.  But, on repeated runs, they 
should be roughly equivalent and should usually exceed the standard DT.


In [312]:
# Preprocessing Training Data
import pandas as pd
import numpy as np
baseDir = ""
df_cp_gradingTest = pd.read_csv(baseDir + 'ChocolatePipes_gradingTestData.csv')
df_cp_leaderTest = pd.read_csv(baseDir + 'ChocolatePipes_leaderboardTestData.csv')
df_cp_trainData = pd.read_csv(baseDir + 'ChocolatePipes_trainData.csv')
df_cp_trainLabels = pd.read_csv(baseDir + 'ChocolatePipes_trainLabels.csv')
df_cp_train = df_cp_trainData.merge(df_cp_trainLabels, how = 'inner', on = 'id')
print('The merged length is: ', len(df_cp_train), '. The raw data length is: ', len(df_cp_trainData)
      , '. The raw label is: ', len(df_cp_trainLabels))
print(' The data statistics summary: ')
display(df_cp_train.describe())
df_missingRatio = df_cp_train.isna().mean()
print('The missing features summary: ')
df_missingFeatures = df_missingRatio[~(df_missingRatio == 0)]
print(df_missingFeatures)
display(df_cp_train)

The merged length is:  47467 . The raw data length is:  47520 . The raw label is:  47472
 The data statistics summary: 


Unnamed: 0,id,Size of chocolate pool,Country funded by,Height of pipe,oompa loomper,longitude,Lattitude,Cocoa farm,Country of factory,Region code,District code,Location,Chocolate consumers in town,Oompa loompa management,Year constructed,Type of pump,management,Payment scheme,chocolate_quality,chocolate_quantity,chocolate_source,chocolate_source_class,pipe_type,label
count,47467.0,47467.0,44601.0,47467.0,44580.0,47467.0,47467.0,47467.0,47163.0,47467.0,47467.0,47467.0,47467.0,44340.0,47467.0,47467.0,47467.0,47467.0,47467.0,47467.0,47467.0,47467.0,47467.0,47467.0
mean,37191.715297,374.600625,141.115289,800.150273,151.899955,227.93598,319.292247,3.648872,5989.42175,340.275455,6.651547,50.317442,819.968905,1.855413,1305.837129,6.339562,1.178545,2.315482,0.29629,0.72404,3.235595,0.233215,1.077865,1.156909
std,21474.132526,2985.590707,253.131028,762.398823,299.948528,40.657547,2.943134,2.418489,5353.572098,17.585556,9.685951,33.008031,751.897946,2.847421,953.434661,4.209997,2.254,2.271601,0.905013,0.990391,2.423525,0.433898,1.161506,0.949571
min,0.0,0.0,0.0,0.0,0.0,0.0,313.35056,0.0,0.0,326.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18626.5,0.0,20.0,0.0,5.0,233.093634,316.466155,1.0,1260.0,330.0,3.0,20.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,37194.0,0.0,46.0,569.0,32.0,234.921073,319.972462,4.0,4479.0,337.0,4.0,44.0,1025.0,0.0,1991.0,4.0,0.0,2.0,0.0,0.0,3.0,0.0,1.0,2.0
75%,55764.5,220.0,137.0,1520.0,150.0,237.18433,321.672365,6.0,9739.5,342.0,6.0,79.0,1215.0,4.0,2009.0,9.0,1.0,4.0,0.0,1.0,5.0,0.0,2.0,2.0
max,74247.0,350200.0,1897.0,2970.0,2145.0,240.345193,325.0,8.0,19287.0,424.0,81.0,124.0,16300.0,12.0,2018.0,17.0,11.0,6.0,7.0,4.0,9.0,2.0,6.0,2.0


The missing features summary: 
Country funded by              0.060379
oompa loomper                  0.060821
Country of factory             0.006404
Does factory offer tours       0.055681
Oompa loompa management        0.065877
Official or Unofficial pipe    0.051088
dtype: float64


Unnamed: 0,id,Size of chocolate pool,Date of entry,Country funded by,Height of pipe,oompa loomper,longitude,Lattitude,Cocoa farm,Country of factory,Region code,District code,Location,Chocolate consumers in town,Does factory offer tours,Recorded by,Oompa loompa management,Official or Unofficial pipe,Year constructed,Type of pump,management,management_group,Payment scheme,chocolate_quality,chocolate_quantity,chocolate_source,chocolate_source_class,pipe_type,label
0,71141,0.0,3/15/14,,1239,,235.841139,321.664266,4,9578.0,327,2,110,1001,False,Chocolate Lovers Consultants Ltd,6.0,,2005,3,5,user-group,0,0,0,0,0,0,2
1,33880,0.0,2/2/14,41.0,1277,5.0,230.024915,320.057996,5,4418.0,341,4,19,1150,False,Chocolate Lovers Consultants Ltd,0.0,Official pipe,1999,3,0,user-group,6,0,0,6,1,0,2
2,29168,700.0,3/16/14,18.0,856,14.0,237.506154,320.995123,2,14700.0,328,4,18,1750,True,Chocolate Lovers Consultants Ltd,7.0,Unofficial pipe,2017,4,0,user-group,3,1,0,3,0,2,2
3,15743,0.0,2/23/12,140.0,400,5.0,238.880254,317.263155,7,5171.0,385,54,108,1001,True,Chocolate Lovers Consultants Ltd,0.0,Official pipe,2010,7,0,user-group,0,0,0,3,0,0,0
4,28450,0.0,2/17/14,336.0,1780,5.0,235.649350,320.647179,4,3940.0,346,2,84,1050,,Chocolate Lovers Consultants Ltd,0.0,Official pipe,2014,8,0,user-group,0,0,0,5,0,2,2
5,48952,3800.0,1/29/14,20.0,1099,5.0,229.820544,320.117498,5,11486.0,341,4,19,1135,True,Chocolate Lovers Consultants Ltd,4.0,Official pipe,1984,3,0,user-group,2,0,0,6,1,1,2
6,37939,0.0,7/20/12,53.0,0,86.0,0.000000,325.000000,1,15957.0,344,7,36,0,True,Chocolate Lovers Consultants Ltd,0.0,Official pipe,0,0,0,user-group,3,0,1,3,0,2,2
7,70142,0.0,3/6/12,20.0,697,5.0,235.659584,315.984362,7,1385.0,330,4,43,1150,True,Chocolate Lovers Consultants Ltd,,Official pipe,2013,8,0,user-group,0,3,1,5,0,2,0
8,6733,250.0,1/25/14,,1901,,234.881585,320.116965,4,10804.0,338,3,32,1400,True,Chocolate Lovers Consultants Ltd,0.0,,2005,14,0,user-group,4,0,1,3,0,0,2
9,73441,0.0,7/29/12,11.0,0,5.0,230.583390,322.547317,1,2825.0,343,31,88,0,True,Chocolate Lovers Consultants Ltd,0.0,Unofficial pipe,0,3,0,user-group,0,0,2,0,0,0,0


In [313]:
print('The length is: ', len(pd.unique(df_cp_train.loc[:,'management'])))

The length is:  12


In [314]:
# # Remove some unnecessary information by reasoning
# 1. Recorded by: all the data recorded by the same person
# 2. Date of entry: less/no correlation
# 3. Id: no correlation
# 4. Region and District code: seems redundant as there is already longitude, lattitude and Location
df_cp_train.drop(columns = ['Recorded by', 'Date of entry', 'id', 'Region code', 'District code'], inplace = True)


In [315]:
# # Fill the missing ratio with its mode. Please note all the missing features are categorical.
missingFeatures_col = df_missingFeatures.index
missingFeatures_mode = df_cp_train.loc[:, missingFeatures_col].mode().iloc[0] # Convert to Series
df_cp_train.loc[:, missingFeatures_col] = df_cp_train.loc[:, missingFeatures_col].fillna(missingFeatures_mode)

In [316]:
# # OHE
catagorical_features = ['chocolate_quality', 'chocolate_quantity', 'chocolate_source', 
                       'chocolate_source_class', 'pipe_type', 'Country funded by',
                       'Location', 'Does factory offer tours', 'Oompa loompa management',
                       'Cocoa farm', 'Country of factory', 'Official or Unofficial pipe',
                       'Type of pump', 'Payment scheme', 'management', 'management_group']
df_cp_train = pd.get_dummies(data = df_cp_train, columns = catagorical_features)
label_col = df_cp_train.loc[:, 'label']
df_cp_train.drop(columns = 'label', inplace = True)
df_cp_train = pd.concat([df_cp_train, label_col], axis = 1)

In [317]:
# # Outlier in Numerical values
# by observations, there exists 0 in Year constructed feature, which seems impossible
year_col = df_cp_train.loc[:, 'Year constructed']
year_mean = year_col[~(year_col == 0)].mode()
df_cp_train.loc[:, 'Year constructed'] = year_col.replace(0, year_mean[0])