# CIS 419/519 
#**Homework 4 : Adaboost and the Challenge**

In [229]:
import pandas as pd
import numpy as np


# Adaboost-SAMME

In [230]:
import numpy as np
import math
from sklearn import tree

class BoostedDT:

    def __init__(self, numBoostingIters=100, maxTreeDepth=3):
        '''
        Constructor

        Class Fields 
        clfs : List object containing individual DecisionTree classifiers, in order of creation during boosting
        betas : List of beta values, in order of creation during boosting
        '''

        self.clfs = None  # keep the class fields, and be sure to keep them updated during boosting
        self.betas = None
        
        #TODO
        self.numBoostingIters = numBoostingIters
        self.maxTreeDepth = maxTreeDepth
        self.K = None
        self.classes = None
        



    def fit(self, X, y, random_state=None):
        '''
        Trains the model. 
        Be sure to initialize all individual Decision trees with the provided random_state value if provided.
        
        Arguments:
            X is an n-by-d Pandas Data Frame
            y is an n-by-1 Pandas Data Frame
            random_seed is an optional integer value
        '''
        #TODO
        X = X.to_numpy()
        y = y.to_numpy()
        n, d = X.shape
        y = y.reshape((n, 1))
        weights = np.zeros((n, 1)) + 1 / n
        
        
        # Initialization
        if self.betas is None:
            self.betas = []
        if self.clfs is None:
            self.clfs = []
        if self.classes is None:
            self.classes = np.unique(y)
        if self.K is None:
            self.K = len(self.classes)
            self.classes.reshape((self.K, 1))
        
        
        for iter in range(self.numBoostingIters):
            clf = self.get_weightedDT(X, y, weights, self.maxTreeDepth, random_state)
            y_train = clf.predict(X).reshape((n, 1))
            epsilon = (weights[ (~(y_train == y)).reshape((n, 1)) ]).sum() # weighted training error
            beta = 0.5 * (np.log( (1 - epsilon) / epsilon ) + np.log(self.K - 1)) # beta, the importance for current model
            # print('beta: ', beta)
            self.betas.append(beta)
            self.clfs.append(clf)
            accuracy_array = (y_train == y).astype('int32').reshape((n, 1))
            # incorrect prediction -> -1, correct prediction -> 1
            sign_array = np.where( accuracy_array == 0, -1, accuracy_array)
            weights = weights * np.exp(-beta * sign_array) # update the weight
            weights = weights / sum(weights) # normalize the weights
    
    def get_weightedDT(self, X, y, weight, maxTreeDepth, random_seed):
        """
        Inputs:
            X is an n-by-d numpy array
            y is an n-by-1 numpy array
            weight is an n-by-1 numpy array
        Outputs:
            A Decision Tree Model with weighted bootstrap sampling
        """
        
        # Resampling n instanced based on the weight
        from sklearn import tree
        n, d = X.shape
        weight = weight.reshape((n,))
        clf = tree.DecisionTreeClassifier(max_depth = maxTreeDepth, random_state = random_seed)
        clf = clf.fit(X, y, sample_weight = weight)
        return clf
        

    def predict(self, X):
        '''
        Used the model to predict values for each instance in X
        Arguments:
            X is an n-by-d Pandas Data Frame
        Returns:
            an n-by-1 Pandas Data Frame of the predictions
        '''
        #TODO
        n, d = X.shape
        votes_array = np.zeros((n, self.K))
        for index, clf in enumerate(self.clfs):
            cur_predict = clf.predict(X).reshape((n, 1))
            # fill out the votes array
            votes_array = votes_array + (cur_predict == self.classes).astype('int32') * self.betas[index]
        y_predict = self.classes[np.argmax(votes_array, axis = 1)] # find the index associated with max votes
        y_predict = pd.DataFrame( np.where(y_predict == -1, 0, y_predict) )
        
        return y_predict
            
            
            
        

# Test BoostedDT

In [231]:
import numpy as np
from sklearn import datasets
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def test_boostedDT():

  # load the data set
  sklearn_dataset = datasets.load_breast_cancer()
  # convert to pandas df
  df = pd.DataFrame(sklearn_dataset.data,columns=sklearn_dataset.feature_names)
  df['CLASS'] = pd.Series(sklearn_dataset.target)
  df.head()

  # split randomly into training/testing
  train, test = train_test_split(df, test_size=0.5, random_state=42)
  # Split into X,y matrices
  X_train = train.drop(['CLASS'], axis=1)
  y_train = train['CLASS']
  X_test = test.drop(['CLASS'], axis=1)
  y_test = test['CLASS']


  # train the decision tree
  modelDT = DecisionTreeClassifier()
  modelDT.fit(X_train, y_train)

  # train the boosted DT
  modelBoostedDT = BoostedDT(numBoostingIters=100, maxTreeDepth=2)
  modelBoostedDT.fit(X_train, y_train)

  # train sklearn's implementation of Adaboost
  modelSKBoostedDT = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=100)
  modelSKBoostedDT.fit(X_train, y_train)

  # output predictions on the test data
  ypred_DT = modelDT.predict(X_test)
  ypred_BoostedDT = modelBoostedDT.predict(X_test)
  ypred_SKBoostedDT = modelSKBoostedDT.predict(X_test)

  # compute the training accuracy of the model
  accuracy_DT = accuracy_score(y_test, ypred_DT)
  accuracy_BoostedDT = accuracy_score(y_test, ypred_BoostedDT)
  accuracy_SKBoostedDT = accuracy_score(y_test, ypred_SKBoostedDT)

  print("Decision Tree Accuracy = "+str(accuracy_DT))
  print("My Boosted Decision Tree Accuracy = "+str(accuracy_BoostedDT))
  print("Sklearn's Boosted Decision Tree Accuracy = "+str(accuracy_SKBoostedDT))
  print()
  print("Note that due to randomization, your boostedDT might not always have the ")
  print("exact same accuracy as Sklearn's boostedDT.  But, on repeated runs, they ")
  print("should be roughly equivalent and should usually exceed the standard DT.")

test_boostedDT()

Decision Tree Accuracy = 0.9228070175438596
My Boosted Decision Tree Accuracy = 0.9649122807017544
Sklearn's Boosted Decision Tree Accuracy = 0.9578947368421052

Note that due to randomization, your boostedDT might not always have the 
exact same accuracy as Sklearn's boostedDT.  But, on repeated runs, they 
should be roughly equivalent and should usually exceed the standard DT.


## Preprocessing Test Data 

In [232]:
# # # Preprocessing Training Data
import pandas as pd
import numpy as np
baseDir = ""
df_cp_trainData = pd.read_csv(baseDir + 'ChocolatePipes_trainData.csv')
df_cp_trainLabels = pd.read_csv(baseDir + 'ChocolatePipes_trainLabels.csv')
df_cp_train = df_cp_trainData.merge(df_cp_trainLabels, how = 'inner', on = 'id')
print('The merged length is: ', len(df_cp_train), '. The raw data length is: ', len(df_cp_trainData)
      , '. The raw label is: ', len(df_cp_trainLabels))
print(' The data statistics summary: ')
display(df_cp_train.describe())

The merged length is:  47467 . The raw data length is:  47520 . The raw label is:  47472
 The data statistics summary: 


Unnamed: 0,id,Size of chocolate pool,Country funded by,Height of pipe,oompa loomper,longitude,Lattitude,Cocoa farm,Country of factory,Region code,...,Year constructed,Type of pump,management,Payment scheme,chocolate_quality,chocolate_quantity,chocolate_source,chocolate_source_class,pipe_type,label
count,47467.0,47467.0,44601.0,47467.0,44580.0,47467.0,47467.0,47467.0,47163.0,47467.0,...,47467.0,47467.0,47467.0,47467.0,47467.0,47467.0,47467.0,47467.0,47467.0,47467.0
mean,37191.715297,374.600625,141.115289,800.150273,151.899955,227.93598,319.292247,3.648872,5989.42175,340.275455,...,1305.837129,6.339562,1.178545,2.315482,0.29629,0.72404,3.235595,0.233215,1.077865,1.156909
std,21474.132526,2985.590707,253.131028,762.398823,299.948528,40.657547,2.943134,2.418489,5353.572098,17.585556,...,953.434661,4.209997,2.254,2.271601,0.905013,0.990391,2.423525,0.433898,1.161506,0.949571
min,0.0,0.0,0.0,0.0,0.0,0.0,313.35056,0.0,0.0,326.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18626.5,0.0,20.0,0.0,5.0,233.093634,316.466155,1.0,1260.0,330.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,37194.0,0.0,46.0,569.0,32.0,234.921073,319.972462,4.0,4479.0,337.0,...,1991.0,4.0,0.0,2.0,0.0,0.0,3.0,0.0,1.0,2.0
75%,55764.5,220.0,137.0,1520.0,150.0,237.18433,321.672365,6.0,9739.5,342.0,...,2009.0,9.0,1.0,4.0,0.0,1.0,5.0,0.0,2.0,2.0
max,74247.0,350200.0,1897.0,2970.0,2145.0,240.345193,325.0,8.0,19287.0,424.0,...,2018.0,17.0,11.0,6.0,7.0,4.0,9.0,2.0,6.0,2.0


#### 1. Remove some unnecessary information by reasoning

In [233]:
# # Remove some unnecessary information by reasoning
# 1. Recorded by: all the data recorded by the same person
# 2. Date of entry: less/no correlation
# 3. Id: no correlation
# 4. Region and District code: seems redundant as there is already longitude, lattitude and Location
df_cp_train.drop(columns = ['Recorded by', 'Date of entry', 'id', 'Region code', 'District code',
                           'Country of factory'], inplace = True)


#### 2. Missing Value Handling

In [234]:
df_missingRatio = df_cp_train.isna().mean()
print('The missing features summary: ')
df_missingFeatures = df_missingRatio[~(df_missingRatio == 0)]
print(df_missingFeatures)
display(df_cp_train)

The missing features summary: 
Country funded by              0.060379
oompa loomper                  0.060821
Does factory offer tours       0.055681
Oompa loompa management        0.065877
Official or Unofficial pipe    0.051088
dtype: float64


Unnamed: 0,Size of chocolate pool,Country funded by,Height of pipe,oompa loomper,longitude,Lattitude,Cocoa farm,Location,Chocolate consumers in town,Does factory offer tours,...,Type of pump,management,management_group,Payment scheme,chocolate_quality,chocolate_quantity,chocolate_source,chocolate_source_class,pipe_type,label
0,0.0,,1239,,235.841139,321.664266,4,110,1001,False,...,3,5,user-group,0,0,0,0,0,0,2
1,0.0,41.0,1277,5.0,230.024915,320.057996,5,19,1150,False,...,3,0,user-group,6,0,0,6,1,0,2
2,700.0,18.0,856,14.0,237.506154,320.995123,2,18,1750,True,...,4,0,user-group,3,1,0,3,0,2,2
3,0.0,140.0,400,5.0,238.880254,317.263155,7,108,1001,True,...,7,0,user-group,0,0,0,3,0,0,0
4,0.0,336.0,1780,5.0,235.649350,320.647179,4,84,1050,,...,8,0,user-group,0,0,0,5,0,2,2
5,3800.0,20.0,1099,5.0,229.820544,320.117498,5,19,1135,True,...,3,0,user-group,2,0,0,6,1,1,2
6,0.0,53.0,0,86.0,0.000000,325.000000,1,36,0,True,...,0,0,user-group,3,0,1,3,0,2,2
7,0.0,20.0,697,5.0,235.659584,315.984362,7,43,1150,True,...,8,0,user-group,0,3,1,5,0,2,0
8,250.0,,1901,,234.881585,320.116965,4,32,1400,True,...,14,0,user-group,4,0,1,3,0,0,2
9,0.0,11.0,0,5.0,230.583390,322.547317,1,88,0,True,...,3,0,user-group,0,0,2,0,0,0,0


In [235]:
print('The length is: ', len(pd.unique(df_cp_train.loc[:,'management'])))

The length is:  12


In [236]:
# # Fill the missing ratio with its mode. Please note all the missing features are categorical.
missingFeatures_col = df_missingFeatures.index
missingFeatures_mode = df_cp_train.loc[:, missingFeatures_col].mode().iloc[0] # Convert to Series
df_cp_train.loc[:, missingFeatures_col] = df_cp_train.loc[:, missingFeatures_col].fillna(missingFeatures_mode)

#### 3. One-Hot Encoding (OHE)

In [237]:
# # OHE
catagorical_features = ['chocolate_quality', 'chocolate_quantity', 'chocolate_source', 
                       'chocolate_source_class', 'pipe_type', 'Country funded by',
                       'Location', 'Does factory offer tours', 'Oompa loompa management',
                       'Cocoa farm', 'Official or Unofficial pipe',
                       'Type of pump', 'Payment scheme', 'management', 'management_group']
df_cp_train = pd.get_dummies(data = df_cp_train, columns = catagorical_features)
label_col = df_cp_train.loc[:, 'label']
df_cp_train.drop(columns = 'label', inplace = True)
df_cp_train = pd.concat([df_cp_train, label_col], axis = 1)

#### 4. Outlier Handling

In [238]:
# # Outlier in Numerical values
# by observations, there exists a lots of 0 in numerical features, which seems impossible
numerical_features = ['Size of chocolate pool', 'Height of pipe', 'oompa loomper', 'longitude',
                      'Lattitude', 'Chocolate consumers in town', 'Year constructed']
df_numerical = df_cp_train.loc[:, numerical_features]
df_numerical = df_numerical.replace(0, np.nan) # for computing average by skipping nan
df_numerical_mean = df_numerical.mean()
for col in range(len(df_numerical_mean)):
    df_numerical.iloc[:, col] = df_numerical.iloc[:, col].fillna(df_numerical_mean[col])
df_cp_train.loc[:, numerical_features] = df_numerical

## Preprocessing Test Data (Repeat same procedure)

In [239]:
# # # Preprocessing Test Data (Repeat same procedure)
df_cp_gradingTest = pd.read_csv(baseDir + 'ChocolatePipes_gradingTestData.csv')
df_cp_leaderTest = pd.read_csv(baseDir + 'ChocolatePipes_leaderboardTestData.csv')

#### 1. Remove some unnecessary information by reasoning

In [240]:
# # Remove some unnecessary information by reasoning
# store the ids
id_LT = pd.DataFrame(df_cp_leaderTest.loc[:, 'id'])
id_GT = pd.DataFrame(df_cp_gradingTest.loc[:, 'id'])
df_cp_leaderTest.drop(columns = ['Recorded by', 'Date of entry', 'id', 'Region code', 'District code',
                           'Country of factory'], inplace = True)
df_cp_gradingTest.drop(columns = ['Recorded by', 'Date of entry', 'id', 'Region code', 'District code',
                           'Country of factory'], inplace = True)

#### 2. Missing Value Handling

In [241]:
# # Missing Value Handling
df_LT_missingRatio = df_cp_leaderTest.isna().mean()
df_GT_missingRatio = df_cp_gradingTest.isna().mean()
df_LT_missingFeatures = df_LT_missingRatio[~(df_LT_missingRatio == 0)]
df_GT_missingFeatures = df_GT_missingRatio[~(df_GT_missingRatio == 0)]
print('The missing features summary: ')
print('Leader Board Test: ')
print(df_LT_missingFeatures)
print('Grading Board Test: ')
print(df_GT_missingFeatures)
# # Fill the missing ratio with its mode. Please note all the missing features are categorical.
missingFeatures_LT_col = df_LT_missingFeatures.index
missingFeatures_GT_col = df_GT_missingFeatures.index
missingFeatures_LT_mode = df_cp_leaderTest.loc[:, missingFeatures_LT_col].mode().iloc[0] # Convert to Series
missingFeatures_GT_mode = df_cp_gradingTest.loc[:, missingFeatures_GT_col].mode().iloc[0] # Convert to Series

df_cp_leaderTest.loc[:, missingFeatures_LT_col] = df_cp_leaderTest.loc[:, missingFeatures_LT_col].fillna(missingFeatures_LT_mode)
df_cp_gradingTest.loc[:, missingFeatures_GT_col] = df_cp_gradingTest.loc[:, missingFeatures_GT_col].fillna(missingFeatures_GT_mode)

print('Leader Board Test Data: ')
display(df_cp_leaderTest)
print('Grading Test Data: ')
display(df_cp_gradingTest)

The missing features summary: 
Leader Board Test: 
Country funded by              0.064310
oompa loomper                  0.064310
Does factory offer tours       0.057912
Oompa loompa management        0.061616
Official or Unofficial pipe    0.053704
dtype: float64
Grading Board Test: 
Country funded by              0.064478
oompa loomper                  0.064310
Does factory offer tours       0.057744
Oompa loompa management        0.064310
Official or Unofficial pipe    0.052020
dtype: float64
Leader Board Test Data: 


Unnamed: 0,Size of chocolate pool,Country funded by,Height of pipe,oompa loomper,longitude,Lattitude,Cocoa farm,Location,Chocolate consumers in town,Does factory offer tours,...,Year constructed,Type of pump,management,management_group,Payment scheme,chocolate_quality,chocolate_quantity,chocolate_source,chocolate_source_class,pipe_type
0,250.0,20.0,382.0,5.0,238.787973,319.805660,2,39,1700,True,...,1988,0,0,user-group,4,1,1,5,0,2
1,2200.0,84.0,2011.0,75.0,235.928711,316.736912,7,29,1025,True,...,2014,3,0,user-group,2,0,0,0,0,0
2,0.0,146.0,0.0,69.0,233.214210,321.331113,4,6,0,False,...,0,9,0,user-group,0,0,3,5,0,3
3,0.0,20.0,0.0,5.0,234.380247,316.232795,7,44,0,False,...,0,3,4,user-group,0,0,3,6,1,0
4,300.0,70.0,205.0,62.0,239.186970,318.084512,6,76,1120,True,...,2005,14,1,user-group,4,0,0,3,0,0
5,0.0,108.0,0.0,522.0,232.947042,321.883939,1,59,0,True,...,0,9,0,user-group,0,2,3,5,0,3
6,0.0,98.0,0.0,5.0,232.220180,321.348128,5,48,0,True,...,0,8,8,parastatal,6,0,2,5,0,3
7,0.0,102.0,1762.0,5.0,231.161804,317.445534,8,95,1001,True,...,1985,7,1,user-group,6,3,4,3,0,0
8,0.0,11.0,1072.0,9.0,236.047885,314.382303,3,10,1180,True,...,1999,15,1,user-group,5,0,1,5,0,2
9,0.0,39.0,0.0,31.0,232.860794,315.721684,0,74,0,False,...,0,8,0,user-group,5,0,1,5,0,2


Grading Test Data: 


Unnamed: 0,Size of chocolate pool,Country funded by,Height of pipe,oompa loomper,longitude,Lattitude,Cocoa farm,Location,Chocolate consumers in town,Does factory offer tours,...,Year constructed,Type of pump,management,management_group,Payment scheme,chocolate_quality,chocolate_quantity,chocolate_source,chocolate_source_class,pipe_type
0,900.0,41.0,1531.0,97.0,230.595985,321.331451,5,30,1430,True,...,1998,3,0,user-group,2,0,1,6,1,1
1,0.0,46.0,0.0,84.0,233.679086,315.998964,0,79,0,False,...,0,3,1,user-group,5,0,0,6,1,0
2,0.0,20.0,208.0,175.0,238.761893,317.003683,7,108,1001,True,...,1979,7,0,user-group,0,0,2,3,0,1
3,0.0,441.0,0.0,5.0,232.925101,322.361938,1,120,0,True,...,0,15,0,user-group,0,0,0,5,0,2
4,0.0,20.0,1485.0,5.0,237.372817,321.729784,2,20,1001,False,...,1983,3,0,user-group,0,0,1,9,2,3
5,210.0,23.0,1702.0,82.0,235.663016,321.629124,4,89,1250,True,...,2003,3,0,user-group,4,5,1,6,1,1
6,0.0,284.0,1034.0,278.0,229.643541,320.138473,5,119,7302,False,...,1990,9,2,other,0,0,2,8,1,1
7,0.0,20.0,2028.0,5.0,236.585967,321.763156,2,26,1150,True,...,1995,3,0,user-group,6,0,1,0,0,0
8,0.0,20.0,1160.0,5.0,235.873700,321.623891,4,110,1001,False,...,2005,3,5,user-group,0,0,0,0,0,0
9,0.0,59.0,0.0,5.0,233.639000,321.686884,4,11,0,True,...,0,8,1,user-group,1,0,0,5,0,2


#### 3. One-Hot Encoding

In [242]:
# # OHE
catagorical_features = ['chocolate_quality', 'chocolate_quantity', 'chocolate_source', 
                       'chocolate_source_class', 'pipe_type', 'Country funded by',
                       'Location', 'Does factory offer tours', 'Oompa loompa management',
                       'Cocoa farm', 'Official or Unofficial pipe',
                       'Type of pump', 'Payment scheme', 'management', 'management_group']
df_cp_leaderTest = pd.get_dummies(data = df_cp_leaderTest, columns = catagorical_features)
df_cp_gradingTest = pd.get_dummies(data = df_cp_gradingTest, columns = catagorical_features)

#### 4. Outlier Handling

In [243]:
# # Outlier in Numerical values
# by observations, there exists a lots of 0 in numerical features, which seems impossible
numerical_features = ['Size of chocolate pool', 'Height of pipe', 'oompa loomper', 'longitude',
                      'Lattitude', 'Chocolate consumers in town', 'Year constructed']
df_LT_numerical = df_cp_leaderTest.loc[:, numerical_features]
df_GT_numerical = df_cp_gradingTest.loc[:, numerical_features]

df_LT_numerical = df_LT_numerical.replace(0, np.nan) # for computing average by skipping nan
df_GT_numerical = df_GT_numerical.replace(0, np.nan) # for computing average by skipping nan

df_LT_numerical_mean = df_LT_numerical.mean()
df_GT_numerical_mean = df_GT_numerical.mean()

for col in range(len(df_LT_numerical_mean)):
    df_LT_numerical.iloc[:, col] = df_LT_numerical.iloc[:, col].fillna(df_LT_numerical_mean[col])
for col in range(len(df_GT_numerical_mean)):
    df_GT_numerical.iloc[:, col] = df_GT_numerical.iloc[:, col].fillna(df_GT_numerical_mean[col])
    
df_cp_leaderTest.loc[:, numerical_features] = df_LT_numerical
df_cp_gradingTest.loc[:, numerical_features] = df_GT_numerical

## Trim the feature for consistency

In [244]:
col_LT = df_cp_leaderTest.columns.tolist()
col_GT = df_cp_gradingTest.columns.tolist()
col_Train = df_cp_train.columns.tolist()
LT_Train_col = set(col_Train) & set(col_LT) # used for predicting Leader board data
GT_Train_col = set(col_Train) & set(col_GT) # used for predicting Grading data

# store the training y label
y_LT = pd.DataFrame(df_cp_train.loc[:, 'label'])

#### Leader Board Data Section with AdaBoost

In [205]:
# Trim the training data for leader board test
trainingData_LT = pd.DataFrame(df_cp_train.loc[:, LT_Train_col])
df_cp_leaderTest = pd.DataFrame(df_cp_leaderTest.loc[:, LT_Train_col])

In [206]:
# # Run the Model, predict the Leader board data
Ada_model_LT = BoostedDT(numBoostingIters=100, maxTreeDepth=13)
Ada_model_LT.fit(trainingData_LT, y_LT)
y_predict_LT = Ada_model_LT.predict(df_cp_leaderTest)

KeyboardInterrupt: 

In [93]:
# # Find the training error
Ada_model_LT = BoostedDT(numBoostingIters=100, maxTreeDepth=13)
Ada_model_LT.fit(trainingData_LT, y_LT)
y_trainError_pred = Ada_model_LT.predict(trainingData_LT)

In [104]:
# # Compute the accuracy
y_trainError_pred.columns = ['label']
trainingAcc = (y_trainError_pred == y_LT).mean()
print('The AdaBoost Training Accuracy: ', trainingAcc[0])

The AdaBoost Training Accuracy:  0.993300608844039


In [62]:
y_predict_LT.columns = ['label']
LT_final_pred = pd.concat([id_LT, y_predict_LT], axis = 1)
LT_final_pred.to_csv('predictions-leaderboard-BoostedDT.csv', index=False)

#### Grading Data Section with AdaBoost

In [245]:
# Trim the training data for grading test
trainingData_GT = pd.DataFrame(df_cp_train.loc[:, GT_Train_col])
df_cp_gradingTest = pd.DataFrame(df_cp_gradingTest.loc[:, GT_Train_col])

In [246]:
# # Run the Model, predict the Leader board data
Ada_model_GT = BoostedDT(numBoostingIters=100, maxTreeDepth=13)
Ada_model_GT.fit(trainingData_GT, y_LT)
y_predict_GT = Ada_model_GT.predict(df_cp_gradingTest)

In [247]:
y_predict_GT.columns = ['label']
GT_final_pred = pd.concat([id_GT, y_predict_GT], axis = 1)
GT_final_pred.to_csv('predictions-grading-BoostedDT.csv', index=False)

Unnamed: 0,id,label
0,47224,0
1,21217,2
2,40243,0
3,260,2
4,22148,0
5,11921,2
6,40024,0
7,36970,0
8,7167,0
9,3651,2


## SVM Predictions

In [188]:
# # scale the data
# concat X and y for split
trainingData_LT = pd.concat([trainingData_LT, y_LT], axis = 1)
trainingData_LT, test = train_test_split(trainingData_LT, test_size=0.5, random_state=42)
# separate again
y_LT = trainingData_LT.loc[:, 'label']
trainingData_LT = trainingData_LT.drop(['label'], axis = 1)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1)).fit(trainingData_LT)
X_train_SVM = scaler.transform(trainingData_LT)
X_test_LT_SVM = scaler.transform(df_cp_leaderTest)

ValueError: operands could not be broadcast together with shapes (5940,839) (761,) (5940,839) 

In [64]:
from sklearn import svm
model_SVM = svm.SVC(decision_function_shape = 'ovo')
model_SVM.fit(X_train_SVM, np.ravel(y_LT))
y_predict_LT_SVM = model_SVM.predict(X_test_LT_SVM)

In [65]:
y_predict_LT_SVM = pd.DataFrame(y_predict_LT_SVM)
y_predict_LT_SVM.columns = ['label']
LT_final_pred_SVM = pd.concat([id_LT, y_predict_LT_SVM], axis = 1)
LT_final_pred_SVM.to_csv('predictions-leaderboard-SVC.csv', index=False)

In [105]:
# # Find the training error
y_trainError_pred_SVM = model_SVM.predict(X_train_SVM)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [125]:
y_trainError_pred_SVM = pd.DataFrame(y_trainError_pred_SVM)
y_trainError_pred_SVM.columns = ['label']
y_LT = pd.DataFrame(y_LT)
trainingAcc_SVM = (y_trainError_pred_SVM.to_numpy() == y_LT.to_numpy()).mean()
print('SVM Training Accuracy is: ', trainingAcc_SVM)

0.7926515821851431

#### Grading Data Section

In [224]:
# # scale the data
# concat X and y for split
trainingData_GT = pd.concat([trainingData_GT, y_LT], axis = 1)
trainingData_GT, test = train_test_split(trainingData_GT, test_size=0.5, random_state=42)
# separate again
y_LT = trainingData_GT.loc[:, 'label']
trainingData_GT = trainingData_GT.drop(['label'], axis = 1)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1)).fit(trainingData_GT)
X_train_SVM = scaler.transform(trainingData_GT)
X_test_GT_SVM = scaler.transform(df_cp_gradingTest)

In [225]:
model_SVM = svm.SVC(decision_function_shape = 'ovo')
model_SVM.fit(X_train_SVM, np.ravel(y_LT))
y_predict_GT_SVM = model_SVM.predict(X_test_GT_SVM)

In [226]:
y_predict_GT_SVM = pd.DataFrame(y_predict_GT_SVM)
y_predict_GT_SVM.columns = ['label']
GT_final_pred_SVM = pd.concat([id_GT, y_predict_GT_SVM], axis = 1)
GT_final_pred_SVM.to_csv('predictions-grading-SVC.csv', index=False)

## Decision Tree Prediction

In [85]:
import random 
def cross_validated_accuracy(DecisionTreeClassifier, X, y, num_trials, num_folds, random_seed):
    random.seed(random_seed)
    """
   Args:
        DecisionTreeClassifier: An Sklearn DecisionTreeClassifier (e.g., created by "tree.DecisionTreeClassifier(criterion='entropy')")
        X: Input features
        y: Labels
        num_trials: Number of trials to run of cross validation
        num_folds: Number of folds (the "k" in "k-folds")
        random_seed: Seed for uniform execution (Do not change this) 

    Returns:
        cvScore: The mean accuracy of the cross-validation experiment

    Notes:
        1. You may NOT use the cross-validation functions provided by Sklearn
    """
    ## TODO ##
    from sklearn.model_selection import RepeatedStratifiedKFold
    rskf = RepeatedStratifiedKFold(n_splits = num_folds, n_repeats = num_trials,
                                   random_state = random_seed)
    scores = np.zeros(num_trials * num_folds) # intialize a score array with 0 entries
    # loop through all the trials(repetitions) and all the folds. 
    # Two for loops nested together in fact
    # the dataset gets shuffled before each trial/repetition
    count = 0
    for train_index, test_index in rskf.split(X, y):
        # get the x_train and x_test
        X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
        # get the y_train and y_test
        y_train, y_test = y.loc[train_index], y.loc[test_index]
        # Model the tree
        clf = DecisionTreeClassifier.fit(X_train, y_train) 
        # prediction
        y_predict = clf.predict(X_test)
        # calculate the accuracy
        scores[count] = (np.ravel(y_test) == y_predict).mean()
        count += 1 # update the counter
    
    cvScore = scores.mean() # get the mean accuracy
    print('\nThe CV estimate of test error (Unpruned): %0.2f (+/- %0.2f)' % (1-cvScore, scores.std()*2))
    print('\nThe mean accuracy of the cross-validation is %0.2f: ' % cvScore)
    return cvScore

In [86]:
def automatic_dt_pruning(DecisionTreeClassifier, X, y, num_trials, num_folds, random_seed):
    random.seed(random_seed)
    """
    Returns the pruning parameter (i.e., ccp_alpha) with the highest cross-validated accuracy
      Args:
            DecisionTreeClassifier  : An Sklearn DecisionTreeClassifier (e.g., created by "tree.DecisionTreeClassifier(criterion='entropy')")      
            X (Pandas.DataFrame)    : Input Features
            y (Pandas.Series)       : Labels
            num_trials              : Number of trials to run of cross validation
            num_folds               : Number of folds for cross validation (The "k" in "k-folds") 
            random_seed             : Seed for uniform execution (Do not change this)

        Returns:
            ccp_alpha : Tuned pruning paramter with highest cross-validated accuracy

        Notes:
            1. Don't change any other Decision Tree Classifier parameters other than ccp_alpha
            2. Use the cross_validated_accuracy function you implemented to find the cross-validated accuracy
    """
  ## TODO ##
    # greater value the ccp_alpha is, it increases the nodes being pruned
    # so let's start the ccp_alpha at 0.
    step_size = 0.01
    ccp_value = 0
    accuracy_list = []
    ccp_list = []
    clf = DecisionTreeClassifier
    tracker = 0;
    stop_threshold = 200
    while True:
        clf.set_params(ccp_alpha = ccp_value)
        accuracy_list.append(cross_validated_accuracy(clf, X, y, num_trials, num_folds, random_seed))
        ccp_value += step_size
        ccp_list.append(ccp_value)
        if accuracy_list[tracker] < accuracy_list[tracker - 1]:
            break
        if tracker == stop_threshold: # if it takes too long 
            break
        tracker += 1
        print('==================', tracker, '=======================')
        
    print('The accuracy list is: ', accuracy_list)
    # get the last/largest ccp_value as the best ccp_alpha 
    # since we want to pruned the tree as much as we can
    ccp_alpha = ccp_list[-1]
    return ccp_alpha

from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion = 'entropy')
trainingData_LT = pd.DataFrame(df_cp_train.loc[:, LT_Train_col])
y_LT = pd.DataFrame(df_cp_train.loc[:, 'label'])
automatic_dt_pruning(clf, trainingData_LT, y_LT, 3,3,10)


The CV estimate of test error (Unpruned): 0.25 (+/- 0.00)

The mean accuracy of the cross-validation is 0.75: 

The CV estimate of test error (Unpruned): 0.31 (+/- 0.00)

The mean accuracy of the cross-validation is 0.69: 
The accuracy list is:  [0.7467152599788591, 0.6919895101409935]


0.02

#### Test data prediction by normal tree

In [91]:
model_normalDT_LT = tree.DecisionTreeClassifier(criterion = 'entropy', ccp_alpha = 0.02)
model_normalDT_LT.fit(trainingData_LT, y_LT)
y_predict_LT_normalDT = model_normalDT_LT.predict(df_cp_leaderTest)
LT_final_pred_normalDT = pd.concat([id_LT, pd.DataFrame(y_predict_LT_normalDT)], axis = 1)
LT_final_pred_normalDT.to_csv('predictions-leaderboard-normalDT.csv', index=False)