# AdaBoost

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time as t
import random

! pip install graphviz

from IPython.display import SVG
from graphviz import Source

from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier



### Helper Functions

In [2]:
translations = {
    'Driving_to' : ['No Urgent Place', 'Home', 'Work'],
    'Passanger' : ['Alone', 'Partner', 'Kid(s)', 'Friend(s)'],
    'Weather' : ['Sunny', 'Rainy', 'Snowy'],
    'Time' : ['7AM', '10AM', '2PM', '6PM', '10PM'],
    'Coupon' : ['Restaurant(<20)', 'Restaurant(20-50)', 'Carry out & Take away', 'Bar', 'Coffee House'],
    'Coupon_validity' : ['2h', '1d'],
    'Gender' : ['Male', 'Female'],
    'Age' : ['below21', '21', '26', '31', '36', '41', '46', '50plus'],
    'Maritalstatus' : ['Single', 'Divorced', 'Widowed', 'Unmarried partner', 'Married partner'],
    'Education' : ['Some High School', 'High School Graduate', 'Some college - no degree', 'Associates degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'],
    'Occupation' : ['Unemployed', 'Construction & Extraction', 'Arts Design Entertainment Sports & Media', 'Food Preparation & Serving Related', 'Education&Training&Library', 'Sales & Related', 'Computer & Mathematical', 'Student', 'Architecture & Engineering', 'Business & Financial', 'Office & Administrative Support', 'Retired', 'Management', 'Life Physical Social Science', 'Healthcare Support', 'Building & Grounds Cleaning & Maintenance', 'Legal', 'Installation Maintenance & Repair', 'Protective Service', 'Healthcare Practitioners & Technical', 'Community & Social Services', 'Transportation & Material Moving', 'Personal Care & Service', 'Farming Fishing & Forestry', 'Production Occupations'],
    'Income' : ['Less than $12500', '$12500 - $24999', '$25000 - $37499', '$37500 - $49999', '$50000 - $62499', '$62500 - $74999', '$75000 - $87499', '$87500 - $99999', '$100000 or More'],
}

def accuracy(y_actual, y_predicted):
    if(not len(y_actual) == len(y_predicted)):
        print("Lengths don't match")
        return 0
    correct = 0
    for i in range(len(y_actual)):
        if y_actual[i] == y_predicted[i]:
            correct += 1
    return 1.0 * correct / len(y_actual)

def clean_df(df):
    for (field, vals) in df.iteritems():
        for v_ind in vals.index:
            if pd.isna(vals[v_ind]):
                # NaN condition
                df.at[v_ind, field] = 0
        if field in translations:
            strings = []
            for s_ind in vals.index:
                if vals[s_ind] not in translations[field]:
                    # String that wasn't in training data
                    df.at[s_ind, field] = 0
                    continue
                df.at[s_ind, field] = translations[field].index(vals[s_ind])
    return df

def make_abc(n, depth, leaf_nodes=None):
    abc = AdaBoostClassifier()
    dtc = tree.DecisionTreeClassifier()
    dtc.criterion = 'entropy'
    dtc.max_tree_depth = depth
    abc.n_estimators = n
    dtc.max_leaf_nodes = leaf_nodes
    abc.base_estimator = dtc
    return abc

### Data Importing and Processing

In [3]:
start_time = t.time()
df = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

# print(df)

### Data Processing

In [4]:
# Remove the id column

df = df.loc[:, df.columns != 'id']
df_test = df_test.loc[:, df_test.columns != 'id']

# Make string values into numbers

clean_df(df)
clean_df(df_test)

# Not 10 like normal since we already have a test partition
num_folds = 9 

cv_length = 1.0 * df.shape[0] / num_folds

partitions = [df.iloc[int(i * cv_length):int((i+1) * cv_length),:] for i in range(num_folds)]
# print([len(partitions[i]) for i in range(len(partitions))])

### Model

In [9]:
# Rotate the folds to tune the parameters

results = []
max_tree_depths = range(5, 15)
num_trees = [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128]
# max_leaf_nodes = range(2, 1003, 10)
num_coord_descs = 4
opt_num_trees = random.choice(num_trees)
opt_tree_depth = random.choice(max_tree_depths)
# opt_num_leaves = random.choice(max_leaf_nodes)

# should return the optimal value
# r is the range to iterate over
# ind is zero indexed
# n_estimators, max_depth, max_leaf_nodes
def descend_on_index(r, ind):
    results = []
    # (n, depth, leaf_nodes=None)
    input_array = [opt_num_trees, opt_tree_depth]
    for i in range(num_folds):
        validation = partitions[i]
        training = pd.concat([j for j in partitions if not j.equals(validation)])

        y_train = training['Decision']
        X_train = training.loc[:, df.columns != 'Decision']

        y_valid = validation['Decision']
        X_valid = validation.loc[:, df.columns != 'Decision']

        results_for_fold = []

        for n in r:
            input_array[ind] = n
            abc = make_abc(input_array[0], input_array[1])
            abc.fit(X_train, y_train)
            a = accuracy(np.array(y_valid), abc.predict(X_valid))
            results_for_fold += [a]
            print("Accuracy for {:4d} estimators with depth {:2d} on fold {:2d} is {:.3f}".format(input_array[0], input_array[1], i, a))
            print("Training accuracy: {:.3f}".format(accuracy(np.array(y_train), abc.predict(X_train))))
        results += [results_for_fold]
        
    results_for_n = np.array(results).T
    avg_acc = [np.mean(results_for_n[j]) for j in range(len(results_for_n))]
    opt = r[avg_acc.index(np.max(avg_acc))]
    print("Optimal number on index {:}: {:}".format(ind, opt))
    return opt


for c in range(num_coord_descs):
    print("Number descent:", c)
    prev_num_trees = opt_num_trees
    opt_num_trees = descend_on_index(num_trees, 0)
    if prev_num_trees == opt_num_trees:
        break
    prev_tree_depth = opt_tree_depth
    opt_tree_depth = descend_on_index(max_tree_depths, 1)
    if prev_tree_depth == opt_tree_depth:
        break
    

Number descent: 0
Accuracy for    1 estimators with depth 11 on fold  0 is 0.690
Training accuracy: 0.999
Accuracy for    2 estimators with depth 11 on fold  0 is 0.658
Training accuracy: 0.999
Accuracy for    3 estimators with depth 11 on fold  0 is 0.677
Training accuracy: 0.999
Accuracy for    4 estimators with depth 11 on fold  0 is 0.710
Training accuracy: 0.999
Accuracy for    6 estimators with depth 11 on fold  0 is 0.712
Training accuracy: 0.999
Accuracy for    8 estimators with depth 11 on fold  0 is 0.717
Training accuracy: 0.999
Accuracy for   12 estimators with depth 11 on fold  0 is 0.723
Training accuracy: 0.999
Accuracy for   16 estimators with depth 11 on fold  0 is 0.721
Training accuracy: 0.999
Accuracy for   24 estimators with depth 11 on fold  0 is 0.718
Training accuracy: 0.999
Accuracy for   32 estimators with depth 11 on fold  0 is 0.737
Training accuracy: 0.999
Accuracy for   48 estimators with depth 11 on fold  0 is 0.730
Training accuracy: 0.999
Accuracy for  

Accuracy for   32 estimators with depth 11 on fold  6 is 0.700
Training accuracy: 0.999
Accuracy for   48 estimators with depth 11 on fold  6 is 0.706
Training accuracy: 0.999
Accuracy for   64 estimators with depth 11 on fold  6 is 0.684
Training accuracy: 0.999
Accuracy for   96 estimators with depth 11 on fold  6 is 0.704
Training accuracy: 0.999
Accuracy for  128 estimators with depth 11 on fold  6 is 0.708
Training accuracy: 0.999
Accuracy for    1 estimators with depth 11 on fold  7 is 0.685
Training accuracy: 0.999
Accuracy for    2 estimators with depth 11 on fold  7 is 0.647
Training accuracy: 0.999
Accuracy for    3 estimators with depth 11 on fold  7 is 0.695
Training accuracy: 0.999
Accuracy for    4 estimators with depth 11 on fold  7 is 0.679
Training accuracy: 0.999
Accuracy for    6 estimators with depth 11 on fold  7 is 0.701
Training accuracy: 0.999
Accuracy for    8 estimators with depth 11 on fold  7 is 0.705
Training accuracy: 0.999
Accuracy for   12 estimators wit

Accuracy for  128 estimators with depth  5 on fold  6 is 0.689
Training accuracy: 0.999
Accuracy for  128 estimators with depth  6 on fold  6 is 0.715
Training accuracy: 0.999
Accuracy for  128 estimators with depth  7 on fold  6 is 0.714
Training accuracy: 0.999
Accuracy for  128 estimators with depth  8 on fold  6 is 0.730
Training accuracy: 0.999
Accuracy for  128 estimators with depth  9 on fold  6 is 0.714
Training accuracy: 0.999
Accuracy for  128 estimators with depth 10 on fold  6 is 0.709
Training accuracy: 0.999
Accuracy for  128 estimators with depth 11 on fold  6 is 0.706
Training accuracy: 0.999
Accuracy for  128 estimators with depth 12 on fold  6 is 0.700
Training accuracy: 0.999
Accuracy for  128 estimators with depth 13 on fold  6 is 0.685
Training accuracy: 0.999
Accuracy for  128 estimators with depth 14 on fold  6 is 0.702
Training accuracy: 0.999
Accuracy for  128 estimators with depth  5 on fold  7 is 0.734
Training accuracy: 0.999
Accuracy for  128 estimators wit

Accuracy for   16 estimators with depth  8 on fold  4 is 0.719
Training accuracy: 0.999
Accuracy for   24 estimators with depth  8 on fold  4 is 0.712
Training accuracy: 0.999
Accuracy for   32 estimators with depth  8 on fold  4 is 0.704
Training accuracy: 0.999
Accuracy for   48 estimators with depth  8 on fold  4 is 0.714
Training accuracy: 0.999
Accuracy for   64 estimators with depth  8 on fold  4 is 0.718
Training accuracy: 0.999
Accuracy for   96 estimators with depth  8 on fold  4 is 0.723
Training accuracy: 0.999
Accuracy for  128 estimators with depth  8 on fold  4 is 0.716
Training accuracy: 0.999
Accuracy for    1 estimators with depth  8 on fold  5 is 0.672
Training accuracy: 0.999
Accuracy for    2 estimators with depth  8 on fold  5 is 0.648
Training accuracy: 0.999
Accuracy for    3 estimators with depth  8 on fold  5 is 0.692
Training accuracy: 0.999
Accuracy for    4 estimators with depth  8 on fold  5 is 0.644
Training accuracy: 0.999
Accuracy for    6 estimators wit

### Test

In [12]:
X_test = df_test.loc[:, df_test.columns != 'Decision']

y_train = df['Decision']
X_train = df.loc[:, df.columns != 'Decision']

abc = make_abc(opt_num_trees, opt_tree_depth)
abc.fit(X_train, y_train)

results = abc.predict(X_test)
print("Runtime is {:} seconds".format(t.time() - start_time))

Runtime is 2952.5844984054565 seconds


### Format for Output

In [13]:
df_test['Decision'] = results
df_test['id'] = range(1, df_test.shape[0] + 1)
df_test = df_test[['id', 'Decision']]
print(df_test)

df_test.to_csv(r"../data/adaboost_results.csv", index = False)

        id  Decision
0        1         0
1        2         0
2        3         1
3        4         0
4        5         1
...    ...       ...
2495  2496         1
2496  2497         0
2497  2498         1
2498  2499         1
2499  2500         1

[2500 rows x 2 columns]
