# Random Forest

### Imports

In [1]:
! pip install graphviz
! pip install dl8.5
! pip install chefboost



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time as t
import random

! pip install graphviz

from IPython.display import SVG
from graphviz import Source

from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier



### Helper Functions

In [3]:
translations = {
    'Driving_to' : ['No Urgent Place', 'Home', 'Work'],
    'Passanger' : ['Alone', 'Partner', 'Kid(s)', 'Friend(s)'],
    'Weather' : ['Sunny', 'Rainy', 'Snowy'],
    'Time' : ['7AM', '10AM', '2PM', '6PM', '10PM'],
    'Coupon' : ['Restaurant(<20)', 'Restaurant(20-50)', 'Carry out & Take away', 'Bar', 'Coffee House'],
    'Coupon_validity' : ['2h', '1d'],
    'Gender' : ['Male', 'Female'],
    'Age' : ['below21', '21', '26', '31', '36', '41', '46', '50plus'],
    'Maritalstatus' : ['Single', 'Divorced', 'Widowed', 'Unmarried partner', 'Married partner'],
    'Education' : ['Some High School', 'High School Graduate', 'Some college - no degree', 'Associates degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'],
    'Occupation' : ['Unemployed', 'Construction & Extraction', 'Arts Design Entertainment Sports & Media', 'Food Preparation & Serving Related', 'Education&Training&Library', 'Sales & Related', 'Computer & Mathematical', 'Student', 'Architecture & Engineering', 'Business & Financial', 'Office & Administrative Support', 'Retired', 'Management', 'Life Physical Social Science', 'Healthcare Support', 'Building & Grounds Cleaning & Maintenance', 'Legal', 'Installation Maintenance & Repair', 'Protective Service', 'Healthcare Practitioners & Technical', 'Community & Social Services', 'Transportation & Material Moving', 'Personal Care & Service', 'Farming Fishing & Forestry', 'Production Occupations'],
    'Income' : ['Less than $12500', '$12500 - $24999', '$25000 - $37499', '$37500 - $49999', '$50000 - $62499', '$62500 - $74999', '$75000 - $87499', '$87500 - $99999', '$100000 or More'],
}

def accuracy(y_actual, y_predicted):
    if(not len(y_actual) == len(y_predicted)):
        print("Lengths don't match")
        return 0
    correct = 0
    for i in range(len(y_actual)):
        if y_actual[i] == y_predicted[i]:
            correct += 1
    return 1.0 * correct / len(y_actual)

def clean_df(df):
    for (field, vals) in df.iteritems():
        for v_ind in vals.index:
            if pd.isna(vals[v_ind]):
                # NaN condition
                df.at[v_ind, field] = 0
        if field in translations:
            strings = []
            for s_ind in vals.index:
                if vals[s_ind] not in translations[field]:
                    # String that wasn't in training data
                    df.at[s_ind, field] = 0
                    continue
                df.at[s_ind, field] = translations[field].index(vals[s_ind])
    return df

def make_rfc(num_trees, max_depth):#, max_leaves):
    rfc = RandomForestClassifier()
    rfc.criterion = 'entropy'
    rfc.max_tree_depth = max_depth
    rfc.n_estimators = num_trees
#     rfc.max_leaf_nodes = max_leaves
    return rfc

### Data Importing and Processing

In [4]:
start_time = t.time()
df = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

# print(df)

### Data Processing

In [6]:
ranked_columns = ['Coupon', 'Occupation', 'Income', 'Coffeehouse', 'Age', 'Time', 'Bar',
 'Education', 'Carryaway', 'Restaurant20to50', 'Restaurantlessthan20',
 'Distance', 'Maritalstatus', 'Coupon_validity', 'Temperature', 'Passanger',
 'Driving_to', 'Gender', 'Weather', 'Children', 'Direction_same']

top_cutoff = 15

# Remove the id column

df = df.loc[:, df.columns != 'id']
df_test = df_test.loc[:, df_test.columns != 'id']

df.drop(labels=ranked_columns[top_cutoff:], axis=1, inplace=True)
df_test.drop(labels=ranked_columns[top_cutoff:], axis=1, inplace=True)

# Make string values into numbers

clean_df(df)
clean_df(df_test)

# Not 10 like normal since we already have a test partition
num_folds = 9 

cv_length = 1.0 * df.shape[0] / num_folds

partitions = [df.iloc[int(i * cv_length):int((i+1) * cv_length),:] for i in range(num_folds)]
# print([len(partitions[i]) for i in range(len(partitions))])


### Model

In [8]:
# Rotate the folds to tune the parameters

results = []
max_tree_depths = range(5, 15)
num_trees = [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 196, 256]
num_coord_descs = 4
opt_num_trees = random.choice(num_trees)
opt_tree_depth = random.choice(max_tree_depths)

# should return the optimal value
# r is the range to iterate over
# ind is zero indexed
# n_estimators, max_depth, max_leaf_nodes
def descend_on_index(r, ind):
    results = []
    # (n, depth, leaf_nodes=None)
    input_array = [opt_num_trees, opt_tree_depth]
    for i in range(num_folds):
        validation = partitions[i]
        training = pd.concat([j for j in partitions if not j.equals(validation)])

        y_train = training['Decision']
        X_train = training.loc[:, df.columns != 'Decision']

        y_valid = validation['Decision']
        X_valid = validation.loc[:, df.columns != 'Decision']

        results_for_fold = []

        for n in r:
            input_array[ind] = n
            abc = make_rfc(input_array[0], input_array[1])
            abc.fit(X_train, y_train)
            a = accuracy(np.array(y_valid), abc.predict(X_valid))
            results_for_fold += [a]
            print("Accuracy for {:4d} estimators with depth {:2d} on fold {:2d} is {:.3f}".format(input_array[0], input_array[1], i, a))
            print("Training accuracy: {:.3f}".format(accuracy(np.array(y_train), abc.predict(X_train))))
        results += [results_for_fold]
        
    results_for_n = np.array(results).T
    avg_acc = [np.mean(results_for_n[j]) for j in range(len(results_for_n))]
    opt = r[avg_acc.index(np.max(avg_acc))]
    print("Optimal number on index {:}: {:}".format(ind, opt))
    return opt


for c in range(num_coord_descs):
    print("Number descent:", c)
    prev_num_trees = opt_num_trees
    opt_num_trees = descend_on_index(num_trees, 0)
    if prev_num_trees == opt_num_trees:
        break
    prev_tree_depth = opt_tree_depth
    opt_tree_depth = descend_on_index(max_tree_depths, 1)
    if prev_tree_depth == opt_tree_depth:
        break

Number descent: 0
Accuracy for    1 estimators with depth  6 on fold  0 is 0.662
Training accuracy: 0.859
Accuracy for    2 estimators with depth  6 on fold  0 is 0.615
Training accuracy: 0.858
Accuracy for    3 estimators with depth  6 on fold  0 is 0.693
Training accuracy: 0.939
Accuracy for    4 estimators with depth  6 on fold  0 is 0.690
Training accuracy: 0.938
Accuracy for    6 estimators with depth  6 on fold  0 is 0.690
Training accuracy: 0.964
Accuracy for    8 estimators with depth  6 on fold  0 is 0.706
Training accuracy: 0.975
Accuracy for   12 estimators with depth  6 on fold  0 is 0.732
Training accuracy: 0.987
Accuracy for   16 estimators with depth  6 on fold  0 is 0.728
Training accuracy: 0.990
Accuracy for   24 estimators with depth  6 on fold  0 is 0.758
Training accuracy: 0.991
Accuracy for   32 estimators with depth  6 on fold  0 is 0.752
Training accuracy: 0.992
Accuracy for   48 estimators with depth  6 on fold  0 is 0.758
Training accuracy: 0.993
Accuracy for  

Accuracy for  128 estimators with depth  6 on fold  5 is 0.749
Training accuracy: 0.994
Accuracy for  196 estimators with depth  6 on fold  5 is 0.744
Training accuracy: 0.994
Accuracy for  256 estimators with depth  6 on fold  5 is 0.750
Training accuracy: 0.994
Accuracy for    1 estimators with depth  6 on fold  6 is 0.642
Training accuracy: 0.866
Accuracy for    2 estimators with depth  6 on fold  6 is 0.635
Training accuracy: 0.861
Accuracy for    3 estimators with depth  6 on fold  6 is 0.683
Training accuracy: 0.931
Accuracy for    4 estimators with depth  6 on fold  6 is 0.659
Training accuracy: 0.942
Accuracy for    6 estimators with depth  6 on fold  6 is 0.701
Training accuracy: 0.969
Accuracy for    8 estimators with depth  6 on fold  6 is 0.692
Training accuracy: 0.978
Accuracy for   12 estimators with depth  6 on fold  6 is 0.715
Training accuracy: 0.986
Accuracy for   16 estimators with depth  6 on fold  6 is 0.722
Training accuracy: 0.991
Accuracy for   24 estimators wit

Accuracy for  256 estimators with depth  7 on fold  4 is 0.736
Training accuracy: 0.993
Accuracy for  256 estimators with depth  8 on fold  4 is 0.737
Training accuracy: 0.993
Accuracy for  256 estimators with depth  9 on fold  4 is 0.741
Training accuracy: 0.993
Accuracy for  256 estimators with depth 10 on fold  4 is 0.731
Training accuracy: 0.993
Accuracy for  256 estimators with depth 11 on fold  4 is 0.736
Training accuracy: 0.993
Accuracy for  256 estimators with depth 12 on fold  4 is 0.740
Training accuracy: 0.993
Accuracy for  256 estimators with depth 13 on fold  4 is 0.731
Training accuracy: 0.993
Accuracy for  256 estimators with depth 14 on fold  4 is 0.739
Training accuracy: 0.993
Accuracy for  256 estimators with depth  5 on fold  5 is 0.749
Training accuracy: 0.994
Accuracy for  256 estimators with depth  6 on fold  5 is 0.748
Training accuracy: 0.994
Accuracy for  256 estimators with depth  7 on fold  5 is 0.752
Training accuracy: 0.994
Accuracy for  256 estimators wit

### Test

In [9]:
X_test = df_test.loc[:, df_test.columns != 'Decision']

y_train = df['Decision']
X_train = df.loc[:, df.columns != 'Decision']

rfc = make_rfc(opt_num_trees, opt_tree_depth)
rfc.fit(X_train, y_train)

results = rfc.predict(X_test)
print("Runtime is {:} seconds".format(t.time() - start_time))

Runtime is 656.9403331279755 seconds


### Format for Output

In [10]:
df_test['Decision'] = results
df_test['id'] = range(1, df_test.shape[0] + 1)
df_test = df_test[['id', 'Decision']]
print(df_test)

df_test.to_csv(r"../data/pruned_random_forest_results.csv", index = False)

        id  Decision
0        1         0
1        2         0
2        3         1
3        4         1
4        5         1
...    ...       ...
2495  2496         1
2496  2497         0
2497  2498         1
2498  2499         1
2499  2500         1

[2500 rows x 2 columns]
