# Random Forest

### Imports

In [1]:
! pip install graphviz
! pip install dl8.5
! pip install chefboost



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time as t
import random

! pip install graphviz

from IPython.display import SVG
from graphviz import Source

from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier



### Helper Functions

In [3]:
translations = {
    'Driving_to' : ['No Urgent Place', 'Home', 'Work'],
    'Passanger' : ['Alone', 'Partner', 'Kid(s)', 'Friend(s)'],
    'Weather' : ['Sunny', 'Rainy', 'Snowy'],
    'Time' : ['7AM', '10AM', '2PM', '6PM', '10PM'],
    'Coupon' : ['Restaurant(<20)', 'Restaurant(20-50)', 'Carry out & Take away', 'Bar', 'Coffee House'],
    'Coupon_validity' : ['2h', '1d'],
    'Gender' : ['Male', 'Female'],
    'Age' : ['below21', '21', '26', '31', '36', '41', '46', '50plus'],
    'Maritalstatus' : ['Single', 'Divorced', 'Widowed', 'Unmarried partner', 'Married partner'],
    'Education' : ['Some High School', 'High School Graduate', 'Some college - no degree', 'Associates degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'],
    'Occupation' : ['Unemployed', 'Construction & Extraction', 'Arts Design Entertainment Sports & Media', 'Food Preparation & Serving Related', 'Education&Training&Library', 'Sales & Related', 'Computer & Mathematical', 'Student', 'Architecture & Engineering', 'Business & Financial', 'Office & Administrative Support', 'Retired', 'Management', 'Life Physical Social Science', 'Healthcare Support', 'Building & Grounds Cleaning & Maintenance', 'Legal', 'Installation Maintenance & Repair', 'Protective Service', 'Healthcare Practitioners & Technical', 'Community & Social Services', 'Transportation & Material Moving', 'Personal Care & Service', 'Farming Fishing & Forestry', 'Production Occupations'],
    'Income' : ['Less than $12500', '$12500 - $24999', '$25000 - $37499', '$37500 - $49999', '$50000 - $62499', '$62500 - $74999', '$75000 - $87499', '$87500 - $99999', '$100000 or More'],
}

def accuracy(y_actual, y_predicted):
    if(not len(y_actual) == len(y_predicted)):
        print("Lengths don't match")
        return 0
    correct = 0
    for i in range(len(y_actual)):
        if y_actual[i] == y_predicted[i]:
            correct += 1
    return 1.0 * correct / len(y_actual)

def clean_df(df):
    for (field, vals) in df.iteritems():
        for v_ind in vals.index:
            if pd.isna(vals[v_ind]):
                # NaN condition
                df.at[v_ind, field] = 0
        if field in translations:
            strings = []
            for s_ind in vals.index:
                if vals[s_ind] not in translations[field]:
                    # String that wasn't in training data
                    df.at[s_ind, field] = 0
                    continue
                df.at[s_ind, field] = translations[field].index(vals[s_ind])
    return df

def make_rfc(num_trees, max_depth):#, max_leaves):
    rfc = RandomForestClassifier()
    rfc.criterion = 'entropy'
    rfc.max_tree_depth = max_depth
    rfc.n_estimators = num_trees
#     rfc.max_leaf_nodes = max_leaves
    return rfc

### Data Importing and Processing

In [4]:
start_time = t.time()
df = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

# print(df)

### Data Processing

In [5]:
# Remove the id column

df = df.loc[:, df.columns != 'id']
df_test = df_test.loc[:, df_test.columns != 'id']

# Make string values into numbers

clean_df(df)
clean_df(df_test)

# Not 10 like normal since we already have a test partition
num_folds = 9 

cv_length = 1.0 * df.shape[0] / num_folds

partitions = [df.iloc[int(i * cv_length):int((i+1) * cv_length),:] for i in range(num_folds)]
# print([len(partitions[i]) for i in range(len(partitions))])


### Model

In [7]:
# Rotate the folds to tune the parameters

results = []
max_tree_depths = range(5, 15)
num_trees = [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128]
# max_leaf_nodes = range(2, 1003, 10)
num_coord_descs = 6
opt_num_trees = random.choice(num_trees)
opt_tree_depth = random.choice(max_tree_depths)
# opt_num_leaves = random.choice(max_leaf_nodes)

for c in range(num_coord_descs):
    print("Number descent:", c)
    
#     results = []

#     for i in range(num_folds):
#         validation = partitions[i]
#         training = pd.concat([j for j in partitions if not j.equals(validation)])

#         y_train = training['Decision']
#         X_train = training.loc[:, df.columns != 'Decision']

#         y_valid = validation['Decision']
#         X_valid = validation.loc[:, df.columns != 'Decision']

#         results_for_fold = []

#         for l in max_leaf_nodes:
#             rfc = make_rfc(opt_num_trees, opt_tree_depth, l)
#             rfc.fit(X_train, y_train)
#             a = accuracy(np.array(y_valid), rfc.predict(X_valid))
#             results_for_fold += [a]
#             print("Accuracy for {:4d} trees with depth {:2d} and {:2d} leaves on fold {:2d} is {:.3f}".format(opt_num_trees, opt_tree_depth, l, i, a))
#             print("Training accuracy: {:.3f}".format(accuracy(np.array(y_train), rfc.predict(X_train))))
#         results += [results_for_fold]
        
#     results_for_d = np.array(results).T
#     avg_acc = [np.mean(results_for_d[j]) for j in range(len(results_for_d))]
    
#     opt_num_leaves = max_leaf_nodes[avg_acc.index(np.max(avg_acc))]
#     print("Optimal leaf nodes:", opt_num_leaves)
    
    results = []

    for i in range(num_folds):
        validation = partitions[i]
        training = pd.concat([j for j in partitions if not j.equals(validation)])

        y_train = training['Decision']
        X_train = training.loc[:, df.columns != 'Decision']

        y_valid = validation['Decision']
        X_valid = validation.loc[:, df.columns != 'Decision']

        results_for_fold = []

        for n in num_trees:
            rfc = make_rfc(n, opt_tree_depth)
            rfc.fit(X_train, y_train)
            a = accuracy(np.array(y_valid), rfc.predict(X_valid))
            results_for_fold += [a]
            print("Accuracy for {:4d} trees with depth {:2d} on fold {:2d} is {:.3f}".format(n, opt_tree_depth, i, a))
            print("Training accuracy: {:.3f}".format(accuracy(np.array(y_train), rfc.predict(X_train))))
        results += [results_for_fold]
        
    results_for_n = np.array(results).T
    avg_acc = [np.mean(results_for_n[j]) for j in range(len(results_for_n))]
    prev_opt_num_trees = opt_num_trees
    opt_num_trees = num_trees[avg_acc.index(np.max(avg_acc))]
    print("Optimal number of trees:", opt_num_trees)
    if prev_opt_num_trees == opt_num_trees:
        break
    
    results = []

    for i in range(num_folds):
        validation = partitions[i]
        training = pd.concat([j for j in partitions if not j.equals(validation)])

        y_train = training['Decision']
        X_train = training.loc[:, df.columns != 'Decision']

        y_valid = validation['Decision']
        X_valid = validation.loc[:, df.columns != 'Decision']

        results_for_fold = []

        for d in max_tree_depths:
            rfc = make_rfc(opt_num_trees, d)
            rfc.fit(X_train, y_train)
            a = accuracy(np.array(y_valid), rfc.predict(X_valid))
            results_for_fold += [a]
            print("Accuracy for {:4d} trees with depth {:2d} on fold {:2d} is {:.3f}".format(opt_num_trees, d, i, a))
            print("Training accuracy: {:.3f}".format(accuracy(np.array(y_train), rfc.predict(X_train))))
        results += [results_for_fold]
        
    results_for_d = np.array(results).T
    avg_acc = [np.mean(results_for_d[j]) for j in range(len(results_for_d))]
    prev_opt_tree_depth = opt_tree_depth
    opt_tree_depth = max_tree_depths[avg_acc.index(np.max(avg_acc))]
    print("Optimal tree depth:", opt_tree_depth)
    if prev_opt_tree_depth == opt_tree_depth:
        break
    

Number descent: 0
Accuracy for    1 trees with depth  7 on fold  0 is 0.634
Training accuracy: 0.862
Accuracy for    2 trees with depth  7 on fold  0 is 0.598
Training accuracy: 0.859
Accuracy for    3 trees with depth  7 on fold  0 is 0.667
Training accuracy: 0.938
Accuracy for    4 trees with depth  7 on fold  0 is 0.665
Training accuracy: 0.940
Accuracy for    6 trees with depth  7 on fold  0 is 0.672
Training accuracy: 0.969
Accuracy for    8 trees with depth  7 on fold  0 is 0.713
Training accuracy: 0.979
Accuracy for   12 trees with depth  7 on fold  0 is 0.724
Training accuracy: 0.992
Accuracy for   16 trees with depth  7 on fold  0 is 0.734
Training accuracy: 0.996
Accuracy for   24 trees with depth  7 on fold  0 is 0.743
Training accuracy: 0.998
Accuracy for   32 trees with depth  7 on fold  0 is 0.744
Training accuracy: 0.998
Accuracy for   48 trees with depth  7 on fold  0 is 0.769
Training accuracy: 0.999
Accuracy for   64 trees with depth  7 on fold  0 is 0.762
Training ac

Accuracy for    6 trees with depth  7 on fold  7 is 0.670
Training accuracy: 0.970
Accuracy for    8 trees with depth  7 on fold  7 is 0.710
Training accuracy: 0.980
Accuracy for   12 trees with depth  7 on fold  7 is 0.711
Training accuracy: 0.992
Accuracy for   16 trees with depth  7 on fold  7 is 0.716
Training accuracy: 0.995
Accuracy for   24 trees with depth  7 on fold  7 is 0.723
Training accuracy: 0.997
Accuracy for   32 trees with depth  7 on fold  7 is 0.723
Training accuracy: 0.999
Accuracy for   48 trees with depth  7 on fold  7 is 0.732
Training accuracy: 0.999
Accuracy for   64 trees with depth  7 on fold  7 is 0.731
Training accuracy: 0.999
Accuracy for   96 trees with depth  7 on fold  7 is 0.727
Training accuracy: 0.999
Accuracy for  128 trees with depth  7 on fold  7 is 0.736
Training accuracy: 0.999
Accuracy for    1 trees with depth  7 on fold  8 is 0.645
Training accuracy: 0.863
Accuracy for    2 trees with depth  7 on fold  8 is 0.610
Training accuracy: 0.861
Accu

Accuracy for  128 trees with depth 10 on fold  7 is 0.745
Training accuracy: 0.999
Accuracy for  128 trees with depth 11 on fold  7 is 0.731
Training accuracy: 0.999
Accuracy for  128 trees with depth 12 on fold  7 is 0.739
Training accuracy: 0.999
Accuracy for  128 trees with depth 13 on fold  7 is 0.737
Training accuracy: 0.999
Accuracy for  128 trees with depth 14 on fold  7 is 0.738
Training accuracy: 0.999
Accuracy for  128 trees with depth  5 on fold  8 is 0.768
Training accuracy: 0.999
Accuracy for  128 trees with depth  6 on fold  8 is 0.754
Training accuracy: 0.999
Accuracy for  128 trees with depth  7 on fold  8 is 0.760
Training accuracy: 0.999
Accuracy for  128 trees with depth  8 on fold  8 is 0.757
Training accuracy: 0.999
Accuracy for  128 trees with depth  9 on fold  8 is 0.752
Training accuracy: 0.999
Accuracy for  128 trees with depth 10 on fold  8 is 0.769
Training accuracy: 0.999
Accuracy for  128 trees with depth 11 on fold  8 is 0.757
Training accuracy: 0.999
Accu

Training accuracy: 0.863
Accuracy for    2 trees with depth 10 on fold  6 is 0.603
Training accuracy: 0.856
Accuracy for    3 trees with depth 10 on fold  6 is 0.674
Training accuracy: 0.938
Accuracy for    4 trees with depth 10 on fold  6 is 0.655
Training accuracy: 0.948
Accuracy for    6 trees with depth 10 on fold  6 is 0.691
Training accuracy: 0.972
Accuracy for    8 trees with depth 10 on fold  6 is 0.698
Training accuracy: 0.983
Accuracy for   12 trees with depth 10 on fold  6 is 0.701
Training accuracy: 0.993
Accuracy for   16 trees with depth 10 on fold  6 is 0.733
Training accuracy: 0.996
Accuracy for   24 trees with depth 10 on fold  6 is 0.735
Training accuracy: 0.998
Accuracy for   32 trees with depth 10 on fold  6 is 0.729
Training accuracy: 0.998
Accuracy for   48 trees with depth 10 on fold  6 is 0.743
Training accuracy: 0.999
Accuracy for   64 trees with depth 10 on fold  6 is 0.741
Training accuracy: 0.999
Accuracy for   96 trees with depth 10 on fold  6 is 0.739
Trai

Accuracy for   96 trees with depth 13 on fold  5 is 0.743
Training accuracy: 0.999
Accuracy for   96 trees with depth 14 on fold  5 is 0.748
Training accuracy: 0.999
Accuracy for   96 trees with depth  5 on fold  6 is 0.744
Training accuracy: 0.999
Accuracy for   96 trees with depth  6 on fold  6 is 0.752
Training accuracy: 0.999
Accuracy for   96 trees with depth  7 on fold  6 is 0.737
Training accuracy: 0.999
Accuracy for   96 trees with depth  8 on fold  6 is 0.756
Training accuracy: 0.999
Accuracy for   96 trees with depth  9 on fold  6 is 0.746
Training accuracy: 0.999
Accuracy for   96 trees with depth 10 on fold  6 is 0.748
Training accuracy: 0.999
Accuracy for   96 trees with depth 11 on fold  6 is 0.741
Training accuracy: 0.999
Accuracy for   96 trees with depth 12 on fold  6 is 0.746
Training accuracy: 0.999
Accuracy for   96 trees with depth 13 on fold  6 is 0.744
Training accuracy: 0.999
Accuracy for   96 trees with depth 14 on fold  6 is 0.744
Training accuracy: 0.999
Accu

Accuracy for   64 trees with depth  6 on fold  4 is 0.738
Training accuracy: 0.999
Accuracy for   96 trees with depth  6 on fold  4 is 0.741
Training accuracy: 0.999
Accuracy for  128 trees with depth  6 on fold  4 is 0.739
Training accuracy: 0.999
Accuracy for    1 trees with depth  6 on fold  5 is 0.631
Training accuracy: 0.859
Accuracy for    2 trees with depth  6 on fold  5 is 0.571
Training accuracy: 0.856
Accuracy for    3 trees with depth  6 on fold  5 is 0.672
Training accuracy: 0.941
Accuracy for    4 trees with depth  6 on fold  5 is 0.650
Training accuracy: 0.940
Accuracy for    6 trees with depth  6 on fold  5 is 0.698
Training accuracy: 0.969
Accuracy for    8 trees with depth  6 on fold  5 is 0.699
Training accuracy: 0.981
Accuracy for   12 trees with depth  6 on fold  5 is 0.701
Training accuracy: 0.990
Accuracy for   16 trees with depth  6 on fold  5 is 0.718
Training accuracy: 0.996
Accuracy for   24 trees with depth  6 on fold  5 is 0.718
Training accuracy: 0.998
Accu

Accuracy for  128 trees with depth  5 on fold  4 is 0.744
Training accuracy: 0.999
Accuracy for  128 trees with depth  6 on fold  4 is 0.740
Training accuracy: 0.999
Accuracy for  128 trees with depth  7 on fold  4 is 0.746
Training accuracy: 0.999
Accuracy for  128 trees with depth  8 on fold  4 is 0.729
Training accuracy: 0.999
Accuracy for  128 trees with depth  9 on fold  4 is 0.742
Training accuracy: 0.999
Accuracy for  128 trees with depth 10 on fold  4 is 0.733
Training accuracy: 0.999
Accuracy for  128 trees with depth 11 on fold  4 is 0.740
Training accuracy: 0.999
Accuracy for  128 trees with depth 12 on fold  4 is 0.741
Training accuracy: 0.999
Accuracy for  128 trees with depth 13 on fold  4 is 0.739
Training accuracy: 0.999
Accuracy for  128 trees with depth 14 on fold  4 is 0.736
Training accuracy: 0.999
Accuracy for  128 trees with depth  5 on fold  5 is 0.751
Training accuracy: 0.999
Accuracy for  128 trees with depth  6 on fold  5 is 0.746
Training accuracy: 0.999
Accu

Accuracy for   24 trees with depth 13 on fold  3 is 0.733
Training accuracy: 0.998
Accuracy for   32 trees with depth 13 on fold  3 is 0.734
Training accuracy: 0.999
Accuracy for   48 trees with depth 13 on fold  3 is 0.740
Training accuracy: 0.999
Accuracy for   64 trees with depth 13 on fold  3 is 0.752
Training accuracy: 0.999
Accuracy for   96 trees with depth 13 on fold  3 is 0.746
Training accuracy: 0.999
Accuracy for  128 trees with depth 13 on fold  3 is 0.748
Training accuracy: 0.999
Accuracy for    1 trees with depth 13 on fold  4 is 0.640
Training accuracy: 0.860
Accuracy for    2 trees with depth 13 on fold  4 is 0.608
Training accuracy: 0.864
Accuracy for    3 trees with depth 13 on fold  4 is 0.660
Training accuracy: 0.933
Accuracy for    4 trees with depth 13 on fold  4 is 0.640
Training accuracy: 0.942
Accuracy for    6 trees with depth 13 on fold  4 is 0.669
Training accuracy: 0.967
Accuracy for    8 trees with depth 13 on fold  4 is 0.696
Training accuracy: 0.983
Accu

Accuracy for   64 trees with depth  8 on fold  2 is 0.729
Training accuracy: 0.999
Accuracy for   64 trees with depth  9 on fold  2 is 0.728
Training accuracy: 0.999
Accuracy for   64 trees with depth 10 on fold  2 is 0.734
Training accuracy: 0.999
Accuracy for   64 trees with depth 11 on fold  2 is 0.730
Training accuracy: 0.999
Accuracy for   64 trees with depth 12 on fold  2 is 0.726
Training accuracy: 0.999
Accuracy for   64 trees with depth 13 on fold  2 is 0.737
Training accuracy: 0.999
Accuracy for   64 trees with depth 14 on fold  2 is 0.726
Training accuracy: 0.999
Accuracy for   64 trees with depth  5 on fold  3 is 0.747
Training accuracy: 0.999
Accuracy for   64 trees with depth  6 on fold  3 is 0.749
Training accuracy: 0.999
Accuracy for   64 trees with depth  7 on fold  3 is 0.739
Training accuracy: 0.999
Accuracy for   64 trees with depth  8 on fold  3 is 0.749
Training accuracy: 0.999
Accuracy for   64 trees with depth  9 on fold  3 is 0.742
Training accuracy: 0.999
Accu

### Test

In [8]:
X_test = df_test.loc[:, df_test.columns != 'Decision']

y_train = df['Decision']
X_train = df.loc[:, df.columns != 'Decision']

rfc = make_rfc(opt_num_trees, opt_tree_depth)
rfc.fit(X_train, y_train)

results = rfc.predict(X_test)
print("Runtime is {:} seconds".format(t.time() - start_time))

Runtime is 1119.8544027805328 seconds


### Format for Output

In [9]:
df_test['Decision'] = results
df_test['id'] = range(1, df_test.shape[0] + 1)
df_test = df_test[['id', 'Decision']]
print(df_test)

df_test.to_csv(r"../data/random_forest_results.csv", index = False)

        id  Decision
0        1         0
1        2         1
2        3         1
3        4         0
4        5         1
...    ...       ...
2495  2496         1
2496  2497         0
2497  2498         1
2498  2499         1
2499  2500         0

[2500 rows x 2 columns]
