# Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import time
from datetime import datetime
import numpy as np
import math

In [45]:
## Import data
training_filename = 'dtrain123.dat'
test_filename = 'dtest123.dat'
filename = 'zipcombo.dat'
data = np.loadtxt(filename)
## define feature vector x and true value y
x = data[:,1:]
y = data[:,0]

In [41]:
def basic_result_tree(x,y,num_tree,runs):

    training_set_errors = np.zeros((len(num_tree), runs))
    test_set_errors = np.zeros((len(num_tree), runs))

    for t in range(len(num_tree)):
        tree = num_tree[t]
        for run in range(runs):
            print("Now doing run ", run+1, "/", runs, " for ",", num_tree=",tree,".........", end='\r')
            X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True)

            rfc_classifier = RandomForestClassifier(n_estimators=tree,max_depth=None, random_state=0)
            rfc_classifier.fit(X_train, y_train)

            test_labels = rfc_classifier.predict(X_test)
            test_errors = sum(y_test != test_labels) / len(y_test)

            train_labels = rfc_classifier.predict(X_train)
            train_errors = sum(y_train != train_labels) / len(y_train)

            training_set_errors[t, run] = train_errors
            test_set_errors[t, run] = test_errors
    return training_set_errors, test_set_errors

In [42]:
def construct_table(training_set_errors,test_set_errors):
    train_mean=np.mean(training_set_errors,axis=1)
    train_std=np.std(training_set_errors,axis=1)
    test_mean=np.mean(test_set_errors,axis=1)
    test_std=np.std(test_set_errors,axis=1)

    mean_std = []
    for i in range(len(train_mean)):
        data_t = []
        colomn_1 = "{0:.4f} +- {1:.4f}".format(train_mean[i]*100,train_std[i]*100)
        data_t.append(colomn_1)
        colomn_2 = "{0:.4f} +- {1:.4f}".format(test_mean[i]*100,test_std[i]*100)    
        data_t.append(colomn_2)
        mean_std.append(data_t)
    return mean_std

In [46]:
num_tree = np.array([10,100,500,1000])
runs = 20        
startTime = datetime.now()
training_set_ntr_errors, test_set_ntr_errors=basic_result_tree(x,y,num_tree,runs)
time_tree_ntr_basic = datetime.now() - startTime
print("Time taken: ", time_tree_ntr_basic)

Time taken:  0:18:13.635669  , num_tree= 1000 .........


In [47]:
# to construct table
Tree_mean_std=construct_table(training_set_ntr_errors,test_set_ntr_errors)
pd.DataFrame(data=Tree_mean_std,index=num_tree,columns=['train_mean_std(%)','test_mean_std(%)'])

Unnamed: 0,train_mean_std(%),test_mean_std(%)
10,0.1116 +- 0.0438,6.4167 +- 0.4464
100,0.0000 +- 0.0000,3.6989 +- 0.3512
500,0.0000 +- 0.0000,3.3118 +- 0.3576
1000,0.0000 +- 0.0000,3.5806 +- 0.4617


## Add Cross Validation

In [48]:
def cross_validation_tree(x,y,num_tree,k):
    """
    This function performs a k-fold cross validation on X, using a kernel of "kernel_choice" with parameter d.
    :param X: the observations array
    :param y: the labels vector
    :param kernel_choice: Depending on the kernel choice, can be {'Polynomial', 'Gaussian'}
    :param d: the parameter of the kernel
    :param k: the number of splits, i.e. the k parameter in k-fold Cross Validation
    :return: the mean of test error across the k runs of the CV process and its standard deviation
    """
    kf = KFold(n_splits=k, shuffle=True)
    mistake_arr = np.zeros(k)
    i = 0
    
    for train_index, cv_index in kf.split(x):
        # Spit the matrix using the indices gained by the CV method and construct X and Y arrays
        X_train = x[train_index]
        X_cv = x[cv_index]
        y_train = y[train_index]
        y_cv = y[cv_index]
    
        # We are only interested in the alphas and not the MSE on the training set
        rfc_classifier = RandomForestClassifier(n_estimators=tree,max_depth=None, random_state=0)
        rfc_classifier.fit(X_train, y_train)
        
        predicted_labels = rfc_classifier.predict(X_cv)
        mistakes = sum(y_cv != predicted_labels)
        mistake_arr[i] = mistakes / len(y_cv)
        i += 1
        
    return mistake_arr.mean()

In [49]:
def cv_process_tree(x,y,num_tree,runs):
    """
    This function performs 5-fold cross validation, multiple times (specified by runs argument) across the different
    values of d specified in d_arr using the kernel specified in kernel_choice
    :param d_arr: an array of d values
    :param runs: The number of runs to repeat the CV process
    :param kernel_choice: Depending on the kernel choice, can be {'Polynomial', 'Gaussian'}
    :param calculate_confusions: Whether or not to also calculate confusions on the test set
    :return: the array of d_stars, the test_errors and the confusions found
    """
    tree_stars = np.zeros(runs)
    test_errors = np.zeros(runs)
    
    for run in range(runs):
        single_confusion_mtx = np.zeros(shape = (10,10))
        # In each run we will iterate through the d array and use all possible values of d
        # Allocate 80/20 percent for training and test set
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True)

        CV_means = np.zeros(len(num_tree))
        for i in range(len(num_tree)):
            print("Now doing run ", run+1, "/", runs, " for d=", num_tree[i], ".........", end='\r')
            mistake = cross_validation_tree(X_train, y_train,num_tree[i],k=5)
            CV_means[i] = mistake
            
        # Train in whole 80% now with c_star
        tree_stars[run] = num_tree[CV_means.argmin()]
        # Train on whole of 80%
        rfc_classifier = RandomForestClassifier(n_estimators=tree,max_depth=None, random_state=0)
        rfc_classifier.fit(X_train, y_train)
        
        predicted_labels = rfc_classifier.predict(X_test)
        
        test_error = sum(y_test != predicted_labels) / len(y_test)
        test_errors[run] = test_error
    return tree_stars,test_errors

In [50]:
runs = 20
num_tree = np.array([10,100,500,1000])
startTime = datetime.now()
tree_stars_array, test_errors_array = cv_process_tree(x,y,num_tree,runs)
time_pp_cv = datetime.now() - startTime
print("Time taken: ", time_pp_cv)
print("Mean c*: ", tree_stars_array.mean(), " with std: ", np.std(tree_stars_array))
print("Mean test error(%): ", test_errors_array.mean()*100, " with std(%): ", np.std(test_errors_array)*100)

Time taken:  1:26:53.404987 d= 1000 .........
Mean c*:  382.0  with std:  363.4501341312175
Mean test error(%):  3.2580645161290325  with std(%):  0.28570602700185843
