In [35]:
import numpy as np

train = np.genfromtxt('train_2008.csv', delimiter=',')
X_train = train[1:, :-1]
y_train = train[1:, -1]
X_test = np.genfromtxt('test_2008.csv', delimiter=',')[1:,:]

In [36]:
def classification_err(y, real_y):
    """
    This function returns the classification error between two equally-sized vectors of 
    labels; this is the fraction of samples for which the labels differ.
    
    Inputs:
        y: (N, ) shaped array of predicted labels
        real_y: (N, ) shaped array of true labels
    Output:
        Scalar classification error
    """
    misclassified = 0
    for i in range (len(y)):
        if y[i] != real_y[i]:
            misclassified += 1
    return misclassified/len(y)

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
num_folds = 10
kf = KFold(n_splits=num_folds)


for min_leaf in range (1, 25):
    for depth in range (25, 60, 5):
        # Iterate through cross-validation folds:
        i = 1
        train_acc = 0
        test_acc = 0
        for train_index, test_index in kf.split(X_train):

            # Print out test indices:
            print('Fold ', i, ' of ', num_folds, ' test indices:', test_index)

            # Training and testing data points for this fold:
            x_train_subset, x_test_subset = X_train[train_index], X_train[test_index]
            y_train_subset, y_test_subset = y_train[train_index], y_train[test_index]



            trees_subset = RandomForestClassifier(n_estimators=200, 
                                       max_depth=depth, 
                                       min_samples_leaf = min_leaf)
            trees_subset.fit(x_train_subset, y_train_subset)
            y_subset_predict = trees_subset.predict(x_test_subset)
            y_train_subset_predict = trees_subset.predict(x_train_subset)
            test_acc += (1 - classification_err(y_subset_predict, y_test_subset))
            train_acc += (1- classification_err(y_train_subset_predict, y_train_subset))
            i += 1
        print ("depth: ", depth)
        print ("min_leaf: ", min_leaf)
        print ("test acc: ", test_acc/num_folds)
        print ("train acc: ", train_acc/num_folds)

KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import RandomForestClassifier

trees = RandomForestClassifier(n_estimators=200, 
                               max_depth=40, 
                               min_samples_leaf = 1)
trees.fit(X_train, y_train)

In [21]:

from sklearn.ensemble import RandomForestClassifier

trees = RandomForestClassifier(n_estimators=200, 
                               max_depth=40, 
                               min_samples_leaf = 1)
trees.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=40, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [22]:

print("model accuracy: ", trees.score(X_train, y_train))

y_predict = trees.predict_proba(X_test)
#print(y_predict)


model accuracy:  0.9999690723243695


In [16]:
output = []
output.append(["id", "target"])
for i in range(len(y_predict)):
    #output.append([i, y_predict[i][0]])
    output.append([i, y_predict[i][1]])

In [17]:
print(output[:10])

[['id', 'target'], [0, 0.505], [1, 0.185], [2, 0.175], [3, 0.405], [4, 0.245], [5, 0.08], [6, 0.255], [7, 0.615], [8, 0.02]]


In [18]:
import csv
with open('voter_pred_2008.csv', mode='w') as voter_file:
    voter_writer = csv.writer(voter_file, delimiter=",")
    for i in output:
        voter_writer.writerow(i)

In [19]:
def dectree_max_depth(tree):
    n_nodes = tree.node_count
    children_left = tree.children_left
    children_right = tree.children_right

    def walk(node_id):
        if (children_left[node_id] != children_right[node_id]):
            left_max = 1 + walk(children_left[node_id])
            right_max = 1 + walk(children_right[node_id])
            return max(left_max, right_max)
        else: # leaf
            return 1

    root_node_id = 0
    return walk(root_node_id)

In [20]:
[dectree_max_depth(t.tree_) for t in trees.estimators_]

[59,
 51,
 50,
 53,
 51,
 47,
 43,
 50,
 50,
 43,
 49,
 47,
 47,
 48,
 48,
 47,
 57,
 51,
 54,
 56,
 51,
 55,
 56,
 48,
 49,
 47,
 51,
 48,
 51,
 46,
 51,
 41,
 43,
 49,
 46,
 53,
 52,
 46,
 59,
 50,
 53,
 61,
 46,
 46,
 48,
 47,
 53,
 50,
 49,
 49,
 49,
 53,
 49,
 46,
 54,
 50,
 49,
 51,
 54,
 47,
 58,
 45,
 47,
 47,
 49,
 52,
 64,
 48,
 45,
 49,
 54,
 45,
 50,
 51,
 49,
 51,
 58,
 45,
 46,
 50,
 44,
 49,
 53,
 45,
 53,
 58,
 47,
 49,
 54,
 45,
 46,
 55,
 51,
 48,
 54,
 51,
 45,
 46,
 50,
 46,
 46,
 46,
 55,
 52,
 56,
 53,
 53,
 44,
 49,
 57,
 49,
 54,
 50,
 42,
 49,
 51,
 53,
 51,
 51,
 53,
 52,
 48,
 52,
 53,
 53,
 53,
 48,
 52,
 52,
 48,
 52,
 54,
 50,
 47,
 48,
 54,
 45,
 51,
 44,
 45,
 55,
 50,
 43,
 63,
 52,
 55,
 46,
 48,
 49,
 49,
 48,
 47,
 49,
 52,
 45,
 50,
 47,
 46,
 49,
 48,
 49,
 61,
 54,
 59,
 50,
 56,
 44,
 47,
 48,
 50,
 49,
 49,
 55,
 53,
 54,
 63,
 55,
 49,
 47,
 48,
 49,
 58,
 53,
 48,
 55,
 54,
 59,
 54,
 50,
 54,
 47,
 48,
 51,
 50,
 52,
 45,
 44,
 46,
 54,
 51]