In [1]:
import numpy as np

train = np.genfromtxt('train_2008.csv', delimiter=',')
X_train = train[1:, :-1]
y_train = train[1:, -1]
X_test = np.genfromtxt('test_2008.csv', delimiter=',')[1:,:]

In [2]:
def classification_err(y, real_y):
    """
    This function returns the classification error between two equally-sized vectors of 
    labels; this is the fraction of samples for which the labels differ.
    
    Inputs:
        y: (N, ) shaped array of predicted labels
        real_y: (N, ) shaped array of true labels
    Output:
        Scalar classification error
    """
    misclassified = 0
    for i in range (len(y)):
        if y[i] != real_y[i]:
            misclassified += 1
    return misclassified/len(y)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
num_folds = 10
kf = KFold(n_splits=num_folds)


for min_leaf in range (1000, 1500, 50):
    for depth in range (25, 65, 5):
        # Iterate through cross-validation folds:
        i = 1
        train_acc = 0
        test_acc = 0
        for train_index, test_index in kf.split(X_train):

            # Print out test indices:
            print('Fold ', i, ' of ', num_folds, ' test indices:', test_index)

            # Training and testing data points for this fold:
            x_train_subset, x_test_subset = X_train[train_index], X_train[test_index]
            y_train_subset, y_test_subset = y_train[train_index], y_train[test_index]



            trees_subset = RandomForestClassifier(n_estimators=200, 
                                       max_depth=depth, 
                                       min_samples_leaf = min_leaf)
            trees_subset.fit(x_train_subset, y_train_subset)
            y_subset_predict = trees_subset.predict(x_test_subset)
            y_train_subset_predict = trees_subset.predict(x_train_subset)
            test_acc += (1 - classification_err(y_subset_predict, y_tWe est_subset))
            train_acc += (1- classification_err(y_train_subset_predict, y_train_subset))
            i += 1
        print ("depth: ", depth)
        print ("min_leaf: ", min_leaf)
        print ("test acc: ", test_acc/num_folds)
        print ("train acc: ", train_acc/num_folds)

Fold  1  of  10  test indices: [   0    1    2 ... 6464 6465 6466]
Fold  2  of  10  test indices: [ 6467  6468  6469 ... 12931 12932 12933]
Fold  3  of  10  test indices: [12934 12935 12936 ... 19398 19399 19400]
Fold  4  of  10  test indices: [19401 19402 19403 ... 25865 25866 25867]
Fold  5  of  10  test indices: [25868 25869 25870 ... 32332 32333 32334]
Fold  6  of  10  test indices: [32335 32336 32337 ... 38799 38800 38801]
Fold  7  of  10  test indices: [38802 38803 38804 ... 45266 45267 45268]
Fold  8  of  10  test indices: [45269 45270 45271 ... 51732 51733 51734]
Fold  9  of  10  test indices: [51735 51736 51737 ... 58198 58199 58200]
Fold  10  of  10  test indices: [58201 58202 58203 ... 64664 64665 64666]
depth:  25
min_leaf:  1000
test acc:  0.7446299734582448
train acc:  0.7446525165986271
Fold  1  of  10  test indices: [   0    1    2 ... 6464 6465 6466]
Fold  2  of  10  test indices: [ 6467  6468  6469 ... 12931 12932 12933]
Fold  3  of  10  test indices: [12934 12935 129

Fold  2  of  10  test indices: [ 6467  6468  6469 ... 12931 12932 12933]
Fold  3  of  10  test indices: [12934 12935 12936 ... 19398 19399 19400]
Fold  4  of  10  test indices: [19401 19402 19403 ... 25865 25866 25867]
Fold  5  of  10  test indices: [25868 25869 25870 ... 32332 32333 32334]
Fold  6  of  10  test indices: [32335 32336 32337 ... 38799 38800 38801]
Fold  7  of  10  test indices: [38802 38803 38804 ... 45266 45267 45268]
Fold  8  of  10  test indices: [45269 45270 45271 ... 51732 51733 51734]
Fold  9  of  10  test indices: [51735 51736 51737 ... 58198 58199 58200]
Fold  10  of  10  test indices: [58201 58202 58203 ... 64664 64665 64666]
depth:  35
min_leaf:  1050
test acc:  0.7446299734582448
train acc:  0.7446387708941599
Fold  1  of  10  test indices: [   0    1    2 ... 6464 6465 6466]
Fold  2  of  10  test indices: [ 6467  6468  6469 ... 12931 12932 12933]
Fold  3  of  10  test indices: [12934 12935 12936 ... 19398 19399 19400]
Fold  4  of  10  test indices: [19401 194

Fold  3  of  10  test indices: [12934 12935 12936 ... 19398 19399 19400]
Fold  4  of  10  test indices: [19401 19402 19403 ... 25865 25866 25867]
Fold  5  of  10  test indices: [25868 25869 25870 ... 32332 32333 32334]
Fold  6  of  10  test indices: [32335 32336 32337 ... 38799 38800 38801]
Fold  7  of  10  test indices: [38802 38803 38804 ... 45266 45267 45268]
Fold  8  of  10  test indices: [45269 45270 45271 ... 51732 51733 51734]
Fold  9  of  10  test indices: [51735 51736 51737 ... 58198 58199 58200]
Fold  10  of  10  test indices: [58201 58202 58203 ... 64664 64665 64666]
depth:  45
min_leaf:  1100
test acc:  0.7446145103377871
train acc:  0.744614715911342
Fold  1  of  10  test indices: [   0    1    2 ... 6464 6465 6466]
Fold  2  of  10  test indices: [ 6467  6468  6469 ... 12931 12932 12933]
Fold  3  of  10  test indices: [12934 12935 12936 ... 19398 19399 19400]
Fold  4  of  10  test indices: [19401 19402 19403 ... 25865 25866 25867]
Fold  5  of  10  test indices: [25868 2586

Fold  4  of  10  test indices: [19401 19402 19403 ... 25865 25866 25867]
Fold  5  of  10  test indices: [25868 25869 25870 ... 32332 32333 32334]
Fold  6  of  10  test indices: [32335 32336 32337 ... 38799 38800 38801]
Fold  7  of  10  test indices: [38802 38803 38804 ... 45266 45267 45268]
Fold  8  of  10  test indices: [45269 45270 45271 ... 51732 51733 51734]
Fold  9  of  10  test indices: [51735 51736 51737 ... 58198 58199 58200]
Fold  10  of  10  test indices: [58201 58202 58203 ... 64664 64665 64666]
depth:  55
min_leaf:  1150
test acc:  0.7446145103377871
train acc:  0.744614715911342
Fold  1  of  10  test indices: [   0    1    2 ... 6464 6465 6466]
Fold  2  of  10  test indices: [ 6467  6468  6469 ... 12931 12932 12933]
Fold  3  of  10  test indices: [12934 12935 12936 ... 19398 19399 19400]
Fold  4  of  10  test indices: [19401 19402 19403 ... 25865 25866 25867]
Fold  5  of  10  test indices: [25868 25869 25870 ... 32332 32333 32334]
Fold  6  of  10  test indices: [32335 3233

KeyboardInterrupt: 

In [4]:
import numpy as np

## Importing the MNIST dataset using Keras
train = np.genfromtxt('train_2008.csv', delimiter=',')
X_train = train[1:, :-1]
y_train = train[1:, -1]
X_test = np.genfromtxt('test_2008.csv', delimiter=',')[1:,:]

from sklearn.ensemble import RandomForestClassifier

tree = RandomForestClassifier(n_estimators=10000, max_depth=55, min_samples_leaf=1)
tree.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=55, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [8]:

# print("model accuracy: ", tree.score(X_train, y_train))
print(1)
y_predict = tree.predict_proba(X_test)
print(y_predict)



1
[[0.5047     0.4953    ]
 [0.8465     0.1535    ]
 [0.82546287 0.17453713]
 ...
 [0.7144     0.2856    ]
 [0.9102     0.0898    ]
 [0.7383     0.2617    ]]


In [9]:
output = []
output.append(["id", "target"])
for i in range(len(y_predict)):
    #output.append([i, y_predict[i][0]])
    output.append([i, y_predict[i][1]])
    
import csv
with open('voter_pred_2008daniel2.csv', mode='w') as voter_file:
    voter_writer = csv.writer(voter_file, delimiter=",")
    for i in output:
        voter_writer.writerow(i)

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
num_folds = 10
kf = KFold(n_splits=num_folds)
y_tests = []
i = 1
for train_index, test_index in kf.split(X_train):
        # Print out test indices:
        print('Fold ', i, ' of ', num_folds, ' test indices:', test_index)

        # Training and testing data points for this fold:
        x_train_subset, x_test_subset = X_train[train_index], X_train[test_index]
        y_train_subset, y_test_subset = y_train[train_index], y_train[test_index]

        trees_subset = RandomForestClassifier(n_estimators=200, 
                                   max_depth=55, 
                                   min_samples_leaf = 1)
        trees_subset.fit(x_train_subset, y_train_subset)
        curr_y_test = trees_subset.predict_proba(X_test)
        y_tests.append(curr_y_test)
        i += 1
        

Fold  1  of  10  test indices: [   0    1    2 ... 6464 6465 6466]
Fold  2  of  10  test indices: [ 6467  6468  6469 ... 12931 12932 12933]
Fold  3  of  10  test indices: [12934 12935 12936 ... 19398 19399 19400]
Fold  4  of  10  test indices: [19401 19402 19403 ... 25865 25866 25867]
Fold  5  of  10  test indices: [25868 25869 25870 ... 32332 32333 32334]
Fold  6  of  10  test indices: [32335 32336 32337 ... 38799 38800 38801]
Fold  7  of  10  test indices: [38802 38803 38804 ... 45266 45267 45268]
Fold  8  of  10  test indices: [45269 45270 45271 ... 51732 51733 51734]
Fold  9  of  10  test indices: [51735 51736 51737 ... 58198 58199 58200]
Fold  10  of  10  test indices: [58201 58202 58203 ... 64664 64665 64666]


In [24]:
final_y_test = np.mean(y_tests, axis=0)
print(final_y_test)

[[0.501      0.499     ]
 [0.8425     0.1575    ]
 [0.82981447 0.17018553]
 ...
 [0.7125     0.2875    ]
 [0.905      0.095     ]
 [0.7175     0.2825    ]]


In [25]:
final_y_test[0][1]

0.499

In [27]:
output = []
output.append(["id", "target"])
for i in range(len(final_y_test)):
    #output.append([i, y_predict[i][0]])
    output.append([i, final_y_test[i][1]])
    
import csv
with open('voter_pred_2008daniel3.csv', mode='w') as voter_file:
    voter_writer = csv.writer(voter_file, delimiter=",")
    for i in output:
        voter_writer.writerow(i)