In [7]:
import numpy as np
import csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import winsound

In [2]:
train = np.loadtxt('train_2008.csv', skiprows = 1, delimiter = ',').transpose()
test = np.loadtxt('test_2008.csv', skiprows = 1, delimiter = ',').transpose()
X_train, y_train  = train[0: len(train) - 1], train[len(train) - 1]
X_test = test

stupid_cols = [0]

for i in range(len(X_train)):
    val = X_train[i][0]
    same = True
    for j in range(len(X_train)):
        if(X_train[i][j] != val):
            same = False
            break
    if(same == True):
        stupid_cols.append(i)
            # These columns are either the voter id: clearly no correlation
            # or they have the same values for every voter: no correlation
            # to reduce computation time, we should find more columns to delete

X_train = np.delete(X_train, stupid_cols, 0).transpose()
X_test = np.delete(X_test, stupid_cols, 0).transpose()

In [4]:
print(test)

[[0.0000e+00 1.0000e+00 2.0000e+00 ... 1.5997e+04 1.5998e+04 1.5999e+04]
 [1.1000e+01 1.1000e+01 1.1000e+01 ... 1.1000e+01 1.1000e+01 1.1000e+01]
 [2.0080e+03 2.0080e+03 2.0080e+03 ... 2.0080e+03 2.0080e+03 2.0080e+03]
 ...
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]]


In [None]:
def write_sub(y_test, fname):
    '''
    y: ordered y values calculated for X_test
    fname: str for filename (including ".csv")
    
    takes y, fname: writes a .csv file in 
    proper format as specified by sample_submision.csv
    
    it is VERY important that y_test be in the same order
    as X_test
    '''
    f = open(fname, 'w')
    f.write('id,target\n')
    for i in range(len(y_test)):
        f.write(str(i) + ',' + str(round(y_test[i], 6)) +'\n')
    f.close()
def squared_loss(y, real_y):
    """
    Inputs:
        y: (N, ) shaped array of predicted labels
        real_y: (N, ) shaped array of true labels
    Output:
        Squared loss
    """
    loss = 0
    for i in range(len(y)):
        loss += (y[i] - real_y[i])**2
            
    return loss / len(y)
        

In [None]:
def docrossval(clf ,num_folds):
    '''
    This is the best way to do it but it is so slow
    Divides X_train into num_folds folds, and does
    every possible permutation of that
    returns 
    tr_err: average training errror for all folds
    tst_err: average testing error for all folds
    '''
    kf = KFold(n_splits=num_folds)
    tr_err = 0.0  # training error for this validation
    val_err = 0.0 # validation error for this validation 
    
    for train_index, test_index in kf.split(X_train):
        # Training and testing data points for this fold:
        x_tr, x_val = X_train[train_index], X_train[test_index]
        y_tr, y_val = y_train[train_index], y_train[test_index]

        clf.fit(x_tr, y_tr)
        tr_err += squared_loss(clf.predict(x_tr), y_tr)
        val_err  += squared_loss(clf.predict(x_val), y_val)
    return(tr_err, val_err)



# This is how we should do validation, but it takes forever

In [None]:
'''Do not run this block unless you want to wait a millenium'''
min_samps = [5, 10] # adjust this to run through multiple values of min_samples_leaf using docrossval()
train_error = []
test_error = []
for samp in min_samps:
    clf = RandomForestRegressor(n_estimators = 10)
    clf.set_params(min_samples_leaf = samp)
    temp = docrossval(clf, 10)
    train_error.append(temp[0])
    test_error.append(temp[1])
    
plt.plot(min_samps, train_error, color = 'g')
plt.plot(min_samps, test_error)
plt.show()

# Model 1

In [6]:
clf = RandomForestRegressor(n_estimators = 100)
clf.set_params(min_samples_leaf = 50)
clf.fit(X_train, y_train)
squared_loss(clf.predict(X_train), y_train)
y = clf.predict(X_test)
write_sub(y, 'attempt1.csv')

KeyboardInterrupt: 

In [None]:
min_samps = [5 * i for i in range(1, 20)]
num_estimators = 100

tr_errs = []
val_errs = []

for samp in min_samps:
    
    #Below we are creating a single training and validation set
    split_index = int(len(X_train) * .1) # the number here defines what percent of the data points are for validation
    p = np.random.permutation(len(X_train))
    shufX = X_train[p]
    shufy = y_train[p]
    x_val, x_tr =  shufX[:split_index], shufX[split_index:]
    y_val, y_tr = shufy[:split_index], shufy[split_index:]
    clf = RandomForestRegressor(n_estimators = num_estimators, min_samples_leaf = samp)
    clf.fit(x_tr, y_tr)
    
    tr_errs.append(squared_loss(clf.predict(x_tr), y_tr))
    val_errs.append(squared_loss(clf.predict(x_val), y_val))
plt.plot(min_samps, tr_errs, color = 'g')
plt.plot(min_samps, val_errs)
plt.show()