In [44]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

def eval_tree_based_model_max_depth(clf, max_depth, X_train, y_train, X_test, y_test):
    """
    This function evaluates the given classifier (either a decision tree or random forest) at all of the 
    maximum tree depth parameters in the vector max_depth, using the given training and testing
    data. It returns two vector, with the training and testing classification errors.
    
    Inputs:
        clf: either a decision tree or random forest classifier object
        max_depth: a (T, ) vector of all the max_depth stopping condition parameters 
                            to test, where T is the number of parameters to test
        X_train: (N, D) matrix of training samples.
        y_train: (N, ) vector of training labels.
        X_test: (N, D) matrix of test samples
        y_test: (N, ) vector of test labels
    Output:
        train_err: (T, ) vector of classification errors on the training data
        test_err: (T, ) vector of classification errors on the test data
    """
    training_errors = []
    test_errors = []
    # evaluates the tree classifier for the desired number of max_tree_depth values values 
    for i in range(len(max_depth)):
        print(max_depth[i])
        clf.set_params(n_estimators = max_depth[i])
        clf.fit(X_train, y_train)
        y_train_predict = clf.predict_proba(X_train)[:, 1]
        y_test_predict = clf.predict_proba(X_test)[:,1]
        
        training_errors.append(roc_auc_score(y_train, y_train_predict))
        test_errors.append(roc_auc_score( y_test, y_test_predict))
    return np.array(training_errors), np.array(test_errors)

def classification_err(y, real_y):
    """
    This function returns the classification error between two equally-sized vectors of 
    labels; this is the fraction of samples for which the labels differ.
    
    Inputs:
        y: (N, ) shaped array of predicted labels
        real_y: (N, ) shaped array of true labels
    Output:
        Scalar classification error
    """
    return sum(y != real_y)/len(y)



In [57]:
data = np.loadtxt(open("train_2008.csv", "rb"), delimiter=",", skiprows=1)
test_data = np.loadtxt(open("test_2008.csv", "rb"), delimiter=",", skiprows=1)

kf = KFold(n_splits = 10, shuffle = True) 

X = data[:, 3:382]
y = data[:, 382]

X_test = data[:, 3:382]



In [54]:
bad_indices = []
for i in range(np.shape(X)[1]):
    num_less = X[:,i] < 0
    if sum(num_less)/len(num_less) > 0.2:
        bad_indices.append(i + 1)
print(len(bad_indices))
print(bad_indices)

# np.delete(X, bad_indices, axis = 1)

264
[7, 10, 11, 12, 23, 24, 25, 26, 38, 42, 48, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 164, 167, 168, 169, 170, 171, 172, 173, 174, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 212, 213, 214, 215, 216, 217, 218, 219, 221, 222, 224, 225, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 244, 245, 246, 247, 253, 264, 265, 266, 269, 270, 271, 272, 273, 274, 275, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 2

array([[  1., 201.,   0., ...,   0.,   0.,   0.],
       [  1., 201.,   0., ...,   0.,   0.,   0.],
       [  1.,   1.,   0., ...,   0.,   0.,   0.],
       ...,
       [  1.,   1.,   0., ...,   0.,   0.,   0.],
       [  1., 201.,   0., ...,   0.,   0.,   0.],
       [  1., 201.,   0., ...,   0.,   0.,   0.]])

In [55]:
all_predictions = np.zeros(10)
for train_index, test_index in kf.split(X):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_val = X[train_index], X[test_index]
    y_train, y_val = y[train_index], y[test_index]
    
#     #fprint(np.shape(y_test))
#     scaler = StandardScaler()
#     # Fit on training set only.
#     scaler.fit(X_train)
#     # Apply transform to both the training set and the test set.
#     X_train = scaler.transform(X_train)
#     X_val = scaler.transform(X_val)
    
#     pca = PCA(n_components = 250)
#     pca.fit(X_train)
#     X_train = pca.transform(X_train)
#     X_val = pca.transform(X_val)
    
    clf = RandomForestClassifier(min_samples_leaf = 101, n_estimators = 175)
#   clf = svm.SVC(probability = True)
    clf.fit(X_train, y_train)
#     print(clf.classes_)
    
    
    predictions = clf.predict_proba(X_val)[:, 1]
    print(sum(np.floor(2.0*predictions) != np.array(y_val))/len(predictions))
    print(roc_auc_score(y_val, predictions))
    importances = clf.feature_importances_
    indices = np.argsort(importances)[::-1]
    # Print the feature ranking
    print("Feature ranking:")
    for f in range(X_train.shape[1]):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
   
#     max_depth = np.arange(80, 200, 10)
#     train_err, test_err = eval_tree_based_model_max_depth(clf, max_depth, X_train, 
#                                                             y_train, X_val, y_val)

#     plt.figure()
#     plt.plot(max_depth, test_err, label='Testing error')
#     plt.plot(max_depth, train_err, label='Training error')
#     plt.xlabel('Maximum Tree Depth')
#     plt.ylabel('Classification error')
#     plt.title('Decision Tree with Gini Impurity and Maximum Tree Depth')
#     plt.legend(loc=0, shadow=True, fontsize='x-large')
#     plt.show()

#    print('Test error minimized at max_depth = %i' % max_depth[np.argmax(test_err)])




0.2308643884335859
0.7765839864714341
0.23689500541209216
0.7791557897699353
0.2393691046853255
0.7649458586081999
0.24076078552651925
0.7538410838576104
0.23581258698005259
0.7518471156042908
0.23318385650224216
0.767855987913515
0.23766816143497757
0.7776027196976829
0.2466749149396845
0.7674700551677873
0.22827095576863593
0.7598086832753338
0.2262604392205382
0.7778255414710147
