In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer

def eval_tree_based_model_max_depth(clf, max_depth, X_train, y_train, X_test, y_test):
    """
    This function evaluates the given classifier (either a decision tree or random forest) at all of the 
    maximum tree depth parameters in the vector max_depth, using the given training and testing
    data. It returns two vector, with the training and testing classification errors.
    
    Inputs:
        clf: either a decision tree or random forest classifier object
        max_depth: a (T, ) vector of all the max_depth stopping condition parameters 
                            to test, where T is the number of parameters to test
        X_train: (N, D) matrix of training samples.
        y_train: (N, ) vector of training labels.
        X_test: (N, D) matrix of test samples
        y_test: (N, ) vector of test labels
    Output:
        train_err: (T, ) vector of classification errors on the training data
        test_err: (T, ) vector of classification errors on the test data
    """
    training_errors = []
    test_errors = []
    # evaluates the tree classifier for the desired number of max_tree_depth values values 
    for i in range(len(max_depth)):
        print(max_depth[i])
        clf.set_params(min_samples_leaf = max_depth[i])
        clf.fit(X_train, y_train)
        y_train_predict = clf.predict_proba(X_train)[:, 1]
        y_test_predict = clf.predict_proba(X_test)[:,1]
        
        training_errors.append(roc_auc_score(y_train, y_train_predict))
        test_errors.append(roc_auc_score( y_test, y_test_predict))
    return np.array(training_errors), np.array(test_errors)

def classification_err(y, real_y):
    """
    This function returns the classification error between two equally-sized vectors of 
    labels; this is the fraction of samples for which the labels differ.
    
    Inputs:
        y: (N, ) shaped array of predicted labels
        real_y: (N, ) shaped array of true labels
    Output:
        Scalar classification error
    """
    return sum(y != real_y)/len(y)



  from numpy.core.umath_tests import inner1d


In [2]:
data = np.loadtxt(open("train_2008.csv", "rb"), delimiter=",", skiprows=1)
test_data = np.loadtxt(open("test_2008.csv", "rb"), delimiter=",", skiprows=1)

# kf = KFold(n_splits = 10, shuffle = True) 

X = data[:, 3:382]
y = data[:, 382]
print(y)
X_test = test_data[:, 3:382]



[0. 0. 0. ... 0. 0. 0.]


In [3]:
sum(X[:, 10]<0)

64663

In [4]:
X[X < 0] = -1
print(X.shape)
print(X_test.shape)
bad_indices = []
real_bad_indices = []
for i in range(np.shape(X)[1]):
    num_less = X[:,i] < 0
    #bad_indices.append(float(sum(num_less))/float(len(num_less)))
    if sum(num_less)/len(num_less) == 1:
        real_bad_indices.append(i)
print(real_bad_indices)
# print(bad_indices[45])


X = np.delete(X, real_bad_indices, axis = 1)
np.delete(X_test, real_bad_indices,axis = 1)
print(X.shape)
print(X_test.shape)
imp = Imputer(missing_values=-1, strategy='mean')
X = imp.fit_transform(X)
imp = Imputer(missing_values=-1, strategy='mean')
X_test = imp.fit_transform(X_test)
print(X.shape)
print(X_test.shape)

(64667, 379)
(16000, 379)
[9, 11, 126, 127, 128, 132, 133, 134]
(64667, 379)
(16000, 379)
(64667, 371)
(16000, 366)


In [5]:
print(X.shape)
print(X[:, 10])
print(X[:, 8])

(64667, 371)
[39202259. 41152903. 30422918. ... 14715449. 35674287. 38686082.]
[16.          6.         15.         ... 16.         12.
 11.05314957]


In [6]:
#all_predictions = np.zeros(10)
# for train_index, test_index in kf.split(X):
#     # print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_val = X[train_index], X[test_index]
#     y_train, y_val = y[train_index], y[test_index]
    
#     #fprint(np.shape(y_test))
#     scaler = StandardScaler()
#     # Fit on training set only.
#     scaler.fit(X_train)
#     # Apply transform to both the training set and the test set.
#     X_train = scaler.transform(X_train)
#     X_val = scaler.transform(X_val)
    
#     pca = PCA(n_components = 250)
#     pca.fit(X_train)
#     X_train = pca.transform(X_train)
#     X_val = pca.transform(X_val)
    
clf = RandomForestClassifier(n_estimators = 190, min_samples_leaf = 20)
#   clf = svm.SVC(probability = True)
#    clf.fit(X_train, y_train)
#     print(clf.classes_)
clf.fit(X, y)
    
#     predictions = clf.predict_proba(X_val)[:, 1]
#     print(sum(np.floor(2.0*predictions) != np.array(y_val))/len(predictions))
#     print(roc_auc_score(y_val, predictions))
#     importances = clf.feature_importances_
#     indices = np.argsort(importances)[::-1]
    # Print the feature ranking
    # print("Feature ranking:")
#     for f in range(X_train.shape[1]):
#         print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
#         #print(X_train[:, indices[f]])


   
#     max_depth = np.arange(10, 200, 20)
#     train_err, test_err = eval_tree_based_model_max_depth(clf, max_depth, X_train, 
#                                                             y_train, X_val, y_val)

#     plt.figure()
#     plt.plot(max_depth, test_err, label='Testing error')
#     plt.plot(max_depth, train_err, label='Training error')
#     plt.xlabel('Maximum Tree Depth')
#     plt.ylabel('Classification error')
#     plt.title('Decision Tree with Gini Impurity and Maximum Tree Depth')
#     plt.legend(loc=0, shadow=True, fontsize='x-large')
#     plt.show()

#     print('Test error minimized at max_depth =', np.argmax(test_err), max_depth[np.argmax(test_err)])

results = clf.predict_proba(X_test)


ValueError: Number of features of the model must match the input. Model n_features is 371 and input n_features is 366 

In [None]:
with open('output_1.csv', 'w') as output:
    for i in range(len(test_data)):
        output.write(str(test_data[i][0]) + ',' + str(results[i]) + '\n')

