In [1]:
import json, re
import pandas as pd
import numpy as np

from sklearn import preprocessing
from math import sqrt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import roc_auc_score as AUC

In [2]:
# Functions to load the dataset

import numpy as np

def read_data(file_name):
#     This function is taken from:
#     https://github.com/benhamner/BioResponse/blob/master/Benchmarks/csv_io.py

    f = open(file_name)
    #ignore header
    f.readline()
    samples = []
    target = []
    for line in f:
        line = line.strip().split(",")
        sample = [float(x) for x in line]
        samples.append(sample)
    return samples

def load():

# Convenience function to load all data as numpy arrays.

    print "Loading data..."
    filename_train = 'Train_Vectorizer.csv'
    filename_test = 'Test_Vectorizer.csv'

    train = read_data("Train_Vectorizer.csv")
    y_train = np.array([x[0] for x in train])
    X_train = np.array([x[1:] for x in train])
    X_test = np.array(read_data("Test_Vectorizer.csv"))
    print "Finished loading"
    return X_train, y_train, X_test

if __name__ == '__main__':

    X_train, y_train, X_test = load()

Loading data...
Finished loading


In [79]:
"""
Blending {RandomForests, ExtraTrees, GradientBoosting} + stretching to
[0,1]. The blending scheme is related to the idea Jose H. Solorzano
presented here:
http://www.kaggle.com/c/bioresponse/forums/t/1889/question-about-the-process-of-ensemble-learning/10950#post10950
'''You can try this: In one of the 5 folds, train the models, then use
the results of the models as 'variables' in logistic regression over
the validation data of that fold'''. Or at least this is the
implementation of my understanding of that idea :-)
The predictions are saved in test.csv. 
Note: if you increase the number of estimators of the classifiers,
e.g. n_estimators=1000, you get a better score/rank on the private
test set.
Originally published on Github by Emanuele Olivetti.
"""

from __future__ import division
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

def logloss(attempt, actual, epsilon=1.0e-15):
    """log loss function
    """
    attempt = np.clip(attempt, epsilon, 1.0-epsilon)
    return - np.mean(actual * np.log(attempt) + (1.0 - actual) * np.log(1.0 - attempt))


if __name__ == '__main__':

    np.random.seed(0) # seed to shuffle the train set

    n_folds = 10
    verbose = True
    shuffle = False

    X, y, X_submission = load()

    if shuffle:
        idx = np.random.permutation(y.size)
        X = X[idx]
        y = y[idx]

    skf = list(StratifiedKFold(y, n_folds))

    clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini')]
#             RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
#             ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
#             ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
#             GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]

    print "Creating train and test sets for blending and predicting."
    
    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))
    
    for j, clf in enumerate(clfs):
        print j, clf
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print "Fold", i
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]
            
            
            clf.fit(X_train, y_train)
            y_submission = clf.predict_proba(X_test)[:,1]

#     print "Saving Results."
#     np.savetxt(fname='test.csv', X=y_submission, fmt='%0.9f')

Loading data...
Finished loading
Creating train and test sets for blending and predicting.
0 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9


In [2]:
from sklearn.ensemble import RandomForestClassifier
import scipy

# Basic CSV IO

def read_data(file_name):
    f = open(file_name)
    #ignore header
    f.readline()
    samples = []
    target = []
    for line in f:
        line = line.strip().split(",")
        sample = [float(x) for x in line]
        samples.append(sample)
    return samples

def write_delimited_file(file_path, data,header=None, delimiter=","):
    f_out = open(file_path,"w")
    if header is not None:
        f_out.write(delimiter.join(header) + "\n")
    for line in data:
        if isinstance(line, str):
            f_out.write(line + "\n")
        else:
            f_out.write(delimiter.join(line) + "\n")
    f_out.close()

In [3]:
train = read_data("Train_Vectorizer.csv")

y = [x[0] for x in train]
X = [x[1:] for x in train]

test = read_data("Train_Vectorizer.csv")

# rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=1)
# rf.fit(train[:300], target[:300])

#     predicted_probs = rf.predict_proba(test)
#     predicted_probs = ["%f" % x[1] for x in predicted_probs]
#     write_delimited_file("rf_benchmark.csv",
#                                 predicted_probs)

In [4]:
X = np.asarray(X)
# test = np.array(test)
y = np.asarray(y)

In [5]:
from sklearn.cross_validation import KFold

def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()
    
    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

In [8]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

# print "Support vector machines:"
# print "%.3f" % accuracy(y, run_cv(X,y,SVC))
print "Random forest:"
print "%.3f" % accuracy(y, run_cv(X,y,RF))

Random forest:
0.339


In [57]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import ExtraTreesRegressor as ETR

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

# print "Support vector machines:"
# print "%.3f" % accuracy(y, run_cv(X,y,SVC))
print "Exra Trees Regressor:"
print "%.3f" % accuracy(y, run_cv(X,y, ETR))

Exra Trees Regressor:
0.142


In [6]:
def run_prob_cv(X, y, clf_class, **kwargs):
    
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_prob = np.zeros((len(y),2))
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(X_test)
        
    return y_prob

In [65]:
from sklearn.ensemble import ExtraTreesClassifier

ETCclf = ExtraTreesClassifier()
ETCclf = ETCclf.fit(X, y)

ETCclf.feature_importances_

array([  4.82786590e-01,   2.06205856e-01,   3.72183982e-02,
         1.79121330e-01,   0.00000000e+00,   1.97616675e-04,
         1.53880316e-04,   1.42607518e-03,   5.16060819e-04,
         7.97389748e-04,   1.69738535e-03,   8.01689892e-04,
         1.08577640e-04,   1.37059431e-03,   5.92559545e-04,
         8.96050786e-05,   6.22377319e-04,   1.58737557e-03,
         1.35469276e-03,   4.76317872e-05,   9.42847523e-04,
         3.23210704e-03,   2.59393996e-03,   6.86137260e-04,
         8.40160552e-04,   5.52449883e-03,   2.34483461e-03,
         2.81611381e-04,   1.29975447e-03,   6.03047879e-03,
         4.68549659e-04,   1.83753720e-06,   6.25498416e-04,
         1.11743497e-03,   1.05039715e-03,   1.74484240e-03,
         1.32522512e-03,   8.37641047e-04,   2.05878749e-03,
         3.46961004e-03,   3.94630216e-04,   1.16264367e-04,
         2.85570128e-03,   1.26299707e-04,   7.29255053e-04,
         6.45614855e-04,   1.64138688e-03,   2.71505506e-04,
         4.64590654e-03,

In [104]:
from treeinterpreter import treeinterpreter as ti
from sklearn.feature_selection import SelectFromModel

model = SelectFromModel(ETCclf, prefit=True)
X_new = model.transform(X)
X_new.shape               

(647054, 4)

In [117]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

# print "Support vector machines:"
# print "%.3f" % accuracy(y, run_cv(X,y,SVC))
print "Random forest:"
print "%.3f" % accuracy(y, run_cv(X_new,y,RF))

Random forest:
0.314


In [31]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40, random_state = 1234)
rf = RandomForestRegressor(n_estimators=10, max_depth=1000, min_samples_leaf=1, min_samples_split=2, n_jobs=-1)

rf.fit(X, y)
rf_test_mse = mean_squared_error(y_test, rf.predict(X_test))
rf_train_mse = mean_squared_error(y_train, rf.predict(X_train))

print "train MSE, %.4f, test MSE: %.4f" % (train_mse, test_mse)

train MSE, 1089.1301, test MSE: 1077.8334


In [27]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# fit estimator
est = GradientBoostingClassifier(n_estimators=100, max_depth=3)
est.fit(X_train, y_train)

# predict class labels
pred = est.predict(X_test)

# score on test data (accuracy)
acc = est.score(X_test, y_test)
print 'ACC: %.4f' % acc

In [None]:
from sklearn.ensemble import ExtraTreesClassifier


# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=1234)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()



# # Plot the feature importances of the forest
# plt.figure()
# plt.title("Feature importances")
# plt.bar(range(10), importances[indices][0:10],
#        color="r", align="center", yerr=std[indices][0:10])
# plt.xticks(range(10), indices)
# plt.xlim([-1, 10])
# plt.show()

In [28]:
import numpy as np

def plot_data(figsize=(8, 5)):
    fig = plt.figure(figsize=figsize)
    gt = plt.plot(x_plot, ground_truth(x_plot), alpha=0.4, label='ground truth')

    # plot training and testing data
    plt.scatter(X_train, y_train, s=10, alpha=0.4)
    plt.scatter(X_test, y_test, s=10, alpha=0.4, color='red')
    plt.xlim((0, 10))
    plt.ylabel('y')
    plt.xlabel('x')