In [64]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

SEED = 42

In [65]:
''' Performs k-fold cross validation. '''
def cross_validate(x, y, k, model):
    mean_err = 0.
    for i in range(k):
        # Split the training data into training and testing sets for cross-validation.
        x_train, x_cv, y_train, y_cv = train_test_split(x, y, test_size=float(1)/k, random_state=i*SEED)

        model.fit(x_train, y_train)

        # Calculate training error.
        preds = model.predict(x_cv)
        fold_err = model.score(x_cv, y_cv)
        mean_err += fold_err
    mean_err /= k
    print mean_err

In [66]:
''' Fills all missing values in the data with the mean of the respective feature. '''
def fillEmptys(data_frame):
    count = 0
    for feature in data_frame:
        for x in range(len(data_frame[feature])):
            if pd.isnull(data_frame[feature][x]):
                data_frame[feature][x] = data_frame[feature].mean()
                count += 1

In [67]:
def plot_feature(id, feature, title):
    plt.scatter(id, feature)
    plt.title(title)
    plt.show()

In [68]:
def optimize_xgb(train):
    tune_params = [{'max_depth': [k for k in range(40)]}, 
               {'learning_rate': [k for k in np.linspace(0.01, 0.05, 5)]},
               {'n_estimators': [k for k in range(300)]},
               {'gamma': [k for k in np.linspace(0.0, 0.1, 10)]}]
    gs = GridSearchCV(xgb.XGBClassifier(), tune_params, cv=5)
    gs.fit(train.drop(['id', 'Y'], axis=1), train['Y'])
    print gs.best_params_
    print gs.best_score_
    return gs.best_estimator_

In [None]:
train = pd.read_csv('/Users/anthonydepalatis/Documents/Workspace/school/MidtermProject379K/data/train.csv')
test = pd.read_csv('/Users/anthonydepalatis/Documents/Workspace/school/MidtermProject379K/data/test.csv')

#for feature in train:
#    if feature != 'id':
#        plot_feature(train['id'], train[feature], feature)

gbm = xgb.XGBClassifier(max_depth=20, n_estimators=78)
cross_validate(train.drop(['id', 'Y'], axis=1), train['Y'], 5, gbm)

# Optimize hyper parameters.
#gbm = optimize_xgb(train)

# Make final predictions
gbm.fit(train.drop(['id', 'Y'], axis=1), train['Y'])
test_preds = gbm.predict_proba(test.drop(['id'], axis=1))[:,1]
submission = pd.DataFrame({"Id":test.id, "Y":test_preds})
submission.to_csv('/Users/anthonydepalatis/Documents/Workspace/school/MidtermProject379K/submission.csv', index=False)