In [50]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

SEED = 42

In [51]:
''' Performs k-fold cross validation. '''
def cross_validate(x, y, k, model):
    mean_err = 0.
    for i in range(k):
        # Split the training data into training and testing sets for cross-validation.
        x_train, x_cv, y_train, y_cv = train_test_split(x, y, test_size=float(1)/k, random_state=i*SEED)

        model.fit(x_train, y_train)

        # Calculate training error.
        preds = model.predict(x_cv)
        fold_err = model.score(x_cv, y_cv)
        mean_err += fold_err
    mean_err /= k
    print mean_err

In [52]:
''' Creates scatter plots for each feature. '''
def plot_features(data):
    for feature in data:
        if feature != 'id':
            plt.scatter(data['id'], data[feature])
            plt.title(feature)
            plt.show()

In [53]:
''' Finds the optimal hyper parameters for and XGB model. '''
''' WARNING: EXTREMELY LONG RUN-TIME '''
def optimize_xgb(train):
    tune_params = [{'max_depth': [k for k in range(40)]}, 
               {'learning_rate': [k for k in np.linspace(0.01, 0.05, 5)]},
               {'n_estimators': [k for k in range(300)]},
               {'gamma': [k for k in np.linspace(0.0, 0.1, 10)]}]
    gs = GridSearchCV(xgb.XGBClassifier(), tune_params, cv=5)
    gs.fit(train.drop(['id', 'Y'], axis=1), train['Y'])
    print gs.best_params_
    print gs.best_score_
    return gs.best_estimator_

In [54]:
def encode_features(train, test, features):
    for x in range(len(features)):
        features[x] = train.columns.get_loc(features[x])
    
    enc = OneHotEncoder(categorical_features=features, handle_unknown='ignore')
    enc.fit(train.append(test).fillna(train.append(test).mean()))
    
    train_encoded = enc.transform(train)
    test_encoded = enc.transform(test)
    return train_encoded, test_encoded

In [55]:
''' Stacks the predictions from the given model onto the given training and testing data as 'new_feature'. '''
def stack_model(model, train, test, new_feature):
    model.fit(train.drop(['id', 'Y'], axis=1), train['Y'])
    train[new_feature] = pd.DataFrame(model.predict_proba(train.drop(['id', 'Y'], axis=1))[:,1])
    test[new_feature] = pd.DataFrame(model.predict_proba(test.drop(['id'], axis=1))[:,1])
    return train, test

In [59]:
train = pd.read_csv('/Users/anthonydepalatis/Documents/Workspace/school/MidtermProject379K/data/train.csv')
test = pd.read_csv('/Users/anthonydepalatis/Documents/Workspace/school/MidtermProject379K/data/test.csv')

# Fill all empty rows with the mean of that column.
train = train.fillna(train.mean())
test = test.fillna(test.mean())

# Separate labels, features, and indices
x_train = train.drop(['id', 'Y'], axis=1)
x_test = test.drop(['id'], axis=1)
Y = train['Y']

# Encode that shit.
x_train, x_test = encode_features(x_train, x_test, ['F1', 'F4', 'F5', 'F7', 'F8', 'F12', 'F13', 'F15', 'F17', 'F20', 'F24'])

# Train an xgb model with default hyper params and get the initial AUC.
gbm = xgb.XGBClassifier()
cross_validate(x_train, Y, 5, gbm)

# Train a logistic regression model with default hyper params and get the initial AUC.
clf = LogisticRegression()
cross_validate(x_train, Y, 5, clf)

# Train a random forest model with default hyper params and get the initial AUC.
clf = RandomForestClassifier()
cross_validate(x_train, Y, 5, clf)

# Stack xgb with default hyper params.
#print '\n------Beginning parameter stacking...------\n'
#train, test = stack_model(gbm, train, test, 'xgb1')
#cross_validate(train.drop(['id', 'Y'], axis=1), train['Y'], 5, xgb.XGBClassifier())

# Stack xgb with optimized n_estimators.
#gbm = xgb.XGBClassifier(n_estimators=78)
#train, test = stack_model(gbm, train, test, 'xgb2')

# Stack log reg with default hyper params.
#train, test = stack_model(LogisticRegression(), train, test, 'log_reg1')
#cross_validate(train.drop(['id', 'Y'], axis=1), Y, 5, xgb.XGBClassifier())

# Stack random forest with default hyper params.
#train, test = stack_model(RandomForestClassifier(), train, test, 'rand1')
#cross_validate(train.drop(['id', 'Y'], axis=1), Y, 5, xgb.XGBClassifier())

# Initialize final model.
gbm = xgb.XGBClassifier()
print '\nFinal model\'s score:'
cross_validate(x_train, Y, 5, gbm)

# Make final predictions
gbm.fit(x_train, Y)
test_preds = gbm.predict_proba(x_test)[:,1]
submission = pd.DataFrame({"Id":test.id, "Y":test_preds})
submission.to_csv('/Users/anthonydepalatis/Documents/Workspace/school/MidtermProject379K/submission.csv', index=False)

0.93684
0.93414
0.93552

Final model's score:
0.93684
