# Hedge Fund X
This notebook solve the Hedge Fund X's competition challenge: Financial Modeling challenge.
This one is used to evaluate model, tuning param to find most sutable model.
To discover data set, check the other Notebook.

In [38]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn import metrics
from xgboost import XGBClassifier

import datetime

In [6]:
df = pd.read_csv("../input/hedge_fund_x/train.csv")
df_test = pd.read_csv("../input/hedge_fund_x/test.csv")
df.head(5)

Unnamed: 0,data_id,period,c1,c2,c3,c4,c5,c6,c7,c8,...,c80,c81,c82,c83,c84,c85,c86,c87,c88,target
0,2,train1,0.65557,-2.2e-05,-0.000539,-0.001075,0.0,0.0,0.21339,0.0,...,-0.023358,-0.017041,0.0,0.060697,0.0,0.0,0.0,-0.000202,-0.14022,1
1,3,train1,1.64643,-0.000292,-0.008367,0.009497,0.0,0.0,0.0,0.0,...,-0.059429,-0.009109,0.0,0.021645,0.0,0.0,0.0,-0.004382,0.455767,0
2,5,train1,-0.74301,0.004642,-0.000647,-0.00329,0.0,0.0,0.0,0.0,...,0.001796,-0.000104,0.0,-0.024718,0.0,0.0,0.219566,0.072711,1.15558,0
3,7,train1,0.02977,-0.006343,-0.000635,-0.002516,0.0,0.0,0.160313,0.0,...,-0.005501,0.045308,0.0,-0.148852,0.0,0.0,0.0,-0.101181,-0.954553,0
4,10,train1,-0.660243,0.012591,-0.002098,-0.022264,0.0,0.0,0.0,0.0,...,0.029034,-0.005847,0.0,-0.007073,0.0,0.0,0.0,-0.004842,0.436002,0


In [7]:
# We drop column period to make it same as test dataset
train1_df = df[df.period == 'train1'].drop(['period'], axis=1)

## Training
### Model definition
Define multiple models to test
_NOTE:_ Work in progress
_TODO:_ review all pipeline

In [8]:
clf_xgb = XGBClassifier(max_depth=7, n_estimators=150, reg_lambda=10)

### Parameter tuning

### Evaluate model
compare performance of model

In [31]:
def print_val_score(scores, label):
    print "{}: {:.2f} (+/- {:.2f})".format(
        label,
        scores[label].mean(), 
        scores[label].std())
def evaluate_model(est, train_df):
    selected_cols = [col for col in train_df.columns if col not in ['target']]
    X_train = train_df[selected_cols].values
    y_train = train_df['target'].values

    scoring={'acc':'accuracy', 'log_loss':'neg_log_loss'}
    scores = cross_validate(estimator=est, X=X_train, y=y_train, cv=10, scoring=scoring, 
                            return_train_score=True)
    print_val_score(scores, 'train_acc')
    print_val_score(scores, 'test_acc')
    print_val_score(scores, 'train_log_loss')
    print_val_score(scores, 'test_log_loss')

### Baseline performance
We treat this as a baseline for our optimization, if we get over this, we are getting better

In [33]:
evaluate_model(clf_xgb, train1_df)

train_acc: 0.92 (+/- 0.00)
test_acc: 0.82 (+/- 0.10)
train_log_loss: -0.33 (+/- 0.01)
test_log_loss: -0.44 (+/- 0.09)


### Use model to predict next period

In [76]:
selected_cols = [col for col in df.columns if col not in ['period', 'target']]
train1_df = df[df.period == 'train1'].drop(['period'], axis=1)
X_train = train1_df[selected_cols].values
y_train = train1_df['target'].values

clf_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=150, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=10,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [78]:
def evaluate_period(train_df):
    selected_cols = [col for col in train_df.columns if col not in ['period', 'target']]
    X_test = train_df[selected_cols].values
    y_test = train_df['target'].values
    y_pred = clf_xgb.predict(X_test)
    y_pred_proba = clf_xgb.predict_proba(X_test)[:,1]
    return y_pred, y_pred_proba, y_test

In [79]:
def print_pred_score(y_pred, y_pred_proba, y_test):
    print "Accuracy scrore: {}".format(metrics.accuracy_score(y_test, y_pred))
    print "Log loss scrore: {}".format(metrics.log_loss(y_test, y_pred_proba))

In [80]:
train2_df = df[df.period == 'train2'].drop(['period'], axis=1)

In [81]:
print_pred_score(*evaluate_period(train1_df))
print_pred_score(*evaluate_period(train2_df))

Accuracy scrore: 0.9168
Log loss scrore: 0.339864600846
Accuracy scrore: 0.498525
Log loss scrore: 0.751101043463


Naive Xgboost doesn't do well on next period. Look like the schematics of different period is different. Let try other

In [82]:
train3_df = df[df.period == 'train3'].drop(['period'], axis=1)
train4_df = df[df.period == 'train4'].drop(['period'], axis=1)
train5_df = df[df.period == 'train5'].drop(['period'], axis=1)

In [83]:
print_pred_score(*evaluate_period(train3_df))
print_pred_score(*evaluate_period(train4_df))
print_pred_score(*evaluate_period(train5_df))

Accuracy scrore: 0.48865
Log loss scrore: 0.77315410604
Accuracy scrore: 0.50055
Log loss scrore: 0.759808796312
Accuracy scrore: 0.50835
Log loss scrore: 0.750535000322


The accuracy score is 0.5, which means we are as worse as random!

### Plot learning curves
Train model multiple times on different training set size to inspect overfit or underfit

In [2]:
from sklearn.model_selection import learning_curve

def plot_learning_curve(X_train, y_train, est, title):
    train_sizes, train_scores, test_scores = learning_curve(estimator=est,
                       X=X_train,
                       y=y_train,
                       train_sizes=np.linspace(0.1, 1.0, 10),
                       cv=10,
                       n_jobs=1)

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.plot(train_sizes, train_mean,
             color='blue', marker='o',
             markersize=5, label='training accuracy')

    plt.fill_between(train_sizes,
                     train_mean + train_std,
                     train_mean - train_std,
                     alpha=0.15, color='blue')

    plt.plot(train_sizes, test_mean,
             color='green', linestyle='--',
             marker='s', markersize=5,
             label='validation accuracy')

    plt.fill_between(train_sizes,
                     test_mean + test_std,
                     test_mean - test_std,
                     alpha=0.15, color='green')

    plt.grid()
    plt.title(title)
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim([0.2, 1.0])
    plt.tight_layout()
    plt.savefig('./figures/learning_curve_xgboost_{:%Y%m%d-%H%M}.png'.format(datetime.datetime.now()), dpi=300)
    plt.show()

In [30]:
plot_learning_curve(X_train, y_train, clf_xgb, 'xgb')

<matplotlib.figure.Figure at 0x7f64fe86bd50>

## Prediction
Prepare train & test data for predition model

In [24]:
# Train all data, to train a subset
selected_cols = [col for col in df.columns if col not in ['period', 'target']]
X_train = df[selected_cols].values
y_train = df['target'].values
X_test = df_test.values
print 'Train data shape: {}'.format(X_train.shape)
print 'Train data shape: {}'.format(X_test.shape)

Train data shape: (560000, 89)
Train data shape: (361500, 89)


Fit all training data (above code is model with cross validation) and make prediction

In [25]:
chosen_est = clf_xgb
chosen_est.fit(X_train, y_train)
predictions = chosen_est.predict_proba(X_test)[:,1]
submission = pd.DataFrame({'data_id': df_test['data_id'],'target': predictions})
submission.to_csv("submit_{:%Y%m%d-%H%M}.csv".format(datetime.datetime.now()), index=False)