# Hedge Fund X
This notebook solve the Hedge Fund X's competition challenge: Financial Modeling challenge.
This one is used to evaluate model, tuning param to find most sutable model.
To discover data set, check the other Notebook.

In [360]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn import metrics
from xgboost import XGBClassifier

from sklearn.externals import joblib
import datetime

In [361]:
df = pd.read_csv("../input/hedge_fund_x/train.csv")
df_test = pd.read_csv("../input/hedge_fund_x/test.csv")
df.head(5)

Unnamed: 0,data_id,period,c1,c2,c3,c4,c5,c6,c7,c8,...,c80,c81,c82,c83,c84,c85,c86,c87,c88,target
0,2,train1,0.65557,-2.2e-05,-0.000539,-0.001075,0.0,0.0,0.21339,0.0,...,-0.023358,-0.017041,0.0,0.060697,0.0,0.0,0.0,-0.000202,-0.14022,1
1,3,train1,1.64643,-0.000292,-0.008367,0.009497,0.0,0.0,0.0,0.0,...,-0.059429,-0.009109,0.0,0.021645,0.0,0.0,0.0,-0.004382,0.455767,0
2,5,train1,-0.74301,0.004642,-0.000647,-0.00329,0.0,0.0,0.0,0.0,...,0.001796,-0.000104,0.0,-0.024718,0.0,0.0,0.219566,0.072711,1.15558,0
3,7,train1,0.02977,-0.006343,-0.000635,-0.002516,0.0,0.0,0.160313,0.0,...,-0.005501,0.045308,0.0,-0.148852,0.0,0.0,0.0,-0.101181,-0.954553,0
4,10,train1,-0.660243,0.012591,-0.002098,-0.022264,0.0,0.0,0.0,0.0,...,0.029034,-0.005847,0.0,-0.007073,0.0,0.0,0.0,-0.004842,0.436002,0


## Training
### Model definition
Define multiple models to test
_NOTE:_ Work in progress
_TODO:_ review all pipeline

## Remove noise
There are some column that has so much noise, going to train with those collumn removed

In [362]:
zero_count = (train1_df==0.0).sum(axis=0).sort_values(ascending=False)
has_zero_ratio = zero_count[zero_count > 0].div(train1_df.shape[0])
excluded_cols = has_zero_ratio.index
print excluded_cols.values

['c8' 'c37' 'c34' 'c45' 'c23' 'c18' 'c6' 'c62' 'c28' 'c85' 'c46' 'c60'
 'c55' 'c65' 'c5' 'c32' 'c41' 'c54' 'c86' 'c70' 'c63' 'c82' 'c7' 'c19'
 'c58' 'c52' 'c17' 'c10' 'c13' 'c84' 'c75' 'c42' 'target']


### Predict period

In [363]:
clf_xgb_period = XGBClassifier(max_depth=7, n_estimators=150, reg_lambda=100, 
                               objective='multi:softmax', 
                               nthread=10)

In [364]:
# train_df = pd.concat([df[x*40000: x*40000 + 1000] for x in range(0,14)])
train_df = df
print train_df.shape

(560000, 91)


In [390]:
# excluded_cols_period = [x for x in excluded_cols]
excluded_cols_period.append('period')
excluded_cols_period.append('data_id')
print excluded_cols_period

['c8', 'c37', 'c34', 'c45', 'c23', 'c18', 'c6', 'c62', 'c28', 'c85', 'c46', 'c60', 'c55', 'c65', 'c5', 'c32', 'c41', 'c54', 'c86', 'c70', 'c63', 'c82', 'c7', 'c19', 'c58', 'c52', 'c17', 'c10', 'c13', 'c84', 'c75', 'c42', 'target', 'period', 'data_id']


In [391]:
selected_cols = [col for col in train_df.columns if col not in excluded_cols_period]
X_train = train_df[selected_cols].values
y_train = train_df['period'].values
print X_train.shape

(560000, 56)


In [None]:
clf_xgb_period.fit(X_train, y_train, verbose=True)

In [None]:
joblib.dump(clf_xgb_period, 'clf_xgb_period_40000_14_all_param.pkl')

### Predict period for data

In [394]:
oh = OneHotEncoder()
le = LabelEncoder()
period_values = df['period']
le_period = le.fit_transform(period_values)
print le_period[90000:90005]

[7 7 7 7 7]


In [395]:
oh_period = oh.fit_transform(le_period.reshape(-1,1)).toarray()
print oh_period
print oh_period.shape

[[ 1.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
(560000, 14)


In [396]:
print oh_period.shape

(560000, 14)


In [397]:
oh_period_df = pd.DataFrame(oh_period)
print oh_period_df.head(5)

    0    1    2    3    4    5    6    7    8    9    10   11   12   13
0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
1  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
2  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
3  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
4  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0


In [398]:
enc_df = pd.concat([df, oh_period_df], axis=1)
print enc_df.head(5)

   data_id  period        c1        c2        c3        c4   c5   c6  \
0        2  train1  0.655570 -0.000022 -0.000539 -0.001075  0.0  0.0   
1        3  train1  1.646430 -0.000292 -0.008367  0.009497  0.0  0.0   
2        5  train1 -0.743010  0.004642 -0.000647 -0.003290  0.0  0.0   
3        7  train1  0.029770 -0.006343 -0.000635 -0.002516  0.0  0.0   
4       10  train1 -0.660243  0.012591 -0.002098 -0.022264  0.0  0.0   

         c7   c8 ...     4    5    6    7    8    9   10   11   12   13  
0  0.213390  0.0 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.000000  0.0 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.000000  0.0 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.160313  0.0 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.000000  0.0 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 105 columns]


In [399]:
selected_cols = [col for col in enc_df.columns if col not in excluded_cols_period]
X_train = enc_df[selected_cols].values
y_train = enc_df['target'].values
print X_train.shape

(560000, 70)


In [400]:
clf_xgb = XGBClassifier(max_depth=7, n_estimators=150, reg_lambda=100)
clf_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=150, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=100,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [401]:
joblib.dump(clf_xgb, 'clf_xgb_40000_14_all_param.pkl')

['clf_xgb_40000_14.pkl']

## Try to predict test

In [404]:
selected_cols = [col for col in df_test.columns if col not in excluded_cols_period]
X_test = df_test[selected_cols].values

In [405]:
X_test_period = clf_xgb_period.predict(X_test)
print X_test_period

['train8' 'train2' 'train1' ..., 'train11' 'train14' 'train2']


In [406]:
X_test_period_enc = oh.transform(le.transform(X_test_period).reshape(-1,1)).toarray()
print X_test_period_enc

[[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  1. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [407]:
print X_test_period_enc.shape

(361500, 14)


In [408]:
X_test_period_enc_df = pd.DataFrame(X_test_period_enc)
print X_test_period_enc_df.head(5)
print X_test_period_enc_df.shape

    0    1    2    3    4    5    6    7    8    9    10   11   12   13
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0
1  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
2  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
3  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
4  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
(361500, 14)


In [409]:
X_test_enc_df = pd.concat([df_test, X_test_period_enc_df], axis=1)
print X_test_enc_df.shape

(361500, 103)


In [410]:
selected_cols = [col for col in X_test_enc_df.columns if col not in excluded_cols_period]
X_test = X_test_enc_df[selected_cols].values
print X_test.shape

(361500, 70)


In [411]:
predictions = clf_xgb.predict_proba(X_test)[:,1]
print predictions

[ 0.25347605  0.5180108   0.48309475 ...,  0.44581366  0.47977474
  0.47478169]


In [412]:
print predictions.shape

(361500,)


In [413]:
submission = pd.DataFrame({'data_id': df_test['data_id'],'target': predictions})
submission.to_csv("submit_{:%Y%m%d-%H%M}.csv".format(datetime.datetime.now()), index=False)

## Prediction
Prepare train & test data for predition model

In [130]:
# Train all data, to train a subset
# selected_cols = [col for col in train1_df.columns if col not in ['period', 'target']]
selected_cols = [col for col in train1_df.columns if col not in excluded_cols]
X_train = df[selected_cols].values
y_train = df['target'].values
X_test = df_test[selected_cols].values
print 'Train data shape: {}'.format(X_train.shape)
print 'Train data shape: {}'.format(X_test.shape)

Train data shape: (560000, 57)
Train data shape: (361500, 57)


Fit all training data (above code is model with cross validation) and make prediction

In [131]:
chosen_est = clf_xgb
chosen_est.fit(X_train, y_train)
predictions = chosen_est.predict_proba(X_test)[:,1]
submission = pd.DataFrame({'data_id': df_test['data_id'],'target': predictions})
submission.to_csv("submit_{:%Y%m%d-%H%M}.csv".format(datetime.datetime.now()), index=False)