In [1]:
import numpy as np
import os
import pandas as pd
import sys
sys.path.append('..')
from src.tools import numerai_api

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import log_loss
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GridSearchCV



In [2]:
import logging
# setup logger
log_fmt = '%(asctime)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.DEBUG, format=log_fmt)
logger = logging.getLogger()
logger.handlers[0].stream = sys.stdout

In [3]:
#round_number = numerai_api.get_current_round()
round_number = 92

In [4]:
raw_data_path = os.path.join(os.path.pardir,'data','raw')
raw_data_file = os.path.join(raw_data_path, '{}_numerai_raw.pkl'.format(round_number))

In [9]:
try:
    df = pd.read_pickle(raw_data_file)
except FileNotFoundError:
    get_data = os.path.join(os.path.pardir, 'src', 'data', 'get_raw_data_binary.py')
    !python $get_data
    df = pd.read_pickle(raw_data_file)


In [10]:
df_prediction = df.loc[(df['data_type'] == 'validation') | 
                       (df['data_type'] == 'test') | 
                       (df['data_type'] == 'live'), 'feature1':'feature50']

df_validation_predict = df.loc[df['data_type'] == 'validation','feature1':'feature50']
df_validation_target = df.loc[df['data_type'] == 'validation','target']

X_train_era = df.loc[df['data_type'] == 'train', :].drop(['data_type','target'], axis=1)
y_train_era = df.loc[df['data_type'] == 'train', ['era','target']]

X_train = X_train_era.drop('era', axis=1)
y_train = y_train_era['target']

The following is done with cross_val_score

for i in range(0,10):
    
    log_reg_model.fit(X_split_train[i].drop('era', axis=1), y_split_train[i]['target'])
    score = log_reg_model.score(X_split_test[i].drop('era', axis=1), y_split_test[i]['target'])
    print(score)

In [17]:
lg = LogisticRegression()
gkf = GroupKFold(n_splits=10)
cv = gkf.split(X_train_era, y_train_era, groups=X_train_era['era'])
#cv_log_loss = cross_val_score(lg, X_train.drop('era', axis=1), y_train['target'], cv=cv, scoring='neg_log_loss') 

NameError: name 'gkf' is not defined

In [11]:
def create_submission(df_predict_feat, model, filename='predictions.csv', filter_=np.empty(0)):
    submission = pd.DataFrame()
    submission['id'] = df_predict_feat.index
    
    if filter_.any():
        df_predict_feat = df_predict_feat.drop(df_predict_feat.columns[filter_], axis=1)
    submission['probability'] = model.predict_proba(df_predict_feat)[:,1]
    submission.to_csv(filename,index=None)

In [12]:
def get_validation_log_loss(model, df, filter_=np.empty(0)):
    df_validation_predict = df.loc[df['data_type'] == 'validation','feature1':'feature50']
    if filter_.any():
        df_validation_predict = df_validation_predict.drop(df_validation_predict.columns[filter_], axis=1)
    
    df_validation_target = df.loc[df['data_type'] == 'validation','target']
    validation_prediction = model.predict_proba(df_validation_predict)
    return log_loss(df_validation_target, validation_prediction)

def check_consistency(model, df, filter_=np.empty(0)):
    eras_passed=0
    for era in df.loc[df['data_type']=='validation',:].era.unique():
        loss = get_validation_log_loss(model,df.loc[df['era']==era,:],filter_)
        if loss < 0.693:
            eras_passed+=1

    return eras_passed/12



In [27]:
lg_baseline = LogisticRegression()
logger.info('Fitting model')
lg_baseline.fit(X_train, y_train)
logger.info('Model fit')

create_submission(df_prediction, lg_baseline, 'predictions_baseline.csv')
baseline_logloss = get_validation_log_loss(lg_baseline, df)
baseline_logloss

2018-01-28 11:03:01,172 - INFO - Fitting model
2018-01-28 11:03:17,190 - INFO - Model fit
2018-01-28 11:03:17,191 - INFO - Creating submission
2018-01-28 11:03:17,849 - INFO - Submission created


"\nsubmission = pd.DataFrame()\nsubmission['id'] = df_prediction.index\nsubmission['probability'] = lg_baseline.predict_proba(df_prediction)[:,1]\nsubmission.to_csv('predictions_baseline.csv',index=None)\n"

In [30]:
cv = gkf.split(X_train_era, y_train_era, groups=X_train_era['era'])
lg_group = LogisticRegressionCV(cv=cv)
logger.info('Starting model fit')
lg_group.fit(X_train, y_train)
logger.info('Model fitted')
group_logloss = get_validation_log_loss(lg_group, df)
group_logloss

2018-01-28 11:03:17,990 - INFO - Starting model fit
2018-01-28 11:06:18,937 - INFO - Model fitted


In [32]:
(group_logloss-baseline_logloss)/group_logloss*100

-0.05782708551779613

In [18]:
print(0.6926237830311198 - 0.6926199800815499)
print(0.6926199800815499 - 0.6926260055809711)

3.802949569919889e-06
-6.025499421191682e-06


0.7809569239067203 default
0.6926539978867465 n_estimators=1000,max_leaf_nodes=15
0.6926281272932799 n_estimators=500,max_leaf_nodes=15
0.6984105280907695 n_estimators=100,max_leaf_nodes=None
0.6928226448632782 n_estimators=100,max_leaf_nodes=5

0.6926490053669208 n_estimators=50,max_leaf_nodes=15
0.6926264881596373 n_estimators=100,max_leaf_nodes=15
0.6926237830311198 n_estimators=150,max_leaf_nodes=15
0.6926199800815499 n_estimators=175,max_leaf_nodes=15
0.6926260055809711 n_estimators=200,max_leaf_nodes=15


0.6925970274264839 n_estimators=175,max_leaf_nodes=20
0.6925456965520028 n_estimators=175,max_leaf_nodes=30
0.6925077743059121 n_estimators=175,max_leaf_nodes=50
0.6924796040625568 n_estimators=175,max_leaf_nodes=100
0.6924269351737405 n_estimators=175,max_leaf_nodes=150
0.6923994850550886 n_estimators=175,max_leaf_nodes=200
0.692395593293289  n_estimators=175,max_leaf_nodes=205
0.6924060245108058 n_estimators=175,max_leaf_nodes=210
0.6924183696339382 n_estimators=175,max_leaf_nodes=225
0.6924205980793621 n_estimators=175,max_leaf_nodes=250
0.6924420819324049 n_estimators=175,max_leaf_nodes=300
0.692472160869273  n_estimators=175,max_leaf_nodes=400


0.6924482343001823 n_estimators=175,max_leaf_nodes=205 entropy

0.6925234826212867 n_estimators=175,max_leaf_nodes=205,max_features=log2
0.692395593293289  n_estimators=175,max_leaf_nodes=205
0.6924785583536479 n_estimators=175,max_leaf_nodes=205,max_features=8
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10
0.6924414990011737 n_estimators=175,max_leaf_nodes=205,max_features=11
0.692452599084798  n_estimators=175,max_leaf_nodes=205,max_features=12
0.692434921786358  n_estimators=175,max_leaf_nodes=205,max_features=15
0.6924986626987714 n_estimators=175,max_leaf_nodes=205,max_features=20
0.6925193101183662 n_estimators=175,max_leaf_nodes=205,max_features=30
0.6924359465554766 n_estimators=175,max_leaf_nodes=205,max_features=50

0.6924264481698214 n_estimators=175,max_leaf_nodes=205,max_features=10, bootstrap=False
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10, oob_score=True

0.6924813705374833 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=5
0.6924325504443605 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=10
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=50
0.6923830374594487 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=30
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=40
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=35
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=32
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=31

0.6924227331958277 n_estimators=175,max_leaf_nodes=205,max_features=10,min_samples_split=0.01
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10,min_samples_split=3
0.6924284376896439 n_estimators=175,max_leaf_nodes=205,max_features=10,min_samples_split=50
0.6923971988691803 n_estimators=175,max_leaf_nodes=205,max_features=10,min_samples_split=20
0.6924490732428077 n_estimators=175,max_leaf_nodes=205,max_features=10,min_samples_split=10

0.6922934463211274 top 15 max 10
0.6930808753582083 top 4
0.69265250301624   top 7 no max features
0.6924978160188072 top 11
0.6923629534794612 top 22 max 10
0.6923870823834799 top 22 no max features
0.6923336618872348 top 19 max 10
0.6923747980103881 top 17 max 10
0.6923650167342608 top 16 max 10
0.6923704226959646 top 14

In [13]:
rfc_baseline = RandomForestClassifier(n_jobs=-1,
                             n_estimators=175, 
                             max_leaf_nodes=205,
                             max_features=10,
                             random_state=21)

logger.info('Starting model fit')
rfc_baseline.fit(X_train, y_train)
logger.info('Model fitted')

rfc_baseline_logloss = get_validation_log_loss(rfc_baseline, df)
rfc_baseline_logloss

2018-01-29 23:14:29,632 - INFO - Starting model fit
2018-01-29 23:16:42,560 - INFO - Model fitted


0.6923817293453017

In [14]:
filter_ = np.where(rfc_baseline.feature_importances_ < 0.020)[0]
X_train_important = X_train.drop(X_train.columns[filter_], axis=1)
print(len(X_train_important.columns))
X_train_important.columns

15


Index(['feature1', 'feature6', 'feature9', 'feature11', 'feature15',
       'feature21', 'feature25', 'feature27', 'feature28', 'feature31',
       'feature34', 'feature36', 'feature41', 'feature42', 'feature46'],
      dtype='object')

In [None]:
rfc_top15 = RandomForestClassifier(n_jobs=-1,
                             n_estimators=175, 
                             max_leaf_nodes=205,
                             max_features=10,
                             random_state=21)

logger.info('Starting model fit')
rfc.fit(X_train_important, y_train)
logger.info('Model fitted')

rfc_logloss = get_validation_log_loss(rfc, df, filter_)
rfc_logloss

In [187]:
check_consistency(rfc, df, filter_)

0.8333333333333334

In [15]:
import random

def create_original_submission(df, model, amount, filename='predictions.csv', filter_=np.empty(0)):
    random.seed(21)
    submission = df.loc[(df['data_type'] == 'validation') | 
                       (df['data_type'] == 'test') | 
                       (df['data_type'] == 'live'), :]
    cols = submission.columns.tolist()
    cols = cols[2:53] + cols[0:2]
    submission = submission[cols]
    
    df_predict_feat = submission.loc[:,'feature1':'feature50']
    if filter_.any():
        df_predict_feat = df_predict_feat.drop(df_predict_feat.columns[filter_], axis=1)
        
            
    submission['probability'] = model.predict_proba(df_predict_feat)[:,1]
    submission['probability'] = submission['probability'] + random.uniform(-amount,amount)
    
    validation_data = submission.loc[submission['data_type'] == 'validation', :]
    validation_target = validation_data.loc[:, 'target']
    validation_prediction = validation_data.loc[:, 'probability']
    validation_log_loss = log_loss(validation_target, validation_prediction)
    print("Logloss: {}".format(validation_log_loss))
    
    eras_passed=0
    for era in validation_data['era'].unique():
        era_data = validation_data.loc[validation_data['era']==era,:]
        era_target = era_data.loc[:, 'target']
        era_prediction = era_data.loc[:, 'probability']
        era_log_loss = log_loss(era_target, era_prediction)
        if era_log_loss < 0.693:
            eras_passed+=1

    print("Consistency: {}".format(eras_passed/12))
    submission['id'] = submission.index
    submission = submission.loc[:, ['id','probability']]
    
    submission.to_csv(filename, index=False)


In [322]:
create_original_submission(df, rfc_top15, 0.012, 'predictions_RF_top15_rand.csv', filter_)

Logloss: 0.6924171968616661
Consistency: 0.8333333333333334


In [16]:
gkf = GroupKFold(n_splits=10)
cv = gkf.split(X_train_era, y_train_era, groups=X_train_era['era'])
param_grid = { 
    'n_estimators': [155, 160, 165, 170, 175],
    'max_leaf_nodes': [201, 203, 205]
}

rfc_CV = RandomForestClassifier(n_jobs=-1,
                             n_estimators=175, 
                             max_leaf_nodes=205,
                             max_features=10,
                             random_state=21)
CV_rfc = GridSearchCV(n_jobs=-1, estimator=rfc_CV, param_grid=param_grid, cv=cv)
logger.info('Starting model fit')
CV_rfc.fit(X_train_important, y_train)
logger.info('Model fitted')
CV_rfc.best_params_


2018-01-29 23:16:43,106 - INFO - Starting model fit
2018-01-30 03:48:52,558 - INFO - Model fitted


{'max_leaf_nodes': 203, 'n_estimators': 170}

In [18]:
create_original_submission(df, CV_rfc, 0.012, 'predictions_RF_top15_CV.csv', filter_)

Logloss: 0.6924082459556313
Consistency: 0.8333333333333334
