In [1]:
import numpy as np
import os
import pandas as pd
import pickle
import sys
sys.path.append('..')
from src.tools import numerai_api, utils

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import log_loss
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GridSearchCV



In [2]:
import logging
# setup logger
log_fmt = '%(asctime)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)
logger = logging.getLogger()
logger.handlers[0].stream = sys.stdout

In [3]:
df = utils.load_data()

In [4]:
df_prediction = df.loc[(df['data_type'] == 'validation') | 
                       (df['data_type'] == 'test') | 
                       (df['data_type'] == 'live'), 'feature1':'feature50']

df_validation_predict = df.loc[df['data_type'] == 'validation','feature1':'feature50']
df_validation_target = df.loc[df['data_type'] == 'validation','target']

X_train_era = df.loc[df['data_type'] == 'train', :].drop(['data_type','target'], axis=1)
y_train_era = df.loc[df['data_type'] == 'train', ['era','target']]

X_train = X_train_era.drop('era', axis=1)
y_train = y_train_era['target']

The following is done with cross_val_score

for i in range(0,10):
    
    log_reg_model.fit(X_split_train[i].drop('era', axis=1), y_split_train[i]['target'])
    score = log_reg_model.score(X_split_test[i].drop('era', axis=1), y_split_test[i]['target'])
    print(score)

In [6]:
lg = LogisticRegression()
gkf = GroupKFold(n_splits=10)
cv = gkf.split(X_train_era, y_train_era, groups=X_train_era['era'])
#cv_log_loss = cross_val_score(lg, X_train.drop('era', axis=1), y_train['target'], cv=cv, scoring='neg_log_loss') 

In [5]:
def create_submission(df_predict_feat, model, filename='predictions.csv', filter_=np.empty(0)):
    submission = pd.DataFrame()
    submission['id'] = df_predict_feat.index
    
    if filter_.any():
        df_predict_feat = df_predict_feat.drop(df_predict_feat.columns[filter_], axis=1)
    submission['probability'] = model.predict_proba(df_predict_feat)[:,1]
    submission.to_csv(filename,index=None)

In [6]:
def get_validation_log_loss(model, df, filter_=np.empty(0)):
    df_validation_predict = df.loc[df['data_type'] == 'validation','feature1':'feature50']
    if filter_.any():
        df_validation_predict = df_validation_predict.drop(df_validation_predict.columns[filter_], axis=1)
    
    df_validation_target = df.loc[df['data_type'] == 'validation','target']
    validation_prediction = model.predict_proba(df_validation_predict)
    return log_loss(df_validation_target, validation_prediction)

def check_consistency(model, df, filter_=np.empty(0)):
    eras_passed=0
    for era in df.loc[df['data_type']=='validation',:].era.unique():
        loss = get_validation_log_loss(model,df.loc[df['era']==era,:],filter_)
        if loss < 0.693:
            eras_passed+=1

    return eras_passed/12



In [33]:
lg_baseline = LogisticRegression(random_state=21)
logger.info('Fitting model')
lg_baseline.fit(X_train, y_train)
logger.info('Model fit')

create_submission(df_prediction, lg_baseline, 'predictions_baseline.csv')
baseline_logloss = get_validation_log_loss(lg_baseline, df)
baseline_logloss

2018-01-31 16:14:41,187 - INFO - Fitting model
2018-01-31 16:14:54,863 - INFO - Model fit


0.692945879397903

In [36]:

scaler = preprocessing.MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
validation_features_scaled = scaler.transform(df.loc[df['data_type']=='validation','feature1':'feature50'])

lg_scaled = LogisticRegression(n_jobs=-1, solver='saga', random_state=21)
logger.info('Fitting model')
lg_scaled.fit(X_train_scaled, y_train)
logger.info('Model fit')

validation_target = df.loc[df['data_type'] == 'validation','target']
validation_prediction = lg_scaled.predict_proba(validation_features_scaled)
lg_scaled_logloss = log_loss(validation_target, validation_prediction)
lg_scaled_logloss

2018-01-31 16:16:22,163 - INFO - Fitting model
2018-01-31 16:16:33,231 - INFO - Model fit


0.6929470574555433

In [None]:
lg_important_features = np.where(abs(lg_baseline.coef_) > .2)

0.692945879397903 baseline 
0.6929555810479051 standard scaled 
0.6929555168509783 robust scaled
0.6929470574555433 max abs scaled




In [None]:
lg_opt = LogisticRegression(n_jobs=-1, random_state=21, 
                            penalty='l1')
logger.info('Fitting model')
lg_opt.fit(X_train, y_train)
logger.info('Model fit')

lg_opt_logloss = get_validation_log_loss(lg_opt, df)
lg_opt_logloss

2018-01-31 15:39:51,440 - INFO - Fitting model


In [30]:
cv = gkf.split(X_train_era, y_train_era, groups=X_train_era['era'])
lg_group = LogisticRegressionCV(cv=cv)
logger.info('Starting model fit')
lg_group.fit(X_train, y_train)
logger.info('Model fitted')
group_logloss = get_validation_log_loss(lg_group, df)
group_logloss

2018-01-28 11:03:17,990 - INFO - Starting model fit
2018-01-28 11:06:18,937 - INFO - Model fitted


In [32]:
(group_logloss-baseline_logloss)/group_logloss*100

-0.05782708551779613

In [18]:
print(0.6926237830311198 - 0.6926199800815499)
print(0.6926199800815499 - 0.6926260055809711)

3.802949569919889e-06
-6.025499421191682e-06


0.7809569239067203 default
0.6926539978867465 n_estimators=1000,max_leaf_nodes=15
0.6926281272932799 n_estimators=500,max_leaf_nodes=15
0.6984105280907695 n_estimators=100,max_leaf_nodes=None
0.6928226448632782 n_estimators=100,max_leaf_nodes=5

0.6926490053669208 n_estimators=50,max_leaf_nodes=15
0.6926264881596373 n_estimators=100,max_leaf_nodes=15
0.6926237830311198 n_estimators=150,max_leaf_nodes=15
0.6926199800815499 n_estimators=175,max_leaf_nodes=15
0.6926260055809711 n_estimators=200,max_leaf_nodes=15


0.6925970274264839 n_estimators=175,max_leaf_nodes=20
0.6925456965520028 n_estimators=175,max_leaf_nodes=30
0.6925077743059121 n_estimators=175,max_leaf_nodes=50
0.6924796040625568 n_estimators=175,max_leaf_nodes=100
0.6924269351737405 n_estimators=175,max_leaf_nodes=150
0.6923994850550886 n_estimators=175,max_leaf_nodes=200
0.692395593293289  n_estimators=175,max_leaf_nodes=205
0.6924060245108058 n_estimators=175,max_leaf_nodes=210
0.6924183696339382 n_estimators=175,max_leaf_nodes=225
0.6924205980793621 n_estimators=175,max_leaf_nodes=250
0.6924420819324049 n_estimators=175,max_leaf_nodes=300
0.692472160869273  n_estimators=175,max_leaf_nodes=400


0.6924482343001823 n_estimators=175,max_leaf_nodes=205 entropy

0.6925234826212867 n_estimators=175,max_leaf_nodes=205,max_features=log2
0.692395593293289  n_estimators=175,max_leaf_nodes=205
0.6924785583536479 n_estimators=175,max_leaf_nodes=205,max_features=8
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10
0.6924414990011737 n_estimators=175,max_leaf_nodes=205,max_features=11
0.692452599084798  n_estimators=175,max_leaf_nodes=205,max_features=12
0.692434921786358  n_estimators=175,max_leaf_nodes=205,max_features=15
0.6924986626987714 n_estimators=175,max_leaf_nodes=205,max_features=20
0.6925193101183662 n_estimators=175,max_leaf_nodes=205,max_features=30
0.6924359465554766 n_estimators=175,max_leaf_nodes=205,max_features=50

0.6924264481698214 n_estimators=175,max_leaf_nodes=205,max_features=10, bootstrap=False
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10, oob_score=True

0.6924813705374833 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=5
0.6924325504443605 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=10
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=50
0.6923830374594487 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=30
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=40
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=35
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=32
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10,max_depth=31

0.6924227331958277 n_estimators=175,max_leaf_nodes=205,max_features=10,min_samples_split=0.01
0.6923817293453017 n_estimators=175,max_leaf_nodes=205,max_features=10,min_samples_split=3
0.6924284376896439 n_estimators=175,max_leaf_nodes=205,max_features=10,min_samples_split=50
0.6923971988691803 n_estimators=175,max_leaf_nodes=205,max_features=10,min_samples_split=20
0.6924490732428077 n_estimators=175,max_leaf_nodes=205,max_features=10,min_samples_split=10

0.6922934463211274 top 15 max 10
0.6930808753582083 top 4
0.69265250301624   top 7 no max features
0.6924978160188072 top 11
0.6923629534794612 top 22 max 10
0.6923870823834799 top 22 no max features
0.6923336618872348 top 19 max 10
0.6923747980103881 top 17 max 10
0.6923650167342608 top 16 max 10
0.6923704226959646 top 14

0.6922934463211274 top 15 max 10
0.6922937527006087 max abs scaler
0.6922938995815806 robust scaler
0.692294168860375  standard scaler
0.6922932509521263 standard no mean

0.6922837586171385

0.6924052073894894 PCA reduction to 7
0.6923661805741801 PCA reduction to 
0.6923270568188112


In [7]:
rfc_baseline = RandomForestClassifier(n_jobs=-1,
                             n_estimators=170, 
                             max_leaf_nodes=203,
                             max_features=10,
                             random_state=21)

logger.info('Starting model fit')
rfc_baseline.fit(X_train, y_train)
logger.info('Model fitted')

rfc_baseline_logloss = get_validation_log_loss(rfc_baseline, df)
rfc_baseline_logloss

2018-02-03 20:29:04,329 - INFO - Starting model fit
2018-02-03 20:31:19,644 - INFO - Model fitted


0.6924603132117868

In [8]:
pickle.dump(rfc_baseline, open('rfc_baseline_93.pkl','wb'))

In [19]:
!dir

 Volume in drive D is DATA
 Volume Serial Number is 6E14-EF9D

 Directory of D:\Projects\numerai\learning_numerai\notebooks

31/01/2018  06:10    <DIR>          .
31/01/2018  06:10    <DIR>          ..
28/01/2018  09:46                 0 .gitkeep
29/01/2018  10:36    <DIR>          .ipynb_checkpoints
31/01/2018  06:08            38,073 Data processing testing.ipynb
31/01/2018  03:30            21,369 Numerai API.ipynb
31/01/2018  04:14         9,038,574 predictions_baseline.csv
29/01/2018  08:52         9,056,479 predictions_RF.csv
30/01/2018  06:12         9,058,033 predictions_RF_top15_CV.csv
29/01/2018  08:55         9,057,467 predictions_RF_top15_rand.csv
31/01/2018  06:10         5,030,908 rfc_baseline.pkl
               8 File(s)     41,300,903 bytes
               3 Dir(s)  298,262,204,416 bytes free


In [5]:
rfc_baseline = pickle.load(open('rfc_baseline_93.pkl','rb'))
filter_ = np.where(rfc_baseline.feature_importances_ < 0.020)[0]
X_train_important = X_train.drop(X_train.columns[filter_], axis=1)
print(len(X_train_important.columns))
X_train_important.columns

16


Index(['feature1', 'feature2', 'feature6', 'feature9', 'feature11',
       'feature15', 'feature17', 'feature25', 'feature28', 'feature29',
       'feature31', 'feature34', 'feature36', 'feature41', 'feature42',
       'feature46'],
      dtype='object')

In [10]:
rfc_top15 = RandomForestClassifier(n_jobs=-1,
                             n_estimators=170, 
                             max_leaf_nodes=203,
                             max_features=10,
                             random_state=21)

logger.info('Starting model fit')
rfc_top15.fit(X_train_important, y_train)
logger.info('Model fitted')

rfc_logloss = get_validation_log_loss(rfc_top15, df, filter_)
rfc_logloss

2018-02-03 20:31:20,159 - INFO - Starting model fit
2018-02-03 20:33:29,095 - INFO - Model fitted


0.6924185377721743

In [11]:
pickle.dump(rfc_top15, open('rfc_top_93.pkl','wb'))

In [12]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

scaler = preprocessing.StandardScaler(with_mean=False)
scaler.fit_transform(X_train_important)
X_train_scaled = scaler.transform(X_train_important)
validation_features = df.loc[df['data_type']=='validation','feature1':'feature50']
validation_features = validation_features.drop(validation_features.columns[filter_], axis=1)
validation_features_scaled = scaler.transform(validation_features)

pca = PCA(n_components=15, random_state=21)
pca.fit(X_train_scaled)
X_pca = pca.transform(X_train_scaled)
val_feat_pca = pca.transform(validation_features_scaled)


In [13]:
rfc_top15_scaled = RandomForestClassifier(n_jobs=-1,
                             n_estimators=170, 
                             max_leaf_nodes=203,
                             max_features=15,
                             random_state=21)


logger.info('Starting model fit')
rfc_top15_scaled.fit(X_pca, y_train)
logger.info('Model fitted')



2018-02-03 20:33:30,327 - INFO - Starting model fit
2018-02-03 20:37:14,969 - INFO - Model fitted


In [14]:
validation_target = df.loc[df['data_type'] == 'validation','target']
validation_prediction = rfc_top15_scaled.predict_proba(val_feat_pca)
rfc_scaled_logloss = log_loss(validation_target, validation_prediction)
rfc_scaled_logloss

0.6925541776106379

In [187]:
check_consistency(rfc, df, filter_)

0.8333333333333334

In [6]:
import random

def create_original_submission(df, model, amount, scaler, filename='predictions.csv', filter_=np.empty(0)):
    random.seed(21)
    submission = df.loc[(df['data_type'] == 'validation') | 
                       (df['data_type'] == 'test') | 
                       (df['data_type'] == 'live'), :]
    cols = submission.columns.tolist()
    cols = cols[2:53] + cols[0:2]
    submission = submission[cols]
    
    df_predict_feat = submission.loc[:,'feature1':'feature50']
    if filter_.any():
        df_predict_feat = df_predict_feat.drop(df_predict_feat.columns[filter_], axis=1)
        
    df_predict_feat = scaler.transform(df_predict_feat)        
    submission['probability'] = model.predict_proba(df_predict_feat)[:,1]
    submission['probability'] = submission['probability'] + random.uniform(-amount,amount)
    
    validation_data = submission.loc[submission['data_type'] == 'validation', :]
    validation_target = validation_data.loc[:, 'target']
    validation_prediction = validation_data.loc[:, 'probability']
    validation_log_loss = log_loss(validation_target, validation_prediction)
    print("Logloss: {}".format(validation_log_loss))
    
    eras_passed=0
    for era in validation_data['era'].unique():
        era_data = validation_data.loc[validation_data['era']==era,:]
        era_target = era_data.loc[:, 'target']
        era_prediction = era_data.loc[:, 'probability']
        era_log_loss = log_loss(era_target, era_prediction)
        if era_log_loss < 0.693:
            eras_passed+=1

    print("Consistency: {}".format(eras_passed/12))
    submission['id'] = submission.index
    submission = submission.loc[:, ['id','probability']]
    
    submission.to_csv(filename, index=False)


In [322]:
create_original_submission(df, rfc_top15, 0.012, 'predictions_RF_top15_rand.csv', filter_)

Logloss: 0.6924171968616661
Consistency: 0.8333333333333334


In [7]:
scaler = preprocessing.StandardScaler(with_mean=False)
scaler.fit_transform(X_train_important)
X_train_scaled = scaler.transform(X_train_important)
validation_features = df.loc[df['data_type']=='validation','feature1':'feature50']
validation_features = validation_features.drop(validation_features.columns[filter_], axis=1)
validation_features_scaled = scaler.transform(validation_features)

In [8]:
gkf = GroupKFold(n_splits=10)
cv = gkf.split(X_train_era, y_train_era, groups=X_train_era['era'])

param_grid = { 
    'n_estimators': [168, 170, 172,],
    'max_leaf_nodes': [204, 206, 210,],
    'max_features': [10,],
}

rfc_top15_scaled_CV = RandomForestClassifier(n_jobs=-1,
                             n_estimators=170, 
                             max_leaf_nodes=203,
                             max_features=10,
                             random_state=21)

rfc_top15_scaled_CV = GridSearchCV(n_jobs=-1, estimator=rfc_top15_scaled_CV, param_grid=param_grid, cv=cv)

logger.info('Starting model fit')
rfc_top15_scaled_CV.fit(X_train_scaled, y_train)
logger.info('Model fitted')

rfc_top15_scaled_CV.best_params_


2018-02-04 12:12:58,705 - INFO - Starting model fit
2018-02-04 15:06:08,423 - INFO - Model fitted


{'max_features': 10, 'max_leaf_nodes': 206, 'n_estimators': 170}

In [None]:
rfc_top15_scaled_CV.best_params_

In [9]:
validation_target = df.loc[df['data_type'] == 'validation','target']
validation_prediction = rfc_top15_scaled_CV.predict_proba(validation_features_scaled)
rfc_top15_scaled_CV_logloss = log_loss(validation_target, validation_prediction)
rfc_top15_scaled_CV_logloss

0.6924258008731194

0.6924258008731194 features 10 nodes 206 estimators 170

In [14]:
create_original_submission(df, rfc_top15_scaled_CV, 0.01, scaler, 'predictions_RF_top15_CV.csv', filter_)

Logloss: 0.6923692745555485
Consistency: 0.8333333333333334
