In [None]:
#####################################
# Libraries
#####################################
# Common libs
import pandas as pd
import numpy as np
import sys
import os
import os.path
import random
from pathlib import Path

from time import time
from itertools import chain

# Image processing
import imageio
import skimage
import skimage.io
import skimage.transform
#from skimage.transform import rescale, resize, downscale_local_mean

# Charts
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns


# ML
import scipy
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from xgboost import XGBClassifier
#from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
#from sklearn.impute import SimpleImputer
from sklearn.preprocessing import QuantileTransformer,StandardScaler, MinMaxScaler,OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
#from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.sequence import TimeseriesGenerator
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPool2D, Dropout, BatchNormalization, LSTM, Embedding
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, ReduceLROnPlateau
from keras.utils import to_categorical
import tensorflow

#####################################
# Settings
#####################################
plt.style.use('seaborn')
# Set random seed to make results reproducable
np.random.seed(42)
tensorflow.set_random_seed(42)
os.environ['PYTHONHASHSEED'] = '42'
# Improve printed df readability
pd.options.display.float_format = '{:,.4f}'.format
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)

In [None]:
from itertools import chain
from kaggle.competitions import twosigmanews
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from wordcloud import WordCloud

import datetime
import gc

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import seaborn as sns
import warnings

%matplotlib inline
np.random.seed(2018)
stop = set(stopwords.words('english'))
py.init_notebook_mode(connected=True)
warnings.filterwarnings('ignore')

In [None]:
# turn off analytics for training, or running out of memory.
ANALYTICS = False

### Download Data

In [None]:

env = twosigmanews.make_env()

In [None]:
(market_train_df, news_train_df) = env.get_training_data()


In [None]:
market_train_df['price_diff'] = market_train_df['close'] - market_train_df['open']

In [None]:
market_train_df['close_to_open'] =  np.abs(market_train_df['close'] / market_train_df['open'])

In [None]:
market_train_df['assetName_mean_open'] = market_train_df.groupby('assetName')['open'].transform('mean')
market_train_df['assetName_mean_close'] = market_train_df.groupby('assetName')['close'].transform('mean')

# if open price is too far from mean open price for this company, replace it. Otherwise replace close price.
for i, row in market_train_df.loc[market_train_df['close_to_open'] >= 2].iterrows():
    if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):
        market_train_df.iloc[i,5] = row['assetName_mean_open']
    else:
        market_train_df.iloc[i,4] = row['assetName_mean_close']
        
for i, row in market_train_df.loc[market_train_df['close_to_open'] <= 0.5].iterrows():
    if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):
        market_train_df.iloc[i,5] = row['assetName_mean_open']
    else:
        market_train_df.iloc[i,4] = row['assetName_mean_close']

In [None]:
market_train_df.drop(['price_diff', 'assetName_mean_open', 'assetName_mean_close'], axis=1, inplace=True)

In [None]:

market_train_df['returnsOpenNextMktres10'] = market_train_df['returnsOpenNextMktres10'].clip(-0.2, 0.2)

In [None]:
market_train_df['returnsClosePrevRaw1'] = market_train_df['returnsClosePrevRaw1'].clip(-0.1, 0.1)
market_train_df['returnsOpenPrevRaw1'] = market_train_df['returnsOpenPrevRaw1'].clip(-0.1, 0.1)
market_train_df['returnsClosePrevMktres1'] = market_train_df['returnsClosePrevMktres1'].clip(-0.1, 0.1)
market_train_df['returnsOpenPrevMktres1'] = market_train_df['returnsOpenPrevMktres1'].clip(-0.1, 0.1)
market_train_df['returnsClosePrevRaw10'] = market_train_df['returnsClosePrevRaw10'].clip(-0.2, 0.2)
market_train_df['returnsOpenPrevRaw10'] = market_train_df['returnsOpenPrevRaw10'].clip(-0.2, 0.2)
market_train_df['returnsClosePrevMktres10'] = market_train_df['returnsClosePrevMktres10'].clip(-0.2, 0.2)
market_train_df['returnsOpenPrevMktres10'] = market_train_df['returnsOpenPrevMktres10'].clip(-0.2, 0.2)



## Modelling








The VM doesn't have enough memory. Temporarily shrinking the dataset. 

In [None]:
market_train_df = market_train_df.tail(3000000)
news_train_df = news_train_df.tail(6000000)



For market data, we extract more features from the original dataset directly. 

In [None]:
def extract_market_features(market_train_df):
    # downcast to reduce memory footprint
    market_train_df['volume'] = market_train_df['volume'].astype(np.float32)
    market_train_df['open'] = market_train_df['open'].astype(np.float32)
    market_train_df['close'] = market_train_df['close'].astype(np.float32)
    market_train_df['returnsClosePrevRaw1'] = market_train_df['returnsClosePrevRaw1'].astype(np.float32)
    market_train_df['returnsClosePrevRaw10'] = market_train_df['returnsClosePrevRaw10'].astype(np.float32)
    market_train_df['returnsClosePrevMktres1'] = market_train_df['returnsClosePrevMktres1'].astype(np.float32)
    market_train_df['returnsClosePrevMktres10'] = market_train_df['returnsClosePrevMktres10'].astype(np.float32)
    market_train_df['returnsOpenPrevRaw1'] = market_train_df['returnsOpenPrevRaw1'].astype(np.float32)
    market_train_df['returnsOpenPrevRaw10'] = market_train_df['returnsOpenPrevRaw10'].astype(np.float32)
    market_train_df['returnsOpenPrevMktres1'] = market_train_df['returnsOpenPrevMktres1'].astype(np.float32)
    market_train_df['returnsOpenPrevMktres10'] = market_train_df['returnsOpenPrevMktres10'].astype(np.float32)
    
    market_train_df['close_to_open'] = market_train_df['close'] / market_train_df['open']
    market_train_df['close_to_open'] = market_train_df['close'] / market_train_df['open']
    market_train_df['close_to_open'] = market_train_df['close'] / market_train_df['open']
    market_train_df['close_to_open'] = market_train_df['close'] / market_train_df['open']
    market_train_df['volume_to_mean'] = (market_train_df['volume'] / market_train_df['volume'].mean()).astype(np.float32)
    market_train_df['returns_close_to_open_prev_raw1'] = market_train_df['returnsClosePrevRaw1'] / market_train_df['returnsOpenPrevRaw1']
    market_train_df['returns_close_to_open_prev_raw10'] = market_train_df['returnsClosePrevRaw10'] / market_train_df['returnsOpenPrevRaw10']
    market_train_df['returns_close_to_open_prev_mktres1'] = market_train_df['returnsClosePrevMktres1'] / market_train_df['returnsOpenPrevMktres1']
    market_train_df['returns_close_to_open_prev_mktres10'] = market_train_df['returnsClosePrevMktres10'] / market_train_df['returnsOpenPrevMktres10']
    market_train_df['returns_prev_open_raw1_to_close_raw10'] = market_train_df['returnsOpenPrevRaw1'] / market_train_df['returnsOpenPrevRaw10']
    market_train_df['returns_prev_close_raw1_to_close_raw10'] = market_train_df['returnsClosePrevRaw1'] / market_train_df['returnsClosePrevRaw10']
    market_train_df['returns_prev_open_mktres1_to_close_mktres10'] = market_train_df['returnsOpenPrevMktres1'] / market_train_df['returnsOpenPrevMktres10']
    market_train_df['returns_prev_close_mktres1_to_close_mktres10'] = market_train_df['returnsClosePrevMktres1'] / market_train_df['returnsClosePrevMktres10']
    
    return market_train_df



For news data, however, we may have multiple news articles associated with a stock on a given day. We collection a number of key statistics from these articles and aggregate them. 

In [None]:
def extract_news_features(news_train_df):
    news_cols_agg = {
        'urgency': ['min', 'max', 'count'],
        'takeSequence': ['min', 'max', 'count'],
        'bodySize': ['min', 'max', 'mean', 'std'],
        'wordCount': ['min', 'max', 'mean', 'std'],
        'sentenceCount': ['min', 'max', 'mean', 'std'],
        'companyCount': ['min', 'max', 'mean', 'std'],
        'marketCommentary': ['min', 'max', 'mean', 'std'],
        'relevance': ['min', 'max', 'mean', 'std'],
        'sentimentNegative': ['min', 'max', 'mean', 'std'],
        'sentimentNeutral': ['min', 'max', 'mean', 'std'],
        'sentimentPositive': ['min', 'max', 'mean', 'std'],
        'sentimentWordCount': ['min', 'max', 'mean', 'std'],
        'noveltyCount12H': ['min', 'max', 'mean', 'std'],
        'noveltyCount24H': ['min', 'max', 'mean', 'std'],
        'noveltyCount3D': ['min', 'max', 'mean', 'std'],
        'noveltyCount5D': ['min', 'max', 'mean', 'std'],
        'noveltyCount7D': ['min', 'max', 'mean', 'std'],
        'volumeCounts12H': ['min', 'max', 'mean', 'std'],
        'volumeCounts24H': ['min', 'max', 'mean', 'std'],
        'volumeCounts3D': ['min', 'max', 'mean', 'std'],
        'volumeCounts5D': ['min', 'max', 'mean', 'std'],
        'volumeCounts7D': ['min', 'max', 'mean', 'std']
    }
    
    # Fix asset codes (str -> list)
    news_train_df['assetCodes'] = news_train_df['assetCodes'].str.findall(f"'([\w\./]+)'")    
    
    # Expand assetCodes
    assetCodes_expanded = list(chain(*news_train_df['assetCodes']))
    if(not news_train_df.empty): assetCodes_index = news_train_df.index.repeat(news_train_df['assetCodes'].apply(len)) 
    else: assetCodes_index = news_train_df.index
    #assetCodes_index = news_train_df.index.repeat(news_train_df['assetCodes'].apply(len))

    assert len(assetCodes_index) == len(assetCodes_expanded)
    df_assetCodes = pd.DataFrame({'level_0': assetCodes_index, 'assetCode': assetCodes_expanded})

    # Create expandaded news (will repeat every assetCodes' row)
    news_cols = ['time', 'assetCodes'] + sorted(news_cols_agg.keys())
    news_train_df_expanded = pd.merge(df_assetCodes, news_train_df[news_cols], left_on='level_0', right_index=True, suffixes=(['','_old']))

    # Free memory
    del news_train_df, df_assetCodes

    # Aggregate numerical news features
    news_train_df_aggregated = news_train_df_expanded.groupby(['time', 'assetCode']).agg(news_cols_agg)
    
    # Free memory
    del news_train_df_expanded

    # Convert to float32 to save memory
    news_train_df_aggregated = news_train_df_aggregated.astype(np.float32)
    
    # Flat columns
    news_train_df_aggregated.columns = ['_'.join(col).strip() for col in news_train_df_aggregated.columns.values]
    
    return news_train_df_aggregated

In [None]:

def merge_market_news(market_train_df, news_train_df):
    df = market_train_df.join(news_train_df, on=['time', 'assetCode'])
    return df

In [None]:
def get_y(df):
    label_cols = ['returnsOpenNextMktres10'] 
    real = df[label_cols] 
    y=(df[label_cols] >=0).astype(float)
    return y,real

In [None]:
def get_xy(market_train_df, news_train_df,batch_idx, le=None):
    X, le,t,u = get_x(market_train_df, news_train_df,batch_idx)
    y,real = get_y(market_train_df)
    return X, y, le,t,u,real

def label_encode(series, min_count):
    vc = series.value_counts()
    le = {c:i for i, c in enumerate(vc.index[vc >= min_count])}
    return le


def get_x(market_train_df, news_train_df,batch_idx,le=None):
    # Split date into before and after 22h (the time used in train data)
    # E.g: 2007-03-07 23:26:39+00:00 -> 2007-03-08 00:00:00+00:00 (next day)
    #      2009-02-25 21:00:50+00:00 -> 2009-02-25 00:00:00+00:00 (current day)
    market_train_df = market_train_df.loc[batch_idx.index]
    numeric_cols = ['volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1',
                'returnsOpenPrevMktres1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 'returnsClosePrevMktres10',
                'returnsOpenPrevMktres10']
    label_cols = ['returnsOpenNextMktres10']
    market_train_df[numeric_cols] = market_train_df[ ['assetCode'] + numeric_cols].groupby('assetCode').transform(lambda g: g.fillna(method='bfill'))
    scaler = preprocessing.StandardScaler()
    market_train_df[numeric_cols] = scaler.fit_transform(market_train_df[numeric_cols]).astype(np.float32)
    news_train_df = news_train_df.merge(batch_idx, on=['time'])
    news_train_df['time'] = (news_train_df['time'] - np.timedelta64(22,'h')).dt.ceil('1D')

    # Round time of market_train_df to 0h of curret day
    market_train_df['time'] = market_train_df['time'].dt.floor('1D')
    market_train_df = extract_market_features(market_train_df)
    news_train_df = extract_news_features(news_train_df)
    X = merge_market_news(market_train_df, news_train_df)
    universe = X['universe']
    feature_cols = [ 'volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
                    'returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 'returnsClosePrevMktres10', 'returnsOpenPrevMktres10','close_to_open', 
                    'volume_to_mean', 'returns_close_to_open_prev_raw1', 'returns_close_to_open_prev_raw10', 'returns_close_to_open_prev_mktres1', 'returns_close_to_open_prev_mktres10', 
                    'returns_prev_open_raw1_to_close_raw10', 'returns_prev_close_raw1_to_close_raw10', 'returns_prev_open_mktres1_to_close_mktres10', 
                    'returns_prev_close_mktres1_to_close_mktres10']
                    #, 'urgency_min', 'urgency_max', 'urgency_count', 'takeSequence_min', 'takeSequence_max', 'takeSequence_count', 
                    #'bodySize_min', 'bodySize_max', 'bodySize_mean', 'bodySize_std', 'wordCount_min', 'wordCount_max', 'wordCount_mean', 'wordCount_std', 'sentenceCount_min', 
                    #'sentenceCount_max', 'sentenceCount_mean', 'sentenceCount_std', 'companyCount_min', 'companyCount_max', 'companyCount_mean', 'companyCount_std', 'marketCommentary_min', 
                    #'marketCommentary_max', 'marketCommentary_mean', 'marketCommentary_std', 'relevance_min', 'relevance_max', 'relevance_mean', 'relevance_std', 'sentimentNegative_min', 
                    #'sentimentNegative_max', 'sentimentNegative_mean', 'sentimentNegative_std', 'sentimentNeutral_min', 'sentimentNeutral_max', 'sentimentNeutral_mean', 
                    #'sentimentNeutral_std', 'sentimentPositive_min', 'sentimentPositive_max', 'sentimentPositive_mean', 'sentimentPositive_std', 'sentimentWordCount_min', 
                    #'sentimentWordCount_max', 'sentimentWordCount_mean', 'sentimentWordCount_std', 'noveltyCount12H_min', 'noveltyCount12H_max', 'noveltyCount12H_mean',
                    #'noveltyCount12H_std', 'noveltyCount24H_min', 'noveltyCount24H_max', 'noveltyCount24H_mean', 'noveltyCount24H_std', 'noveltyCount3D_min', 'noveltyCount3D_max', 
                    #'noveltyCount3D_mean', 'noveltyCount3D_std', 'noveltyCount5D_min', 'noveltyCount5D_max', 'noveltyCount5D_mean', 'noveltyCount5D_std', 'noveltyCount7D_min', 
                    #'noveltyCount7D_max', 'noveltyCount7D_mean', 'noveltyCount7D_std', 'volumeCounts12H_min', 'volumeCounts12H_max', 'volumeCounts12H_mean', 'volumeCounts12H_std', 
                    #'volumeCounts24H_min', 'volumeCounts24H_max', 'volumeCounts24H_mean', 'volumeCounts24H_std', 'volumeCounts3D_min', 'volumeCounts3D_max', 'volumeCounts3D_mean', 
                    #'volumeCounts3D_std', 'volumeCounts5D_min', 'volumeCounts5D_max', 'volumeCounts5D_mean', 'volumeCounts5D_std', 'volumeCounts7D_min', 'volumeCounts7D_max', 'volumeCounts7D_mean', 'volumeCounts7D_std']
    X[feature_cols] = X[ ['assetCode'] + feature_cols].groupby('assetCode').transform(lambda g: g.fillna(method='bfill'))
    scaler = preprocessing.StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols]).astype(np.float32)
    X['volume'].fillna(0,inplace=True)
    X['close'].fillna(0,inplace=True)
    X['open'].fillna(0,inplace=True)
    X['returnsClosePrevRaw1'].fillna(0,inplace=True)
    X['returnsOpenPrevRaw1'].fillna(0,inplace=True)
    X['returnsClosePrevMktres1'].fillna(0,inplace=True)
    X['returnsOpenPrevMktres1'].fillna(0,inplace=True)
    X['returnsClosePrevRaw10'].fillna(0,inplace=True)
    X['returnsOpenPrevRaw10'].fillna(0,inplace=True)
    X['returnsClosePrevMktres10'].fillna(0,inplace=True)
    X['returnsOpenPrevMktres10'].fillna(0,inplace=True)
    X['universe'].fillna(0,inplace=True)
    X['close_to_open'].fillna(0,inplace=True)
    X['volume_to_mean'].fillna(0,inplace=True)
    X['returns_close_to_open_prev_raw1'].fillna(0,inplace=True)
    X['returns_close_to_open_prev_raw10'].fillna(0,inplace=True)
    X['returns_close_to_open_prev_mktres1'].fillna(0,inplace=True)
    X['returns_close_to_open_prev_mktres10'].fillna(0,inplace=True)
    X['returns_prev_open_raw1_to_close_raw10'].fillna(0,inplace=True)
    X['returns_prev_close_raw1_to_close_raw10'].fillna(0,inplace=True)
    X['returns_prev_open_mktres1_to_close_mktres10'].fillna(0,inplace=True)
    X['returns_prev_close_mktres1_to_close_mktres10'].fillna(0,inplace=True)   
   # X['urgency_min'].fillna(0,inplace=True)
   # X['urgency_max'].fillna(0,inplace=True)
   # X['urgency_count'].fillna(0,inplace=True)
  #  X['takeSequence_min'].fillna(0,inplace=True)
   # X['takeSequence_max'].fillna(0,inplace=True)
   # X['takeSequence_count'].fillna(0,inplace=True)
   # X['bodySize_min'].fillna(0,inplace=True)
   # X['bodySize_max'].fillna(0,inplace=True)
   # X['bodySize_mean'].fillna(0,inplace=True)
  #  X['bodySize_std'].fillna(0,inplace=True)
   # X['wordCount_min'].fillna(0,inplace=True)
   # X['wordCount_max'].fillna(0,inplace=True)
    #X['wordCount_mean'].fillna(0,inplace=True)
    #X['wordCount_std'].fillna(0,inplace=True)
    #X['sentenceCount_min'].fillna(0,inplace=True)
    #X['sentenceCount_max'].fillna(0,inplace=True)
    #X['sentenceCount_mean'].fillna(0,inplace=True)
   # X['sentenceCount_std'].fillna(0,inplace=True)
  #  X['companyCount_min'].fillna(0,inplace=True)
   # X['companyCount_max'].fillna(0,inplace=True)
   # X['companyCount_mean'].fillna(0,inplace=True)
    #X['companyCount_std'].fillna(0,inplace=True)
    #X['marketCommentary_min'].fillna(0,inplace=True)
    #X['marketCommentary_max'].fillna(0,inplace=True)
    #X['marketCommentary_mean'].fillna(0,inplace=True)
    #X['marketCommentary_std'].fillna(0,inplace=True)
    #X['relevance_min'].fillna(0,inplace=True)
    #X['relevance_max'].fillna(0,inplace=True)
    #X['relevance_mean'].fillna(0,inplace=True)
    ##X['relevance_std'].fillna(0,inplace=True)
    #X['sentimentNegative_min'].fillna(0,inplace=True)
   # X['sentimentNegative_max'].fillna(0,inplace=True)
   #X['sentimentNegative_mean'].fillna(0,inplace=True)
    #X['sentimentNegative_std'].fillna(0,inplace=True)
    #X['sentimentNeutral_min'].fillna(0,inplace=True)
    #X['sentimentNeutral_max'].fillna(0,inplace=True)
    #X['sentimentNeutral_mean'].fillna(0,inplace=True)
   # X['sentimentNeutral_std'].fillna(0,inplace=True)
   # X['sentimentPositive_min'].fillna(0,inplace=True)
   # X['sentimentPositive_max'].fillna(0,inplace=True)
   # X['sentimentPositive_mean'].fillna(0,inplace=True)
   # X['sentimentPositive_std'].fillna(0,inplace=True)
   # X['sentimentWordCount_min'].fillna(0,inplace=True)
   # X['sentimentWordCount_max'].fillna(0,inplace=True)
   # X['sentimentWordCount_mean'].fillna(0,inplace=True)
   # X['sentimentWordCount_std'].fillna(0,inplace=True)
    #X['noveltyCount12H_min'].fillna(0,inplace=True)
    #X['noveltyCount12H_max'].fillna(0,inplace=True)
   # X['noveltyCount12H_mean'].fillna(0,inplace=True)
   # X['noveltyCount12H_std'].fillna(0,inplace=True)
   # X['noveltyCount24H_min'].fillna(0,inplace=True)
   # X['noveltyCount24H_max'].fillna(0,inplace=True)
   # X['noveltyCount24H_mean'].fillna(0,inplace=True)
   # X['noveltyCount24H_std'].fillna(0,inplace=True)
   # X['noveltyCount3D_min'].fillna(0,inplace=True)
   # X['noveltyCount3D_max'].fillna(0,inplace=True)
   # X['noveltyCount3D_mean'].fillna(0,inplace=True)
   # X['noveltyCount3D_std'].fillna(0,inplace=True)
   # X['noveltyCount5D_min'].fillna(0,inplace=True)
   # X['noveltyCount5D_max'].fillna(0,inplace=True)
   # X['noveltyCount5D_mean'].fillna(0,inplace=True)
   # X['noveltyCount5D_std'].fillna(0,inplace=True)
   # X['noveltyCount7D_min'].fillna(0,inplace=True)
   # X['noveltyCount7D_max'].fillna(0,inplace=True)
   # X['noveltyCount7D_mean'].fillna(0,inplace=True)
   # X['noveltyCount7D_std'].fillna(0,inplace=True)
   # X['volumeCounts12H_min'].fillna(0,inplace=True)
   # X['volumeCounts12H_max'].fillna(0,inplace=True)
   # X['volumeCounts12H_mean'].fillna(0,inplace=True)
   # X['volumeCounts12H_std'].fillna(0,inplace=True)
   # X['volumeCounts24H_min'].fillna(0,inplace=True)
   # X['volumeCounts24H_max'].fillna(0,inplace=True)
   # X['volumeCounts24H_mean'].fillna(0,inplace=True)
   # X['volumeCounts24H_std'].fillna(0,inplace=True)
    #X['volumeCounts3D_min'].fillna(0,inplace=True)
    #X['volumeCounts3D_max'].fillna(0,inplace=True)
    #X['volumeCounts3D_mean'].fillna(0,inplace=True)
    #X['volumeCounts3D_std'].fillna(0,inplace=True)
    #X['volumeCounts5D_min'].fillna(0,inplace=True)
    #X['volumeCounts5D_max'].fillna(0,inplace=True)
    #X['volumeCounts5D_mean'].fillna(0,inplace=True)
    #X['volumeCounts5D_std'].fillna(0,inplace=True)
    #X['volumeCounts7D_min'].fillna(0,inplace=True)
    #X['volumeCounts7D_max'].fillna(0,inplace=True)
    #X['volumeCounts7D_mean'].fillna(0,inplace=True)
    #X['volumeCounts7D_std'].fillna(0,inplace=True)
    
    #print(X.isnull().any())
   # If not label-encoder... encode assetCode
   
    if le is None:
        le_assetCode = label_encode(X['assetCode'], min_count=10)
        le_assetName = label_encode(X['assetName'], min_count=5)
    else:
        # 'unpack' label encoders
        le_assetCode, le_assetName = le
        
    X['assetCode'] = X['assetCode'].map(le_assetCode).fillna(-1).astype(np.int16)
    X['assetName'] = X['assetName'].map(le_assetName).fillna(-1).astype(np.int16)
    time_batch = X['time'].dt.day.astype(np.int8)
    
    try:
        X.drop(columns=['returnsOpenNextMktres10'], inplace=True)
    except:
        pass
    try:
        X.drop(columns=['universe'], inplace=True)
    except:
        pass
    X['dayofweek'], X['month'] = X.time.dt.dayofweek.astype(np.int8), X.time.dt.month.astype(np.int8)
    X.drop(columns='time', inplace=True)

    # Fix some mixed-type columns
    for bogus_col in ['marketCommentary_min', 'marketCommentary_max']:
        X[bogus_col] = X[bogus_col].astype(np.float32)
    
    features = X[feature_cols]
    return features, (le_assetCode, le_assetName),time_batch,universe


We then merge market data and news data, and remove the original dataframes to reduce memory footprint.

Create training and validation datasets and add time and universe to validation dataset in order to calculate custom scores later. 


For LSTM dataset

In [None]:
market_train_df = market_train_df.sort_values(by=['time'])
train_index, test_index = train_test_split(market_train_df.index,shuffle=False, test_size=0.1, random_state=2018)
train_index, valid_index = train_test_split(train_index,shuffle=False,test_size=0.2, random_state=2018)
market_train_idx = market_train_df.loc[train_index][['time', 'assetCode']]
market_val_idx = market_train_df.loc[valid_index][['time', 'assetCode']]
market_test_idx = market_train_df.loc[test_index][['time', 'assetCode']]

In [None]:
class LSTM_Generator:
    
    def __init__(self,market, news, idx):
        self.market = market
        self.news = news
        self.idx = idx

    def lstm_batch_lookback(self, batch_size, is_train, look_back, look_back_step):
        while True:
            # Get market indices of random assets, sorted by assetCode, time.
            batch_index_df = self.get_batch_idx(batch_size)
            X, y, le,t,u,real= self.get_batch(batch_index_df)
            X, y,index = self.with_look_back(X,y,t,u,real,look_back,look_back_step)
            print(X,y)
            yield X,y
            
    def get_batch_idx(self, batch_size):
        asset_codes = self.idx['assetCode'].unique().tolist()
        asset = np.random.choice(asset_codes)
        asset_codes.remove(asset)
        batch_index_df = self.idx[self.idx.assetCode == asset].tail(batch_size)
        # Repeat until reach batch_size records
        while (batch_index_df.index.size < batch_size) and (len(asset_codes) > 0):
            asset = np.random.choice(asset_codes)
            asset_codes.remove(asset)
            asset_index_df = self.idx[self.idx.assetCode == asset].tail(batch_size - batch_index_df.index.size)
            batch_index_df = pd.concat([batch_index_df, asset_index_df])
        return batch_index_df.sort_values(by=['assetCode', 'time'])
            
    def get_batch(self, batch_idx):
        X, y, le,t,u,real = get_xy(self.market, self.news, batch_idx)
        return(X, y, le,t,u,real)
    
    def with_look_back(self, X, y, t, u,real,look_back, look_back_step):
        """
        Add look back window values to prepare dataset for LSTM
        """
        X_processed, y_processed = [], []
        t_processed, u_processed = [], []
        index_processed =[]
        # Fix last window in batch, can be not full
        
        if look_back > len(X): 
            look_back = len(X)
            look_back_step = min(look_back_step, look_back)
            
        for i in range(0,len(X)-look_back+1):
            # Add lookback to X
            x_window = X.values[i:(i+look_back):look_back_step, :]
            X_processed.append(x_window)
            if y is None: continue
            y_window = y.values[i+look_back-1]
            y_processed.append(y_window)
            #t_window = t.values[i+look_back-1]
            #t_processed.append(t_window)
            #u_window = u.values[i+look_back-1]
            #u_processed.append(u_window)
            #r_window = real.values[i+look_back-1]
            #real_processed.append(r_window)
            index = X.index[i+look_back-1]
            index_processed.append(index)
        if(y is not None): return np.array(X_processed), np.array(y_processed),index_processed
        else: return np.array(X_processed)

    
train_generator = LSTM_Generator(market_train_df, news_train_df,market_train_idx)
val_generator = LSTM_Generator(market_train_df, news_train_df, market_val_idx)
test_generator = LSTM_Generator(market_train_df, news_train_df, market_test_idx)
print('Generators created')

##X,y=next(train_generator.lstm_batch_lookback(20,True,10,2))
##print(X)
##print(y)

In [None]:
X,y=next(train_generator.lstm_batch_lookback(100,True,10,2))
print(X.shape)
print(y.shape)

In [None]:
from keras.layers.advanced_activations import LeakyReLU
def lstm_128():
    model = Sequential()
    input_size = 21
    model.add(LSTM(units=128, return_sequences=True, input_shape=(None,input_size)))
    model.add(LSTM(units=64, return_sequences=True ))
    model.add(LSTM(units=32, return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    return(model)        

model = lstm_128()
model.summary()



In [None]:
batch_size=1000
validation_batch_size=1000
steps_per_epoch=20
validation_steps=5
epochs=5
look_back=90
look_back_step=10

print(f'epochs:{epochs}, steps per epoch: {steps_per_epoch}, validation steps:{validation_steps}')
print(f'Batch_size:{batch_size}, validation batch size:{validation_batch_size}')

# Fit
training = model.fit_generator(train_generator.lstm_batch_lookback(batch_size=batch_size 
            , is_train=True 
            , look_back=look_back 
            , look_back_step=look_back_step) 
        , epochs=epochs 
        , validation_data=val_generator.lstm_batch_lookback(batch_size=validation_batch_size
            , is_train=False
            , look_back=look_back
            , look_back_step=look_back_step) 
        , steps_per_epoch=steps_per_epoch 
        , validation_steps=validation_steps)


In [None]:
print(t_test.loc[idx])

In [None]:
batch_idx = market_test_idx.index[0:999]
market_sub_idx = market_train_df.loc[batch_idx][['time', 'assetCode']]
asset = market_sub_idx['assetCode'].values[0]
asset_idx = market_test_idx[market_test_idx.assetCode == asset]
market_df = market_train_df.loc[market_train_df.assetCode == asset].copy().set_index(['time'], drop=False)
news_df = news_train_df.merge(market_df, on=['time'])
X_test, y_test, le_test,t_test,u_test,real= get_xy(market_df,news_df,market_sub_idx)
X_test, y_test,index= test_generator.with_look_back(X_test, y_test,t_test,u_test,real, look_back = 90, look_back_step=10)
confidence = np.clip(model.predict(X_test)*2-1,-1,1)
y_pred = pd.DataFrame(confidence, index = market_df.iloc[90-1:]['time'].dt.date)

In [None]:
x_t_i = []
t_pro = []
for i in range(0,300):
    batch_idx = market_test_idx.index[i*1000:(i+1)*1000-1]
    market_sub_idx = market_train_df.loc[batch_idx][['time', 'assetCode']]
    X_test, y_test, le_test,t_test,u_test,real= get_xy(market_train_df,news_train_df,market_sub_idx)
    X_test, y_test,index= test_generator.with_look_back(X_test, y_test,t_test,u_test,real, look_back = 90, look_back_step=10)
    confidence = np.clip(model.predict(X_test)*2-1,-1,1)
    y_pred = pd.DataFrame(confidence, index = index)
    for idx in index:
        x_t = y_pred.loc[idx].values * real.loc[idx].values
        x_t = x_t * u_test.loc[idx]
        x_t_i.append(x_t)
        t_pro.append(t_test.loc[idx])

In [None]:
dataset = pd.DataFrame({'day': t_pro, 'x_t_i': x_t_i}, columns=['day', 'x_t_i'])
x_t = dataset.groupby('day').sum()

In [None]:

print(y_pred)

In [None]:
x_t = dataset.groupby('day').sum()
print(x_t)
mean = x_t.mean()
std =  x_t.std()
score = mean / std
print(score)

In [None]:
#batch_idx = market_test_idx.index
#market_sub_idx = market_train_df.loc[batch_idx][['time', 'assetCode']]
print(market_test_idx.index)

In [None]:
print('The benchmark\'s score on test data: ', evaluate(y_pred.loc[index].values, y_pred.loc[idx].values, 
                                                       u_test.loc[index].values, t_test.loc[index].values))

In [None]:
def evaluate(preds, labels, universe, time):
    x_t = preds * labels * universe
    print(x_t.flatten().sum())
    #dataset = pd.DataFrame({'day': np.array(time), 'x_t_i': x_t}, columns=['day', 'x_t_i'])
    x_t_sum = x_t.flatten().sum()
    print(x_t_sum.mean())
    print(x_t_sum.std())
    score = x_t_sum.mean() / x_t_sum.std()
    return score

In [None]:
def predict_on_test():
    idx = market_test_idx
    X_test, y_test, le_test,t_test,u_test = get_xy(market_train_df,news_train_df,idx)
    X_test, y_test,t_test,u_test = test_generator.with_look_back(X_test, y_test,t_test,u_test, look_back = 90, look_back_step=10)
    confidence = model.predict(X_test)*2-1
    real_processed ,universe_processed =[],[]
    for i in range(0,len(market_df)-90+1):
        r=market_df['returnsOpenNextMktres10'].values[i+90-1]
        real_processed.append(r)
        u=market_df['universe'].values[i+90-1]
        universe_processed.append(u)
    
    x_t_i = []
    for i in range(0,len(real_processed)):
        x_t = confidence[i] * real_processed[i] * universe_processed[i]
        x_t_i.append(x_t)
    
    dataset = pd.DataFrame({'day': t_test, 'x_t_i': np.array(x_t_i).flatten()}, columns=['day', 'x_t_i'])
    x_t = dataset.groupby('day').sum().values.flatten()
    mean = np.mean(x_t)
    std = np.std(x_t)
    score = mean / std
    print(score)
    
predict_on_test()

In [None]:
def get_xy(market_train_df, news_train_df, le=None):
    X, le,t,u = get_x(market_train_df, news_train_df)
    y,real = get_y(market_train_df)
    return X, y, le,t,u,real

def label_encode(series, min_count):
    vc = series.value_counts()
    le = {c:i for i, c in enumerate(vc.index[vc >= min_count])}
    return le


def get_x(market_train_df, news_train_df,le=None):
    # Split date into before and after 22h (the time used in train data)
    # E.g: 2007-03-07 23:26:39+00:00 -> 2007-03-08 00:00:00+00:00 (next day)
    #      2009-02-25 21:00:50+00:00 -> 2009-02-25 00:00:00+00:00 (current day)
    numeric_cols = ['volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1',
                'returnsOpenPrevMktres1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 'returnsClosePrevMktres10',
                'returnsOpenPrevMktres10']
    label_cols = ['returnsOpenNextMktres10']
    market_train_df[numeric_cols] = market_train_df[ ['assetCode'] + numeric_cols].groupby('assetCode').transform(lambda g: g.fillna(method='bfill'))
    scaler = preprocessing.StandardScaler()
    market_train_df[numeric_cols] = scaler.fit_transform(market_train_df[numeric_cols]).astype(np.float32)
    news_train_df['time'] = (news_train_df['time'] - np.timedelta64(22,'h')).dt.ceil('1D')

    # Round time of market_train_df to 0h of curret day
    market_train_df['time'] = market_train_df['time'].dt.floor('1D')
    market_train_df = extract_market_features(market_train_df)
    news_train_df = extract_news_features(news_train_df)
    X = merge_market_news(market_train_df, news_train_df)
    universe = X['universe']
    feature_cols = [ 'volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
                    'returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 'returnsClosePrevMktres10', 'returnsOpenPrevMktres10','close_to_open', 
                    'volume_to_mean', 'returns_close_to_open_prev_raw1', 'returns_close_to_open_prev_raw10', 'returns_close_to_open_prev_mktres1', 'returns_close_to_open_prev_mktres10', 
                    'returns_prev_open_raw1_to_close_raw10', 'returns_prev_close_raw1_to_close_raw10', 'returns_prev_open_mktres1_to_close_mktres10', 
                    'returns_prev_close_mktres1_to_close_mktres10']
                    #, 'urgency_min', 'urgency_max', 'urgency_count', 'takeSequence_min', 'takeSequence_max', 'takeSequence_count', 
                    #'bodySize_min', 'bodySize_max', 'bodySize_mean', 'bodySize_std', 'wordCount_min', 'wordCount_max', 'wordCount_mean', 'wordCount_std', 'sentenceCount_min', 
                    #'sentenceCount_max', 'sentenceCount_mean', 'sentenceCount_std', 'companyCount_min', 'companyCount_max', 'companyCount_mean', 'companyCount_std', 'marketCommentary_min', 
                    #'marketCommentary_max', 'marketCommentary_mean', 'marketCommentary_std', 'relevance_min', 'relevance_max', 'relevance_mean', 'relevance_std', 'sentimentNegative_min', 
                    #'sentimentNegative_max', 'sentimentNegative_mean', 'sentimentNegative_std', 'sentimentNeutral_min', 'sentimentNeutral_max', 'sentimentNeutral_mean', 
                    #'sentimentNeutral_std', 'sentimentPositive_min', 'sentimentPositive_max', 'sentimentPositive_mean', 'sentimentPositive_std', 'sentimentWordCount_min', 
                    #'sentimentWordCount_max', 'sentimentWordCount_mean', 'sentimentWordCount_std', 'noveltyCount12H_min', 'noveltyCount12H_max', 'noveltyCount12H_mean',
                    #'noveltyCount12H_std', 'noveltyCount24H_min', 'noveltyCount24H_max', 'noveltyCount24H_mean', 'noveltyCount24H_std', 'noveltyCount3D_min', 'noveltyCount3D_max', 
                    #'noveltyCount3D_mean', 'noveltyCount3D_std', 'noveltyCount5D_min', 'noveltyCount5D_max', 'noveltyCount5D_mean', 'noveltyCount5D_std', 'noveltyCount7D_min', 
                    #'noveltyCount7D_max', 'noveltyCount7D_mean', 'noveltyCount7D_std', 'volumeCounts12H_min', 'volumeCounts12H_max', 'volumeCounts12H_mean', 'volumeCounts12H_std', 
                    #'volumeCounts24H_min', 'volumeCounts24H_max', 'volumeCounts24H_mean', 'volumeCounts24H_std', 'volumeCounts3D_min', 'volumeCounts3D_max', 'volumeCounts3D_mean', 
                    #'volumeCounts3D_std', 'volumeCounts5D_min', 'volumeCounts5D_max', 'volumeCounts5D_mean', 'volumeCounts5D_std', 'volumeCounts7D_min', 'volumeCounts7D_max', 'volumeCounts7D_mean', 'volumeCounts7D_std']
    X[feature_cols] = X[ ['assetCode'] + feature_cols].groupby('assetCode').transform(lambda g: g.fillna(method='bfill'))
    scaler = preprocessing.StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols]).astype(np.float32)
    X['volume'].fillna(0,inplace=True)
    X['close'].fillna(0,inplace=True)
    X['open'].fillna(0,inplace=True)
    X['returnsClosePrevRaw1'].fillna(0,inplace=True)
    X['returnsOpenPrevRaw1'].fillna(0,inplace=True)
    X['returnsClosePrevMktres1'].fillna(0,inplace=True)
    X['returnsOpenPrevMktres1'].fillna(0,inplace=True)
    X['returnsClosePrevRaw10'].fillna(0,inplace=True)
    X['returnsOpenPrevRaw10'].fillna(0,inplace=True)
    X['returnsClosePrevMktres10'].fillna(0,inplace=True)
    X['returnsOpenPrevMktres10'].fillna(0,inplace=True)
    X['universe'].fillna(0,inplace=True)
    X['close_to_open'].fillna(0,inplace=True)
    X['volume_to_mean'].fillna(0,inplace=True)
    X['returns_close_to_open_prev_raw1'].fillna(0,inplace=True)
    X['returns_close_to_open_prev_raw10'].fillna(0,inplace=True)
    X['returns_close_to_open_prev_mktres1'].fillna(0,inplace=True)
    X['returns_close_to_open_prev_mktres10'].fillna(0,inplace=True)
    X['returns_prev_open_raw1_to_close_raw10'].fillna(0,inplace=True)
    X['returns_prev_close_raw1_to_close_raw10'].fillna(0,inplace=True)
    X['returns_prev_open_mktres1_to_close_mktres10'].fillna(0,inplace=True)
    X['returns_prev_close_mktres1_to_close_mktres10'].fillna(0,inplace=True)   
   # X['urgency_min'].fillna(0,inplace=True)
   # X['urgency_max'].fillna(0,inplace=True)
   # X['urgency_count'].fillna(0,inplace=True)
  #  X['takeSequence_min'].fillna(0,inplace=True)
   # X['takeSequence_max'].fillna(0,inplace=True)
   # X['takeSequence_count'].fillna(0,inplace=True)
   # X['bodySize_min'].fillna(0,inplace=True)
   # X['bodySize_max'].fillna(0,inplace=True)
   # X['bodySize_mean'].fillna(0,inplace=True)
  #  X['bodySize_std'].fillna(0,inplace=True)
   # X['wordCount_min'].fillna(0,inplace=True)
   # X['wordCount_max'].fillna(0,inplace=True)
    #X['wordCount_mean'].fillna(0,inplace=True)
    #X['wordCount_std'].fillna(0,inplace=True)
    #X['sentenceCount_min'].fillna(0,inplace=True)
    #X['sentenceCount_max'].fillna(0,inplace=True)
    #X['sentenceCount_mean'].fillna(0,inplace=True)
   # X['sentenceCount_std'].fillna(0,inplace=True)
  #  X['companyCount_min'].fillna(0,inplace=True)
   # X['companyCount_max'].fillna(0,inplace=True)
   # X['companyCount_mean'].fillna(0,inplace=True)
    #X['companyCount_std'].fillna(0,inplace=True)
    #X['marketCommentary_min'].fillna(0,inplace=True)
    #X['marketCommentary_max'].fillna(0,inplace=True)
    #X['marketCommentary_mean'].fillna(0,inplace=True)
    #X['marketCommentary_std'].fillna(0,inplace=True)
    #X['relevance_min'].fillna(0,inplace=True)
    #X['relevance_max'].fillna(0,inplace=True)
    #X['relevance_mean'].fillna(0,inplace=True)
    ##X['relevance_std'].fillna(0,inplace=True)
    #X['sentimentNegative_min'].fillna(0,inplace=True)
   # X['sentimentNegative_max'].fillna(0,inplace=True)
   #X['sentimentNegative_mean'].fillna(0,inplace=True)
    #X['sentimentNegative_std'].fillna(0,inplace=True)
    #X['sentimentNeutral_min'].fillna(0,inplace=True)
    #X['sentimentNeutral_max'].fillna(0,inplace=True)
    #X['sentimentNeutral_mean'].fillna(0,inplace=True)
   # X['sentimentNeutral_std'].fillna(0,inplace=True)
   # X['sentimentPositive_min'].fillna(0,inplace=True)
   # X['sentimentPositive_max'].fillna(0,inplace=True)
   # X['sentimentPositive_mean'].fillna(0,inplace=True)
   # X['sentimentPositive_std'].fillna(0,inplace=True)
   # X['sentimentWordCount_min'].fillna(0,inplace=True)
   # X['sentimentWordCount_max'].fillna(0,inplace=True)
   # X['sentimentWordCount_mean'].fillna(0,inplace=True)
   # X['sentimentWordCount_std'].fillna(0,inplace=True)
    #X['noveltyCount12H_min'].fillna(0,inplace=True)
    #X['noveltyCount12H_max'].fillna(0,inplace=True)
   # X['noveltyCount12H_mean'].fillna(0,inplace=True)
   # X['noveltyCount12H_std'].fillna(0,inplace=True)
   # X['noveltyCount24H_min'].fillna(0,inplace=True)
   # X['noveltyCount24H_max'].fillna(0,inplace=True)
   # X['noveltyCount24H_mean'].fillna(0,inplace=True)
   # X['noveltyCount24H_std'].fillna(0,inplace=True)
   # X['noveltyCount3D_min'].fillna(0,inplace=True)
   # X['noveltyCount3D_max'].fillna(0,inplace=True)
   # X['noveltyCount3D_mean'].fillna(0,inplace=True)
   # X['noveltyCount3D_std'].fillna(0,inplace=True)
   # X['noveltyCount5D_min'].fillna(0,inplace=True)
   # X['noveltyCount5D_max'].fillna(0,inplace=True)
   # X['noveltyCount5D_mean'].fillna(0,inplace=True)
   # X['noveltyCount5D_std'].fillna(0,inplace=True)
   # X['noveltyCount7D_min'].fillna(0,inplace=True)
   # X['noveltyCount7D_max'].fillna(0,inplace=True)
   # X['noveltyCount7D_mean'].fillna(0,inplace=True)
   # X['noveltyCount7D_std'].fillna(0,inplace=True)
   # X['volumeCounts12H_min'].fillna(0,inplace=True)
   # X['volumeCounts12H_max'].fillna(0,inplace=True)
   # X['volumeCounts12H_mean'].fillna(0,inplace=True)
   # X['volumeCounts12H_std'].fillna(0,inplace=True)
   # X['volumeCounts24H_min'].fillna(0,inplace=True)
   # X['volumeCounts24H_max'].fillna(0,inplace=True)
   # X['volumeCounts24H_mean'].fillna(0,inplace=True)
   # X['volumeCounts24H_std'].fillna(0,inplace=True)
    #X['volumeCounts3D_min'].fillna(0,inplace=True)
    #X['volumeCounts3D_max'].fillna(0,inplace=True)
    #X['volumeCounts3D_mean'].fillna(0,inplace=True)
    #X['volumeCounts3D_std'].fillna(0,inplace=True)
    #X['volumeCounts5D_min'].fillna(0,inplace=True)
    #X['volumeCounts5D_max'].fillna(0,inplace=True)
    #X['volumeCounts5D_mean'].fillna(0,inplace=True)
    #X['volumeCounts5D_std'].fillna(0,inplace=True)
    #X['volumeCounts7D_min'].fillna(0,inplace=True)
    #X['volumeCounts7D_max'].fillna(0,inplace=True)
    #X['volumeCounts7D_mean'].fillna(0,inplace=True)
    #X['volumeCounts7D_std'].fillna(0,inplace=True)
    
    #print(X.isnull().any())
   # If not label-encoder... encode assetCode
   
    if le is None:
        le_assetCode = label_encode(X['assetCode'], min_count=10)
        le_assetName = label_encode(X['assetName'], min_count=5)
    else:
        # 'unpack' label encoders
        le_assetCode, le_assetName = le
        
    X['assetCode'] = X['assetCode'].map(le_assetCode).fillna(-1).astype(np.int16)
    X['assetName'] = X['assetName'].map(le_assetName).fillna(-1).astype(np.int16)
    time_batch = X['time'].dt.day.astype(np.int8)
    
    try:
        X.drop(columns=['returnsOpenNextMktres10'], inplace=True)
    except:
        pass
    try:
        X.drop(columns=['universe'], inplace=True)
    except:
        pass
    X['dayofweek'], X['month'] = X.time.dt.dayofweek.astype(np.int8), X.time.dt.month.astype(np.int8)
    X.drop(columns='time', inplace=True)

    # Fix some mixed-type columns
    for bogus_col in ['marketCommentary_min', 'marketCommentary_max']:
        X[bogus_col] = X[bogus_col].astype(np.float32)
    
    features = X[feature_cols]
    return features, (le_assetCode, le_assetName),time_batch,universe

In [None]:
def make_predictions(market_obs_df, news_obs_df, predictions_template_df):
    """
    Predict confidence for one day and update predictions_template_df['confidenceValue']
    @param market_obs_df: market_obs_df returned from env
    @param predictions_template_df: predictions_template_df returned from env.
    @return: None. prediction_template_df updated instead. 
    """
    # Preprocess the data
    X, t, u = get_x_sub(market_obs_df, news_obs_df)
    # Add look back window for LSTM, passing X only - we don't know y, we are predicting them
    X = train_generator.with_look_back(X, None,t, u,look_back=90, look_back_step=10)
    # Predict
    y_pred = model.predict(X)
    confidence_df=pd.DataFrame(y_pred*2-1, columns=['confidence'])

    # Merge predicted confidence to predictions template
    pred_df = pd.concat([predictions_template_df, confidence_df], axis=1).fillna(0)
    predictions_template_df.confidenceValue = pred_df.confidence

In [None]:
days = env.get_prediction_days()

for (market_obs_df, news_obs_df, predictions_template_df) in days:
    make_predictions(market_obs_df, news_obs_df, predictions_template_df)
    env.predict(predictions_template_df)

In [None]:
env.write_submission_file()