<h1 style="padding-top: 25px;padding-bottom: 25px;text-align: left; padding-left: 10px; background-color: #DDDDDD; 
    color: black;"> <img style="float: left; padding-right: 10px; width: 45px" src="https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/iacs.png"> AC295: Advanced Practical Data Science </h1>

## Project: News Analytics for Stock Return Prediction

**Harvard University, Fall 2020**  
**Instructors**: Pavlos Protopapas  

### **Team: $\alpha\beta normal$ $Distri\beta ution$**
#### **Roht Beri, Eduardo Peynetti, Jessica Wijaya, Stuart Neilson**

# Generating Master TFRecords for BERT and fin-BERT




**For multiple targets and models**

## Disks

### Connect Google Drive

In [1]:
from google.colab import drive

drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Libraries

### Install Packages

In [2]:
!pip3 install transformers
!pip3 install --upgrade pymongo[srv]==3.10.1
!pip3 install dask[dataframe]

Requirement already up-to-date: pymongo[srv]==3.10.1 in /usr/local/lib/python3.6/dist-packages (3.10.1)


### Imports

In [3]:
import os
import ast
import requests
import tarfile
import tempfile
import zipfile
import shutil
import csv
import json
import time
import sys
import subprocess
import logging
import pickle

import pymongo
import bson
import dns

import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_hub as hub

from tensorflow import keras
from tensorflow.python.keras import backend as K
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import layers

from collections import Counter
from glob import glob
from threading import Thread

from pytz import timezone
from bson import BSON, ObjectId
from pymongo import MongoClient
from tabulate import tabulate
from tqdm.notebook import tqdm, trange
from datetime import datetime, timedelta, date, time
from dateutil.parser import isoparser

from transformers import BertTokenizer, TFBertForSequenceClassification, AutoModel

%matplotlib inline

## Variables

### Useful Constants and Variables

In [4]:
# Set google drive path for pipeline storage
TF_MASTER_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/TF_RECORDS_MASTER/'
TF_MODEL_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/BERT_MODEL_{}/'
TF_HS_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/fin-BERT_HS/'
TICKERS_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/components/sp1500.csv'
IND_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/components/industries.csv'
IND_GROUP_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/components/industry_groups.csv'
SECTOR_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/components/sectors.csv'
BERT_HIDDEN_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/headlines/bert_hs/'

# Pipeline variables
BATCH_SIZE = 128
AUTOTUNE = tf.data.experimental.AUTOTUNE
prefetch = AUTOTUNE

# BERT Model
fin_bert_model = 'ipuneetrathore/bert-base-cased-finetuned-finBERT' 
bert_model = 'bert-base-uncased'
fin_cased = False
bert_cased = True

In [5]:
#MASTER_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/lean/master.p'
TRAIN_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/lean/train.p'
VALID_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/lean/valid.p'
TEST_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/lean/test.p'

In [6]:
data_path = '/content/drive/MyDrive/abnormal-distribution-project-data/lean/ml_filter.p'
with open(data_path,'rb') as pkl_file:
    LM_FILTER_LIST = pickle.load(pkl_file)
LM_FILTER_LIST = tf.constant(LM_FILTER_LIST)

## TFRecords

### Utils for Creating TFRecords

In [7]:
# The following functions can be used to convert a value to a type compatible
# with tf.train.Example.
# Credit: https://www.tensorflow.org/tutorials/load_data/tfrecord

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [8]:
# Function to create TFRecord Sample for writing
def get_tf_record(item, fin_tokenizer, bert_tokenizer):

    id = item['_id']

    ticker = item['ticker']
    open = item['open_date'].isoformat()
    close = item['close_date'].isoformat()

    ret_c_2 = item['ret_close_2']
    ret_c_3 = item['ret_close_3']
    res_c_2 = item['res_close_2']
    res_c_3 = item['res_close_3']
    ret_o_2 = item['ret_open_2'] 
    ret_o_3 = item['ret_open_3']
    ret_c_d = item['dict_ret_close']
    ret_o_d = item['dict_ret_open']
    res_c_d = item['dict_res_close']

    description = item['text']
    
    # Fin-BERT Tokenization
    description_token = fin_tokenizer.encode_plus(
        description, 
        add_special_tokens = True, # add [CLS], [SEP]
        max_length = 512, # max length of the text that can go to BERT (<=512)
        padding='max_length',
        return_attention_mask = True, # add attention mask to not focus on pad tokens
        truncation='longest_first',
        return_tensors="tf"
    )
    f_description_input = description_token['input_ids'].numpy().tostring()
    f_description_type = description_token['token_type_ids'].numpy().tostring()
    f_description_attention = description_token['attention_mask'].numpy().tostring()

    # BERT Tokenization
    description_token = bert_tokenizer.encode_plus(
        description, 
        add_special_tokens = True, # add [CLS], [SEP]
        max_length = 512, # max length of the text that can go to BERT (<=512)
        padding='max_length',
        return_attention_mask = True, # add attention mask to not focus on pad tokens
        truncation='longest_first',
        return_tensors="tf"
    )
    b_description_input = description_token['input_ids'].numpy().tostring()
    b_description_type = description_token['token_type_ids'].numpy().tostring()
    b_description_attention = description_token['attention_mask'].numpy().tostring()

    # Create tf.train.Example
    feature={
        '_id' : _int64_feature(id),

        'ticker': _bytes_feature(ticker.encode('UTF-8')),
        'open': _bytes_feature(open.encode('UTF-8')),
        'close': _bytes_feature(close.encode('UTF-8')),

        'b_input_ids': _bytes_feature(b_description_input),
        'b_token_type_ids': _bytes_feature(b_description_type),
        'b_attention_mask': _bytes_feature(b_description_attention), 

        'f_input_ids': _bytes_feature(f_description_input),
        'f_token_type_ids': _bytes_feature(f_description_type),
        'f_attention_mask': _bytes_feature(f_description_attention), 

        'ret_c_2': _int64_feature(ret_c_2),
        'ret_c_3': _int64_feature(ret_c_3),
        'res_c_2': _int64_feature(res_c_2),
        'res_c_3': _int64_feature(res_c_3),
        'ret_o_2': _int64_feature(ret_o_2),
        'ret_o_3': _int64_feature(ret_o_3),
        'ret_c_d': _int64_feature(ret_c_d),
        'ret_o_d': _int64_feature(ret_o_d),
        'res_c_d': _int64_feature(res_c_d),
    }
    features=tf.train.Features(feature=feature)
    example = tf.train.Example(features=features)

    return example

In [9]:
# Creates shard for the given ticker
def write_tfrecord(df, shard_path, fin_tokenizer, bert_tokenizer):
    
    # Tokenize the data and parse the TF Record
    examples = df.apply(get_tf_record, axis=1, args=(fin_tokenizer, bert_tokenizer,))

    # TFRecord writer initialized
    with tf.io.TFRecordWriter(shard_path) as writer:
        
        # Write to TFRecords
        for example in tqdm(examples):
            
            # Wrtie the TFRecord
            writer.write(example.SerializeToString())

In [10]:
# Function to create TFRecords for the dataset
def create_TFRecords(
    train = False, valid=False, test = False,
    train_path=TRAIN_PATH, valid_path=VALID_PATH, 
    test_path=TEST_PATH, path=TF_MASTER_PATH
):
    # Setup folders
    if not os.path.exists(path):
        os.mkdir(path)


    # Setup paths
    done_path = path + '*.records'
    shard_path = path + "{}_{:02d}.records"

    # Files alread done   
    records_done = []
    records_done.extend(glob(done_path))

    #Initalize BERT Tokenizer
    fin_tokenizer = BertTokenizer.from_pretrained(fin_bert_model, do_lower_case=fin_cased)
    bert_tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=bert_cased)

    # List of data paths
    path_set = []
    path_type = []
    if train:
        path_set.append(train_path)
        path_type.append('train')
    if valid:
        path_set.append(valid_path)
        path_type.append('valid')
    if test:
        path_set.append(test_path)
        path_type.append('test') 

    # Max number of records in a shard
    max_num_records = 100000

    # Run loop over three paths
    for i, data_path in tqdm(enumerate(path_set)):
        
        # Read the data into pandas
        with open(data_path,'rb') as pkl_file:
            df = pickle.load(pkl_file)

        num_records = df.shape[0]
        num_shards = num_records//max_num_records
        num_shards += min(1, num_records % max_num_records)

        print('Processing {} data'.format(path_type[i]))
        for j in trange(num_shards):
            # Path for the shard
            temp_path = shard_path.format(path_type[i], j)
            
            # Check if the shard already exists
            if os.path.exists(temp_path):
                continue
            
            # Start and the end of the shard
            start = j * max_num_records
            end = min(num_records, (j+1)*max_num_records)

            # Dataframe slice
            temp_df = df.iloc[start : end]

            # Create the TFRecord
            write_tfrecord(temp_df, temp_path, fin_tokenizer, bert_tokenizer)

### Create TFRecords Dataset

In [11]:
#shutil.rmtree('/content/drive/MyDrive/abnormal-distribution-project-data/headlines/tf_records_monthly_fin')

In [12]:
# Create TFRecords for Stocks
create_TFRecords(train=True, valid=True, test=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=40.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Processing train data


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


Processing valid data


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Processing test data


HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))





### Read TFRecords Dataset

In [13]:
# Check data integrity
tick_path = TF_MASTER_PATH + '*.records'

files = glob(tick_path)
i = 0
for file in tqdm(files):
    raw_dataset = tf.data.TFRecordDataset(file)
    try:
        for raw_record in raw_dataset.take(10):
            example = tf.train.Example()
            example.ParseFromString(raw_record.numpy())
            example = example.features.feature['_id']
        
        del raw_dataset
    except:
        #os.remove(file)
        print('file in problem: ', file)
        i += 1

if i:
    print(f"{i} files should be deleted and rerun Create TFRecords")
else:
    print("TFRecords are in good shape")

HBox(children=(FloatProgress(value=0.0, max=37.0), HTML(value='')))


TFRecords are in good shape


In [14]:
filenames = [TF_MASTER_PATH + 'train_02.records']
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset

<TFRecordDatasetV2 shapes: (), types: tf.string>

In [15]:
for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)

features {
  feature {
    key: "_id"
    value {
      int64_list {
        value: 1584485
      }
    }
  }
  feature {
    key: "b_attention_mask"
    value {
      bytes_list {
        value: "\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\00

In [16]:
# Create a dictionary describing the features.
features={
        '_id': tf.io.FixedLenFeature([], tf.int64), 

        'ticker': tf.io.FixedLenFeature([], tf.string),
        'open': tf.io.FixedLenFeature([], tf.string),
        'close': tf.io.FixedLenFeature([], tf.string),

        'b_input_ids': tf.io.FixedLenFeature([], tf.string),
        'b_token_type_ids': tf.io.FixedLenFeature([], tf.string),
        'b_attention_mask': tf.io.FixedLenFeature([], tf.string), 

        'f_input_ids': tf.io.FixedLenFeature([], tf.string),
        'f_token_type_ids': tf.io.FixedLenFeature([], tf.string),
        'f_attention_mask': tf.io.FixedLenFeature([], tf.string), 

        'ret_c_2': tf.io.FixedLenFeature([], tf.int64),
        'ret_c_3': tf.io.FixedLenFeature([], tf.int64),
        'res_c_2': tf.io.FixedLenFeature([], tf.int64),
        'res_c_3': tf.io.FixedLenFeature([], tf.int64),
        'ret_o_2': tf.io.FixedLenFeature([], tf.int64),
        'ret_o_3': tf.io.FixedLenFeature([], tf.int64),
        'ret_c_d': tf.io.FixedLenFeature([], tf.int64),
        'ret_o_d': tf.io.FixedLenFeature([], tf.int64),
        'res_c_d': tf.io.FixedLenFeature([], tf.int64),
    }

def _parse_image_function(example_proto):
  # Parse the input tf.train.Example proto using the dictionary above.
  return tf.io.parse_single_example(example_proto, features)

parsed_image_dataset = raw_dataset.map(_parse_image_function)
parsed_image_dataset

<MapDataset shapes: {_id: (), b_attention_mask: (), b_input_ids: (), b_token_type_ids: (), close: (), f_attention_mask: (), f_input_ids: (), f_token_type_ids: (), open: (), res_c_2: (), res_c_3: (), res_c_d: (), ret_c_2: (), ret_c_3: (), ret_c_d: (), ret_o_2: (), ret_o_3: (), ret_o_d: (), ticker: ()}, types: {_id: tf.int64, b_attention_mask: tf.string, b_input_ids: tf.string, b_token_type_ids: tf.string, close: tf.string, f_attention_mask: tf.string, f_input_ids: tf.string, f_token_type_ids: tf.string, open: tf.string, res_c_2: tf.int64, res_c_3: tf.int64, res_c_d: tf.int64, ret_c_2: tf.int64, ret_c_3: tf.int64, ret_c_d: tf.int64, ret_o_2: tf.int64, ret_o_3: tf.int64, ret_o_d: tf.int64, ticker: tf.string}>

In [17]:
i = 0
for data in parsed_image_dataset:
    input_ids = tf.io.decode_raw(data['f_input_ids'], tf.int32)
    print(data['res_c_d'].numpy())
    print(data['_id'])
    print(data['ticker'])
    print(input_ids.numpy()[:10])
    print(input_ids.numpy().shape)
    break

2
tf.Tensor(1584485, shape=(), dtype=int64)
tf.Tensor(b'tm', shape=(), dtype=string)
[  101  1798  2381  1111   158   119   156   119 12983 15689]
(512,)


## Pipeline

### Utils for BERT Pieline

In [18]:
# Function to parse data features
def _parse_features_function(example):
    # Parse the input tf.train.Example proto using the dictionary above.
    tf_records_features = {
        '_id': tf.io.FixedLenFeature([], tf.int64), 

        'ticker': tf.io.FixedLenFeature([], tf.string),
        'open': tf.io.FixedLenFeature([], tf.string),
        'close': tf.io.FixedLenFeature([], tf.string),

        'b_input_ids': tf.io.FixedLenFeature([], tf.string),
        'b_token_type_ids': tf.io.FixedLenFeature([], tf.string),
        'b_attention_mask': tf.io.FixedLenFeature([], tf.string), 

        'f_input_ids': tf.io.FixedLenFeature([], tf.string),
        'f_token_type_ids': tf.io.FixedLenFeature([], tf.string),
        'f_attention_mask': tf.io.FixedLenFeature([], tf.string), 

        'ret_c_2': tf.io.FixedLenFeature([], tf.int64),
        'ret_c_3': tf.io.FixedLenFeature([], tf.int64),
        'res_c_2': tf.io.FixedLenFeature([], tf.int64),
        'res_c_3': tf.io.FixedLenFeature([], tf.int64),
        'ret_o_2': tf.io.FixedLenFeature([], tf.int64),
        'ret_o_3': tf.io.FixedLenFeature([], tf.int64),
        'ret_c_d': tf.io.FixedLenFeature([], tf.int64),
        'ret_o_d': tf.io.FixedLenFeature([], tf.int64),
        'res_c_d': tf.io.FixedLenFeature([], tf.int64),
    }
    return tf.io.parse_single_example(example, tf_records_features)

In [19]:
# Structure the data for Sentiment & Hidden Layer Generation
def structure_data_fin_HS(data):
    id = data['_id']
    close = data['close']
    ticker = data['ticker']

    input_ids = tf.io.decode_raw(data['f_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['f_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['f_token_type_ids'], tf.int32)
    
    stock_return_1 = data['ret_c_3']
    stock_return_2 = data['res_c_3']
    
    return ((input_ids, token_type_ids, attention_mask), (id, ticker, close, stock_return_1, stock_return_2))

In [20]:
# Structure the data for training returns with fin-BERT Models with different targets
def structure_data_fin_ret_c_2(data):
    input_ids = tf.io.decode_raw(data['f_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['f_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['f_token_type_ids'], tf.int32)
    stock_return = data['ret_c_2']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_fin_ret_c_3(data):
    input_ids = tf.io.decode_raw(data['f_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['f_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['f_token_type_ids'], tf.int32)
    stock_return = data['ret_c_3']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_fin_res_c_2(data):
    input_ids = tf.io.decode_raw(data['f_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['f_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['f_token_type_ids'], tf.int32)
    stock_return = data['res_c_2']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_fin_res_c_3(data):
    input_ids = tf.io.decode_raw(data['f_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['f_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['f_token_type_ids'], tf.int32)
    stock_return = data['res_c_3']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_fin_ret_o_2(data):
    input_ids = tf.io.decode_raw(data['f_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['f_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['f_token_type_ids'], tf.int32)
    stock_return = data['ret_o_2']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_fin_ret_o_3(data):
    input_ids = tf.io.decode_raw(data['f_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['f_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['f_token_type_ids'], tf.int32)
    stock_return = data['ret_o_3']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_fin_ret_c_d(data):
    input_ids = tf.io.decode_raw(data['f_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['f_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['f_token_type_ids'], tf.int32)
    stock_return = data['ret_c_d']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_fin_ret_o_d(data):
    input_ids = tf.io.decode_raw(data['f_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['f_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['f_token_type_ids'], tf.int32)
    stock_return = data['ret_o_d']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_fin_res_c_d(data):
    input_ids = tf.io.decode_raw(data['f_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['f_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['f_token_type_ids'], tf.int32)
    stock_return = data['res_c_d']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)

In [21]:
# Structure the data for training returns with normal BERT Models with different targets
def structure_data_bert_ret_c_2(data):
    input_ids = tf.io.decode_raw(data['b_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['b_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['b_token_type_ids'], tf.int32)
    stock_return = data['ret_c_2']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_bert_ret_c_3(data):
    input_ids = tf.io.decode_raw(data['b_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['b_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['b_token_type_ids'], tf.int32)
    stock_return = data['ret_c_3']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_bert_res_c_2(data):
    input_ids = tf.io.decode_raw(data['b_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['b_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['b_token_type_ids'], tf.int32)
    stock_return = data['res_c_2']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_bert_res_c_3(data):
    input_ids = tf.io.decode_raw(data['b_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['b_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['b_token_type_ids'], tf.int32)
    stock_return = data['res_c_3']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_bert_ret_o_2(data):
    input_ids = tf.io.decode_raw(data['b_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['b_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['b_token_type_ids'], tf.int32)
    stock_return = data['ret_o_2']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_bert_ret_o_3(data):
    input_ids = tf.io.decode_raw(data['b_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['b_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['b_token_type_ids'], tf.int32)
    stock_return = data['ret_o_3']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_bert_ret_c_d(data):
    input_ids = tf.io.decode_raw(data['b_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['b_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['b_token_type_ids'], tf.int32)
    stock_return = data['ret_c_d']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_bert_ret_o_d(data):
    input_ids = tf.io.decode_raw(data['b_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['b_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['b_token_type_ids'], tf.int32)
    stock_return = data['ret_o_d']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)


def structure_data_bert_res_c_d(data):
    input_ids = tf.io.decode_raw(data['b_input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['b_attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['b_token_type_ids'], tf.int32)
    stock_return = data['res_c_d']
    
    return ((input_ids, token_type_ids, attention_mask), stock_return)

In [22]:
# Remove the items from trainig list where LM score is 0
def filter_train(x):
    id = x['_id']
    
    broadcast_equal = tf.equal(LM_FILTER_LIST, id)
    broadcast_equal = tf.count_nonzero(broadcast_equal)
    
    return broadcast_equal == 0

def filter_not_train(x):
    return True

In [25]:
def generate_pipeline(
    train=False, valid=False, test=False, model=0, path=TF_MASTER_PATH, 
    batch_size=BATCH_SIZE
):
    print("Generating Pipeline....")

    models = [
        structure_data_fin_HS,
        structure_data_fin_ret_c_2,
        structure_data_fin_ret_c_3,
        structure_data_fin_res_c_2,
        structure_data_fin_res_c_3,
        structure_data_fin_ret_o_2,
        structure_data_fin_ret_o_3,
        structure_data_fin_ret_c_d,
        structure_data_fin_ret_o_d,
        structure_data_fin_res_c_d,
        structure_data_bert_ret_c_2,
        structure_data_bert_ret_c_3,
        structure_data_bert_res_c_2,
        structure_data_bert_res_c_3,
        structure_data_bert_ret_o_2,
        structure_data_bert_ret_o_3,
        structure_data_bert_ret_c_d,
        structure_data_bert_ret_o_d,
        structure_data_bert_res_c_d,  
    ]

    # Chose the function to structure data based on the trainign model
    func = models[model]

    if train:
        tfrecords_pattern_path = path + "train_*.records"
        deter = False
        tf_filter = filter_train
    elif valid:
        tfrecords_pattern_path = path + "valid_*.records"
        deter = True
        tf_filter = filter_not_train
    elif test:
        tfrecords_pattern_path = path + "test_*.records"
        deter = True
        tf_filter = filter_not_train

    if model==0:
        deter = True
        tf_filter = filter_not_train

    options = tf.data.Options()
    options.experimental_deterministic = deter
    
    train_files = tf.io.matching_files(tfrecords_pattern_path)

    if deter:
        train_shards = tf.data.Dataset.from_tensor_slices(train_files)
    else:
        train_shards = tf.data.Dataset.list_files(train_files, shuffle=len(train_files))

    train = train_shards.interleave(tf.data.TFRecordDataset, cycle_length=12)
    train = train.with_options(options)
    train = train.map(_parse_features_function, num_parallel_calls=AUTOTUNE)
    train = train.filter(tf_filter)
    train = train.map(func, num_parallel_calls=AUTOTUNE)
    train = train.batch(batch_size)
    #train = train.cache().prefetch(prefetch)

    return train

### Test Pipeline

In [26]:
tickers_data = generate_pipeline(train=True, model=0)
tickers_data

Generating Pipeline....


<BatchDataset shapes: (((None, None), (None, None), (None, None)), ((None,), (None,), (None,), (None,), (None,))), types: ((tf.int32, tf.int32, tf.int32), (tf.int64, tf.string, tf.string, tf.int64, tf.int64))>

In [27]:
for item in tickers_data.take(1):
    print('answer: ', item[1][0].numpy())
    print()
    print('input_ids: ',item[0][0][0].numpy().shape)

answer:  [  5527674   3598616   1584485   1928292   3180964   1943174   6637246
   6835372   5512765   4822943   5421536   5390262   5883440   6358192
   1403201  21814131   5831780   2212387    473154  -8027757   1967932
   3736303   1370883   6701098   6195790   2392907   6049518  21513451
   2354194   1384899  21645633     67849   5974031   4842722   6052629
   6814999   6169225   3608525  -7980309   1336312   1773415   4872663
   2132120  17095156    470375  21710949   4863014   4008847   5440820
    799596   4834864   4323615   4112422   6369833   7330333  18101795
   5594892   2446025   2402910   4768464   7409936    342504   4641107
   1411718   5498986   6142216   4971675   6844288   3961283   4114045
   4171295   4237615   5786439  18437598   1124284   2123639   4543146
  21590053 -15782663   6641848   1247424   4958889   1301173   1580975
   6342832   7787391   2197533   5551381     24156  26478194   4062489
  21952940     90781   5411903   5122753   2522180   7760972   48294

In [28]:
for item in tickers_data.take(1):
    print(item)

((<tf.Tensor: shape=(128, 512), dtype=int32, numpy=
array([[  101,  2859, 12532, ...,     0,     0,     0],
       [  101, 13359, 16631, ...,     0,     0,     0],
       [  101,  1798,  2381, ...,     0,     0,     0],
       ...,
       [  101,   142, 19673, ...,     0,     0,     0],
       [  101, 10017,  3349, ...,     0,     0,     0],
       [  101,  7443,  8607, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(128, 512), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, <tf.Tensor: shape=(128, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>), (<tf.Tensor: shape=(128,), dtype=int64, num