<h1 style="padding-top: 25px;padding-bottom: 25px;text-align: left; padding-left: 10px; background-color: #DDDDDD; 
    color: black;"> <img style="float: left; padding-right: 10px; width: 45px" src="https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/iacs.png"> AC295: Advanced Practical Data Science </h1>

## Project: News Analytics for Stock Return Prediction

**Harvard University, Fall 2020**  
**Instructors**: Pavlos Protopapas  

### **Team: $\alpha\beta normal$ $Distri\beta ution$**
#### **Roht Beri, Eduardo Peynetti, Jessica Wijaya, Stuart Neilson**

## Creating TFRecords Datasets & BERT Pipeline for Tiingo News Dataset

## Disks

### Connect Google Drive

In [1]:
from google.colab import drive

drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Libraries

### Install Packages

In [2]:
!pip3 install transformers
!pip3 install --upgrade pymongo[srv]==3.10.1
!pip3 install dask[dataframe]

Requirement already up-to-date: pymongo[srv]==3.10.1 in /usr/local/lib/python3.6/dist-packages (3.10.1)


### Imports

In [3]:
import os
import ast
import requests
import tarfile
import tempfile
import zipfile
import shutil
import csv
import json
import time
import sys
import subprocess
import logging

import pymongo
import bson
import dns

import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_hub as hub

from tensorflow import keras
from tensorflow.python.keras import backend as K
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import layers

from collections import Counter
from glob import glob
from threading import Thread

from bson import BSON, ObjectId
from pymongo import MongoClient
from tabulate import tabulate
from tqdm.notebook import tqdm, trange
from datetime import datetime, timedelta, date, time

from transformers import BertTokenizer, TFBertModel, TFBertForSequenceClassification

%matplotlib inline

## Variables

### Useful Constants and Variables

In [4]:
# Set google drive path for pipeline storage
PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/headlines/'
TICKERS_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/components/sp1500.csv'
IND_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/components/industries.csv'
IND_GROUP_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/components/industry_groups.csv'
SECTOR_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/components/sectors.csv'
BERT_HIDDEN_PATH = '/content/drive/MyDrive/abnormal-distribution-project-data/headlines/bert_hs/'

#Helpful Constants
START = datetime(2000,1,1)
END = datetime.now()
WINDOW = 365
AUTOTUNE = tf.data.experimental.AUTOTUNE

# Mongo Atlas keys & host name
PASSWORD = '47PXdQpbJKFTLGTJ'
DBNAME = 'abnormalDistribution'
COLLECTION = 'tiingo'
HOST = f'mongodb+srv://abnormal-distribution:{PASSWORD}@cluster0.friwl.mongodb.net/{DBNAME}?retryWrites=true&w=majority'
print(HOST)

# Pipeline variables
batch_size = 512
prefetch = AUTOTUNE

mongodb+srv://abnormal-distribution:47PXdQpbJKFTLGTJ@cluster0.friwl.mongodb.net/abnormalDistribution?retryWrites=true&w=majority


### Get Tickers and Sectors

In [5]:
tickers = pd.read_csv(TICKERS_PATH).values.flatten()
sectors = pd.read_csv(SECTOR_PATH).sector.values

#industry = pd.read_csv(IND_PATH).industry.values
#ind_group = pd.read_csv(IND_GROUP_PATH).industry_group.values

tickers.sort()
sectors.sort()

## TFRecords

### Utils for Creating TFRecords

In [6]:
# The following functions can be used to convert a value to a type compatible
# with tf.train.Example.
# Credit: https://www.tensorflow.org/tutorials/load_data/tfrecord

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [7]:
# Function to extract news from Mongo
def extract_news(news_collection, ticker='AAPL', tags = False,
                 fields={'description': 1, '_id': 1}):
    
    if not tags:
        result = news_collection.find(
            {'tickers': ticker.lower()}, 
            fields, no_cursor_timeout=True)
    else:
        result = news_collection.find(
            {'tickers': { '$eq': [] }, 'tags': ticker}, 
            fields, no_cursor_timeout=True)
    
    return result 

In [8]:
# Creates shard for the given ticker
def thread_tfrecord(ticker, tags, fields,
                    collection, shard_path, 
                    tokenizer):
    
    # Extract news from mongo
    results = extract_news(
        collection,
        ticker, 
        tags,
        fields
    )

    # TFRecord writer initialized
    with tf.io.TFRecordWriter(shard_path) as writer:

        print(f"Creating TFRecords for {ticker}")

        # Tokenize the record and write to TFRecords
        for item in tqdm(results):

            id = item['_id'].binary

            description = item['description']
            if description=="" or description==None or description==[]:
                continue
            description_token = tokenizer.encode_plus(
                description, 
                add_special_tokens = True, # add [CLS], [SEP]
                max_length = 512, # max length of the text that can go to BERT (<=512)
                padding='max_length',
                return_attention_mask = True, # add attention mask to not focus on pad tokens
                truncation='longest_first',
                return_tensors="tf"
            )
            description_input = description_token['input_ids'].numpy().tostring()
            description_type = description_token['token_type_ids'].numpy().tostring()
            description_attention = description_token['attention_mask'].numpy().tostring()

            # Create tf.train.Example
            feature={
                '_id' : _bytes_feature(id),
                'input_ids': _bytes_feature(description_input),
                'token_type_ids': _bytes_feature(description_type),
                'attention_mask': _bytes_feature(description_attention), 
            }
            features=tf.train.Features(feature=feature)
            example = tf.train.Example(features=features)

            # Wrtie the TFRecord
            writer.write(example.SerializeToString())


In [9]:
# Function to create TFRecords for the dataset
def create_TFRecords(tickers = ['AAPL'], tags=False ,host=HOST, 
                     dbname=DBNAME, collection=COLLECTION, path=PATH):
    """
    Creates a TFRecords file whose elements are (_id, description)
    """
    #Setup connection to Mongo
    client = pymongo.MongoClient(host=host)
    db = client[dbname]
    news_collection = db[collection]

    # Number of Shards
    num_shards = len(tickers)

    if not os.path.exists(path + 'tf_records_new'):
        os.mkdir(path + 'tf_records_new')
        os.mkdir(path + 'tf_records_new/tags')
        os.mkdir(path + 'tf_records_new/tickers')  

    if not tags:
        done_path = path + 'tf_records_new/tickers/*.records'
        shard_path = path + "tf_records_new/tickers/{}.records"
    else:
        done_path = path + 'tf_records_new/tags/*.records'
        shard_path = path + "tf_records_new/tags/{}.records"

    # Files alread done   
    records_done = []
    records_done.extend(glob(done_path))

    #Initalize BERT Tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)

    # Data fields to be extracted
    fields={'description': 1, '_id': 1}

    threads = [[] for _ in range(1000)]
    i = 0

    # Create TF Records for 
    for j in trange(num_shards):
        
        # Create the new file path
        file_path = shard_path.format(tickers[j])

        # Create the thread if the file does not exist
        if file_path not in records_done:
            
            # Counter for running maximum 5 threads at a time
            i += 1 if len(threads[i]) >= 5 else 0

            # Create thread for creation of ticker/tag TFRecords
            threads[i].append(
                Thread(
                    target=thread_tfrecord,
                    args = (
                        tickers[j], tags, fields, 
                        news_collection, file_path, 
                        tokenizer
                    )
                )
            )
    
    # Run the threads
    for thread in threads:
        for inner in thread:
            inner.start()
        for inner in thread:
            inner.join()

    client.close()


### Create TFRecords Dataset

In [None]:
# Create TFRecords for Stocks
create_TFRecords(tickers)

In [None]:
# Create TFRecords for Sectors
create_TFRecords(sectors, tags=True)

### Read TFRecords Dataset

In [None]:
# Check data integrity
tick_path = PATH + 'tf_records_new/tickers/*.records'
tag_path = PATH + 'tf_records_new/tags/*.records'

files = glob(tag_path) + glob(tick_path)
i = 0
for file in tqdm(files):
    raw_dataset = tf.data.TFRecordDataset(file)
    try:
        for raw_record in raw_dataset.take(10):
            example = tf.train.Example()
            example.ParseFromString(raw_record.numpy())
            example = ObjectId(
                example.features.feature['_id'].bytes_list.value[0]
            )
        del raw_dataset
    except:
        os.remove(file)
        print('removed file: ', file)
        i += 1

if i:
    print(f"{i} files were deleted, please rerun Create TFRecords")
else:
    print("TFRecords are in good shape")

In [13]:
filenames = ['/content/drive/MyDrive/abnormal-distribution-project-data/headlines/tf_records_new/tickers/ALGN.records']
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset

<TFRecordDatasetV2 shapes: (), types: tf.string>

In [14]:
for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)

features {
  feature {
    key: "_id"
    value {
      bytes_list {
        value: "_\274\206\370\204i\031\330\300[\"Z"
      }
    }
  }
  feature {
    key: "attention_mask"
    value {
      bytes_list {
        value: "\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000

In [15]:
# Create a dictionary describing the features.
features={
        '_id': tf.io.FixedLenFeature([], tf.string), 
        'input_ids': tf.io.FixedLenFeature([], tf.string),
        'token_type_ids': tf.io.FixedLenFeature([], tf.string),
        'attention_mask': tf.io.FixedLenFeature([], tf.string), 
    }

def _parse_image_function(example_proto):
  # Parse the input tf.train.Example proto using the dictionary above.
  return tf.io.parse_single_example(example_proto, features)

parsed_image_dataset = raw_dataset.map(_parse_image_function)
parsed_image_dataset

<MapDataset shapes: {_id: (), attention_mask: (), input_ids: (), token_type_ids: ()}, types: {_id: tf.string, attention_mask: tf.string, input_ids: tf.string, token_type_ids: tf.string}>

In [16]:
i = 0
for data in parsed_image_dataset:
    input_ids = tf.io.decode_raw(data['input_ids'], tf.int32)
    print(ObjectId(data['_id'].numpy()))
    print(input_ids.numpy()[:10])
    print(input_ids.numpy().shape)
    break

5fbc86f8846919d8c05b225a
[  101  6207  3274  1010 14255 18684  2099  1998  3604 23221]
(512,)


## Pipeline

### Utils for BERT Pieline

In [17]:
# Function to parse data features
def _parse_features_function(example):
    # Parse the input tf.train.Example proto using the dictionary above.
    tf_records_features = {
        '_id': tf.io.FixedLenFeature([], tf.string), 
        'input_ids': tf.io.FixedLenFeature([], tf.string),
        'token_type_ids': tf.io.FixedLenFeature([], tf.string),
        'attention_mask': tf.io.FixedLenFeature([], tf.string), 
    }
    return tf.io.parse_single_example(example, tf_records_features)


# Structure the data for training
def structure_data(data):
    id = data['_id']
    input_ids = tf.io.decode_raw(data['input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['token_type_ids'], tf.int32)

    return ((input_ids, token_type_ids, attention_mask), id)

In [18]:
# Function to build pipeline
def build_pipeline(ticker=None, tags=False):
    options = tf.data.Options()
    options.experimental_deterministic = True

    if tags:
        tfrecords_pattern_path = (
            PATH + f"tf_records_new/tags/{ticker}.records"
        )
    else:
        tfrecords_pattern_path = (
            PATH + f"tf_records_new/tickers/{ticker}.records"
        )
    
    train_files = tf.io.matching_files(tfrecords_pattern_path)
    train_shards = tf.data.Dataset.from_tensor_slices(train_files)

    train = train_shards.interleave(tf.data.TFRecordDataset, cycle_length=1)
    train = train.with_options(options)
    train = train.map(_parse_features_function, num_parallel_calls=AUTOTUNE)
    train = train.map(structure_data, num_parallel_calls=AUTOTUNE)
    train = train.batch(batch_size)
    #train = train.cache().prefetch(prefetch)

    return train

In [19]:
def generate_multiple_pipelines(start, num_tickers=12, tags=False, 
                                tickers=tickers, sectors=sectors):
    data = []

    if tags:
        start = min(len(sectors), start)
        stop = min(len(sectors), start+num_tickers)
        tickers = sectors[start: stop]
    else:
        start = min(len(tickers), start)
        stop = min(len(tickers), start+num_tickers)
        tickers = tickers[start: stop]
    
    print("Generating Pipeline....")

    for ticker in tqdm(tickers):
        data.append(build_pipeline(ticker, tags))
    
    return data, tickers

### Test Pipeline

In [None]:
tags_data, tags = generate_multiple_pipelines(0, tags=True)

In [21]:
for item in tags_data[0].take(1):
    print('answer: ', item[1][0].numpy())
    print()
    print('input_ids: ',item[0][0][0].numpy().shape)
print(tags)

answer:  b'_\xbc\x94^\x84i\x19\xd8\xc0d\x1e\x96'

input_ids:  (512,)
['Basic Materials' 'Communication Services' 'Consumer Cyclical'
 'Consumer Defensive' 'Energy' 'Financial Services' 'Healthcare'
 'Industrials' 'Misc' 'Real Estate' 'Technology' 'Utilities']


In [None]:
tickers_data, ticks = generate_multiple_pipelines(0, tags=False)

In [23]:
for item in tickers_data[0].take(1):
    print('answer: ', item[1][0].numpy())
    print()
    print('input_ids: ',item[0][0][0].numpy().shape)
print(ticks)

answer:  b'_\xbc\x81\xe8\x84i\x19\xd8\xc0Z\xbd0'

input_ids:  (512,)
['A' 'AA' 'AAAGY' 'AAL' 'AAMRQ' 'AAP' 'AAPL' 'AAXN' 'AB' 'ABB' 'ABBV'
 'ABC']


## BERT Model

### Utils to build model to get BERT Hidden Layers & BERT Sentiment

In [24]:
# Build Model to get hidden layers
def get_BERT_hidden():

    # Inputs layers
    input_ids = layers.Input(shape=(512,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(512,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(512,), dtype=tf.int32)

    # BERT model
    bert = TFBertModel.from_pretrained('bert-base-uncased', return_dict=True)

    # BERT is not trainable
    bert.trainable = False

    # BERT output
    question = bert(
        input_ids, 
        token_type_ids=token_type_ids, 
        attention_mask=attention_mask
    )

    # Pooled Output
    output = layers.Flatten()(question[1])

    # Build the model
    model = Model(
        inputs=[input_ids, token_type_ids, attention_mask], 
        outputs=output
    )

    return model

In [25]:
# Build Model to get sentiment
def get_BERT_sentiment():
    
    # Inputs layers
    input_ids = layers.Input(shape=(512,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(512,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(512,), dtype=tf.int32)
    
    # BERT classification model
    bert = TFBertForSequenceClassification.from_pretrained(
        'bert-base-uncased', 
        return_dict=True,
        output_hidden_states=True
    )

    # BERT is not trainable
    bert.trainable = False
    
    # BERT output
    logits = bert(
        input_ids, 
        token_type_ids=token_type_ids, 
        attention_mask=attention_mask
    )

    # Apply softmax as BERT output is logits
    output = layers.Softmax()(logits.logits)

    # Build the model
    model = Model(
        inputs=[input_ids, token_type_ids, attention_mask], 
        outputs=output
    )

    return model

In [26]:
# Function for threading model
def thread_model(model_hs, model_sentiment, 
                 input, id, path_results):

    pred_y_hs = model_hs.predict(input)
    pred_y_sent = model_sentiment.predict(input)
    
    temp_df = pd.DataFrame(
        {'_id': id, 
         'bert_features': list(pred_y_hs),
         'bert_sentiment': list(pred_y_sent[:,1])
         }
    )
    
    temp_df.to_csv(path_results, index=False)

In [27]:
# Function to generate and save BERT Representations
def generate_BERT_features(start, num_tickers=100, tags=False, tickers=tickers, 
                           sectors=sectors, path=BERT_HIDDEN_PATH):
    K.clear_session()

    model_hs = get_BERT_hidden()
    model_sentiment = get_BERT_sentiment()

    if not os.path.exists(path+'tags'):
        os.mkdir(path+'tags')
    if not os.path.exists(path+'tickers'):
        os.mkdir(path+'tickers')

    if tags:
        path = path + 'tags/'
        path_results = path + '{}/{}_bert_tiingo.csv'
    else:
        path = path + 'tickers/'
        path_results = path + '{}/{}_bert_tiingo.csv'

    train, ticks = generate_multiple_pipelines(start, num_tickers, tags)

    # Run prediction loop for each ticker
    for i, tick in tqdm(enumerate(ticks)):
        
        # Create directories if they don't exist
        if not os.path.exists(path+tick):
            os.mkdir(path+tick)

        # Initalize batch number & Threads
        batch_num = 0
        threads = []

        print('Processing: ', tick)
        # Run predictor for each batch
        for input, id in tqdm(train[i]):

            # File path to save prediction outcomes
            temp_path = path_results.format(tick, batch_num)

            # Thread prediction only if batch prediction doesn't exists
            if not os.path.exists(temp_path):
                threads.append(
                    Thread(
                        target=thread_model, 
                        args = (model_hs, model_sentiment, input, id, temp_path)
                    )
                )

            # Update batch number
            batch_num += 1

            # Run maximum of 2 threads at a time
            if len(threads) == 2:
                for thread in threads:
                    thread.start()
                for thread in threads:
                    thread.join()
                threads = []

        # Run remaining threads if any
        if threads != []:
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join()

### Get and Save BERT Hidden Layers & Sentiment

In [None]:
# BERT Features for Stocks
generate_BERT_features(start=0, num_tickers=2000)

In [None]:
# BERT Features for Sectors
generate_BERT_features(start=0, tags=True)

## MongoDB Update with BERT Features

### Utils to Update MongoDB

In [30]:
def data_update(news_collection, file, new_path):

    # Read the data
    df = pd.read_csv(file)
    
    # Get the number of updates
    count = df.shape[0]

    # Update the data and get results
    df = df.apply(
        lambda x: news_collection.update_one(
            {"_id": ObjectId(ast.literal_eval(x['_id']))},
            {"$set": 
             {
                 "bert_HS": list(map(float, x['bert_features'].strip('[]').split())), 
                 "bert_sentiment": x["bert_sentiment"] 
             }
            }).matched_count,
        axis=1
    ).sum()
    
    # Check if update was successful and copy the DataRecords to processed folder
    if df==count:
        shutil.copy2(file, new_path)
    
    return df==count

In [31]:
def update_mongo(start=0, num_tickers=100, tags=False, tickers=tickers, 
                 sectors=sectors, host=HOST, dbname=DBNAME, 
                 collection=COLLECTION, path=BERT_HIDDEN_PATH):

    # Create Directory to move files which have been processed
    if not os.path.exists(path+'tags/processed'):
        os.mkdir(path+'tags/processed')
    if not os.path.exists(path+'tickers/processed'):
        os.mkdir(path+'tickers/processed')

    # Set the paths and the tickers
    if tags:
        path = path + 'tags/'
        path_data = path + '{}/*_bert_tiingo.csv'
        path_results = path + 'processed/{}'
        interim_path = path + '{}/{}_bert_tiingo.csv'
        start = min(len(sectors), start)
        stop = min(len(sectors), start+num_tickers)
        tickers = sectors[start: stop]
    else:
        path = path + 'tickers/'
        path_data = path + '{}/*_bert_tiingo.csv'
        path_results = path + 'processed/{}'
        interim_path = path + '{}/{}_bert_tiingo.csv'
        start = min(len(tickers), start)
        stop = min(len(tickers), start+num_tickers)
        tickers = tickers[start: stop]

    #Setup connection to Mongo
    client = pymongo.MongoClient(host=host)
    db = client[dbname]
    news_collection = db[collection]

    print('Processing Tickers.....')
    # Create threads for data update
    for tick in tqdm(tickers):
        # Set temp path to ticker data
        temp_path = path_data.format(tick)
        temp_path_results = path_results.format(tick)
        if not os.path.exists(temp_path_results):
            os.mkdir(temp_path_results)
        
        # Get the files already processed
        processed_files = glob(temp_path_results + '/*_bert_tiingo.csv')
        processed_files = [
            file[:-16].lstrip(path_results[:-2]).lstrip(tick+'/')
            for file in processed_files
        ]
        processed_files = [
            interim_path.format(tick, file) for file in processed_files
        ]

        threads = []
        print("Generating Threads for: ", tick)
        for file in tqdm(glob(temp_path)):

            # Check if file alread processed
            if not file in processed_files:
                # Generate Thread
                threads.append(
                    Thread(
                        target=data_update, 
                        args=(news_collection, file, temp_path_results)
                    )
                )

                if len(threads) >= 50:
                    for thread in threads:
                        thread.start()
                    for thread in threads:
                        thread.join()
                    threads = []
        
        if len(threads) > 0:
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join()
    
    # Create Index in mongoDb on 
    news_collection.create_index("bert_sentiment")

### Update MongoDB with Hidden Layers and Sentiment

In [None]:
# Update Mongo with sector BERT features
update_mongo(start=0, tags=True)

In [None]:
# Update Mongo with stock BERT features
update_mongo(start=0, num_tickers=2000)

## Mongo BERT Feature Extraction for Time Series Processing

### Mongo Feature Extraction Utils

In [16]:
def get_BERT_features_from_mongo(
    ticker, start=None, end=None, tags=False,
    fields={
        "publishedDate": 1, 
        "bert_hs": 1, 
        "bert_sentiment": 1, 
        "_id": 0
    },
    count=False, path=BERT_HIDDEN_PATH, host=HOST, 
    dbname=DBNAME, collection=COLLECTION):
    
    # Setup the dates
    if start==None:
        start = datetime(1900,1,1).isoformat()
    else:
        start = start.isoformat()

    if end==None:
        end = datetime.now().isoformat()
    else:
        end = end.isoformat()

    #Setup connection to Mongo
    client = pymongo.MongoClient(host=host)
    db = client[dbname]
    news_collection = db[collection]

    # Construct Query
    if tags:
        query = {
            "tickers": [], "tags": ticker.lower(),
            "publishedDate": {"$gte": start, "$lte": end},
            "bert_sentiment": { "$exists": True }
        }
    else:
        query = {
            "tickers": ticker.lower(), 
            "publishedDate": {"$gte": start, "$lte": end},
            "bert_sentiment": { "$exists": True }
        }
    
    # Run Query
    if count:
        data = news_collection.count_documents(query)
    else:
        data = news_collection.find(query,fields).sort("publishedDate")

    return data

### MongoDB BERT Features Extraction

In [17]:
get_BERT_features_from_mongo("AAPL", count=True)

220521

## Scrap

In [8]:
client = pymongo.MongoClient(HOST)
db = client[DBNAME]
news_collection = db[COLLECTION]

In [14]:
start = datetime(1900,1,1).isoformat()
end = datetime.now().isoformat()
query = {
    "tickers": 'aapl',
    "publishedDate": {"$gte": start, "$lte": end},
    "bert_sentiment": { "$exists": True }
}
data = news_collection.count_documents(query)

In [13]:
data

220521

In [None]:
start = datetime(1900,1,1).isoformat()
end = datetime.now().isoformat()
query = {
    "tickers": 'aapl',
    "publishedDate": {"$gte": start, "$lte": end},
    "bert_sentiment": { "$exists": False }
}
data = news_collection.count_documents(query)

In [15]:
data

12700

In [None]:
start = datetime(1900,1,1).isoformat()
end = datetime.now().isoformat()
query = {
    "bert_sentiment": { "$exists": True }
}
data = news_collection.count_documents(query)
data

In [None]:
# news_collection.create_index("bert_sentiment")