<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [16]:
import os
import datetime
import numpy as np
import re
import warnings
import pandas as pd
import calendar
import pickle
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import dask.dataframe as dd
from dask.diagnostics import ProgressBar 
from dask.multiprocessing import get
import spacy

warnings.filterwarnings("ignore")

In [17]:
def load_data(data_path):
    
    stock_news_df = pd.read_csv(data_path + 'news_reuters.csv', header=None,
                                names=['tickers', 'company', 'date', 'headline', 'first_sent', 'priority'])
    stock_price_df = pd.read_json(data_path + 'stockReturns.json')
    
    return stock_news_df, stock_price_df

In [75]:
def transform_stock_price(price_df, duration):

    transform_df = price_df[duration].apply(pd.Series)
    transform_df = transform_df.stack().rename('price_change' + '_' + duration).reset_index()
    transform_df.rename(columns={'level_0': 'tickers', 'level_1': 'date'}, inplace=True)
    transform_df.date = transform_df.date.astype('int64')
    
    if duration == 'short':
        transform_df['signal'] = transform_df['price_change' + '_' + duration] \
        .map(lambda x: "stay" if -1 < x < 1 else ("up" if x > 1 else "down"))
    return transform_df

def combine_stock_news(news_df, price_df):
    
    combined_df = news_df.copy()
    
    durations = price_df.columns
    for duration in durations:
        price_duration_df = transform_stock_price(price_df, duration)
        combined_df = pd.merge(left=combined_df, right=price_duration_df,
                       on=['date', 'tickers'], how='inner')
    return combined_df

def to_csv(data_path):
    
    news_df, price_df = load_data(data_path)
    
    combined_df = combine_stock_news(news_df, price_df)
    
    combined_df.to_csv(data_path + "news_price_df.csv")
    
    
def cleanup_text(sent):
    
    monthStrings = list(calendar.month_name)[1:] + list(calendar.month_abbr)[1:]
    monthPattern = '|'.join(monthStrings)
    sent = re.sub(r'\s+', ' ', sent).strip()
    sent = re.sub(r'U.S.', 'United States', sent)
    sent = re.sub(r'CORRECTED-', '', sent)
    sent = re.sub(r'^(\W?[A-Z\s\d]+\b-?)', '', sent)
    sent = re.sub(r'^ ?\W ', '', sent)
    sent = re.sub(r'(\s*-+\s*[A-Za-z]+)$', '', sent)
    sent = re.sub(r"(\'+[A-Z1-9]+\'*)$", '', sent)
    sent = re.sub(r"[$]+", '', sent)
    sent = re.sub(r'({}) \d+'.format(monthPattern), '', sent)
    
    sent = sent.lower().strip()
    
    return sent

def spacy_tokenize(df, col):
    
    nlp = English()
    STOP_WORDS = construct_stop_words()
    sentences = df[col].tolist()
    docs = []
    for sent in sentences:
        docs.append(cleanup_text(sent))

    def token_filter(token, stop_words):
        return not (token.is_punct or token.is_stop or token.is_space)

    try:
        filtered_tokens = []
        for doc in nlp.pipe(docs):
            tokens = [tok.lemma_ for tok in doc if token_filter(tok, STOP_WORDS)]
            tokens = [tok for tok in tokens if not re.search('[\$1-9]+', tok)]
            filtered_tokens.append(tokens)
            
        return filtered_tokens
    except Exception as e:
        raise e



def construct_stop_words():
    
    STOP_WORDS.add("uk")
    STOP_WORDS.add("united")
    STOP_WORDS.add("states")
    STOP_WORDS.add("america")
    
    return STOP_WORDS

In [66]:
data_path = "inputs/"
news_df, price_df = load_data(data_path)

combined_df = combine_stock_news(news_df, price_df)
to_csv(data_path)

In [67]:
combined_df.shape

(52298, 10)

In [68]:
filtered_tokens = spacy_tokenize(combined_df, 'headline')

In [69]:
combined_df1 = combined_df.copy()
combined_df1['headlines_tokens'] = filtered_tokens
combined_df1['head_tok_len'] = combined_df1['headlines_tokens'].map(lambda x: len(x))

In [70]:
combined_df1['signal'].value_counts()

down    42596
stay     7866
up       1836
Name: signal, dtype: int64

In [76]:
nlp = spacy.load('en_core_web_lg')


In [77]:
sent_list = combined_df['headline'].tolist()
cleaned_sent = []
for sent in sent_list:
    cleaned_sent.append(cleanup_text(sent))

In [80]:
pos_count = {}
for doc in nlp.pipe(cleaned_sent):
    for token in doc:
        
        if token.is_stop or token.is_punct or token.is_space:
            continue
        
        if token.pos_ == 'VERB':
            
            if token.lemma_ in pos_count:
                pos_count[token.lemma_] += 1
            else:
                pos_count[token.lemma_] = 1

In [81]:
print("top 10 VERBs {}".format(sorted(pos_count.items(), key=lambda kv: kv[1], reverse=True)[:50]))


top 10 VERBs [('say', 3767), ('buy', 1495), ('rise', 1022), ('see', 1016), ('sell', 904), ('pay', 881), ('fall', 803), ('launch', 751), ('set', 694), ('win', 685), ('hit', 674), ('announce', 670), ('cut', 665), ('raise', 659), ('seek', 623), ('hire', 532), ('expect', 525), ('boost', 523), ('take', 519), ('beat', 517), ('settle', 503), ('open', 490), ('offer', 475), ('expand', 454), ('end', 448), ('plan', 442), ('face', 438), ('approve', 431), ('report', 430), ('lead', 412), ('drop', 406), ('weigh', 404), ('sue', 402), ('add', 394), ('get', 394), ('invest', 362), ('help', 347), ('lose', 345), ('join', 326), ('feed', 301), ('lift', 297), ('drive', 296), ('look', 294), ('reject', 288), ('start', 287), ('extend', 285), ('jump', 284), ('update', 280), ('ask', 277), ('agree', 269)]
