In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from datetime import datetime,timedelta
import sys
import re 
import nums_from_string
import pickle
from email.utils import formataddr
from newsapi.newsapi_client import NewsApiClient
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [3]:
def fetchSession(url):
    session = HTMLSession()
    r = session.get(url)
    return r

def getTrades(r):
    table = r.html.find('table')[0]
    rows = table.find('tr')
    return rows[1:]

def value_to_ints(value):
    bad_chars = [
        ',','$','-'
    ]
    for c in bad_chars:
        value = value.replace(c,'')
    low, high = [
        int(x) for x in (value.split('  ', 1))
    ]
    return [low,high]

def getTicker(t):
    try:
        return re.findall('\[(.*?)\]', t)[0]
    except IndexError:
        return ''

def getYahooInfo(ticker):
    url = 'https://finance.yahoo.com/quote/{}'.format(ticker)
    r = fetchSession(url)
    # handle invalid ticker
    tables = r.html.find('table')
    if len(tables) == 1:
        return -1,-1
    
    left_table = tables[0]
    right_table = tables[1]
    left_rows = left_table.find('td')
    right_rows = right_table.find('td')
    left_items = []
    left_values = []
    right_items = []
    right_values = []
    
    i = 0
    for l,r in zip(left_rows, right_rows):
        # evens = item headers
        if i % 2 == 0:
            left_items.append(l.text)
            right_items.append(r.text)
        # odds = values in table
        else:
            left_values.append(l.text)
            right_values.append(r.text)
        i += 1
    return (
        dict(
            zip(left_items, left_values)
        ),
        dict(
            zip(right_items, right_values)
        )
    )

def isStock(right_table):
    return [*right_table][0] == 'Market Cap'

def getMktCap(right_table):
    return right_table['Market Cap']

def getOpen(left_table):
    return left_table['Open']

def parseToMillions(mkt_cap):
    unit = mkt_cap[-1:]
    number = nums_from_string.get_nums(mkt_cap)[0]
    #keep in units of millions
    if unit == 'B':
        number = number * 1000
    elif unit == 'T':
        number = number * 1000000
    return number

def cleanQuery(trade):
    trade =  re.sub(
        '[^0-9a-zA-Z]+', ' ', trade
    )
    return trade.split('Common')[0] + 'Stock'

def getTradesNews(t, key_path):
    with open(key_path) as f:
        key = f.read()
    newsapi = NewsApiClient(api_key=key)
    
    search = cleanQuery(t)
    try:
        articles = newsapi.get_everything(
            q=search, language='en', sort_by='relevancy'
        )['articles'][:3]
    except IndexError:
        return -1
    if len(articles) == 0:
        return -1
    titles_urls = []
    for n in articles:
        titles_urls.append(
            {
                'title' : n['title'],
                'url' : n['url']
            }
        )
    return titles_urls

def writeTradeToFile(trade, path):
    with open(path, 'w') as f:
        for (key,item) in trade.items():
            if key == 'Yahoo!':
                f.write(
                    '%s\n' % (
                    item
                    )
                )
            else:
                f.write(
                    '%s : %s\n' % (
                    key,item
                    )
                )
        f.write('\n')

def getStockSearchQueries():
    r = fetchSession('https://sec.report/Senate-Stock-Disclosures')
    # if website is down
    try:
        trades = getTrades(r)
    except IndexError:
        sys.exit(1)

    n = len(trades)
    search_queries = []
    found_tickers = []

    for i in range(0,n,2):
        l1_elements = trades[i].find('td')
        trade = l1_elements[1].text
        
        ticker = getTicker(trade)
        if ticker == '' or ticker in found_tickers:
            continue

        # handle case of finding company debt, or rare case of fund having a mkt cap listed instead of an NAV  
        if ('Notes' or 'Matures' or 'Fund') in trade:
            continue
        
        left_table, right_table = getYahooInfo(ticker)
        # invalid ticker given 
        if left_table == -1:
            continue
        if not isStock(right_table):
            continue

        query = cleanQuery(trade)
        info = {
            'trade' : trade,
            'ticker' : ticker,
            'query' : query
        }
        found_tickers.append(ticker)
        search_queries.append(info)

    return search_queries

In [4]:
getNewQueries = False
if getNewQueries:
    search_queries = getStockSearchQueries()
    with open('articles/query_list.txt', 'w') as f:
        for q in search_queries:
            f.write(q['query'] + '\n')
    with open('articles/trade_list.txt', 'w') as f:
        for q in search_queries:
            f.write(q['trade'] + '\n')
    with open('articles/ticker_list.txt', 'w') as f:
        for q in search_queries:
            f.write(q['ticker'] + '\n')

In [5]:
queries = []
with open('articles/query_list.txt', 'r') as f:
    lines = f.readlines()
    for l in lines:
        queries.append(l.strip())

In [6]:
search = queries[1]

key_path = '../res/news/news_key.txt'
with open(key_path) as f:
    key = f.read()
newsapi = NewsApiClient(api_key=key)

relevant_articles = []
for search in queries:

    articles = newsapi.get_everything(
        q=search, language='en', sort_by='relevancy'
        )['articles']
    
    company_name_list = search.split(' ')[:-1]
    company_name = ' '.join(company_name_list)

    for i in range(len(company_name_list), 0, -1):
        match_list = company_name_list[:i]
        match = ' '.join(match_list) 
        for a in articles:
            title = a['title']
            if match in title:    
                relevant_articles.append(
                    {
                        'title' : title,
                        'url' : a['url']
                    }
                )
                articles.remove(a)

In [7]:
relevant_articles

[{'title': 'With 64% ownership of the shares, Verizon Communications Inc. (NYSE:VZ) is heavily dominated by institutional owners',
  'url': 'https://finance.yahoo.com/news/64-ownership-shares-verizon-communications-120133196.html'},
 {'title': 'Improved Earnings Required Before Verizon Communications Inc. (NYSE:VZ) Shares Find Their Feet',
  'url': 'https://finance.yahoo.com/news/improved-earnings-required-verizon-communications-120058233.html'},
 {'title': "Can Verizon Communications Inc.'s (NYSE:VZ) ROE Continue To Surpass The Industry Average?",
  'url': 'https://finance.yahoo.com/news/verizon-communications-inc-nyse-vz-110027099.html'},
 {'title': "Verizon Communications Inc.'s (NYSE:VZ) Prospects Need A Boost To Lift Shares",
  'url': 'https://finance.yahoo.com/news/verizon-communications-inc-nyse-vz-120114299.html'},
 {'title': 'Is it Worthy to Invest Your Hard Earned Money in Verizon Communications (VZ)?',
  'url': 'https://finance.yahoo.com/news/worthy-invest-hard-earned-money-

In [8]:
headlines = []
for a in relevant_articles:
    headline = a['title']
    headlines.append(headline)
df_headlines = pd.DataFrame(headlines)
df_headlines.to_csv('newdata/headlines.csv')

In [9]:
useBest = True

if useBest:
    model_path = 'bestmodel/sentiment_model.h5'
else:
    model_path = 'checkpoint_model/sentiment_model.h5'
sentiment_model = tf.keras.models.load_model(model_path)

In [10]:
sentiment_model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 120, 12)           120000    
                                                                 
 dropout_24 (Dropout)        (None, 120, 12)           0         
                                                                 
 gru_8 (GRU)                 (None, 120, 4)            216       
                                                                 
 dropout_25 (Dropout)        (None, 120, 4)            0         
                                                                 
 global_average_pooling1d_8   (None, 4)                0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_26 (Dropout)        (None, 4)                 0         
                                                      

In [11]:
remove_stopwords = False
trunc_type = 'post'
max_length = 120

with open('tokenizer/tokenizer.pickle', 'rb') as f:
    tokenizer = pickle.load(f)


if remove_stopwords:
    stopwords_path = '../res/stopwords/stopwords.txt'

    stopwords =[]
    with open(stopwords_path, 'r') as f:
        lines = f.readlines()
        stopwords.append('a')
        for l in lines:
            if len(l.strip()) > 1:
                stopwords.append(l.strip())

    for a in relevant_articles:
        headline = a['title']
        headline_stopwords_removed_list = [w for w in headline.split() if w not in stopwords]
        headline_stopwords_removed = ' '.join(headline_stopwords_removed_list)
        sequences = tokenizer.texts_to_sequences([headline_stopwords_removed])
        padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
        sentiment = np.argmax(sentiment_model.predict(padded))
        print(headline + '    SENTIMENT: ' + str(sentiment) + '\n')

else:
    for a in relevant_articles:
        headline = a['title']
        sequences = tokenizer.texts_to_sequences([headline])
        padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
        sentiment = np.argmax(sentiment_model.predict(padded))
        # if sentiment == 0 or sentiment == 2:
        print(headline + '    SENTIMENT: ' + str(sentiment) + '\n')

With 64% ownership of the shares, Verizon Communications Inc. (NYSE:VZ) is heavily dominated by institutional owners    SENTIMENT: 1

Improved Earnings Required Before Verizon Communications Inc. (NYSE:VZ) Shares Find Their Feet    SENTIMENT: 1

Can Verizon Communications Inc.'s (NYSE:VZ) ROE Continue To Surpass The Industry Average?    SENTIMENT: 1

Verizon Communications Inc.'s (NYSE:VZ) Prospects Need A Boost To Lift Shares    SENTIMENT: 1

Is it Worthy to Invest Your Hard Earned Money in Verizon Communications (VZ)?    SENTIMENT: 1

Verizon Falls to 11-Year Low After Stumble on Subscriber Growth    SENTIMENT: 1

Verizon stock headed for its biggest gain of the year, remains highest-yielding Dow component    SENTIMENT: 0

Verizon: It's Time To Be Greedy    SENTIMENT: 1

Verizon: Blood In The Streets    SENTIMENT: 1

Verizon: A Highly Lucrative Buying Opportunity    SENTIMENT: 1

What The Market Gets Wrong About Verizon - A Contrarian View    SENTIMENT: 1

Lo And Behold Verizon At 10

### Should user only see <b>recent</b> articles that are either good or bad, but not neutral?
- Could cut down on error (ex. more likely to label a good/bad article as neutral, but not likely to label a bad article as good)
- How recent to get articles from? Over the past 2 weeks?