In [1]:
import newspaper 
import bs4 as bs
import requests
import spacy
import re
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


In [2]:
nlp = spacy.load('en')

In [3]:
def tokenize(sent):
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

In [4]:
# Top Tech Companies
ticker_tech_comp = ['GOOG', "AMZN", "FB", "AAPL", "MSFT"]
tech_comp = ["google","amazon","facebook","apple","microsoft"]

In [5]:
# Prevents caching so that all articles are released
techcrunch_paper = newspaper.build('https://techcrunch.com/', memoize_articles=False) # memoize looks wrong but its not

In [6]:
for article in techcrunch_paper.articles:
    print(article.url)

http://techcrunch.cn/2017/06/13/emil-michael-ubers-svp-of-business-has-left-the-company/?ncid=rss
http://techcrunch.cn/2017/06/13/emil-michael-ubers-svp-of-business-has-left-the-company/
http://techcrunch.cn/2017/06/13/emil-michael-ubers-svp-of-business-has-left-the-company/feed/
http://techcrunch.cn/2017/06/13/comscore-cord-cutters-tend-to-have-lower-incomes-than-pay-tv-subscribers-watch-less-television/?ncid=rss
http://techcrunch.cn/2017/06/13/comscore-cord-cutters-tend-to-have-lower-incomes-than-pay-tv-subscribers-watch-less-television/
http://techcrunch.cn/2017/06/13/comscore-cord-cutters-tend-to-have-lower-incomes-than-pay-tv-subscribers-watch-less-television/feed/
http://techcrunch.cn/2017/06/05/toyotas-flying-car-project-takes-a-tentative-test-flight/?ncid=rss
http://techcrunch.cn/2017/06/05/toyotas-flying-car-project-takes-a-tentative-test-flight/
http://techcrunch.cn/2017/06/05/toyotas-flying-car-project-takes-a-tentative-test-flight/feed/
http://techcrunch.cn/2017/06/02/us-ap

In [7]:
# Create a dictionary from the tech company tickers, with none as the value of the keys
article_dict = dict.fromkeys(ticker_tech_comp, None)

To determine if an article is talking about a specific company, I run through the company urls which contains the article's title. If the company name is in the title, then that means that the article is about that company

In [8]:
# Fill the dictionary with the first related news article 
for article in techcrunch_paper.articles:
    for company, company_ticker in zip(tech_comp, ticker_tech_comp):
        # Appears to extract english and japanese articles, we only want english
        if company in article.url and "jp" not in article.url: 
            if(article_dict[company_ticker] == None):
                article_dict[company_ticker] = article

In [9]:
article_dict

{'AAPL': <newspaper.article.Article at 0x11d978630>,
 'AMZN': <newspaper.article.Article at 0x11d98bbe0>,
 'FB': <newspaper.article.Article at 0x11d99b278>,
 'GOOG': <newspaper.article.Article at 0x124e20470>,
 'MSFT': <newspaper.article.Article at 0x11d992e10>}

In [10]:
# Download the articles and parse through them so we can use the text
for key in article_dict:
    article_dict[key].download()
    article_dict[key].parse()

Building prefix dict from /Users/Angus/anaconda/lib/python3.6/site-packages/jieba/dict.txt ...
Loading model from cache /var/folders/l7/r9r7m7qn5mz1d267mf3jqfgm0000gn/T/jieba.cache
Loading model cost 1.5296659469604492 seconds.
Prefix dict has been built succesfully.


Next we want to create a vocabularly list for each of the words used in the article.
We will put this in a dictionary with an index associated with each of the vocab keys

Vocab for each of the the articles individually (Need to ask Henry)
----
Should I be using the spacy vectors

# Create a dictionary from the tech company tickers, with none as the value of the keys
vocab_dict = dict.fromkeys(ticker_tech_comp, None)


# Tokenize each of the articles and create a sorted unique vocabulary list for each company
for key in vocab_dict:
    vocab_dict[key] = sorted(set(tokenize(tech_dict[key].text)))
    vocab_dict[key] = dict((vocab, i + 1) for i, vocab in enumerate(vocab_dict[key]))


Vocab for all of the the articles (Need to ask Henry)
----
Only concern is I have to retrain every time i scrape new articles?

In [11]:
# Run through each article and append new vocab in vocab list
vocab = set()
for ticker in ticker_tech_comp:
    vocab |= set((tokenize(article_dict[ticker].text)))

  return _compile(pattern, flags).split(string, maxsplit)


In [12]:
# Put in a dictionary with an index for each of the vocabulary
# Reserve 0 for masking via pad_sequences
vocab = sorted(vocab)
vocab_id = dict((word, i + 1) for i, word in enumerate(vocab))
vocab_size = len(vocab) + 1

In [13]:
# Get the max article length for tokenize -> Spacy does different parsing so lengths may differ
article_len = []
for ticker in ticker_tech_comp:
    article_len.append(len(tokenize(article_dict[ticker].text)))

article_maxlen = max(article_len)

  return _compile(pattern, flags).split(string, maxsplit)


In [14]:
def vectorize_stories(article, vocab_id, article_maxlen):
    word_id = []
    for word in article:
        word_id.append(vocab_id[word]) 
    return pad_sequences([word_id], maxlen=article_maxlen)

In [15]:
# Create a dictionary from the tech company tickers, with none as the value of the keys
article_vectorized_dict = dict.fromkeys(ticker_tech_comp, None)

for ticker in ticker_tech_comp:
    article_vectorized_dict[ticker] = vectorize_stories(tokenize(article_dict[ticker].text), vocab_id, article_maxlen)

article_vectorized_dict

  return _compile(pattern, flags).split(string, maxsplit)


{'AAPL': array([[ 55, 724,  39, ..., 547, 556,  10]], dtype=int32),
 'AMZN': array([[  0,   0,   0, ..., 400, 627,  10]], dtype=int32),
 'FB': array([[  0,   0,   0, ..., 707, 558,  10]], dtype=int32),
 'GOOG': array([[  0,   0,   0, ..., 851,  93, 175]], dtype=int32),
 'MSFT': array([[ 0,  0,  0, ..., 52, 22, 78]], dtype=int32)}

In [16]:
#df = pd.read_csv('final_stock_features_dfs/{}.csv'.format('FB'), index_col=0)
df = pd.read_csv('final_stock_features_dfs/{}.csv'.format('FB'))

df

Unnamed: 0,Perc_Change_1d,100_mov_avg,Open_Prev_1d,High_Prev_1d,Low_Prev_1d,Close_Prev_1d,Volume_Prev_1d
0,0.006053,38.000000,,,,,
1,-0.109861,38.115000,,,,38.00,0.0
2,-0.089039,36.753333,42.05,45.00,38.00,38.23,580587742.0
3,0.032258,35.315000,36.53,36.66,33.00,34.03,168309831.0
4,0.032188,34.652000,32.61,33.59,30.94,31.00,102053826.0
5,-0.033909,34.381667,31.37,32.50,31.36,32.00,73721135.0
6,-0.096208,34.028571,32.95,33.21,31.77,33.03,50275879.0
7,-0.022538,33.380000,32.90,32.95,31.11,31.91,37189630.0
8,0.050018,32.803333,31.48,31.69,28.65,28.84,78060799.0
9,-0.063514,32.483000,28.70,29.55,27.86,28.19,57267867.0


In [17]:
numStockDates = len(df)

In [18]:
df_fb = pd.DataFrame(article_vectorized_dict['FB'])
df_fb = df_fb.append(list(article_vectorized_dict['FB'])*(numStockDates-1),ignore_index=True)

In [37]:
final_df = pd.concat([df, df_fb], axis=1)



In [38]:
final_df = final_df.iloc[2:]
final_df.dropna(inplace=True)



In [39]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(final_df.drop(["Perc_Change_1d"], axis=1), final_df["Perc_Change_1d"], test_size=0.33, random_state=42)

In [44]:
# Split the x_train and x_test into the articles and the financial features to be put into the model
# Training Set
x_train_stock_features = x_train.iloc[:,:6]
x_train_articles = x_train.iloc[:,6:]
# Testing Set
x_test_stock_features = x_test.iloc[:,:6]
x_test_articles = x_test.iloc[:,6:]


In [48]:
from sklearn import preprocessing
x_train_stock_features_normalized = preprocessing.scale(x_train_stock_features) 
x_test_stock_features_normalized = preprocessing.scale(x_test_stock_features) 


In [51]:
x_train_stock_features_normalized

array([[-0.17313751,  0.00461614, -0.0079307 , -0.01365012, -0.03053764,
        -0.20058973],
       [ 0.16216197,  0.14422263,  0.13780722,  0.15867996,  0.13892046,
        -0.74748437],
       [-1.28213343, -1.0821143 , -1.03826383, -1.07341434, -1.02950358,
         3.94658823],
       ..., 
       [ 1.43355598,  1.10444287,  1.07793406,  0.98481194,  0.99588679,
         0.35712927],
       [ 0.49016933,  0.57011409,  0.56850315,  0.53780613,  0.52307778,
        -0.25018732],
       [ 1.42895871,  1.18040137,  1.22080414,  1.19502833,  1.23490639,
        -0.58032435]])

In [49]:
x_test_stock_features_normalized

array([[ 1.634425  ,  1.77502562,  1.76497586,  1.79480739,  1.79039209,
        -0.81690989],
       [ 1.45032209,  1.44523713,  1.43785953,  1.46398065,  1.43698063,
        -0.81104998],
       [-0.20992976, -0.08833121, -0.10271118, -0.08683489, -0.09308612,
        -0.41406481],
       ..., 
       [-0.57916285, -0.58275468, -0.56872931, -0.56220265, -0.55176907,
         0.23442301],
       [-1.22966475, -1.31570442, -1.32984921, -1.31673807, -1.32989656,
        -0.04935758],
       [-1.13190807, -1.30740785, -1.29447831, -1.30525828, -1.28633594,
         0.64068029]])

In [52]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.layers import recurrent
from keras import layers
from keras.models import Model

EMBED_HIDDEN_SIZE = 50

RNN = recurrent.LSTM

print('Build model...')
# Put articles into an LSTM
articles = layers.Input(shape=(article_maxlen,), dtype='int32')
encoded_articles = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(articles)
encoded_articles = layers.Dropout(0.3)(encoded_articles)
encoded_articles = RNN(EMBED_HIDDEN_SIZE)(encoded_articles)
# Grab Stock Features 
stock_features = layers.Input(shape=(6,))
# Concat the two layers
merged = layers.concatenate([stock_features, encoded_articles])
# Dense with one prediction for the percentage change
pred = layers.Dense(1)(merged)
# Create model
model = Model([articles, stock_features], pred)
model.compile(loss='mean_squared_error',
              optimizer=Adam(lr=0.001),
            )


Build model...


In [None]:
BATCH_SIZE = 32
EPOCHS = 30

print('Training')

model.fit([x_train_articles.values, x_train_stock_features_normalized], 
          y_train.values,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.05)

loss = model.evaluate([x_test_articles.values, x_train_stock_features_normalized], 
                      y_test.values,
                      batch_size=BATCH_SIZE)

Training
Train on 813 samples, validate on 43 samples
Epoch 1/30

Converting each of the articles using the spacy vectors
----
Better for reusing same training model?

# Create a dictionary from the tech company tickers, with none as the value of the keys
article_nlp_dict = dict.fromkeys(ticker_tech_comp, None)

# Convert each into a spacy processed text
for ticker in ticker_tech_comp:
    article_nlp_dict[ticker] = nlp(tech_dict[ticker].text)


# Get the max article length for Spacy 
article_len_spacy = []
for ticker in ticker_tech_comp:
    article_len_spacy.append(len(article_nlp_dict[ticker]))

article_maxlen_spacy = max(article_len_spacy)

# Find a way to pad the NLP of the articles to the max length
article_vector_dict = dict.fromkeys(ticker_tech_comp, None)
# Vector format for each of the tech companies
for ticker in ticker_tech_comp:
    article_vector_dict[ticker] = [word.vector for word in article_nlp_dict[ticker]]