# Turn Json files into pandas dataframe

In [98]:
#nltk.download() #download text data sets, incl stop words

import pandas as pd
import ijson
import json
import gensim
import re
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [29]:
filename = 'Earnings_Preview_Facebook_Inc_NASDAQFB_moved_up_10_times_out_of_last_17_quarters__The_Independent_Republic.json'

def read_Google_articles(filename):
    """ 
    Converts Google News JSON file into a data frame. Takes in
    a .json file and returns a dataframe using the json's dictionary-like
    structure 
    """
    fin = open(filename)
    mydict = json.load(fin)
    fin.close()

    df = pd.DataFrame.from_dict(mydict, orient = 'index')
    return df

In [31]:
mydf = read_Google_articles(filename)
mydf

Unnamed: 0,0
body,"Facebook, Inc. (NASDAQ:FB) is projected to dec..."
category,Facebook
title,"Earnings Preview: Facebook, Inc. (NASDAQ:FB) m..."


In [4]:
article = mydf.iloc[0][0]
article

u'Facebook, Inc. (NASDAQ:FB) is projected to declare fiscal fourth quarter financial results right after the stock market\u2019s official close on February 01, 2017. The stock added about 1.4 percent in price since last results when it was at $127.17 a share. Based on the most relevant past-periods data, there is an 58.82 percent probability for this firm\u2019s share price to go up following next quarterly results. Earnings reaction history tells us that the equity price moved up 10 times out of last 17 reported quarters. It has beaten earnings-per-share estimates 58.% of the time in its last 12 earnings reports. It fell short of earnings estimates on 4 occasions, and it has met expectations 1 time.\n\nHere\u2019s how traders responded to FB earnings announcements over the past few quarters.\n\nGiven its history, the average earnings announcement surprise was 21.11 percent over the past four quarters. Back on November 2, 2016, it posted earnings per-share earnings at $0.88 which beat 

## Now clean the body/article
#### some things are not cleaned (such as B for billion and s after 2019. Clean later

In [5]:
# kaggle tutorial method: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
letters_only = re.sub("[^a-zA-Z]", " ", article)

# Insight Jupyter notebook method
# article.decode('utf-8').lower()

# Tokenization
## convert to lower case and split into individual words

In [10]:
# kaggle method
lower_case = letters_only.lower()
words = lower_case.split()

# Insight Jupyter notebook method (lens of results are the same)
re_tokenizer = RegexpTokenizer(r'\w+')
article_tokens = re_tokenizer.tokenize(lower_case)

len(article_tokens)

432

## Stop Words

#### can maybe add more stop words by looking at collection frequency. Currently using stopwords dictionary.

#### 'u' means python represents each word as unicode string

In [11]:
print stopwords.words("english")

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u

# Remove Stopwords

#### note that 'no' and 'not' are removed

In [12]:
# Kaggle Tutorial Method
K_cleaned_tokens = [w for w in words if not w in stopwords.words("english")]
# len(words)

# Insight Jupyter notebook method
I_cleaned_tokens = []
stop_words = set(stopwords.words('english'))
for token in article_tokens:
    if token not in stop_words:
        I_cleaned_tokens.append(token)

# Check if methods are the same
len(I_cleaned_tokens) == len(K_cleaned_tokens), len(I_cleaned_tokens)

(True, 262)

# Stemming and Lemmatization

stemming selectively removed the end of words such as to remove tense
lemmatization accounts for variables such as part of speech, meaning, & context

Lemmatization usually creates more tokens, but extra computational power later may not prove worth it

Try with both

In [13]:
porter = PorterStemmer()
snowball = SnowballStemmer('english')
lancaster = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

In [15]:
stemmed_tokens = []
lemmatized_tokens = []

for token in I_cleaned_tokens:
    stemmed_tokens.append(snowball.stem(token))
    lemmatized_tokens.append(lemmatizer.lemmatize(token))

In [169]:
len(lemmatized_tokens), len(stemmed_tokens)

(262, 262)

# Vectorization

## Count Vectorization - bag of words (uses token count)

train the bag of words model

In [101]:
# can do feature(token) reduction later for too rare and too common words

vectorizer = CountVectorizer()

# stem article
stemmed_article = ' '.join(wd for wd in stemmed_tokens)
# perform count-based vectorization
article_vect = vectorizer.fit_transform([stemmed_article])

# Kaggle method 
train_data_features = article_vect.toarray()
train_data_features

array([[ 1,  5,  5,  1,  1,  2,  1,  1,  1,  1,  1,  2,  1,  3,  1,  1,  1,
         1,  8,  1,  2,  1,  6,  1,  1,  2, 16,  2,  1,  1,  6,  4,  2,  3,
         1,  1,  1,  1,  1,  4,  1,  1,  1,  1,  1,  1,  2,  2,  1,  1,  5,
         1,  1,  1,  3,  1,  1,  1,  2,  2,  1,  1,  1,  1,  3,  6,  5,  2,
         4,  1,  1,  9,  1,  2,  1,  8,  3,  1,  1,  2,  1,  1,  1,  4,  1,
         3,  7,  1,  1,  1, 12,  1,  1,  6,  1,  5,  1,  1,  3,  3,  2,  3,
         1,  1,  2,  1,  1,  2,  1]])

Look at the vocabulary

In [94]:
# Look at the vocabulary
vocab = vectorizer.get_feature_names()

# Look at the counts for each word
dist = np.sum(train_data_features, axis = 0)
for tag, count in zip(vocab, dist):
    print count, tag

1 ad
5 analyst
5 announc
1 approach
1 april
2 averag
1 back
1 base
1 beat
1 beaten
1 call
2 came
1 cap
3 chang
1 climb
1 close
1 compani
1 compar
8 consensus
1 cover
2 data
1 date
6 day
1 declar
1 declin
2 drop
16 earn
2 end
1 ep
1 equiti
6 estim
4 expect
2 facebook
3 fb
1 februari
1 fell
1 financi
1 firm
1 fiscal
4 follow
1 four
1 fourth
1 gain
1 given
1 go
1 higher
2 histori
2 inc
1 januari
1 juli
5 last
1 level
1 low
1 made
3 market
1 met
1 month
1 move
2 nasdaq
2 near
1 next
1 novemb
1 occas
1 offici
3 past
6 per
5 percent
2 period
4 posit
1 post
1 present
9 price
1 probabl
2 project
1 public
8 quarter
3 rang
1 reaction
1 rebound
2 recent
1 record
1 releas
1 relev
4 report
1 respond
3 result
7 revenu
1 right
1 send
1 session
12 share
1 short
1 sinc
6 stock
1 street
5 surpris
1 target
1 tell
3 th
3 time
2 top
3 trade
1 trader
1 us
2 versus
1 volum
1 wall
2 week
1 year


In [97]:
# just look at 10 most frequent words for this article
freqs = [(word, article_vect.getcol(idx).sum()) for word, idx in vectorizer.vocabulary_.items()]
print sorted (freqs, key = lambda x: -x[1])[0:10]

[(u'earn', 16), (u'share', 12), (u'price', 9), (u'consensus', 8), (u'quarter', 8), (u'revenu', 7), (u'stock', 6), (u'day', 6), (u'per', 6), (u'estim', 6)]


## Term Frequency - Inverse Document Frequency (tf-idf) Vectorization

if a word occurs frequently in 1 document, it is important, but if it occurs accross many documents, it is less informative and differentiating

Insight tutorial uses NYTimes corpus: https://open.blogs.nytimes.com/2008/10/14/announcing-the-new-york-times-campaign-finance-api/

# Putting it all together
1) put all functions together 
2) changed list to set for stopwords because search is faster

In [79]:
# first preprocess a bunch of articles

def preprocess_article_content(text_df):
    """
    Simple preprocessing pipeline which uses RegExp, sets basic token requirements, and removes stop words.
    Set up to work with df files created from JSONs
    """
    print 'preprocessing article text...'

    # tokenizer, stops, and stemmer
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))  # can add more stop words to this set
    stemmer = SnowballStemmer('english')

    # process articles
    article_list = []
    for row, article in enumerate(text_df['full_text']):
        cleaned_tokens = []

        letters_only = re.sub("[^a-zA-Z]", " ", article)
        lower_case = letters_only.lower()
        tokens = re_tokenizer.tokenize(lower_case)

    #         tokens = tokenizer.tokenize(article.decode('utf-8').lower())
        for token in tokens:
            if token not in stop_words:
                if len(token) > 0 and len(token) < 20: # removes non words
                    if not token[0].isdigit() and not token[-1].isdigit(): # removes numbers
                        stemmed_tokens = stemmer.stem(token)
                        cleaned_tokens.append(stemmed_tokens)
        # add process article
        article_list.append(' '.join(wd for wd in cleaned_tokens))

    # echo results and return
    print 'preprocessed content for %d articles' % len(article_list)
    return article_list

In [71]:
# Create a df of a new file
filename2 = '2_Bullish_Trades_to_Juice_Your_Facebook_Inc_FB_Stock_Position.json'
comparison_df = read_Google_articles(filename2)
comparison_df

Unnamed: 0,0
body,"Late 2016, I wrote how to catch the Facebook I..."
category,Facebook
title,2 Bullish Trades to Juice Your Facebook Inc (F...


In [72]:
comparison_df.iloc[0][0]

u'Late 2016, I wrote how to catch the Facebook Inc (NASDAQ: ) knife. The trade was pure profits out of thin air. Since then, Facebook stock caught a 13% rally, but it still isn\u2019t expensive. Let\u2019s trade it again.\n\nFundamentally, FB execution has been flawless. It had doubters, but management proved them all wrong. After all, it would take a major gaffe to ruin the potential of a billion users.\n\nTechnically, while this is not an obvious short-term entry point, it should be a good point for a long-term trade. I am not looking to profit from this trade in the next few days. Instead, I am looking out to next year.\n\nThe Trade: Sell the FB Jan 2018 $100 put. This is a bullish trade for which I collect $4.50 per contract to open. To be successful, I need Facebook stock to stay above my sold strike while I hold the position open. Selling naked puts is risky and I only do it if I am willing and able to own FB stock at the strike sold. My breakeven price would be the strike price 

In [81]:
# create new df of articles
combined_articles_df = pd.DataFrame([mydf.iloc[0][0], comparison_df.iloc[0][0]])
combined_articles_df.columns = ['full_text']
combined_articles_df

Unnamed: 0,full_text
0,"Facebook, Inc. (NASDAQ:FB) is projected to dec..."
1,"Late 2016, I wrote how to catch the Facebook I..."


In [82]:
# process articles
processed_article_list = preprocess_article_content(combined_articles_df)

# vectorize the articles and compute count matrix

tf_vectorizer = TfidfVectorizer()
tfidf_article_matrix = tf_vectorizer.fit_transform(processed_article_list)

print tfidf_article_matrix.shape

preprocessing article text...
preprocessed content for 2 articles
(2, 238)


In [85]:
len(processed_article_list)

2

## Ngrams
Data is currently treated as unigrams, they may be updated to bigrams later, any larger increases computational costs drastically

# Modeling
## Random Forest

In [99]:
FB_jan25_26 = [131.479996, 132.779999]

# Try random forest on bag of words
forest = RandomForestClassifier(n_estimators=100)
# fit the forest to the training set, using the bag of words features
forest = forest.fit(train_data_features, FB_jan25_26)

NameError: name 'train' is not defined

In [100]:
train_data_features

array([[ 1,  5,  5,  1,  1,  2,  1,  1,  1,  1,  1,  2,  1,  3,  1,  1,  1,
         1,  8,  1,  2,  1,  6,  1,  1,  2, 16,  2,  1,  1,  6,  4,  2,  3,
         1,  1,  1,  1,  1,  4,  1,  1,  1,  1,  1,  1,  2,  2,  1,  1,  5,
         1,  1,  1,  3,  1,  1,  1,  2,  2,  1,  1,  1,  1,  3,  6,  5,  2,
         4,  1,  1,  9,  1,  2,  1,  8,  3,  1,  1,  2,  1,  1,  1,  4,  1,
         3,  7,  1,  1,  1, 12,  1,  1,  6,  1,  5,  1,  1,  3,  3,  2,  3,
         1,  1,  2,  1,  1,  2,  1]])