# Time price prediction

# Random Forest

### TO DO: integrate all data, split into test/train cases, separate functions into their own .py files

In [2]:
import pandas as pd
import ijson
import json
import gensim
import re
import numpy as np
import os
import nltk.data

# from os import walk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

### Test articles from 2 days

In [8]:
Jan25_article = 'Earnings_Preview_Facebook_Inc_NASDAQFB_moved_up_10_times_out_of_last_17_quarters__The_Independent_Republic.json'
Jan26_article = 'Facebook_Inc_FB_Hires_Former_Google_Exec_to_Lead_Oculus.json'

### Create dataframe from JSON files

In [9]:
articlelist = [Jan25_article, Jan26_article]
articlelist

['Earnings_Preview_Facebook_Inc_NASDAQFB_moved_up_10_times_out_of_last_17_quarters__The_Independent_Republic.json',
 'Facebook_Inc_FB_Hires_Former_Google_Exec_to_Lead_Oculus.json']

In [10]:
def read_Google_articles(articlelist, path):
    """ 
    Converts Google News JSON file into a data frame. Takes in
    a .json file and returns a dataframe using the json's dictionary-like
    structure 
    """
    
    with open(path + articlelist[0],'r') as first:
        firstdict = json.load(first)
        combined_df = pd.DataFrame.from_dict(firstdict, orient = 'index')
        combined_df = combined_df.T
    
    for article in articlelist:
        with open(path + article, 'r') as fin:
            mydict = json.load(fin)
        current_df = pd.DataFrame.from_dict(mydict, orient = 'index')
        current_df = current_df.T
        combined_df = combined_df.append(current_df, ignore_index=True)
    
    # USE CONCAT WITH .APPEND DOESN'T WORK!!!
#     final_df = pd.concat([combined_df, current_df])
        
    return combined_df

# Test on 2 files

In [11]:
path = ''
all_articles_df = read_Google_articles(articlelist, path)
all_articles_df

Unnamed: 0,body,category,title
0,"Facebook, Inc. (NASDAQ:FB) is projected to dec...",Facebook,"Earnings Preview: Facebook, Inc. (NASDAQ:FB) m..."
1,"Facebook, Inc. (NASDAQ:FB) is projected to dec...",Facebook,"Earnings Preview: Facebook, Inc. (NASDAQ:FB) m..."
2,Facebook Inc’s (NASDAQ: ) virtual reality divi...,Facebook 1-26-17,Facebook Inc (FB) Hires Former Google Exec to ...


### Preprocess these articles

In [12]:
def preprocess_article_content(text_df):
    """
    Simple preprocessing pipeline which uses RegExp, sets basic token requirements, and removes stop words.
    Set up to work with df files created from JSONs
    """
    print 'preprocessing article text...'

    # tokenizer, stops, and stemmer
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))  # can add more stop words to this set
    stemmer = SnowballStemmer('english')

    # process articles
    article_list = []
    for row, article in enumerate(text_df['body']):
        cleaned_tokens = []

        letters_only = re.sub("[^a-zA-Z]", " ", article)
        lower_case = letters_only.lower()
        tokens = tokenizer.tokenize(lower_case)

    #         tokens = tokenizer.tokenize(article.decode('utf-8').lower())
        for token in tokens:
            if token not in stop_words:
                if len(token) > 0 and len(token) < 20: # removes non words
                    if not token[0].isdigit() and not token[-1].isdigit(): # removes numbers
                        stemmed_tokens = stemmer.stem(token)
                        cleaned_tokens.append(stemmed_tokens)
        # add process article
        article_list.append(' '.join(wd for wd in cleaned_tokens))

    # echo results and return
    print 'preprocessed content for %d articles' % len(article_list)
    return article_list

In [13]:
cleaned_tokenized_stemmed_lemmatized_articles = preprocess_article_content(all_articles_df)

preprocessing article text...
preprocessed content for 3 articles


In [14]:
len(cleaned_tokenized_stemmed_lemmatized_articles)

3

## Vectorize the bag of words (token count)

In [15]:
# can do feature(token) reduction later for too rare and too common words

vectorizer = CountVectorizer()

# perform count-based vectorization
article_vect = vectorizer.fit_transform(cleaned_tokenized_stemmed_lemmatized_articles)

# Kaggle method 
train_data_features = article_vect.toarray()


## Take a look at the vocabulary

In [16]:
# Look at the vocabulary
vocab = vectorizer.get_feature_names()

# Look at the counts for each word
dist = np.sum(train_data_features, axis = 0)
for tag, count in zip(vocab, dist):
    print count, tag

2 ad
1 adr
1 afford
1 alphabet
10 analyst
2 android
12 announc
1 answer
2 approach
2 april
1 around
1 augment
4 averag
2 back
1 bad
7 barra
2 base
2 beat
2 beaten
1 began
1 begin
1 beij
1 belief
1 billion
1 bought
1 brendan
2 bring
2 call
4 came
2 cap
2 ceo
1 ces
6 chang
1 china
1 chines
1 claim
2 climb
2 close
4 compani
2 compar
1 compel
1 complaint
1 comput
16 consensus
3 consum
1 continu
1 control
1 copyright
1 corp
2 cover
1 critic
4 data
2 date
12 day
1 decemb
2 declar
2 declin
1 detail
3 develop
1 devic
4 divis
4 drop
32 earn
3 effort
1 electron
4 end
2 ep
2 equiti
12 estim
1 excit
8 expect
1 experienc
8 facebook
1 fade
1 familiar
9 fb
2 februari
1 feel
2 fell
2 financi
2 firm
2 fiscal
1 fit
8 follow
2 founder
2 four
2 fourth
3 gain
1 game
1 gave
2 giant
2 given
2 go
1 googl
2 headset
2 help
1 high
2 higher
2 hire
4 histori
7 hugo
1 hype
1 imag
6 inc
2 includ
2 industri
1 innov
1 irib
2 januari
2 join
2 juli
1 kickstart
2 known
14 last
1 lawsuit
4 lead
1 leav
2 level
1 like
1 lim

In [17]:
def data_directory_crawl(path, ticker):
    """
    Crawls through a given parent directory to create a dataframe of articles and their body for the given company ticker
    """

    mypath = path + ticker + '/'
    company_articles_combined_days=pd.DataFrame()

    for directory in os.listdir(mypath):
    #     print directory
        f = []
        d = []
        for (dirpath, dirnames, filenames) in os.walk(mypath + directory):
            f.extend(filenames)
            d.extend(dirnames)

        company_articles_combined_days = company_articles_combined_days.append(read_Google_articles(f, mypath + directory + '/'))
    
    return company_articles_combined_days

# Fit a Random Forest Model to Data

In [19]:
FB_jan25_26 = np.asarray([131.48, 132.78, 133], dtype="|S6")

# Try random forest on bag of words
forest = RandomForestClassifier(n_estimators=100)
# fit the forest to the training set, using the bag of words features
forest = forest.fit(train_data_features, FB_jan25_26)

# Creating a Submission

## Iterate through all days for given company to create dataframe

In [20]:
mypath = '/home/daisyz/Dropbox/finance_prediction/data/'
ticker = 'AAPL'

company_articles_combined_days = data_directory_crawl(mypath, ticker)
company_articles_combined_days

Unnamed: 0,body,category,title
0,On today’s episode of the Zacks Friday Finish ...,Apple Inc,"Tech Earnings Preview: Will AAPL, MSFT, GOOGL,..."
1,On today’s episode of the Zacks Friday Finish ...,Apple Inc,"Tech Earnings Preview: Will AAPL, MSFT, GOOGL,..."
2,U.S. stock futures are pointed lower this morn...,Apple Inc,"Monday’s Vital Data: Apple Inc. (AAPL), Rite A..."
3,While Apple (NASDAQ:AAPL) requests for concess...,Apple Inc,"Apple, India reported nearing manufacturing deal"
4,Apple (AAPL) : In a disclosure report filed on...,Apple Inc,Reilly Financial Advisors Cuts Position in App...
5,Apple (AAPL) will release their earnings for t...,Apple Inc,Apple Earnings Strategy: When To Buy AAPL Shares
6,Leading the Apple Inc. (NASDAQ: ) rumor mill t...,Apple Inc,Wednesday Apple Rumors: AAPL May Use Flexible ...
7,Apple is down 0.07% at $119.92 a share followi...,Apple Inc,Apple slips after being downgraded at Barclays...
8,Cowen maintains Outperform on Qualcomm (NASDAQ...,Apple Inc,Draconian Apple (AAPL) Case Suggests Limited Q...
9,The first day of public trading with President...,Apple Inc,3 Stocks to Watch on Tuesday: Apple Inc. (AAPL...


In [21]:
# create dictionary for each company. Each entry is a combined, cleaned
# string of all articles

days = set(company_articles_combined_days.category)

tokenized_daily_articles = dict()

for day in days:
    indiv_day_articles = company_articles_combined_days.loc[(company_articles_combined_days.category == day)]
    tokenized_daily_articles[day] = preprocess_article_content(indiv_day_articles)

preprocessing article text...
preprocessed content for 31 articles
preprocessing article text...
preprocessed content for 32 articles
preprocessing article text...
preprocessed content for 30 articles
preprocessing article text...
preprocessed content for 29 articles


In [26]:
tokenized_daily_articles.keys()


[u'Apple Inc 1-27-17',
 u'Apple Inc',
 u'Apple Inc 1-26-17',
 u'Apple Inc 1-30-17']

In [27]:
# Do this for every key in the dictionary
# can do feature(token) reduction later for too rare and too common words
# reshape df for word2vec

vectorizer = CountVectorizer()

# perform count-based vectorization
article_vect = vectorizer.fit_transform(tokenized_daily_articles['Apple Inc'])

# Kaggle method 
train_data_features = article_vect.toarray()

In [1]:
def preprocess_article_content_word2vec(text_df, remove_stopwords = False):
    """
    Simple preprocessing pipeline which uses RegExp, sets basic token requirements, and removes stop words.
    Set up to work with df files created from JSONs
    """
    print 'preprocessing article text...'

    # tokenizer, stops, and stemmer
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = SnowballStemmer('english')
    
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))  # can add more stop words to this set
    else:
        stop_words = set()
        
    # process articles
    article_list = []
    for row, article in enumerate(text_df['body']):
        cleaned_tokens = []

        letters_only = re.sub("[^a-zA-Z]", " ", article)
        lower_case = letters_only.lower()
        tokens = tokenizer.tokenize(lower_case)

    #         tokens = tokenizer.tokenize(article.decode('utf-8').lower())
        for token in tokens:
            if token not in stop_words:
                if len(token) > 0 and len(token) < 20: # removes non words
                    if not token[0].isdigit() and not token[-1].isdigit(): # removes numbers
                        stemmed_tokens = stemmer.stem(token)
                        cleaned_tokens.append(stemmed_tokens)
        # add process article
        article_list.append(' '.join(wd for wd in cleaned_tokens))

    # echo results and return
    print 'preprocessed content for %d articles' % len(article_list)
    return article_list