# Time price prediction

# Random Forest

### TO DO: integrate all data, split into test/train cases, separate functions into their own .py files

In [1]:
import pandas as pd
import ijson
import json
import gensim
import re
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

### Test articles from 2 days

In [33]:
Jan25_article = 'Earnings_Preview_Facebook_Inc_NASDAQFB_moved_up_10_times_out_of_last_17_quarters__The_Independent_Republic.json'
Jan26_article = 'Facebook_Inc_FB_Hires_Former_Google_Exec_to_Lead_Oculus.json'

### Create dataframe from JSON files

In [6]:
articlelist = [Jan25_article, Jan26_article]
articlelist

['Earnings_Preview_Facebook_Inc_NASDAQFB_moved_up_10_times_out_of_last_17_quarters__The_Independent_Republic.json',
 'Facebook_Inc_FB_Hires_Former_Google_Exec_to_Lead_Oculus.json']

In [142]:
def read_Google_articles(articlelist):
    """ 
    Converts Google News JSON file into a data frame. Takes in
    a .json file and returns a dataframe using the json's dictionary-like
    structure 
    """
    
    with open(articlelist[0],'r') as first:
        firstdict = json.load(first)
        combined_df = pd.DataFrame.from_dict(firstdict, orient = 'index')
        combined_df = combined_df.T
    
    for article in articlelist:
        with open(article, 'r') as fin:
            mydict = json.load(fin)
        current_df = pd.DataFrame.from_dict(mydict, orient = 'index')
        current_df = current_df.T
    
    # USE CONCAT WITH .APPEND DOESN'T WORK!!!
    final_df = pd.concat([combined_df, current_df])
        
    return final_df

In [144]:
all_articles_df = read_Google_articles(articlelist)
all_articles_df

Unnamed: 0,body,category,title
0,"Facebook, Inc. (NASDAQ:FB) is projected to dec...",Facebook,"Earnings Preview: Facebook, Inc. (NASDAQ:FB) m..."
0,Facebook Inc’s (NASDAQ: ) virtual reality divi...,Facebook 1-26-17,Facebook Inc (FB) Hires Former Google Exec to ...


### Preprocess these articles

In [149]:
def preprocess_article_content(text_df):
    """
    Simple preprocessing pipeline which uses RegExp, sets basic token requirements, and removes stop words.
    Set up to work with df files created from JSONs
    """
    print 'preprocessing article text...'

    # tokenizer, stops, and stemmer
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))  # can add more stop words to this set
    stemmer = SnowballStemmer('english')

    # process articles
    article_list = []
    for row, article in enumerate(text_df['body']):
        cleaned_tokens = []

        letters_only = re.sub("[^a-zA-Z]", " ", article)
        lower_case = letters_only.lower()
        tokens = tokenizer.tokenize(lower_case)

    #         tokens = tokenizer.tokenize(article.decode('utf-8').lower())
        for token in tokens:
            if token not in stop_words:
                if len(token) > 0 and len(token) < 20: # removes non words
                    if not token[0].isdigit() and not token[-1].isdigit(): # removes numbers
                        stemmed_tokens = stemmer.stem(token)
                        cleaned_tokens.append(stemmed_tokens)
        # add process article
        article_list.append(' '.join(wd for wd in cleaned_tokens))

    # echo results and return
    print 'preprocessed content for %d articles' % len(article_list)
    return article_list

In [155]:
cleaned_tokenized_stemmed_lemmatized_articles = preprocess_article_content(all_articles_df)

preprocessing article text...
preprocessed content for 2 articles


In [156]:
len(cleaned_tokenized_stemmed_lemmatized_articles)

2

## Vectorize the bag of words (token count)

In [159]:
# can do feature(token) reduction later for too rare and too common words

vectorizer = CountVectorizer()

# perform count-based vectorization
article_vect = vectorizer.fit_transform(cleaned_tokenized_stemmed_lemmatized_articles)

# Kaggle method 
train_data_features = article_vect.toarray()


## Take a look at the vocabulary

In [161]:
# Look at the vocabulary
vocab = vectorizer.get_feature_names()

# Look at the counts for each word
dist = np.sum(train_data_features, axis = 0)
for tag, count in zip(vocab, dist):
    print count, tag

1 ad
1 adr
1 afford
1 alphabet
5 analyst
2 android
7 announc
1 answer
1 approach
1 april
1 around
1 augment
2 averag
1 back
1 bad
7 barra
1 base
1 beat
1 beaten
1 began
1 begin
1 beij
1 belief
1 billion
1 bought
1 brendan
2 bring
1 call
2 came
1 cap
2 ceo
1 ces
3 chang
1 china
1 chines
1 claim
1 climb
1 close
3 compani
1 compar
1 compel
1 complaint
1 comput
8 consensus
3 consum
1 continu
1 control
1 copyright
1 corp
1 cover
1 critic
2 data
1 date
6 day
1 decemb
1 declar
1 declin
1 detail
3 develop
1 devic
4 divis
2 drop
16 earn
3 effort
1 electron
2 end
1 ep
1 equiti
6 estim
1 excit
4 expect
1 experienc
6 facebook
1 fade
1 familiar
6 fb
1 februari
1 feel
1 fell
1 financi
1 firm
1 fiscal
1 fit
4 follow
2 founder
1 four
1 fourth
2 gain
1 game
1 gave
2 giant
1 given
1 go
1 googl
2 headset
2 help
1 high
1 higher
2 hire
2 histori
7 hugo
1 hype
1 imag
4 inc
2 includ
2 industri
1 innov
1 irib
1 januari
2 join
1 juli
1 kickstart
2 known
9 last
1 lawsuit
4 lead
1 leav
1 level
1 like
1 limitless

# Fit a Random Forest Model to Data

In [172]:
FB_jan25_26 = np.asarray([131.48, 132.78], dtype="|S6")

# Try random forest on bag of words
forest = RandomForestClassifier(n_estimators=100)
# fit the forest to the training set, using the bag of words features
forest = forest.fit(train_data_features, FB_jan25_26)