In [2]:
import pandas as pd
import ijson
import json
import gensim
import re
import numpy as np
import os
import nltk.data

# from os import walk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [3]:
def read_Google_articles(articlelist, path):
    """ 
    Converts Google News JSON file into a data frame. Takes in
    a .json file and returns a dataframe using the json's dictionary-like
    structure 
    """
    
    with open(path + articlelist[0],'r') as first:
        firstdict = json.load(first)
        combined_df = pd.DataFrame.from_dict(firstdict, orient = 'index')
        combined_df = combined_df.T
    
    for article in articlelist:
        with open(path + article, 'r') as fin:
            mydict = json.load(fin)
        current_df = pd.DataFrame.from_dict(mydict, orient = 'index')
        current_df = current_df.T
        combined_df = combined_df.append(current_df, ignore_index=True)
    
    # USE CONCAT WITH .APPEND DOESN'T WORK!!!
#     final_df = pd.concat([combined_df, current_df])
        
    return combined_df

In [4]:
def data_directory_crawl(path, ticker):
    """
    Crawls through a given parent directory to create a dataframe of articles and their body for the given company ticker
    """

    mypath = path + ticker + '/'
    company_articles_combined_days=pd.DataFrame()

    for directory in os.listdir(mypath):
    #     print directory
        f = []
        d = []
        for (dirpath, dirnames, filenames) in os.walk(mypath + directory):
            f.extend(filenames)
            d.extend(dirnames)

        company_articles_combined_days = company_articles_combined_days.append(read_Google_articles(f, mypath + directory + '/'))
    
    return company_articles_combined_days

In [51]:
def preprocess_article_content_wordlist(text_df, remove_stopwords = False):
    """
    Simple preprocessing pipeline which uses RegExp, sets basic token requirements, and removes stop words.
    Set up to work with df files created from JSONs
    """
    print 'preprocessing article text...'

#     # tokenizer, stops, and stemmer
#     tokenizer = RegexpTokenizer(r'\w+')
#     stemmer = SnowballStemmer('english')
    
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))  # can add more stop words to this set
    else:
        stop_words = set()
        
    # process articles
    article_list = []
    for row, article in enumerate(text_df['body']):
        cleaned_tokens = []

        letters_only = re.sub("[^a-zA-Z]", " ", article)
        lower_case_words = letters_only.lower().split()
#         tokens = tokenizer.tokenize(lower_case)
        lower_case_words = [w for w in lower_case_words if not w in stop_words]

#         for token in tokens:
#             if token not in stop_words:
#                 if len(token) > 0 and len(token) < 20: # removes non words
#                     if not token[0].isdigit() and not token[-1].isdigit(): # removes numbers
#                         stemmed_tokens = stemmer.stem(token)
#                         cleaned_tokens.append(stemmed_tokens)
        # add process article
#         article_list.append(' '.join(wd for wd in cleaned_tokens))

    # echo results and return
    print 'preprocessed content for %d words' % len(lower_case_words)
    return lower_case_words

# Creating a Submission
## Iterate through all days for a given company to create a dataframe

In [12]:
mypath = '/home/daisyz/Dropbox/finance_prediction/data/'
ticker = 'AAPL'

company_articles_combined_days = data_directory_crawl(mypath, ticker)
company_articles_combined_days

Unnamed: 0,body,category,title
0,Apple (AAPL) CEO Tim Cook says his company had...,Apple Inc 2-1-17,"Qualcomm Lawsuit Was 'Last Resort,' Apple CEO ..."
1,Apple (AAPL) CEO Tim Cook says his company had...,Apple Inc 2-1-17,"Qualcomm Lawsuit Was 'Last Resort,' Apple CEO ..."
2,Check out which companies are making headlines...,Apple Inc 2-1-17,"Early movers: AAPL, ANTM, TUP, PBI, MPC, & more"
3,"Yesterday, I wrote a somewhat controversial ar...",Apple Inc 2-1-17,Apple: Bulls Have Taken Over
4,Check out which companies are making headlines...,Apple Inc 2-1-17,"After-hours buzz: AAPL, EA, AMD & more"
5,The following companies are expected to report...,Apple Inc 2-1-17,"After-Hours Earnings Report for January 31, 20..."
6,ConocoPhillips ( COP ) has raised the company'...,Apple Inc 2-1-17,"Daily Dividend Report: COP, GLW, AAPL, AFL, CO..."
7,"In early trading on Wednesday, shares of Apple...",Apple Inc 2-1-17,"Nasdaq 100 Movers: CTXS, AAPL"
8,Looking at options trading activity among comp...,Apple Inc 2-1-17,"Notable Wednesday Option Activity: AAPL, DIS, ..."
9,Apple Inc. (NASDAQ: ) — This major provider of...,Apple Inc 2-1-17,Trade of the Day: Apple Inc. (AAPL) Stock No L...


In [120]:
# create dictionary for each company. Each entry is a combined, cleaned
# string of all articles

days = set(company_articles_combined_days.category)
daily_articles_body = pd.DataFrame(columns = ('category', 'full_text'))

for j, day in enumerate(days):
    indiv_day_articles = company_articles_combined_days.loc[(company_articles_combined_days.category == day)]
    all_body = []
    for i in range(0,len(indiv_day_articles)):
        all_body.append(indiv_day_articles.body.iloc[i])
#         print all_body
#     d = [day, all_body]
#     print d
#     temp_df = pd.DataFrame(d, columns = ('category', 'full_text'))
#     print(temp_df)
#     daily_articles_body = pd.DataFrame(all_body, index = day, columns = ('category', 'full_text'))

In [137]:
daily_listed_articles = company_articles_combined_days.groupby('category')['body'].apply(list)
pd.DataFrame(daily_listed_articles)

Unnamed: 0_level_0,body
category,Unnamed: 1_level_1
Apple Inc,[On today’s episode of the Zacks Friday Finish...
Apple Inc 1-26-17,[On today’s episode of the Zacks Friday Finish...
Apple Inc 1-27-17,[On today’s episode of the Zacks Friday Finish...
Apple Inc 1-30-17,"[On Tuesday, January 31st, Apple (AAPL) will r..."
Apple Inc 1-31-17,[Apple (NASDAQ:AAPL) reported earnings on Tues...
Apple Inc 2-1-17,[Apple (AAPL) CEO Tim Cook says his company ha...


In [100]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [41]:
# Define a function to split dictionary into parsed sentences
def word_dict_to_sentences(dictionary_entry, tokenizer, remove_stopwords = False):
    """ Function takes dictionary with combined strings from all articles from 
    multiple days (row = day, words = singular words combined from all articles. 
    """
    
# 1. Use NLTK tokenizer to split the paragraph into sentences.
    raw_sentences = tokenizer.tokenize(dictionary_entry.strip())
    
# 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
#         skip an empty sentence
        if len(raw_sentence) > 0:
            sentences.append(raw_sentence)
            
    # return a list of lists (List of sentences, each sentence is a list of words)
    return sentences

In [43]:
sentences = []
path = mypath = '/home/daisyz/Dropbox/finance_prediction/data/AAPL/'

print "Parsing sentences"
for key in indiv_day_articles:
    indiv_day_articles[key] = word_dict_to_sentences(indiv_day_articles.body.iloc[0], tokenizer)

Parsing sentences


ValueError: Length of values does not match length of index

In [46]:
indiv_day_articles

Unnamed: 0,body,category,title
0,"On Tuesday, January 31st, Apple (AAPL) will re...",Apple Inc 1-30-17,Bartosiak: Trading Apple's (AAPL) Earnings wit...
1,"On Tuesday, January 31st, Apple (AAPL) will re...",Apple Inc 1-30-17,Bartosiak: Trading Apple's (AAPL) Earnings wit...
2,Chicago Equity Partners Llc decreased its stak...,Apple Inc 1-30-17,Chicago Equity Partners LLC Position in Apple ...
3,"In the annals of business relationships, the o...",Apple Inc 1-30-17,How Apple vs. Qualcomm Fits Business Strategy
4,Apple (AAPL) will release their earnings for t...,Apple Inc 1-30-17,Apple Earnings Strategy: When To Buy AAPL Shares
5,"There are 3 sell ratings, 8 hold ratings, 37 b...",Apple Inc 1-30-17,Analyst Activity – Drexel Hamilton Reiterates ...
6,Leading the Apple Inc. (NASDAQ: ) rumor mill t...,Apple Inc 1-30-17,Thursday Apple Rumors: AAPL May Be Planning a ...
7,Leading the Apple Inc. (NASDAQ: ) rumor mill t...,Apple Inc 1-30-17,Wednesday Apple Rumors: AAPL May Use Flexible ...
8,Apple Inc. (NASDAQ: ) dropped a bombshell on o...,Apple Inc 1-30-17,"Apple Inc. (AAPL) Suit Against Qualcomm, Inc. ..."
9,When Apple Inc. (NASDAQ: ) designed the iPhone...,Apple Inc 1-30-17,Apple Inc. (AAPL) iPhone Takes a Hit From ... ...
