#NEWSWATCH SUPERFILTERS

Motivation: We hope to conduct a text analysis of breaking news headlines and associated reports pulled from newswatch and entered into a spreadsheet by our team. All the articles in our database resulted in tangible price movements in the associated stocks and were available immediately upon release through newswatch. We will be seeking to identify keywords, phrases, and article tags that show up across a variety of news headlines with the intention of using the results to tailor newswatch filters such that we can get the news and see it right away.

Specifically, we will start by investigating headlines in several key areas:
    - Biotech
    - M/A
    - Corporate Activity
    - Enforcement Agency Activity
    - Patent Law
    
And will seek to further categorize headlines within these groups

In [69]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from collections import Counter
import codecs

import nltk
from nltk.collocations import *
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.corpus import stopwords

#nltk.download('stopwords')
#nltk.download("genesis")
#nltk.download('punkt')

##Read in Headline Spreadsheet

In [112]:
def get_spreadsheet(sheet):
    news_df = pd.read_csv(sheet)
    ind = news_df.index

    remove_list = ['press release: ', 'dj: press release: ', 'top-line', 'phase 1', 'phase 2', 'phase 3', 'dj', 'u.s.', '*dj']
    replace_list = ['', '', 'topline', 'phase1', 'phase2', 'phase3', '', 'us', '']

    for num in ind:
        tagset = news_df.Tags[num]
        tagset = tagset.split(', ')
        taglistlist = []
        for taglist in tagset:
            taglist = taglist.split()
            taglistlist.append(taglist)
        news_df.loc[num, 'Tags'] = taglistlist
        news_df.loc[num, 'Vendors'] = news_df.loc[num, 'Vendors'].split(', ')

        headline = str(news_df.Headline[num])
        text = str(news_df.Text[num])
        c_txt = str(news_df['Clean Text'][num])

        news_df.loc[num, 'Headline'] = headline.lower()
        news_df.loc[num, 'Text'] = text.lower()
        news_df.loc[num, 'Clean Text'] = c_txt.lower()

        for rem, rep in zip(remove_list, replace_list):
            headline = news_df.Headline[num]
            text = news_df.Text[num]
            c_txt = news_df['Clean Text'][num]
            news_df.loc[num, 'Headline'] = headline.replace(rem, rep)
            news_df.loc[num, 'Text'] = text.replace(rem, rep)
            news_df.loc[num, 'Clean Text'] = c_txt.replace(rem, rep)

    return news_df

Subset the dataframe and analyze tags, headlines, text

###Most common tags by group

In [101]:
def subset_df(head_type, sub_type = None):
    sub_df = news_df[news_df['Headline Type']== head_type]
    if sub_type == None:
        return sub_df
    elif sub_type != None:
        subsub = sub_df[(sub_df['Headline Sub Type']== sub_type)]
        return subsub

def tag_report(df, source):
    vendset = df.Vendors
    tagset = df.Tags
    n = vendset.index
    vend_ind = []
    bigtaglist = []
    #identify articles from given source, retrieve associated tags
    for num in n:
        vendlist = vendset[num]
        for vend_num in range(0, len(vendlist)):
            vendor = vendlist[vend_num]
            if source in vendor:
                vend_ind = vend_num
                bigtaglist.append(tagset[num][vend_ind])
    #no articles from source
    if not bigtaglist:
        return 'No articles from %s!!' % source
    #aggregate tags and return count of most common
    elif bigtaglist:
        tag_list = []
        for group in bigtaglist:
            for tag in group:
                tag_list.append(tag)

        top_tags = Counter(tag_list)
        return top_tags.most_common(35)

##Bigram & Trigram Collocations, Keywords by Group

In [133]:
def make_soup(df):
    txt_list = df['Clean Text']
    heads_list = df.Headline
    with open('txtsoup.txt', 'w') as txts:
        for line in txt_list:
            txts.write("%s\n" % line)
    with open('headsoup.txt', 'w') as hds:
        for line in heads_list:
            hds.write("%s\n" % line)

In [71]:
def keywords(soup):
    fp = open(soup, 'r')
    words = fp.read()
    words = nltk.tokenize.word_tokenize(words)
    stops = nltk.corpus.stopwords.words('english')
    words = [word for word in words if len(word) > 3]
    words = [word.lower() for word in words]
    words = [w for w in words if w not in stops]
    fdist = FreqDist(words)
    return fdist

In [75]:
def get_bigrams(soup, n):
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    # find collocations
    bi_finder = BigramCollocationFinder.from_words(nltk.corpus.genesis.words(soup))
    # only bigrams that appear n+ times
    bi_finder.apply_freq_filter(n)
    ignored_words = nltk.corpus.stopwords.words('english')
    #filter stopwords
    bi_finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
    bigrams = bi_finder.nbest(bigram_measures.pmi, 50)
    return bigrams

def get_trigrams(soup, n):
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    tri_finder = TrigramCollocationFinder.from_words(nltk.corpus.genesis.words(soup))
    tri_finder.apply_freq_filter(n)
    ignored_words = nltk.corpus.stopwords.words('english')
    tri_finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
    trigrams = tri_finder.nbest(trigram_measures.pmi, 50)
    return trigrams

##Results

In [126]:
news_df = get_spreadsheet('HeadlineSpreadsheet.csv')
subsub = subset_df('M/A', 'Late Stage')
tag_report(subsub, 'Dow Jones')

[('.NAMERICA', 20),
 ('N/DJIV', 20),
 ('P/SGN', 20),
 ('P/AEQI', 20),
 ('N/CNW', 20),
 ('P/ABO', 20),
 ('N/DJPT', 20),
 ('N/WED', 20),
 ('N/WER', 20),
 ('R/NME', 20),
 ('N/DN', 20),
 ('N/DJWI', 20),
 ('N/DJG', 20),
 ('N/DJN', 20),
 ('N/DJI', 20),
 ('S.DJ', 19),
 ('R/US', 19),
 ('.US', 19),
 ('P/WMMI', 19),
 ('N/DJEN', 16),
 ('N/DJRT', 16),
 ('.R1', 16),
 ('N/DJGS', 16),
 ('P/AEI', 16),
 ('.CORP', 15),
 ('.MA', 15),
 ('.RESTRUCT', 15),
 ('N/CAC', 15),
 ('N/TNM', 15),
 ('I/XDJGI', 14),
 ('N/DJGV', 14),
 ('N/DJGP', 14),
 ('N/DJPN', 14),
 ('I/XRUS', 14),
 ('M/MMR', 13)]

In [134]:
make_soup(subsub)
head_soup = '/Users/titans_bball30/Desktop/Trlmprojects/headsoup.txt'
text_soup = '/Users/titans_bball30/Desktop/Trlmprojects/txtsoup.txt'
keywords(head_soup)

FreqDist({'acquire': 11, 'agreement': 10, 'group': 6, 'announce': 5, 'definitive': 5, 'inc.': 5, 'announces': 4, 'partners': 4, 'merger': 4, 'acquired': 4, ...})

In [138]:
get_bigrams(head_soup, 2)

[('real', 'estate'),
 ('per', 'share'),
 ('announce', 'merger'),
 ('announces', 'definitive'),
 ('definitive', 'merger'),
 ('announce', 'definitive'),
 ('merger', 'agreement'),
 ('definitive', 'agreement')]

In [145]:
get_trigrams(text_soup, 7)



[('volume', 'weighted', 'average'),
 ('prudential', 'regulation', 'authority'),
 ('http', '://', 'www'),
 ('bass', 'pro', 'shops'),
 ('obtain', 'free', 'copies'),
 ('ocean', 'city', 'home'),
 ('year', 'ended', 'december'),
 ('electro', 'rent', 'corporation'),
 ('contain', 'important', 'information'),
 ('low', 'speed', 'business'),
 ('please', 'visit', 'www'),
 ('city', 'home', 'bank'),
 ('great', 'plains', 'energy'),
 ('customary', 'closing', 'conditions'),
 ('chief', 'executive', 'officer'),
 ('apollo', 'education', 'group'),
 ('exclusive', 'financial', 'advisor'),
 ('joint', 'proxy', 'statement'),
 ('earnings', 'per', 'share'),
 ('additional', 'information', 'regarding'),
 ('apollo', 'global', 'management'),
 ('definitive', 'proxy', 'statement'),
 ('definitive', 'merger', 'agreement'),
 ('baker', 'hughes', 'shareholders'),
 ('chief', 'financial', 'officer'),
 ('per', 'baxalta', 'share'),
 ('proposed', 'business', 'combination'),
 ('combined', 'proxy', 'statement')]

In [142]:
head_soup

'/Users/titans_bball30/Desktop/Trlmprojects/headsoup.txt'