In [59]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
import pandas as pd
import re
import spacy
import os
import string

## Preprocessing Outline

1. Text preprocessing on articles
    - Tokenize articles
    - remove stopwords
    - Lemmatize
2. Break articles into sets of 20 of words
3. Create training data
        - get chunks of text, len 20
        - pre process chunks
        - join labeled data 
        - extract features

## Reading in Data

In [2]:
files = os.listdir('data/articles/')
files = [file for file in files if file != ".DS_Store"]

In [3]:
text_dat = []

In [4]:
for file in files:
    fname = 'data/articles/' + file
    f = open(fname, encoding='utf8', errors='replace')
    article = f.read()
    # removing nonsense chars
    printable = set(string.printable)
    article = ''.join(filter(lambda x: x in printable, article))
    text_dat.append(article)

f.close()

## Tokenizing

In [20]:
def is_digit(string):
    try: 
        float(string)
        return True
    except ValueError:
        return False

In [66]:
# function that tokenizes a list of articles
def clean_txt(article):
    article_tokens = article.split()
    
    # removing stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))
    tokens_nostop = [tk for tk in article_tokens if not tk in stopwords]
    
    # lemmatize
    lemmatize = WordNetLemmatizer()
    tokens_lem = [lemmatize.lemmatize(word) for word in tokens_nostop]
    
    # make numbers followed by percent one word
    for index, value in enumerate(tokens_lem):
        if is_digit(value) == True and index+1 < len(tokens_lem) and tokens_lem[index+1] == 'percent':
            tokens_lem[index] = value + " " + 'percent'
            tokens_lem.pop(index+1)
    
    return tokens_lem

In [67]:
cleaned_lst = [clean_txt(article) for article in text_dat]

In [68]:
len(cleaned_lst)

730

# Labeling data

In [69]:
ceos = pd.read_csv('data/labels/ceo.csv', encoding="ISO-8859-1",header=None)
companies = pd.read_csv('data/labels/companies.csv', header=None)
percentages = pd.read_csv('data/labels/percentage.csv', encoding="ISO-8859-1",header=None)

In [70]:
ceos[2] = [' '.join(s.split()) for s in ceos[2]]
ceos.head()

Unnamed: 0,0,1,2
0,Tom,Horton,Tom Horton
1,Patti,Hart,Patti Hart
2,Jamie,Dimon,Jamie Dimon
3,Steve,Cohen,Steve Cohen
4,Tim,Cook,Tim Cook


### Subsets of Lists

In [234]:
word_subsets = []
for article_tokens in cleaned_lst:
    article_subset = [article_tokens[x:x+20] for x in range(0, len(article_tokens), 20)]
    word_subsets.extend(article_subset)

In [235]:
detokenizer = TreebankWordDetokenizer()
detokenized_subsets = [detokenizer.detokenize(subset) for subset in word_subsets]

In [236]:
# creating dictionary
tokens_and_subsets = dict(zip(detokenized_subsets, word_subsets))

In [237]:
next(iter(tokens_and_subsets.items()))

('(Reuters) - The Federal Reserve\'s vow keep interest rate near zero "considerable time" likely remain place now, U.S. central bank',
 ['(Reuters)',
  '-',
  'The',
  'Federal',
  "Reserve's",
  'vow',
  'keep',
  'interest',
  'rate',
  'near',
  'zero',
  '"considerable',
  'time"',
  'likely',
  'remain',
  'place',
  'now,',
  'U.S.',
  'central',
  'bank'])

## Matching CEOs

In [238]:
# unique ceos
ceo_lst = list(set(ceos[2]))

In [239]:
ceo_matches = []
for i in range(len(detokenized_subsets)):
    match = [x for x in ceo_lst if x in detokenized_subsets[i]]
    if len(match) ==0:
        match = ['None']
    ceo_matches.append(match)

Some text contains more than one match.
<br>
Solution: iterate over the list of matches, if there is more than one match duplicate the text for as many matches as there are in the list.

In [240]:
ceo_text = []
unique_ceos = []
for i in range(len(ceo_matches)):
    if len(ceo_matches[i]) > 1:
        for j in range(len(ceo_matches[i])):
            ceo_text.append(detokenized_subsets[i])
            unique_ceos.append([ceo_matches[i][j]])
            
    else: 
        ceo_text.append(detokenized_subsets[i])
        unique_ceos.append(ceo_matches[i])

In [241]:
len(ceo_text)

464524

In [242]:
len(unique_ceos)

464524

## CEO Training Data

In [243]:
df = pd.DataFrame({'text': ceo_text,
                  'ceos': unique_ceos})

In [244]:
df.dtypes

text    object
ceos    object
dtype: object

In [245]:
# turning lists of len 1 to a 
df['ceoStr'] = [''.join(lst) for lst in df.ceos]

In [254]:
df['text_tokens'] = [tokens_and_subsets.get(text) for text in df['text']]

In [257]:
df['label'] = [1 if val != 'None' else 0 for val in df.ceoStr]

In [258]:
df.head()

Unnamed: 0,text,ceos,ceoStr,text_tokens,label
0,(Reuters) - The Federal Reserve's vow keep int...,[Federal Reserve],Federal Reserve,"[(Reuters), -, The, Federal, Reserve's, vow, k...",1
1,set take slow steady approach first rate rise ...,[None],,"[set, take, slow, steady, approach, first, rat...",0
2,official want remove it. But others feel still...,[None],,"[official, want, remove, it., But, others, fee...",0
3,"market rate hike still way off. ""I think 'cons...",[None],,"[market, rate, hike, still, way, off., ""I, thi...",0
4,Fed President John Williams told Market News I...,[None],,"[Fed, President, John, Williams, told, Market,...",0
