In [10]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer
import spacy
import uuid

nlp = spacy.load('en_core_web_md')

In [11]:
data = pd.read_csv('../data/interim/news_annotated.csv')
data['section_id'] = data.apply(lambda row: uuid.uuid4(), axis=1)
display(data.head())

Unnamed: 0,date,section,keyword,sentiment,category,section_id
0,jan_2002,Marie Thibaut spent 15 years as an administrat...,"Enron, 401(k), stock, shares, Enron, matched, ...","e, e, p, p, e, p, p, p, p, n, n, n, n, n","c, e, c, c, c, e, c, c, e, n, f, e, f, e",efc7958b-3425-453c-aec6-3e553e9a8d12
1,jan_2002,That sorry tale has been repeated thousands of...,"Enron, stocks, retirement money, shares, 401(k...","n, n, e, e, e, e, e, e, n, n, e, e","c, c, e, c, e, c, e, c, c, c, e, e",b0a4be0d-e3c0-4371-b709-c5d9a4d670af
2,jan_2002,Pension-reform advocates say a little paternal...,"employees, company stock, Enron, accountants, ...","e, e, n, n, e, n","e, c, c, f, e, c",a9c4cf47-9390-46c5-aaed-4c68bb3dfa8e
3,jan_2002,Where were the auditors? People ask that quest...,"auditors, corporate collapse, accounting firms...","n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, e...","f, f, f, f, f, f, c, f, f, f, f, c, f, f, f, f...",0a8103e6-f450-4e66-a82b-d53f320136fd
4,jan_2002,The full story of the Enron debacle--and what ...,"Enron, debacle, Andersen, audit, crisis, accou...","n, n, n, e, n, n, e, n, n, e, e, e, n, e, e, e...","c, c, f, f, f, f, f, f, f, f, f, f, c, f, f, f...",b2ed16fd-eb0a-4b16-ab08-f6569d328d9a


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
MAX_LEN = 72

In [37]:
def tokenize_sent_split(review_text, MAX_LEN=MAX_LEN):
    """
    Tokenizes review. If beyond MAX_LEN, splits review
    into chunks at sentence boundaries where possible,
    returning list of tokenized chunks
    """
    tokens = tokenizer.tokenize(review_text)

    if len(tokens) > MAX_LEN:
        doc = nlp(review_text) # Use SpaCy because can identify sentence boundaries
        all_chunks = [] # Return groups of sentences that are below max_len
        chunk = []
        for sent in doc.sents:
            tokens = tokenizer.tokenize(sent.text) # Go back to BERT tokenizer because that's what the model is trained on
            if len(tokens) > MAX_LEN: # Really long single sentence (unlikely). Just split in the middle.
                all_chunks += [chunk]
                all_chunks += [tokens[:MAX_LEN]]
                chunk = tokens[MAX_LEN:]
            elif len(chunk) + len(tokens) < MAX_LEN: # Multiple sentences still below max_len.
                chunk += tokens
            else: # Adding another sentence would go past max_len.
                all_chunks += [chunk]
                chunk = tokens
        all_chunks += [chunk]
        return all_chunks
    
    else:
        return [tokens]

In [47]:
def get_tags(text, keywords, sentiments, index, chunk_max_length = MAX_LEN):
    '''
    Args:
        text: String of original text
        kws: String of comma seperated kws
        sents: String of comma seperated sentiment tags
        idx: text id, used to match chunks back together
        MAX_LEN: Where to chunk input beyond a certain number of tokens
        lower: Whether to lowercase the input
    '''

    chunks = tokenize_sent_split(text)

    possible = []
    observed = []
    
    keywords = keywords.lower()
    
    keywords = keywords.split(", ")
        
    sentiments = [sentiment.upper().strip() for sentiment in sentiments.split(",")]
    
    if len(sentiments) != len(keywords):
        print("Length mismatch - number of sentiments does not equal number of keywords")
        print(text, keywords, sentiments, index, "", sep="\n")
        return []
    
    for text_tokens in chunks:
        tags = np.array(["O" for token in text_tokens])
        sentiments_output = np.array(["O" for token in text_tokens])
        for i in range(len(keywords)):
            keyword = keywords[i]
            if keyword=='':
                continue
            else:
                keyword = tokenizer.tokenize(keyword)
                possible += [" ".join(keyword)]

                for j in range(len(text_tokens)-len(keyword)+1):
                    text_window = text_tokens[j:j+len(keyword)]
                    if text_window == keyword:
                        tags[j:j+len(keyword)] = "I"
                        tags[j] = "B"
                        sentiments_output[j] = sentiments[i]
                        observed += [" ".join(keyword)]
        
        tags = tags.tolist()
        sentiments_output = sentiments_output.tolist()
        output_keyword_tags_sentiments = [tags, text_tokens, sentiments_output]

    missing = set(possible) - set(observed)
    if missing:
        print("The following keywords were provided but not found in the text: ")
        print(index, missing)
        #print(text)
        print()
        
    return output_keyword_tags_sentiments

In [44]:
print(get_tags('The Enron scandal rocked Wall Street and sent stock prices of other energy companies plunging.', 'Enron, scandal, stock prices, energy companies', 'n, n, n, e', '6a345b'))

[['O', 'B', 'I', 'B', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'B', 'I', 'O', 'O'], ['the', 'en', '##ron', 'scandal', 'rocked', 'wall', 'street', 'and', 'sent', 'stock', 'prices', 'of', 'other', 'energy', 'companies', 'plunging', '.'], ['O', 'N', 'O', 'N', 'O', 'O', 'O', 'O', 'O', 'N', 'O', 'O', 'O', 'E', 'O', 'O', 'O']]


In [48]:
#test_df = data.iloc[:1]
data['tags'] = data.apply(lambda row: get_tags(row['section'], row['keyword'], row['sentiment'].lower(), row['section_id']), axis=1)
display(data.head())

The following keywords were provided but not found in the text: 
f5352f4c-c4d9-494c-b62f-202045b8cde8 {'security and exchange commission'}

The following keywords were provided but not found in the text: 
0c34ea4a-b5c9-483b-975f-bb92b50077c0 {"en ##ron ' s"}

The following keywords were provided but not found in the text: 
6822b7f0-90dc-46b3-a414-be60e741fe14 {"en ##ron ' s"}

The following keywords were provided but not found in the text: 
2b2d9d40-e4cd-4d08-bf4e-d1fea8373e1a {'forward', "en ##ron ' s"}

Length mismatch - number of sentiments does not equal number of keywords
Analyzing Enron can be deeply frustrating. “It’s very difficult for us on Wall Street with as little information as we have,” says Fleischer, who is a big bull. (The same is true for Enron’s competitors, but “wholesale operations” are usually a smaller part of their business, and they trade at far lower multiples.) “Enron is a big black box,” gripes another analyst. Without having access to each and every one of 

Unnamed: 0,date,section,keyword,sentiment,category,section_id,tags
0,jan_2002,Marie Thibaut spent 15 years as an administrat...,"Enron, 401(k), stock, shares, Enron, matched, ...","e, e, p, p, e, p, p, p, p, n, n, n, n, n","c, e, c, c, c, e, c, c, e, n, f, e, f, e",efc7958b-3425-453c-aec6-3e553e9a8d12,"[[O, O, O, O, O, O, O, O, O, O, O, B, I, O, B,..."
1,jan_2002,That sorry tale has been repeated thousands of...,"Enron, stocks, retirement money, shares, 401(k...","n, n, e, e, e, e, e, e, n, n, e, e","c, c, e, c, e, c, e, c, c, c, e, e",b0a4be0d-e3c0-4371-b709-c5d9a4d670af,"[[O, B, I, O, O, O, O, O, O, O, O, O, O, O, O,..."
2,jan_2002,Pension-reform advocates say a little paternal...,"employees, company stock, Enron, accountants, ...","e, e, n, n, e, n","e, c, c, f, e, c",a9c4cf47-9390-46c5-aaed-4c68bb3dfa8e,"[[O, O, O, O, O, O, B, I, O, O, O, O, O, O, O,..."
3,jan_2002,Where were the auditors? People ask that quest...,"auditors, corporate collapse, accounting firms...","n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, e...","f, f, f, f, f, f, c, f, f, f, f, c, f, f, f, f...",0a8103e6-f450-4e66-a82b-d53f320136fd,"[[O, O, O, O, O, O, O, B, I, I, O, O, O, O, O,..."
4,jan_2002,The full story of the Enron debacle--and what ...,"Enron, debacle, Andersen, audit, crisis, accou...","n, n, n, e, n, n, e, n, n, e, e, e, n, e, e, e...","c, c, f, f, f, f, f, f, f, f, f, f, c, f, f, f...",b2ed16fd-eb0a-4b16-ab08-f6569d328d9a,"[[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,..."


In [49]:
data.to_clipboard()

In [None]:
### Notes for tomorrow

# Strip quotes out of both keywords and the article text
# Decide what to do about the contraction "Enron's" because it's marked as a keyword several times 
# but is never picked up as a keyword properly because of the apostrophe