In [1]:
import pandas as pd
import numpy as np
import re
import string
import time
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
# import stanfordnlp # Use command 'stanfordnlp.download('en')' once to be able to use the POS processer time (size: 1.96 GB)
from spacy import displacy

# Create a neural pipeline with stanford POS processer (will be used to upper case appropriate words for NER)
# If you change the default dir, make sure you specify the working dir in the command below (if not you can leave it out)
# stf_nlp = stanfordnlp.Pipeline(
#     models_dir='/Users/baci/Documents/Fall 2021/CSCI 4308/Leeds Project/SNLP Resources', processors='tokenize,mwt,pos')

# Load Spacy's Named Entity Recognition (add pipeline to merge entities)
NER = spacy.load('en_core_web_sm')
NER.add_pipe('merge_entities')

# Cache stopword set so access is faster
stop_words = set(stopwords.words('english'))

**Read in Data Files**

In [4]:
# Raw TV Show closed captioning text
df_corpora = pd.read_csv('../../Data/CorporaData/2012/CNBC.Text.2012.1.csv')
df_corpora.columns = ['URL', 'Title', 'RawText']

**Functions To Truecase Words In The Text**

In [51]:
# True case text using stanfordnlpp POS processor (not used yet)
def truecase_snlp(row):
    captions = stf_nlp(row)
    truecase = ' '.join(w.text.capitalize() if w.upos in ["PROPN","NNS"] else w.text for sent in captions.sentences for w in sent.words)
    return truecase

# True case text using nltk POS tagger
def truecase_nltk(row):
    words = nltk.word_tokenize(row)
    tagged_words = nltk.pos_tag([word.lower() for word in words])
    capitalized_words = [w.capitalize() if t in ["NN","NNS"] else w for (w,t) in tagged_words] # Capitalize based on POS tags
    capitalized_words[0] = capitalized_words[0].capitalize() # Capitalize first word in sentence
    return re.sub(" (?=[\.,'!?:;])", "", ' '.join(capitalized_words))

**<span style="font-size:2em;">Gather Topics Included in Text Segment</span>**

**Function to generate tuples from list of Topics**

In [52]:
def generate_tuple(row):
    list_of_tuples = []
    for item in row:
        for string in item.split():
            if string.isnumeric():
                # Take everything up to the numer as a single string
                list_of_tuples.append((item[:item.find(string) - 1], string)) # Create a tuple (Entity, Frequency)
    return list_of_tuples

**Generate list of Topics mentioned from the Raw Text and create tuples from them**

In [53]:
# Grab topic and frequency
df_corpora['Topics'] = df_corpora['RawText'].apply(
    lambda text: text[text.find('TOPICS: TOPIC FREQUENCY ') + len('TOPICS: TOPIC FREQUENCY '):].split('; '))

# Create tuple from each
df_corpora['Topics'] = df_corpora['Topics'].apply(lambda text: generate_tuple(text))

**<span style="font-size:2em;">Preprocessing and Cleaning (for Spacy)</span>**

**Remove Title and HTML Tags**

In [None]:
# Remove title (function: removes everything before and including [[TITLE.END]])
df['RawText_preprocessed'] = df['RawText'].apply(
    lambda text: text[text.find('[[TITLE.END]] ') + len('[[TITLE.END]] '):])   

# Strip html using regex
df['RawText_preprocessed'] = df['RawText_preprocessed'].apply(
    lambda text: ' '.join(re.sub('<[^<]+?>', '', word) for word in text.split()))

**True Case Text to improve accuracy in Spacy's NER (stanfordnlp)**  
Purpose: Spacy's NER does not recognize lower cased entities as well (Run before removing punctuation)

In [55]:
# start = time.time()
# df_corpora['RawText_Preprocessed'] = df_corpora['RawText_Preprocessed'].apply(
#     lambda text: truecase_snlp(text))
# end = time.time()
# print('Time taken to run snlp POS processor on text: {} seconds'.format(end - start))

**True Case Text (nltk)**

In [56]:
start = time.time()
df_corpora['RawText_Preprocessed'] = df_corpora['RawText_Preprocessed'].apply(
    lambda text: truecase_nltk(text))
end = time.time()
print('Time taken to run nltk POS tagger on text: {} seconds'.format(end - start))

Time taken to run nltk POS tagger on text: 149.37974905967712 seconds


**Remove Punctuation and Stopwords**

In [None]:
# Define which punctuation to remove so as to not mess with timestamps yet (i.e. dont remove brackets or colons)
punctuation = '''(){};'"\,<>/@#%^&*_~'''
# May or may not use this depending on whether or not it affects spacy NER accuracy
df['RawText_preprocessed'] = df['RawText_preprocessed'].apply(
    lambda text: ' '.join(word.translate(str.maketrans('', '', punctuation)) for word in text.split()))   

# Remove stopwords
df['RawText_preprocessed'] = df['RawText_preprocessed'].apply(
    lambda text: ' '.join(word for word in text.split() if word.lower() not in stop_words or word in ['am']))

**<span style="font-size:2em;">Spacy Named Entity Recognition</span>**

**Generating Spacy Named Entity Recognition Output (with merged entities)**

In [58]:
start = time.time()
df_corpora['Spacy_Entities'] = df_corpora['RawText_Preprocessed'].apply(
    lambda text: [(entity.text, entity.label_) for entity in NER(text).ents])
end = time.time()
print('Time to detect spacy entities: {} seconds'.format(end - start))

Time to detect spacy entities: 1324.9350650310516 seconds


**Organizing Spacy Output**

In [59]:
# With Merged Entities
df_corpora['NER_Organizations'] = df_corpora['Spacy_Entities'].apply(
    lambda output: [tpl[0] for tpl in output if tpl[1] == 'ORG'])

df_corpora['NER_Persons'] = df_corpora['Spacy_Entities'].apply(
    lambda output: [tpl[0] for tpl in output if tpl[1] == 'PERSON'])

df_corpora['NER_Other'] = df_corpora['Spacy_Entities'].apply(
    lambda output: [tpl[0] for tpl in output if tpl[1] not in ['PERSON', 'ORG']])

**<span style="font-size:2em;">Save Condensed Dataframe To CSV</span>**

In [60]:
df_corpora.to_csv('../../Data/CleanData/Clean.Bloomberg.Text.2021.1.csv', index=False)