In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string
import os
from os.path import exists
import collections
import time

**<span style="font-size:2em;">Read in Cleaned Firm Name Data</span>**

**CRSP Firm Name Data (for testing)**

In [2]:
firms_df = pd.read_csv('../../Data/Diego/conames_clean.csv')

In [3]:
all_firms = []
for i in range(len(firms_df.index)):
    if firms_df.loc[i, 'done'] == 1:
        if isinstance(firms_df.loc[i, 'regexes'], str):
            all_firms.append(firms_df.loc[i, 'regexes'])
        else:
            all_firms.append(firms_df.loc[i, 'names.firms.1'])

**<span style="font-size:2em;">Helper Functions</span>**

**Function to Generate Tuples From Topic Data**

In [4]:
def generate_tuple(row):
    list_of_tuples = []
    for item in row:
        for string in item.split():
            if string.isnumeric():
                # Take everything up to the number as a single string
                list_of_tuples.append((item[:item.find(string) - 1], string)) # Create a tuple (Topic, Frequency)
    return list_of_tuples

**Row-Wise Function that matches Firm Names in Raw Data (1 to 1)**

Changed this because I removed some text preprocessing from RawText (changes: [[TIME.START]] and [[TIME.END]] from TIME START and TIME END)

In [5]:
# def add_hits(row, firm_list):
#     loi = []
#     for segment in str(row).split('[[TIME.START]]'):
#         for firm in firm_list:
#             if ' {} '.format(firm.strip()) in segment:
#                 loi.append((firm.split('(')[0].strip(), segment.split('[[TIME.END]]')[0].strip(), segment.lower().count(' {} '.format(firm.strip()))))
#     if not loi:
#         return 'No Hits'
#     else:
#         return ', '.join(['{}/{}/{}'.format(tpl[0], tpl[1], tpl[2]) for tpl in loi])

In [6]:
def add_hits(row, firm_list):
    loi = []
    for segment in str(row).split('[[TIME.START]]'):
        for firm in firm_list:
            if '*' in firm:
                search_term = r'\b{}'.format(firm.strip('*'))
            else:
                search_term = r'\b{}\b'.format(firm)
            if re.search(search_term.lower(), segment.lower()):
                loi.append((firm, segment.split('[[TIME.END]]')[0].strip(), len(re.findall(search_term.lower(), segment.lower()))))
    if not loi:
        return 'No Hits'
    else:
        return ', '.join(['{}/{}/{}'.format(tpl[0], tpl[1], tpl[2]) for tpl in loi])

**Row-Wise Function that adds the Un-processed Firm Name to the Hits**

In [7]:
def add_ogs(row, firms, firms_og):
    org_hits = []
    for org in str(row).split(','):
        if org == 'No Hits':
            return 'No Hits'
        else:
            org_hits.append('{} ({})'.format(org, firms_og[firms.index(org.lstrip())]))
    return ', '.join(org_hits)

**<span style="font-size:2em;">Create tuples of Corpora Files and Spacy Files</span>**

**Define Path to Files**

In [8]:
corpora_data = '../../Data/CorporaData'

**Iterate through Directories and Match Corresponding Files**

In [9]:
files = []
for folder in os.listdir(corpora_data):
    if folder != '.DS_Store':
        for corpora_file in os.listdir(os.path.join(corpora_data, folder)):
            if corpora_file != '.DS_Store' and corpora_file != '.ipynb_checkpoints':
                files.append((os.path.join(corpora_data, os.path.join(folder, corpora_file))))

**Process Raw Text for Search**

In [14]:
for file in files:
    # if exists('../OutputRaw/{}/{}_{}_output.csv'.format(file.split('/')[4].split('.')[0], file.split('/')[4].split('.')[0], file.split('/')[4].split('.')[2])):
    #     continue
    # else:
    start = time.time()
    df = pd.read_csv(file)
    df.columns = ['URL', 'Title', 'RawText']
    df['Topics'] = df['RawText'].apply(
        lambda text: str(text)[str(text).find('TOPICS: TOPIC FREQUENCY ') + len('TOPICS: TOPIC FREQUENCY '):].split('; '))
    df['Topics'] = df['Topics'].apply(lambda text: generate_tuple(text))
    df['RawText_preprocessed'] = df['RawText'].apply(lambda text: str(text).split('TOPICS: TOPIC FREQUENCY ')[0])
    df['RawText_preprocessed'] = df['RawText_preprocessed'].apply(lambda text: str(text)[str(text).find('[[TITLE.END]] ') + len('[[TITLE.END]] '):])       
    del df['RawText']
    df.to_csv('../../OutputRaw/{}/{}_{}_output.csv'.format(
        file.split('/')[4].split('.')[0], file.split('/')[4].split('.')[0], file.split('/')[4].split('.')[2]), index=False)
    end = time.time()
    print('File {} done in {} seconds'.format(file, end-start))

File ../Data/CorporaData/2021/FOXNEWS.Text.2021.1.csv done in 2.6922757625579834 seconds
File ../Data/CorporaData/2021/MSNBC.Text.2021.1.csv done in 3.061405897140503 seconds
File ../Data/CorporaData/2021/CNBC.Text.2021.1.csv done in 0.961961030960083 seconds
File ../Data/CorporaData/2021/FBC.Text.2021.1.csv done in 1.0513768196105957 seconds
File ../Data/CorporaData/2021/CNN.Text.2021.1.csv done in 2.3217687606811523 seconds
File ../Data/CorporaData/2021/Bloomberg.Text.2021.1.csv done in 0.8401088714599609 seconds


In [11]:
output_path = '../../OutputRaw'

In [12]:
output_files = []
for folder in os.listdir(output_path):  
    if folder != '.DS_Store' and folder != '.ipynb_checkpoints':
        for file in os.listdir(os.path.join(output_path, folder)):
            if file != '.ipynb_checkpoints':
                output_files.append(os.path.join(output_path, os.path.join(folder, file)))

**Match Data and Record**

In [15]:
for output_file in output_files:
    start = time.time()
    dataframe = pd.read_csv(output_file)
    if 'Matched Organizations' in dataframe.columns:
        print(output_file)
        continue
    else:
        final_df = pd.DataFrame(columns=['URL', 'Show Title', 'Topics', 'Matched Organizations'])
        final_df['URL'] = dataframe['URL']
        final_df['Show Title'] = dataframe['Title']
        final_df['Topics'] = dataframe['Topics']
        print('Matching...')
        final_df['Matched Organizations'] = dataframe['RawText_preprocessed'].apply(lambda text: add_hits(text, all_firms))
        print('Done.')
        end = time.time()
        final_df.to_csv(output_file, index=False)
        print('File {} done in {} seconds'.format(output_file, end-start))