In [1]:
import pandas as pd
import numpy as np
import re
import collections
import os
import time

**<span style="font-size:2em;">Read in Cleaned Firm Name Data</span>**

**CRSP Firm Name Data (for testing)**

In [2]:
firms_df = pd.read_excel('../Data/FirmNameData/cleaned_firms.xlsx')
all_firms = [str(firm).lower() for firm in firms_df['Cleaned_Name'].tolist()]
all_firms_og = [str(firm) for firm in firms_df['Name'].tolist()]

**<span style="font-size:2em;">Helper Functions</span>**

**Function to Generate Tuples From Topic Data**

In [3]:
def generate_tuple(row):
    list_of_tuples = []
    for item in row:
        for string in item.split():
            if string.isnumeric():
                # Take everything up to the number as a single string
                list_of_tuples.append((item[:item.find(string) - 1], string)) # Create a tuple (Topic, Frequency)
    return list_of_tuples

**Row-Wise Function that matches Spacy Organization to Firm (1 to 1)**

In [4]:
def add_hits(row, firms):
    org_hits = []
    for org in row.split(';'):
        if org.lower() in firms:
            org_hits.append(org.lower())
    if len(org_hits) >= 1:
        return ', '.join(org_hits)
    else: 
        return 'No Matches'

In [5]:
def add_ogs(row, firms, firms_og):
    org_hits = []
    for org in row.split(','):
        if org == 'No Matches':
            return 'No Matches'
        else:
            org_hits.append('{} ({})'.format(org, firms_og[firms.index(org.lstrip())]))
    return ', '.join(org_hits)

**<span style="font-size:2em;">Create tuples of Corpora Files and Spacy Files</span>**

**Define Path to Files**

In [6]:
corpora_data = '../Data/CorporaData'
meta_data = '../Data/MetaData'

**Iterate through Directories and Match Corresponding Files**

In [7]:
file_tuples = []
for folder in os.listdir(corpora_data):
    if folder != '.DS_Store':
        for corpora_file in os.listdir(os.path.join(corpora_data, folder)):
            if corpora_file != '.DS_Store' and corpora_file != '.ipynb_checkpoints':
                tv_corpora = os.path.join(corpora_data + '/' + folder, corpora_file)
                network = tv_corpora.split('/')[4]
                search_key = '.'.join((network.split('.')[0], network.split('.')[2]))
                for meta_data_file in os.listdir(meta_data):
                    if search_key in meta_data_file:
                        file_tuples.append((search_key.replace('.', '_'), tv_corpora, os.path.join(meta_data, meta_data_file)))

**<span style="font-size:2em;">Functions to Match Entities in Files</span>**  

**Function to Merge Spacy and Corpora Data**

In [8]:
def merge(corpora, spacy):
    # Format Dataframes
    corpora.columns = ['URL', 'Title', 'RawText']
    spacy.columns = ['Title', 'Organizations', 'Persons', 'Other']
    # Grab topic and frequency and pass to generate_tuple()
    corpora['Topics'] = corpora['RawText'].apply(
        lambda text: str(text)[str(text).find('TOPICS: TOPIC FREQUENCY ') + len('TOPICS: TOPIC FREQUENCY '):].split('; '))
    corpora['Topics'] = corpora['Topics'].apply(lambda text: generate_tuple(text))
    # Clean Spacy NER Data
    spacy['Organizations'] = spacy['Organizations'].apply(lambda org: re.sub('_', ' ', str(org)))
    spacy['Persons'] = spacy['Persons'].apply(lambda per: re.sub('_', ' ', str(per)))
    spacy['Other'] = spacy['Other'].apply(lambda oth: re.sub('_', ' ', str(oth)))
    
    return pd.merge(corpora, spacy, on='Title')

**Match Data and Record**

In [9]:
def match(dataframe, year, working_dir, firms=all_firms):
    final_df = pd.DataFrame(columns=['URL', 'Matched Organizations', 'Hits', 'Hits Frequency'])
    final_df['URL'] = dataframe['URL']
    # Match Data
    final_df['Matched Organizations'] = dataframe['Organizations'].apply(lambda list_of_orgs: add_hits(list_of_orgs, all_firms))
    final_df['Hits'] = final_df['Matched Organizations'].apply(lambda list_of_hits: add_ogs(list_of_hits, all_firms, all_firms_og))
    # Compute Frequency for each entity
    final_df['Hits Frequency'] = final_df['Hits'].apply(lambda hits: collections.Counter(hits[0].split(', ')))
    # Save File
    final_df.to_csv(working_dir + '/' + year + '_hits.csv', index=False)

**Iterate through files and Call Necessary Functions**

In [10]:
for tpl in file_tuples:
    # Set up working directory
    working_dir = '../Output/' + tpl[0].split('_')[0]
    year = tpl[0].split('_')[1]
    if not os.path.exists(working_dir):
        os.mkdir(working_dir)
    # Call functions
    merged = merge(pd.read_csv(tpl[1]), pd.read_csv(tpl[2]))
    stats = match(merged, year, working_dir)