In [23]:
import pandas as pd
import numpy as np
import os

**Define Paths to Output Folders**

In [24]:
data = '../../Output'
raw_data = '../../OutputRaw'

**Define Helper Function to Assemble lists of Files**

In [25]:
def get_files(path):
    ret = []
    for folder in os.listdir(path):
        if folder != '.DS_Store' and folder != 'FirmNameStats.txt' and folder != 'OutputStats.txt' and folder != '.ipynb_checkpoints':
            for file in os.listdir(os.path.join(path, folder)):
                if file != '.DS_Store' and file != '.ipynb_checkpoints':
                    ret.append((os.path.join(path + '/' + folder, file)))
    return ret

**<span style="font-size:2em;">Helper Functions</span>**

**Function to Get Unique Firms from each file**

In [26]:
def get_list(org_list):
    ret = []
    for string in org_list:
        for ent in string.split(', '):
            if ent not in ret:
                ret.append(ent)
    return ret

**Function That Produces a Spillover List (A - B)**

In [27]:
def find_difference(list1, list2):
    spillover = []
    for i in list1:
        if i not in list2:
            spillover.append(i)
    return spillover

**Function That Returns a List of Two-Word Firm Names (i.e. Under Armor)**

In [28]:
def get_bigrams(spill):
    b = []
    for i in spill:
        if len(i.split()) > 1:
            b.append(i)
    return b

**<span style="font-size:2em;">Iterate Through File List and Compare Differences in Matches</span>**

**Get Lists of Output Files**

In [29]:
raw_files = get_files(raw_data)
files = get_files(data)

**Iterate Through and Produce Lists of Unique Firm Names across each Output Folder**

In [30]:
og_list = []
raw_list = []
for i in range(len(files)):
    og_df = pd.read_csv(files[i])
    raw_df = pd.read_csv(raw_files[i])
    og_list.append(get_list([str(org) for org in og_df['Matched Organizations'].tolist()]))
    raw_list.append(get_list([str(org) for org in raw_df['Matched Organizations'].tolist()]))

**Remove Duplicates From Both Lists**

In [31]:
raw = []
for l in raw_list:
    for i in l:
        if i not in raw:
            raw.append(i)
ogs = []
for l in og_list:
    for i in l:
        if i not in ogs:
            ogs.append(i)
            
print('Unique Firms Found in Raw Match: {}\nUnique Firms Found in Spacy Match: {}'.format(len(raw), len(ogs)))

Unique Firms Found in Raw Match: 3214
Unique Firms Found in Spacy Match: 1273


**Find The Firm Names that Appear in Raw Match, but not in Spacy Match**

In [32]:
spillover = find_difference(raw, ogs)

**Get Bigram and Unigram Firm Names From Spillover List**  

In [33]:
bgs = get_bigrams(spillover) # These two-word firms are much less likely to be confused with anything other than a company
unigrams = list(set(spillover) - set(bgs))
print('Percentage of Spillover List that are Two-Word Firm Names: {}%'.format(round(len(bgs) * 100/len(spillover), 1)))

Percentage of Spillover List that are Two-Word Firm Names: 67.4%
