In [36]:
import pandas as pd
from random import random, seed
from stop_words import get_stop_words

In [37]:
# Some Probability settings to generate various case variations in names
seed(15)
suffix_threshold = 0.4
lc_threshold = 0.25
formcase_threshold = 0.5
capitalize_threshold = 0.5

In [18]:
stopwords = get_stop_words('en')

In [19]:
filepath = 'corpwatch_api_tables_csv/companies.csv'
df = pd.read_csv(filepath, delimiter='\t')
unique_companies = df['company_name'].unique().tolist()

In [30]:
removable_suffixes = [i.lower() for i in ['& CO', 'corp', 'CO', 'LTD', 'INC', 'Co', 'LLC']]

print(removable_suffixes)

In [34]:
# Creates case variations for COMPANY NAMES
def case_formalize(company):
    if random() > capitalize_threshold:
        return company.capitalize()
    else:
        tokens = company.split()
        ntokens = []
        for i, token in enumerate(tokens):
            if token in stopwords and i>0:
                ntokens.append(token.lower())
            else:
                ntokens.append(token.capitalize())
        return ' '.join(ntokens)
    

In [38]:
processed_companies = []
for company in unique_companies:
    
    chance = random()
    if chance < lc_threshold:
        ncompany = company.lower()
    elif chance < lc_threshold + formcase_threshold:
        ncompany = case_formalize(company)
    else:
        ncompany = company
    
    if random()<suffix_threshold:
        for suffix in removable_suffixes:
            if ncompany.lower().endswith(suffix):
                ncompany = ncompany[:-1*(len(suffix)+1)]
                break
    
    processed_companies.append(ncompany)



In [39]:
processed_companies

['CORPORATE INCOME FUND SEVENTY NINTH SHORT TERM SERIES',
 'K Tron International Inc',
 'aar corp',
 'Abbott laboratories',
 'abel noser',
 'Abercrombie & Fitch',
 'aberdeen idaho mining co',
 'ABRAHAM & CO INC',
 'merchanthouse securities inc',
 'Abrams Industries Inc',
 'Worlds Com Inc',
 'Ametech inc',
 'Accel International Corp',
 'ACE HARDWARE CORP',
 'ACETO',
 'acmat',
 'ACME ELECTRIC CORP',
 'Acme United Corp',
 'LIBERTY ACORN TRUST',
 'affiliated computer services',
 'Adams resources & energy inc',
 'Relm wireless',
 'ADAMS EXPRESS',
 'prospera financial services',
 'ADMINISTRATIVE DATA MANAGEMENT',
 'ADMINISTRATIVE SYSTEMS',
 'Advanced Computer Techniques Corp',
 'ADVANCED DIGITAL SYSTEMS INC',
 'advanced micro devices inc',
 'Alliance gaming corp',
 'Aei Securities',
 'AERO SYSTEMS ENGINEERING INC',
 'aeroflex inc',
 'Aerosonic',
 'ing vp bond portfolio',
 'Ing Vp Money Market Portfolio',
 'Ing variable funds',
 'afa protective systems',
 'Lord abbett affiliated fund',
 'Lumi

In [41]:
with open('companies_list.txt', 'w') as f:
    f.write('\n'.join(processed_companies)+'\n')