In [None]:
# Project: Scrap peer lists from def14a files of S&P 500 companies
# Author: Yijing Yuan
# Date: 07/06/2020

In [1]:
import os
import re
import csv
import pandas as pd
from fuzzywuzzy import fuzz

In [78]:
# Get S&P1500 company names in a list
company_info = pd.read_stata('C:\\Users\\yuanx\\Desktop\\Summer Research\\Compustat-Constituent-List-2008-2019.dta')
company_info_name = list(company_info['co_conm'])
company_info_name_unique = list(dict.fromkeys(company_info_name))
company_info_name_unique.sort()

# Get dictionary of company name and cik
company_info_cik = list(company_info['co_cik'])
kic = dict(zip(company_info_name_unique,company_info_cik))

SP_names = company_info_name_unique
SP_stand = []
SP_special = []
for i in range(len(SP_names)):
    #Get rid of "-CL A"
    SP_names[i] = re.sub('-CL A', '', SP_names[i])
    SP_names[i] = re.sub('-CL B', '', SP_names[i])
    SP_names[i] = re.sub('-OLD', '', SP_names[i])
    SP_names[i] = re.sub('-REDH', '', SP_names[i])
    SP_names[i] = re.sub(r'\/[A-Z]{2}\s*$', '', SP_names[i])
    SP_names[i] = re.sub(r'-OHIO$', '', SP_names[i])
    # Get rid of spaces
    SP_names[i] = re.sub(r'\s+', ' ', SP_names[i])
    SP_names[i] = re.sub(r'\s+$', '', SP_names[i])
    # Standardize "holding/holdings"
    SP_names[i] = re.sub(r'\bHOLDINGS\b', 'HLDG', SP_names[i])
    SP_names[i] = re.sub(r'\bHOLDING\b', 'HLDG', SP_names[i])
    if re.search(r'[^a-zA-Z\s+]+', SP_names[i]) != None:
        SP_special.append(SP_names[i])
    else:
        SP_stand.append(SP_names[i])

#print(len(SP_names))
#print(len(SP_special))
#print(len(SP_stand))
#print(SP_names)
print(len(SP_names))

2178


In [3]:
SP_nonstand = []
for names in SP_names:
    if re.search(r'(.*?)\s+(INC|CORP|CO|LTD)$', names) == None:
        SP_nonstand.append(names)
SP_nonstand

['99 CENTS ONLY STORES',
 'ABBOTT LABORATORIES',
 'ABERCROMBIE & FITCH',
 'ACADIA REALTY TRUST',
 'ACCENTURE PLC',
 'ADIENT PLC',
 'ADVANCED MICRO DEVICES',
 'AECOM',
 'AEROJET ROCKETDYNE HLDG',
 'AES CORP (THE)',
 'AFFILIATED COMPUTER SERVICES',
 'ALLEGION PLC',
 'ALLERGAN PLC',
 'ALLSCRIPTS HEALTHCARE SOLTNS',
 'AMCOR PLC',
 'AMERICAN AXLE & MFG HLDG',
 'AMERICAN CAMPUS COMMUNITIES',
 'AMERICAN EQTY INVT LIFE HLDG',
 'AMERICAN GREETINGS',
 'AMERICAN INTERNATIONAL GROUP',
 'AMERICAN MEDICAL SYSTMS HLDS',
 'AMERICAN PHYSICIANS CAPITAL',
 'AMERICAN PUBLIC EDUCATION',
 'AMERICAN SCIENCE ENGINEERING',
 'AMERIS BANCORP',
 'ANALOG DEVICES',
 'ANDEAVOR',
 'AON PLC',
 'APPLIED SIGNAL TECHNOLOGY',
 'APTIV PLC',
 'ARMADA HOFFLER PROPERTIES',
 'ARRIS INTERNATIONAL PLC',
 'ARROWHEAD PHARMACEUTICALS',
 'ASSOCIATED BANC-CORP',
 'ATWOOD OCEANICS',
 'AUTOMATIC DATA PROCESSING',
 'AVON PRODUCTS',
 'BANCORPSOUTH BANK',
 'BANK OZK',
 'BERKSHIRE HATHAWAY',
 'BIO REFERENCE LABS',
 'BOB EVANS FARMS',
 'BOS

In [4]:
# Get all text chunks after "peer" appears
def get_chunks(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
    # clean unicode in text
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    #text = text.encode('ascii', 'ignore').decode()
    # clean spaces in text
    text = re.sub(r'(\n)+', r'\1', text)
    text = re.sub(r'( )+', r'\1', text)
    searchKey = 'peer'
    res = [i.start() for i in re.finditer(searchKey, text)]
    sub_text = ""
    for i in range(len(res)-1):
        start = res[i]
        end = min(res[i+1]+100, res[i]+2000)
        chunk = text[start:end]
        count = 0
        # get the number of "Inc."
        # assuming the peer group lists must have more than two "Inc."
        # option2: get the highest total count of related research words
        search = ["Inc\."]     #, "Corp\.", "Ltd\.", "Johnson & Johnson"]
        for key in search:
            finds = [i.start() for i in re.finditer(key, chunk)]
            count = count + len(finds)
            if count >= 2:
                sub_text = chunk
    return sub_text

In [5]:
# Get list of peer groups from chunks
def clean_chunk(text):
    # Get rid of spaces
    text = " ".join(text.split())
    # Get rid of numbers but not numbers in words
    text = re.sub(r'\b[0-9]+\b\s*', '', text)
    # Get rid of special characters except * and &
    text = re.sub(r"[^a-zA-Z0-9\*\&\.]+", ' ', text)
    text = re.sub(r"\.(\s)", "", text)
    # Handle Camelcases
    text = " ".join(re.split(r'(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])', text))
    # Get rid of comments (unrelated company names in comments)
    # (ASSUME ALL COMMENTS ARE AT THE END AND AFTER "*")
    # (ASSUME THERE ARE MORE THAN ONE "*")
    # (Check if sentence like "no longer part of our peer group" )
    cut = text.rfind("*")
    if text[cut:].find("no longer part of our peer group") != -1:
        text = text[:cut]
    return text

In [6]:
# Find company names
def NameFind(text):
    pattern = re.compile(r'''((?:[A-Z][A-Z0-9a-z\&\-\.]+(?:\s\&)*)(?=\s[A-Z])(?:\s[A-Z][a-z]+|\s[A-Z]+)*?
           \s(?:Inc|Corp|Co|Ltd|Corporation|Company|Incorporated|Incorporation)\b)''', re.VERBOSE)
    compList = re.findall(pattern, text)
    compList = list(dict.fromkeys(compList))
    return compList

def RemoveName(ls, text):
    for string in ls:
        text = text.replace(string, "")
    return text

In [7]:
def Stand(text):
    text = re.sub(r'\bIncorporated\b', 'Inc', text)
    text = re.sub(r'\bIncorporation\b', 'Inc', text)
    text = re.sub(r'\bCompany\b', 'Co', text)
    text = re.sub(r'\bCorporation\b', 'Corp', text)
    text = re.sub(r'\bHoldings\b', 'HLDG', text)
    text = re.sub(r'\bHolding\b', 'HLDG', text)
    return text

In [8]:
# Find additional S&P companies
def GetPeerList(text):
    peerList = NameFind(text)
    newText = RemoveName(peerList, text)
    newTextUp = newText.upper()
    for name in SP_stand:
        if newTextUp.find(name) != -1:
            peerList.append(name)
    for name in SP_special:
        if fuzz.token_set_ratio(Stand(newTextUp), name) > 90:
            peerList.append(name) 
    return peerList

In [9]:
def get_name_time(text):
    name = re.match(r'.*\\(.*)_def14a_(\d{4}\-\d{2}\-\d{2})', text).group(1)
    date = re.match(r'.*\\(.*)_def14a_(\d{4}\-\d{2}\-\d{2})', text).group(2)
    return name, date

In [24]:
def simpleName(Uppertext):
    Uppertext = re.sub(r'(^|\s)INC|CORP|CO|LTD|PLC\b', '', Uppertext)
    return Uppertext

In [40]:
def get_FineScore(Name1, Name2):
    UpperName1 = Name1.upper()
    UpperName2 = Name2.upper()
    UpperName1 = simpleName(UpperName1).replace(" ", "")
    UpperName2 = simpleName(UpperName2).replace(" ", "")
    FineScore = fuzz.ratio(UpperName1, UpperName2)
    return FineScore

In [66]:
def get_match(company):
    try:
        match_index = SP_names.index(company)
    except ValueError:
        scores = { name: fuzz.token_set_ratio(company, name) for name in SP_names }
        match_ls = [k for k,v in scores.items() if v==max(scores.values())]
        if len(match_ls) == 1:
            match = match_ls[0]
            if scores.get(match) < 90:
                score = get_FineScore(company, match)
            else:
                score = scores.get(match)
            m_kic = kic.get(match)
        if len(match_ls) > 1:
            if scores.get(match_ls[0]) == 100:
                match = match_ls
                score = scores.get(match[0])
                m_kic = []
                for match_name in match_ls:
                    m_kic.append(kic.get(match_name))
            if scores.get(match_ls[0]) < 100:
                scores1 = { match_name: get_FineScore(company, match_name) for match_name in match_ls }
                match_ls1 = [k for k,v in scores1.items() if v==max(scores1.values())]
                if len(match_ls1) == 1:
                    match = match_ls1[0]
                    score = scores1.get(match)
                    m_kic = kic.get(match)
                if len(match_ls1) > 1:
                    match = match_ls1
                    score = scores1.get(match[0])
                    m_kic = []
                    for match_name1 in match_ls1:
                        m_kic.append(kic.get(match_name1))
        return [match, score, m_kic]
    else:
        score = 100
        match = SP_names[match_index]
        m_kic = kic.get(match)
        return [company, score, m_kic]

In [111]:
# Loop through files in the folder

curr_path = os.getcwd()
if curr_path != 'C:\\Users\\yuanx\\Desktop\\Summer Research\\Proxy':
    new_path = 'C:\\Users\\yuanx\\Desktop\\Summer Research\\Proxy'
    #print(new_path)
    os.chdir(new_path)
    curr_path = os.getcwd()
#print(curr_path)

comp_ls = os.listdir()
comp_ls = comp_ls[:1]

file_chunks = []
for comp in comp_ls:
    comp_path = "{}\{}".format(curr_path, comp)
    os.chdir(comp_path)
    file_ls = os.listdir()
    for file in file_ls:
        file_path = "{}\{}".format(comp_path, file)
        chunk_text = get_chunks(file_path)
        search_text = clean_chunk(chunk_text)
        peerList = GetPeerList(search_text)
        co = get_name_time(file_path)[0]
        dt = get_name_time(file_path)[1]
        #if peerList != []:
            #print(dt, co, '\n', peerList)
        kicNum = kic.get(co)
        totalPeers = len(peerList)
        if peerList == []:
            file_chunks.append( {'date': dt, 'cik': kicNum, 'conm': co, 'peer_group(proxy)': '', \
                                 'peer_group(dic)': '', 'proxy_cik': '', 'match': '', \
                                 '#peers': totalPeers} )
        for peerName in peerList:
            peerName = Stand(peerName)
            SP_match, score, match_kic = get_match(peerName)
            file_chunks.append( {'date': dt, 'cik': kicNum, 'conm': co, 'peer_group(proxy)': peerName, \
                                 'peer_group(dic)': SP_match, 'proxy_cik': match_kic, 'match': score, \
                                 '#peers': totalPeers} )

In [112]:
# Write to csv file
os.chdir("C:\\Users\\yuanx\\Desktop\\Summer Research")
fields = ['date', 'cik', 'conm', 'peer_group(proxy)', 'peer_group(dic)', 'proxy_cik', 'match', '#peers']
with open('1.csv', 'w+') as csv_file:  
    writer = csv.DictWriter(csv_file, lineterminator = '\n', fieldnames = fields)   
    writer.writeheader()
    writer.writerows(file_chunks)