In [341]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy
  
nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Get Full IG Dataset

In [3]:
file = "../../data/companyData/infogroup2010s.csv"

In [4]:
import dask.dataframe as dd

df = dd.read_csv(file, assume_missing=True, 
                 dtype={'parent_number': 'object','parent_employee_size_code': 'object',
                       'parent_sales_volume_code': 'object',
                       'abi': 'object','zipcode': 'object'}, low_memory = False)
df = df[df.business_status_code == 1.0]


  import pandas.util.testing as tm


In [20]:
abiRents = df[['abi','parent_number']].drop_duplicates().compute(num_workers = 25)

In [22]:
abiRents.to_csv("../../data/abiRents.csv")

In [None]:
differentParents.to_csv("../../data/differentParents2010s.csv")

In [None]:
hq = df[['abi','ticker','company','archive_version_year','state','city',
         'address_line_1','zipcode',
         'latitude','longitude']].drop_duplicates().compute(num_workers = 100)

In [None]:
hqsOnly = hq[['abi','company']].drop_duplicates()

In [None]:
hqsOnly.company.value_counts()

In [None]:
print(hq.shape,hqsOnly.shape)

Some of the abi numbers seem to be duplicated; it looks like they might be primarily for different government agencies.

In [None]:
hqsOnly.company.value_counts()[hqsOnly.company.value_counts() > 10].index

In [None]:
toDiscard = hqsOnly.company.value_counts()[hqsOnly.company.value_counts() > 1].index
for company in toDiscard:
    print(company)


In [None]:
toDiscard

In [None]:
hqsOnly = hqsOnly[~hqsOnly.company.isin(toDiscard)]
hq      = hq[~hq.company.isin(toDiscard)]

In [None]:
hq.shape

At this point, we have a unique record of every company - hq here. Some of these may well be duplicate entries for a given company, for the cases in which we have a company that has multiple hq.

Let's stash it so that we don't have to go through the above ^^ again.

In [None]:
hqsOnly.to_csv("../../data/ig2010s_uniqueHQs.csv")

In [None]:
hq.to_csv("../../data/ig2010s_uniqueHQs_multLocations.csv")

## Grab Compustat Data

First filter down to the companies for whom we have the supply chain information.

In [342]:
c_links = pd.read_csv("../../data/companyData/compustatSCLinked.csv")

c_links['year'] = c_links.srcdate.astype('str').str.slice(0,4).astype('int64')

c_links = c_links[c_links.year > 2009]

relevant_gvkeys = c_links.gvkey.append(c_links.cgvkey).drop_duplicates()

print(c_links.head(),relevant_gvkeys.shape)

     gvkey                        conm  cgvkey                       cconm  \
80    1013  ADC TELECOMMUNICATIONS INC    9899                    AT&T INC   
81    1013  ADC TELECOMMUNICATIONS INC    2136  VERIZON COMMUNICATIONS INC   
281   1094                  ACETO CORP   31673      AMERISOURCEBERGEN CORP   
282   1094                  ACETO CORP   31673      AMERISOURCEBERGEN CORP   
283   1094                  ACETO CORP    7171               MCKESSON CORP   

                       cnms   srcdate  cid  sid    ctype   salecs     scusip  \
80                     AT&T  20100930   16    0  COMPANY  300.000  000886309   
81   VERIZON COMMUNICATIONS  20100930   13    0  COMPANY  146.000  000886309   
281  AmerisourceBergen Corp  20160630   13    0  COMPANY   78.193  004446100   
282  AmerisourceBergen Corp  20170630   13    0  COMPANY   76.598  004446100   
283           McKesson Corp  20170630   19    0  COMPANY   70.215  004446100   

       stic     ccusip ctic  year  
80   ADCT.1  0

In [343]:
c_links.gvkey.unique().shape[0]

2479

Get the company dataset and check.

The legal name and the given name are slightly different, but basically the same modulo punctuation and case.

In [363]:
c_addresses = pd.read_csv("../../data/companyData/compustatAddresses.csv", 
                dtype={'parent_number': 'object'})[['fyear',
                'gvkey',
                'conm',
                'add1',
                'city',
                'state',
                'idbflag',
                'addzip',
               'naics']].drop_duplicates().rename(columns = {'fyear': 'year'})
c_addresses = c_addresses[(c_addresses.year > 2009) & (c_addresses.year < 2020) & \
                          c_addresses.gvkey.isin(relevant_gvkeys)]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [364]:
c_addresses.year.value_counts()

2012.0    3042
2013.0    3020
2011.0    3018
2014.0    2968
2010.0    2927
2015.0    2849
2016.0    2766
2017.0    2662
2018.0    2522
2019.0    2391
Name: year, dtype: int64

We can later, potentially, subset this to focus on firms in: ag, mining, construction, manufacturing, wholesale and retail, and transportation.

In [365]:
'''c_addresses = c_addresses[(c_addresses.naics.astype('str').str.slice(0,2).isin(['11','21','22','23','31','32',
                                                         '33','42','44','45','48','49']))]
'''
chq = c_addresses[['gvkey','conm','add1','city','state','addzip','idbflag']].drop_duplicates()




We're starting with the compustat north america dataset. Not all of the HQs are in North America, so we can filter some of the information down to match with Infogroup.

In [366]:
chq.idbflag.value_counts()

D    2832
B     752
Name: idbflag, dtype: int64

In [367]:
canadian = ['ON', 'AB','QC', 'BC', 'NS', 'NF', 'SK', 'MB', 'NB']

chq.state.unique()

array(['MN', 'TX', 'AZ', 'UT', 'IL', 'NY', 'CA', nan, 'FL', 'CT', 'NC',
       'WA', 'ON', 'ID', 'OH', 'GA', 'PA', 'MA', 'OK', 'VT', 'CO', 'DE',
       'NJ', 'IN', 'MD', 'PR', 'HI', 'QC', 'WI', 'NE', 'BC', 'MO', 'VA',
       'AB', 'RI', 'LA', 'MI', 'TN', 'DC', 'KY', 'AR', 'OR', 'AL', 'IA',
       'KS', 'NV', 'MS', 'NH', 'SD', 'SC', 'MT', 'ME', 'NF', 'NS', 'VI',
       'NM', 'WY', 'SK', 'ND', 'WV', 'MB'], dtype=object)

In [368]:
chq = chq[~(chq.state.isin(canadian)) & ~chq.state.isna()]

In [369]:
chq.addzip.str.len().value_counts()

5     2651
10     329
Name: addzip, dtype: int64

In [370]:
chq['addzip'] = chq.addzip.astype('str').str.slice(0,5)

In [371]:
chq[chq.idbflag == 'D'].addzip.str.len().value_counts()

5    2589
Name: addzip, dtype: int64

In [372]:
chq[chq.idbflag == "B"].addzip.value_counts()

77002    12
10036     5
95054     5
10017     5
80202     5
         ..
01864     1
78259     1
75243     1
02903     1
63105     1
Name: addzip, Length: 299, dtype: int64

In [373]:
print(chq.head(),chq.shape)

    gvkey                         conm  \
12   1013   ADC TELECOMMUNICATIONS INC   
13   1045  AMERICAN AIRLINES GROUP INC   
68   1075   PINNACLE WEST CAPITAL CORP   
79   1076            PROG HOLDINGS INC   
91   1078          ABBOTT LABORATORIES   

                                      add1           city state addzip idbflag  
12                  13625 Technology Drive   Eden Prairie    MN  55344       D  
13                         1 Skyview Drive     Fort Worth    TX  76155       B  
68  400 North Fifth Street, P.O. Box 53999        Phoenix    AZ  85072       D  
79                     256 West Data Drive         Draper    UT  84020       D  
91       100 Abbott Park Road, Abbott Park  North Chicago    IL  60064       B   (2980, 7)


In [374]:
chq.rename(columns = {'conm': 'company','addzip': 'cstatZipcode'},inplace = True)
chq.head()

Unnamed: 0,gvkey,company,add1,city,state,cstatZipcode,idbflag
12,1013,ADC TELECOMMUNICATIONS INC,13625 Technology Drive,Eden Prairie,MN,55344,D
13,1045,AMERICAN AIRLINES GROUP INC,1 Skyview Drive,Fort Worth,TX,76155,B
68,1075,PINNACLE WEST CAPITAL CORP,"400 North Fifth Street, P.O. Box 53999",Phoenix,AZ,85072,D
79,1076,PROG HOLDINGS INC,256 West Data Drive,Draper,UT,84020,D
91,1078,ABBOTT LABORATORIES,"100 Abbott Park Road, Abbott Park",North Chicago,IL,60064,B


In [375]:
chq = chq[chq.gvkey.isin(relevant_gvkeys)]

In [376]:
chq.shape

(2980, 7)

In [377]:
chq.to_csv("../../data/chq.csv")

# Breakpoint
We can start from here and just clean everything from here.


Headquarters:

In [61]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'})

chq.head()

Unnamed: 0,gvkey,company,add1,city,state,cstatZipcode,idbflag
0,1013,ADC TELECOMMUNICATIONS INC,13625 Technology Drive,Eden Prairie,MN,55344,D
1,1045,AMERICAN AIRLINES GROUP INC,1 Skyview Drive,Fort Worth,TX,76155,B
2,1075,PINNACLE WEST CAPITAL CORP,"400 North Fifth Street, P.O. Box 53999",Phoenix,AZ,85072,D
3,1078,ABBOTT LABORATORIES,"100 Abbott Park Road, Abbott Park",North Chicago,IL,60064,B
4,1094,ACETO CORP,4 Tri Harbor Court,Port Washington,NY,11050,D


In [378]:
hqsOnly     = pd.read_csv("../../data/ig2010s_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})
hqsWithYear = pd.read_csv("../../data/ig2010s_uniqueHQs_multLocations.csv",dtype={'zipcode': 'object'})[['abi','company',
                                                                             'archive_version_year',
                                                                             'state','city','zipcode','address_line_1']]

print(hqsOnly.head())
print(hqsWithYear.head())

hqsWithYear = hqsWithYear[hqsWithYear.archive_version_year <= 2020]

hqsWithYear['last_year'] = hqsWithYear.groupby(['abi'])['archive_version_year'].transform(max)

print(hqsWithYear.shape)

lastHQs = hqsWithYear[hqsWithYear.archive_version_year == hqsWithYear.last_year][['abi','company','state','city','zipcode','address_line_1']]

print(lastHQs.shape)

lastHQs.head()

     abi                         company
0   7609            SOLITRON DEVICES INC
1  21311   WESTERN STATES ENVELOPE & LBL
2  29603                THIELE KAOLIN CO
3  71340                 TRI STAFF GROUP
4  77743  NATIONAL TECHNICAL SYSTEMS INC
     abi                         company  archive_version_year state  \
0   7609            SOLITRON DEVICES INC                2010.0    FL   
1  21311   WESTERN STATES ENVELOPE & LBL                2010.0    WI   
2  29603                THIELE KAOLIN CO                2010.0    GA   
3  71340                 TRI STAFF GROUP                2010.0    CA   
4  77743  NATIONAL TECHNICAL SYSTEMS INC                2010.0    CA   

              city zipcode            address_line_1  
0  WEST PALM BEACH   33407  3301 ELECTRONICS WAY # C  
1           BUTLER   53007           4480 N 132ND ST  
2     SANDERSVILLE   31082             520 KAOLIN RD  
3        SAN DIEGO   92122   6336 GREENWICH DR # 100  
4        CALABASAS   91302  24007 VENTURA BLVD

Unnamed: 0,abi,company,state,city,zipcode,address_line_1
17,158329,CARIDIAN BCT INC,CO,LAKEWOOD,80401,14143 DENVER WEST PKWY # 200
20,211946,FAMILY BRANDS INTL LLC,TN,LENOIR CITY,37771,1001 ELM HILL RD
23,258574,JADE CORP,PA,HUNTINGDON VLY,19006,3063 PHILMONT AVE
24,262493,MONTANA METAL PRODUCTS LLC,IL,DES PLAINES,60018,25 HOWARD AVE
43,455154,O I CORP,TX,COLLEGE STATION,77845,151 GRAHAM RD


In [379]:
lastHQs.to_csv("../../data/companyData/lastHQs.csv")

Only two of these company names appear 2x, which is good. There are ~20,000 companies in this sample.

Let's go through a little bit of a process here:
- Find the exact matches.
- Get a similarity measure between ; ideally something vectorized / something in matrix math.
- Find the top 10 matches for the remaining ones.
- Do some mix and match and see if there's any threshold at which matches become similar ``enough'' to say this is okay and good to go.


We might be able to use the fact that all of the addresses should be the same after some given point, as the compustat addresses are only the most recent ones. 

Let's try a few different ways to match these up.

First, let's find the exact matches.

Make a generic cleaning function that strips out all company names, any punctuation in the name, and makes everything lower case.

In [3]:
def cleanText(text):
    text = text.strip().\
    sub(" CORP","").replace(" CO","").replace(" INC","").\
    replace(" LTD","").replace(" -CL A","").\
    replace(" -LP","").replace(" LP","").\
    replace("-OLD","").replace(" LLC","").\
    replace(", LLC.","").replace(" L.L.C.","").replace(" L.P.","").\
    replace(" L.TD","").replace(" L.L.C.","").replace(" -CL B","").\
    replace(" -CL B","").replace(" -CL i","").replace(" -CL","").\
    replace("-REDH","").replace(" CP","").\
    replace("-ADR","").replace(" PLC","").lower().replace(r'[^\w\s]+', '').\
    replace('-lp','').replace('-spn','').replace('hldg','').replace(' intl','').\
    replace('holdings','').replace('holding','').replace('prtnr','').replace('group','').\
    replace(" med ", " medical ").replace(" tradng ", " trading ").replace("gen ", "general ").\
    replace(" mtr ", " motors ").replace(" motor ", " motors ").replace("-", " ").\
    replace("/", " ").replace("'", " ").replace("&", " ").replace(" a g ", " ").\
    replace(" ag ", " ").replace("  adr ", " ").replace(" adr ", " ").replace("  cp ", " ").\
    replace(" cp ", " ").replace(" plc ", " ").replace(" intl ", " ").replace(" ent ", " ").\
    replace(" nv ", " ").replace(" n.v. ", " ").replace(" worldwide ", " ").\
    replace(" wldwide ", " ").replace(" banc"," bank").replace("^banc","bank")
    
    return text

In [2]:
chq['company']               = list(map(cleanText, chq.company))
lastHQs['company']           = list(map(cleanText, lastHQs.company))

chq.rename(columns = {'city': 'cstatCity',
                     'state': 'cstatState',
                     'add1': 'cstatadd1'}, inplace = True)

chq['cstatCity']  = chq.cstatCity.str.lower()
chq['cstatState'] = chq.cstatState.str.lower()
chq['cstatadd1']  = chq.cstatadd1.str.lower()

lastHQs['city']            = lastHQs.city.str.lower()
lastHQs['state']           = lastHQs.state.str.lower()
lastHQs['address_line_1']  = lastHQs.address_line_1.str.lower()

NameError: name 'chq' is not defined

NAICS names do not match up between compustat and infogroup so they're not helpful.

In [382]:
chq.head()

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,idbflag
12,1013,adc telecommunications,13625 technology drive,eden prairie,mn,55344,D
13,1045,american airlines,1 skyview drive,fort worth,tx,76155,B
68,1075,pinnacle west capital,"400 north fifth street, p.o. box 53999",phoenix,az,85072,D
79,1076,prog,256 west data drive,draper,ut,84020,D
91,1078,abbott laboratories,"100 abbott park road, abbott park",north chicago,il,60064,B


In [383]:
lastHQs.head()

Unnamed: 0,abi,company,state,city,zipcode,address_line_1
17,158329,caridian bct,co,lakewood,80401,14143 denver west pkwy # 200
20,211946,family brands,tn,lenoir city,37771,1001 elm hill rd
23,258574,jade,pa,huntingdon vly,19006,3063 philmont ave
24,262493,montana metal products,il,des plaines,60018,25 howard ave
43,455154,o i,tx,college station,77845,151 graham rd


## Match on company name directly

In [384]:
nameMerge = chq.merge(lastHQs)
nameMerge.shape

(2243, 12)

In [385]:
nameMerge.head()

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,idbflag,abi,state,city,zipcode,address_line_1
0,1013,adc telecommunications,13625 technology drive,eden prairie,mn,55344,D,7523129,mn,eden prairie,55344,13625 technology dr
1,1045,american airlines,1 skyview drive,fort worth,tx,76155,B,7501711,tx,fort worth,76155,1 skyview dr
2,1075,pinnacle west capital,"400 north fifth street, p.o. box 53999",phoenix,az,85072,D,4554051,az,phoenix,85004,400 n 5th st frnt
3,1078,abbott laboratories,"100 abbott park road, abbott park",north chicago,il,60064,B,4352373,il,abbott park,60064,100 abbott park rd
4,1094,aceto,4 tri harbor court,port washington,ny,11050,D,433140944,tx,houston,77010,1221 mckinney st # 3275


In [387]:
nameMerge.to_csv("../../data/companyData/nameMerge.csv")

In [388]:
sum(nameMerge.cstatState == nameMerge.state)/nameMerge.shape[0]

0.9549710209540794

In [389]:
sum(nameMerge.cstatZipcode.str.slice(0,5) == nameMerge.zipcode.str.slice(0,5))/nameMerge.shape[0]

0.8818546589389211

In [390]:
sum(nameMerge.cstatZipcode.str.slice(0,1) == nameMerge.zipcode.str.slice(0,1))/nameMerge.shape[0]

0.9621043245653144

In [391]:
nameMerge[nameMerge.cstatCity != nameMerge.city][50:100]

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,idbflag,abi,state,city,zipcode,address_line_1
438,9922,sparton,5612 johnson lake road,de leon springs,fl,32130,D,8383333,il,schaumburg,60173,425 n martingale rd # 1000
442,10115,stryker,2825 airview boulevard,kalamazoo,mi,49002,B,143964906,mi,portage,49002,2825 airview blvd
444,10156,sunoco,"1735 market street, suite ll",philadelphia,pa,19103,D,464706795,tx,dallas,75225,8111 westchester dr # 600
445,12892,sunoco,"8111 westchester drive, suite 400",dallas,tx,75225,D,4556221,pa,philadelphia,19103,1818 market st # 1500
450,10195,superior industries,"26600 telegraph road, suite 400",southfield,mi,48033,D,413582201,mn,morris,56267,315 state highway 28
485,10860,ameren,1901 chouteau avenue,saint louis,mo,63103,B,967282336,mo,st louis,63103,1901 chouteau ave
488,10884,chiquita brands,550 south caldwell street,charlotte,nc,28202,D,200053361,fl,dania beach,33004,1855 griffin rd # c436
490,10903,unitedhealth,"unitedhealth group center, 9900 bren road east",minnetonka,mn,55343,B,4373189,mn,hopkins,55343,9900 bren rd e # 300w
494,10984,sprint,6200 sprint parkway,overland park,ks,66251,B,400038995,tx,richmond,77469,2141 preston st
496,11060,vf,1551 wewatta street,denver,co,80202,D,7538861,co,greenwood vlg,80111,8505 e orchard rd


Now focus down onto the companies that have not been matched.

In [392]:
chqUnmatched = chq[~chq.company.isin(nameMerge.company)].reset_index()
chqUnmatched.shape

(769, 8)

In [393]:
igUnmatched  = lastHQs[~lastHQs.company.isin(nameMerge.company)].reset_index()
igUnmatched.shape

(57570, 7)

In [394]:
lastHQs.shape

(59805, 6)

# Find Distance

Two distance measures here. Look at top 5 matches and pull the distance measure and matches as well.

### Levenshtein

In [395]:
from Levenshtein import distance as levenshtein_distance

Find LD between the unmatched compustat companies and the unmatched IG ones. 

In [396]:
companyArrayCStat = []

company = chqUnmatched.company[0]
start = time.time()
for company in chqUnmatched.company:
    thisCompany = []
    for ig in igUnmatched.company:
        thisCompany.append(levenshtein_distance(company,ig))
    
    companyArrayCStat.append([thisCompany])

allLD = np.concatenate(companyArrayCStat)


In [397]:
igUnmatched.head()

Unnamed: 0,index,abi,company,state,city,zipcode,address_line_1
0,17,158329,caridian bct,co,lakewood,80401,14143 denver west pkwy # 200
1,20,211946,family brands,tn,lenoir city,37771,1001 elm hill rd
2,23,258574,jade,pa,huntingdon vly,19006,3063 philmont ave
3,24,262493,montana metal products,il,des plaines,60018,25 howard ave
4,43,455154,o i,tx,college station,77845,151 graham rd


Now pull the closest companies in IG to those in CStat. We'll first populate a dataframe with the name, address, city, state, and zip of each unmatched company in compustat, then we'll use the LD to find the same information for the closest company in IG.



There's some legacy code in here that finds the top 5 closest companies; but it doesn't populate the dataframe.

In [398]:
singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]

# legacy
# n = 5
# largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


companyMatches = pd.DataFrame()
companyMatches['cstatCompanies'] = chqUnmatched.company
companyMatches['cstatadd1']      = chqUnmatched.cstatadd1
companyMatches['cstatCity']      = chqUnmatched.cstatCity
companyMatches['cstatState']     = chqUnmatched.cstatState
companyMatches['cstatZip']       = chqUnmatched.cstatZipcode

for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'misspelling']         = allLD[i,:][singleLargestLV[i]]
    companyMatches.at[i,'percMisspelled']      = companyMatches.misspelling[i]/len(companyMatches.cstatCompanies[i])
    companyMatches.at[i,'levCompany']          = igUnmatched.company[singleLargestLV[i]].iloc[0]
    
    companyMatches.at[i,'closestMatchIG_add']      = igUnmatched.address_line_1[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_city']     = igUnmatched.city[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_zipcode']  = igUnmatched.zipcode[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_state']    = igUnmatched.state[singleLargestLV[i]].iloc[0]

    
    # companyMatches.at[i,'closestMatchIG']      = np.array(igUnmatched.company)[largestElementsLV[i]]
    # companyMatches.at[i,'LevSim']              = np.array(allLD[i,:][largestElementsLV[i]], dtype=object)

In [399]:
companyMatches.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,percMisspelled,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state
0,prog,256 west data drive,draper,ut,84020,1.0,0.2,prg,856 us highway 206 # b12,hillsborough,8844,nj
1,beam,222 west merchandise mart plaza,chicago,il,60654,1.0,0.25,team,13131 dairy ashford rd # 600,sugar land,77478,tx
2,american greetings,one american road,cleveland,oh,44144,1.0,0.052632,american greetings,1 american way,westlake,44145,oh
3,american software,"470 east paces ferry road, n.e.",atlanta,ga,30305,1.0,0.055556,american software,470 e paces ferry rd ne,atlanta,30305,ga
4,apa,"2000 post oak boulevard, suite 100",houston,tx,77056,1.0,0.333333,spa,2701 park dr,cleveland,44120,oh


Now get the embeddings and the cosine similarity between them.

In [400]:
def getMatrix(companyEmbeddings):
    companyArray = []
    
    for companies in companyEmbeddings:
        companyArray.append([companies.vector])
    
    companyArray = np.concatenate(companyArray)
    
    return(companyArray)
        

In [401]:
chqUnmatchedList = list(map(nlp, chqUnmatched.company))
allCompaniesIG   = list(map(nlp, igUnmatched.company))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

(769, 57570)

In [402]:
outfile =  '../../data/allCompaniesIG_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(ig, pickle_file)
    
outfile =  '../../data/allCompaniesCStat_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(cstat, pickle_file)    

Each row n here has the similarity between the nth company name in compustat and the IG company corresp to that column.

In [403]:
allSimilarities[0:5,:]

array([[ 0.01691091, -0.01222062,  0.06702404, ...,  0.10824974,
         0.08654009,  0.1449641 ],
       [-0.02306337,  0.06756251,  0.13372983, ...,  0.29612526,
         0.07285859,  0.17976503],
       [ 0.04563743,  0.33906224,  0.24495935, ...,  0.3366109 ,
         0.37406623,  0.39949286],
       [-0.0358867 ,  0.3554296 ,  0.18157756, ...,  0.43370304,
         0.36172885,  0.3919765 ],
       [ 0.12749049,  0.01153258,  0.11958653, ...,  0.08740962,
         0.21394141,  0.13486943]], dtype=float32)

Find indices of companies in IG most similar to each company in CStat.

In [404]:
singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]

# legacy - largest n
# largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]


Add the cosine similarity measures to the similarity dataset.

In [405]:
for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'cosSimilarity']        = allSimilarities[i,:][singleLargestCos[i]]
    companyMatches.at[i,'cosSimilarityCompany'] = igUnmatched.company[singleLargestCos[i]].iloc[0]
    # companyMatches.at[i,'closestMatchCosine']   = np.array(igUnmatched.company)[largestElementsCos[i]]
    # companyMatches.at[i,'cosineSim']            = np.array(allSimilarities[i,:][largestElementsCos[i]], dtype=object)
    
    
    companyMatches.at[i,'costMatchIG_add']     = igUnmatched.address_line_1[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_city']     = igUnmatched.city[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_zipcode']  = igUnmatched.zipcode[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_state']    = igUnmatched.state[singleLargestCos[i]].iloc[0]
    

In [406]:
sum((companyMatches.levCompany == companyMatches.cosSimilarityCompany))

171

## Start Matching

### Take 1: Match + Zip or City

Now find the company matches: ABI - gvkey link.

Start with ones where the names both match.

If the cities or zipcodes match on one of the closest companies (LD or cos), it seems like it is good to go.


Do this in steps to start, at least. First find the companies where both match and either zip or city match. Then find companies where only one matches.

In [407]:
bothMatch_cityZip = companyMatches[(companyMatches.levCompany == companyMatches.cosSimilarityCompany) & \
              ((companyMatches.cstatCity == companyMatches.closestMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.closestMatchIG_zipcode))]
bothMatch_cityZip.shape

bothMatch_cityZip.to_csv("../../data/companyData/bothMatch_cityZip.csv")

In [408]:
bothMatch_cityZip['igCompanies'] = bothMatch_cityZip.levCompany
companiesToCheck                 = bothMatch_cityZip[['cstatCompanies','igCompanies',
                                    'cstatadd1','cstatCity','cstatZip',
                                    'closestMatchIG_add','closestMatchIG_city','closestMatchIG_zipcode']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [409]:
companyMatchesBoth = list(bothMatch_cityZip.cstatCompanies.unique())
len(companyMatchesBoth)

138

Grab the single company match versions.

In [410]:
oneMatch_cityZipOnly = companyMatches[-(companyMatches.cstatCompanies.isin(companyMatchesBoth)) & \
              (((companyMatches.cstatCity == companyMatches.closestMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.closestMatchIG_zipcode)) | \
              ((companyMatches.cstatCity == companyMatches.cosMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.cosMatchIG_zipcode)))].reset_index(drop=True)

oneMatch_cityZipOnly['igCompanies'] = ''

for i in range(0,oneMatch_cityZipOnly.shape[0]):
    if ((oneMatch_cityZipOnly.cstatCity[i] == oneMatch_cityZipOnly.closestMatchIG_city[i]) | \
              (oneMatch_cityZipOnly.cstatZip[i] == oneMatch_cityZipOnly.closestMatchIG_zipcode[i])):
        oneMatch_cityZipOnly.loc[i,'igCompanies'] = oneMatch_cityZipOnly.levCompany[i]
    else:
        oneMatch_cityZipOnly.loc[i,'igCompanies'] = oneMatch_cityZipOnly.cosSimilarityCompany[i]

# oneMatch_cityZipOnly.to_csv("../../data/companyData/oneMatch_cityZipOnly.csv")

In [411]:
companiesToCheck.shape

(138, 8)

In [412]:
oneMatch_cityZipOnly.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,percMisspelled,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state,igCompanies
0,beam,222 west merchandise mart plaza,chicago,il,60654,1.0,0.25,team,13131 dairy ashford rd # 600,sugar land,77478,tx,1.0,beam suntory,222 merchandise mart plz #1600,chicago,60654,il,beam suntory
1,advanced oxygeneral technology,"653 vt route 12a, po box 189",randolph,vt,5060,6.0,0.2,advanced oxygeneral tech,653 vt route 12a,randolph,5060,vt,1.0,advanced technology,101 parker dr,andover,44003,oh,advanced oxygeneral tech
2,southern gas,"ten peachtree place, n.e.",atlanta,ga,30309,2.0,0.166667,southern its,9101 w sahara ave # 105,las vegas,89117,nv,0.92152,gas south,3625 cumberland blvd se # 1500,atlanta,30339,ga,gas south
3,brt apartments,"60 cutter mill road, suite 303",great neck,ny,11021,6.0,0.428571,pjt partners,280 park ave # 15w,new york,10017,ny,0.624561,brt realty trust,60 cuttermill rd # 303,great neck,11021,ny,brt realty trust
4,barry (r g),13405 yarmouth road n.w.,pickerington,oh,43147,6.0,0.545455,berryllege,2277 martha berry hwy nw,mt berry,30149,ga,0.835597,r g barry,13405 yarmouth dr,pickerington,43147,oh,r g barry


In [413]:
companiesToCheck = companiesToCheck.append(oneMatch_cityZipOnly[['cstatCompanies','igCompanies',
                                    'cstatadd1','cstatCity','cstatZip',
                                    'closestMatchIG_add','closestMatchIG_city','closestMatchIG_zipcode']]).\
                                    drop_duplicates()

companiesToCheck.shape

(372, 8)

In [414]:
companiesToCheck[0:50]

Unnamed: 0,cstatCompanies,igCompanies,cstatadd1,cstatCity,cstatZip,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode
3,american software,american software,"470 east paces ferry road, n.e.",atlanta,30305,470 e paces ferry rd ne,atlanta,30305
5,apco oil and gas,apco oil gas,"one williams center, 35th floor",tulsa,74172,1 one williams ctr # 35,tulsa,74172
9,constellation energy grp,constellation energy,100 constellation way,baltimore,21202,100 constellation way,baltimore,21202
23,central natural res,central natural resources,"1044 main street, suite 502",kansas city,64105,1044 main st # 502,kansas city,64105
32,cracker barrel old ctry stor,cracker barrel olduntry str,305 hartmann drive,lebanon,37087,305 hartman dr,lebanon,37087
37,dun bradstreet holdngs,dun bradstreet,101 jfk parkway,short hills,7078,103 john f kennedy pkwy,short hills,7078
39,electro scientific inds,electro scientific industries,13900 nw science park drive,portland,97229,13900 nw science park dr,portland,97229
42,federal mogul,federal mogul,27300 west 11 mile road,southfield,48034,26555 northwestern hwy,southfield,48033
43,federal national mortga assn,federal national mortgage assn,"midtown center, 1100 15th street, nw",washington,20005,1100 15th st nw,washington,20005
46,forest laboratories,forest laboratories,909 third avenue,new york,10022,909 3rd ave # 23,new york,10022


2x check that there are no duplicates here.

In [415]:
duplicates = companiesToCheck.cstatCompanies.value_counts().index[companiesToCheck.cstatCompanies.value_counts() > 1]

companiesToCheck[companiesToCheck.cstatCompanies.isin(duplicates)]

Unnamed: 0,cstatCompanies,igCompanies,cstatadd1,cstatCity,cstatZip,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode


In [416]:
chqStillUnmatched = chqUnmatched[-chqUnmatched.company.isin(companiesToCheck.cstatCompanies)].reset_index(drop=True)
igStillUnmatched  = igUnmatched[-igUnmatched.company.isin(companiesToCheck.igCompanies)].reset_index(drop=True)

# companyMatches['cstatCompanies'] = chqUnmatched.company
print(chqUnmatched.shape, chqStillUnmatched.shape, companiesToCheck.shape)
print(igStillUnmatched.shape,igUnmatched.shape)

(769, 8) (397, 8) (372, 8)
(57204, 7) (57570, 7)


In [417]:
companiesToCheck.to_csv("../../data/companyData/companiesToCheck_cityZip.csv")

## Take 2
Match remaining ones on first word of name

In [418]:
chqUnmatched.company[0].split(' ')[0]

'prog'

In [419]:
chqUnmatched.company[0]

'prog '

Get the edit distance for the first words of the company names.

In [420]:
companyArrayCStat = []

start = time.time()
for company in chqStillUnmatched.company:
    thisCompany = []
    for ig in igStillUnmatched.company:
        thisCompany.append(levenshtein_distance(company.split(' ')[0],ig.split(' ')[0]))
    
    companyArrayCStat.append([thisCompany])

allLD = np.concatenate(companyArrayCStat)


And the cosine distance.

In [421]:
chqStillUnmatchedFirstCo = []
igStillUnmatchedFirstCo = []

for company in chqStillUnmatched.company:
    chqStillUnmatchedFirstCo.append(company.split(' ')[0])
    
for company in igStillUnmatched.company:
    igStillUnmatchedFirstCo.append(company.split(' ')[0])

In [422]:
chqUnmatchedList = list(map(nlp, chqStillUnmatchedFirstCo))
allCompaniesIG = list(map(nlp, igStillUnmatchedFirstCo))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

# largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]
singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]

In [423]:
igStillUnmatched.head()

Unnamed: 0,index,abi,company,state,city,zipcode,address_line_1
0,17,158329,caridian bct,co,lakewood,80401,14143 denver west pkwy # 200
1,20,211946,family brands,tn,lenoir city,37771,1001 elm hill rd
2,23,258574,jade,pa,huntingdon vly,19006,3063 philmont ave
3,24,262493,montana metal products,il,des plaines,60018,25 howard ave
4,43,455154,o i,tx,college station,77845,151 graham rd


In [424]:
singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]
# largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


companyMatches2 = pd.DataFrame()
companyMatches2['cstatCompanies'] = chqStillUnmatched.company
companyMatches2['cstatadd1']      = chqStillUnmatched.cstatadd1
companyMatches2['cstatCity']      = chqStillUnmatched.cstatCity
companyMatches2['cstatState']     = chqStillUnmatched.cstatState
companyMatches2['cstatZip']       = chqStillUnmatched.cstatZipcode


for i in range(0,companyMatches2.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches2.at[i,'misspelling']         = allLD[i,:][singleLargestLV[i]]
    companyMatches2.at[i,'levCompany']          = igStillUnmatched.company[singleLargestLV[i]].iloc[0]
    
    companyMatches2.at[i,'closestMatchIG_add']      = igStillUnmatched.address_line_1[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_city']     = igStillUnmatched.city[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_zipcode']  = igStillUnmatched.zipcode[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_state']    = igStillUnmatched.state[singleLargestLV[i]].iloc[0]

    
    companyMatches2.at[i,'cosSimilarity']        = allSimilarities[i,:][singleLargestCos[i]]
    companyMatches2.at[i,'cosSimilarityCompany'] = igStillUnmatched.company[singleLargestCos[i]].iloc[0]
   
    companyMatches2.at[i,'costMatchIG_add']     = igStillUnmatched.address_line_1[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_city']     = igStillUnmatched.city[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_zipcode']  = igStillUnmatched.zipcode[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_state']    = igStillUnmatched.state[singleLargestCos[i]].iloc[0]

    

In [425]:
companyMatches2.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state
0,prog,256 west data drive,draper,ut,84020,1.0,pro lift indl equipment,12001 plantside dr,louisville,40299,ky,0.54259,techno serve,1120 19th st nw # 8,washington,20036,dc
1,american greetings,one american road,cleveland,oh,44144,0.0,american realty investors,1603 lbj fwy # 800,dallas,75234,tx,1.0,american central bancorp,3300 hedley rd,springfield,62711,il
2,apa,"2000 post oak boulevard, suite 100",houston,tx,77056,0.0,apa the engineered wood assn,7011 s 19th st,tacoma,98466,wa,1.0,apa the engineered wood assn,7011 s 19th st,tacoma,98466,wa
3,bard (c.r.),730 central avenue,murray hill,nj,7974,0.0,bard capital partners,222 greystone rd,evergreen,80439,co,1.0,bard capital partners,222 greystone rd,evergreen,80439,co
4,earthstone energy,"633 17th street, suite 2320",denver,co,80202,0.0,earthstone energy,1400 woodloch forest dr # 300,the woodlands,77380,tx,0.0,caridian bct,14143 denver west pkwy # 200,lakewood,80401,co


In [426]:
companyMatches2.shape

(397, 17)

Find if city or zip match here.

In [427]:
match2_cityZips = companyMatches2[(((companyMatches2.cstatCity == companyMatches2.closestMatchIG_city) | \
              (companyMatches2.cstatZip == companyMatches2.closestMatchIG_zipcode)) | \
              ((companyMatches2.cstatCity == companyMatches2.cosMatchIG_city) | \
              (companyMatches2.cstatZip == companyMatches2.cosMatchIG_zipcode)))].reset_index(drop=True)


In [428]:
match2_cityZips

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state
0,colorado interstate gas,"western pipelines, p.o. box 1087",colorado springs,co,80944,0.0,colorado publishing house,235 s nevada ave,colorado springs,80903,co,1.0,colorado retail venture svc,816 broadway st,sterling,80751,co
1,comtech telecommun,"68 south service road, suite 230",melville,ny,11747,0.0,comtech telecommunications crp,68 s service rd # 230,melville,11747,ny,0.0,caridian bct,14143 denver west pkwy # 200,lakewood,80401,co
2,winthrop realty trust,"7 bulfinch place, suite 500",boston,ma,2114,0.0,winthrop capital partners,7 bulfinch pl # 500,boston,2114,ma,1.0,winthrop capital partners,7 bulfinch pl # 500,boston,2114,ma
3,intl flavors fragrances,521 west 57th street,new york,ny,10019,0.0,intl development envrmnt,1173 2nd ave # 327,new york,10065,ny,1.0,intl matex tank terminals,321 saint charles ave # 200,new orleans,70130,la
4,anixter,2301 patriot boulevard,glenview,il,60026,0.0,anixter international,2301 patriot blvd,glenview,60026,il,0.0,caridian bct,14143 denver west pkwy # 200,lakewood,80401,co
5,monmouth re investment,"bell works complex, suite 1405, 101 crawfords ...",holmdel,nj,7733,0.0,monmouth real est invstmnt crp,101 crawfords corner rd # 1405,holmdel,7733,nj,1.0,monmouth real est invstmnt crp,101 crawfords corner rd # 1405,holmdel,7733,nj
6,tampa electric,"teco plaza, 702 north franklin street",tampa,fl,33602,0.0,tampa hillsborough lbrry systm,900 n ashley dr,tampa,33602,fl,1.0,tampa state bankshares,326 main,tampa,67483,ks
7,virginia electric power,120 tredegar street,richmond,va,23219,0.0,virginia mason memorial,2811 tieton dr,yakima,98902,wa,1.0,virginia secretary of cmmnwlth,1111 e broad st # 4,richmond,23219,va
8,westmorelandal,"9540 south maroon circle, suite 300",englewood,co,80112,2.0,westmoreland mining,9540 maroon cir # 300,englewood,80112,co,0.0,caridian bct,14143 denver west pkwy # 200,lakewood,80401,co
9,tjxs (the),770 cochituate road,framingham,ma,1701,1.0,tjx,770 cochituate rd # 1,framingham,1701,ma,0.0,caridian bct,14143 denver west pkwy # 200,lakewood,80401,co


In [429]:
match2_cityZips['igCompanies'] = ''

for i in range(0,match2_cityZips.shape[0]):
    if ((match2_cityZips.cstatCity[i] == match2_cityZips.closestMatchIG_city[i]) | \
              (match2_cityZips.cstatZip[i] == match2_cityZips.closestMatchIG_zipcode[i])):
        match2_cityZips.loc[i,'igCompanies'] = match2_cityZips.levCompany[i]
    else:
        match2_cityZips.loc[i,'igCompanies'] = match2_cityZips.cosSimilarityCompany[i]


In [430]:
match2_cityZips[['cstatCompanies','igCompanies','closestMatchIG_add','costMatchIG_add']].shape

(42, 4)

In [431]:
match2_cityZips[['cstatCompanies','igCompanies','closestMatchIG_add','costMatchIG_add']].to_csv("../../data/companyData/match2_cityZips.csv")

## Take 3
Try the addresses here.

Let's try something similar:
- Find top 10 most similar addresses by cos sim
- Find top 10 most similar addresses by LD
- Find unique union of these two
- Record LD and cos sim for each
- Filter for totally dissimilar ones
- ``Explode'' the dataset so we have cstat company, address
- Find first word LD and cos sim
- Find total LD and cos sim

In [432]:
n = 10

In [433]:
chqStillUnmatched['cstatadd1']     = chqStillUnmatched.cstatadd1.astype(str)
igStillUnmatched['address_line_1'] = igStillUnmatched.address_line_1.astype(str)

In [434]:
addressArrayCStat = []

start = time.time()
for address in chqStillUnmatched.cstatadd1:
    thisAddress = []
    for ig in igStillUnmatched.address_line_1:
        thisAddress.append(levenshtein_distance(str(address),str(ig)))
    
    addressArrayCStat.append([thisAddress])

allLD = np.concatenate(addressArrayCStat)

singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]
largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


In [435]:
chqUnmatchedList = list(map(nlp, chqStillUnmatched['cstatadd1']))
allCompaniesIG   = list(map(nlp, igStillUnmatched['address_line_1']))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]
largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]

In [436]:
companyMatches3 = pd.DataFrame()
companyMatches3['cstatCompanies'] = chqStillUnmatched.company

Let's grab the set of the 10 closest addresses by LD and cos.

In [451]:
companyMatches3.head()

Unnamed: 0,cstatCompanies,closestAdds_indices
0,prog,
1,american greetings,
2,apa,
3,bard (c.r.),
4,earthstone energy,


Find the unique values in here.

In [452]:
set(largestElementsLV[i]).union(set(largestElementsCos[i]))  

{922,
 2060,
 3401,
 7689,
 9732,
 11623,
 12672,
 15080,
 16088,
 23186,
 26696,
 27546,
 32508,
 34245,
 35339,
 36426,
 40142,
 47201,
 51065,
 52646}

In [455]:
companyMatches3.head()

Unnamed: 0,cstatCompanies,closestAdds_indices
0,prog,"{12672, 9732, 34245, 26696, 3401, 7689, 35339,..."
1,american greetings,"{2498, 26437, 49286, 41165, 41295, 50578, 3618..."
2,apa,"{42176, 36233, 20942, 8337, 10582, 3993, 21212..."
3,bard (c.r.),"{20352, 34050, 24964, 53968, 39123, 51028, 555..."
4,earthstone energy,"{24516, 9669, 28361, 37579, 8981, 16471, 37852..."


In [454]:
companyMatches3['closestAdds_indices'] = ''


for i in range(0,companyMatches3.shape[0]):
    
    # find all the closest LV and cos addresses and put them in a 
    companyMatches3.at[i,'closestAdds_indices']  = set(largestElementsLV[i]).union(set(largestElementsCos[i]))    

And explode it so one index per line.

In [456]:
companyMatches3_indices = companyMatches3.explode('closestAdds_indices').reset_index(drop=True)

Now get the companies, the cosine similarities, and the levenshtein distances.

In [457]:
companyMatches3_indices.closestAdds_indices

0       12672
1        9732
2       34245
3       26696
4        3401
        ...  
7251    56947
7252    27062
7253    53303
7254    53305
7255    32443
Name: closestAdds_indices, Length: 7256, dtype: object

Get the embeddings for the cosine similarity.

In [458]:
for i in range(0,companyMatches3_indices.shape[0]):
    companyMatches3_indices.at[i,'igCompanies']  = igStillUnmatched.company[companyMatches3_indices.closestAdds_indices[i]] # .iloc[0]

    companyMatches3_indices.at[i,'lv']           = levenshtein_distance(companyMatches3_indices.cstatCompanies[i],companyMatches3_indices.igCompanies[i])
    companyMatches3_indices.at[i,'percMisspelled']       = companyMatches3_indices.lv[i]/len(companyMatches3_indices.cstatCompanies[i])

    # companyMatches3.at[i,'add_percMisspelled'] = companyMatches2.add_misspelling[i]/len(companyMatches2['cstatadd1'][i])


Get the company embeddings here.

In [459]:
cMatches  = companyMatches3_indices[['cstatCompanies']].drop_duplicates()
igMatches = companyMatches3_indices[['igCompanies']].drop_duplicates()



In [460]:
def getVector(text):
    embedding = nlp(text)
    
    return(embedding.vector)

In [461]:
cStatEmbeddings = list(map(getVector, cMatches.cstatCompanies))
igEmbeddings    = list(map(getVector, igMatches.igCompanies))

In [462]:
igMatches['igEmbedding']       = igEmbeddings
cMatches['cstatEmbedding']     = cStatEmbeddings

In [463]:
companyMatches3_indices = companyMatches3_indices.merge(igMatches).merge(cMatches)

In [464]:
companyMatches3_indices.shape

(7256, 7)

Loop through and get the cosine similarity.

In [465]:
for i in range(0,companyMatches3_indices.shape[0]):
    companyMatches3_indices.at[i,'cosSim']  = cosine_similarity([companyMatches3_indices.igEmbedding[i]],
                                                                [companyMatches3_indices.cstatEmbedding[i]])


In [466]:
filtered = companyMatches3_indices[((companyMatches3_indices.percMisspelled < 0.4) | \
                        (companyMatches3_indices.cosSim > 0.6))]

In [470]:
filtered[['cstatCompanies','igCompanies','percMisspelled','cosSim']].to_csv("../../data/companyData/companyMatches3_indices.csv")

In [469]:
filtered.shape

(85, 8)

In [467]:
filtered.head()

Unnamed: 0,cstatCompanies,closestAdds_indices,igCompanies,lv,percMisspelled,igEmbedding,cstatEmbedding,cosSim
214,monmouth re investment,22335,monmouth real est invstmnt crp,12.0,0.545455,"[-0.283336, 0.14570001, -0.0871573, -0.2300073...","[-0.057172, 0.07114, 0.002073332, -0.32842332,...",0.604113
319,american greetings,31069,american united mutual ins,14.0,0.736842,"[-0.46551502, 0.14741275, 0.06110751, -0.03591...","[-0.46316501, 0.059975, -0.18294, -0.28495002,...",0.674053
321,american greetings,20519,american greetings,1.0,0.052632,"[-0.46316501, 0.059975, -0.18294, -0.28495002,...","[-0.46316501, 0.059975, -0.18294, -0.28495002,...",1.0
331,intl business machines,21249,international business,14.0,0.636364,"[-0.1470675, 0.483495, 0.28637648, -0.17117998...","[-0.11614334, 0.24550134, 0.11403233, -0.64854...",0.712628
566,lumen technologies,35266,stealthbits technologies,10.0,0.555556,"[-0.23206, -0.049808, 0.158955, -0.269525, 0.0...","[0.08609001, 0.17962201, 0.191066, -0.13955499...",0.677055


# Combine all these things

In [471]:
dset1 = pd.read_csv("../../data/companyData/companiesToCheck_cityZip.csv")[['cstatCompanies','igCompanies','delete']]

In [472]:
dset2 = pd.read_csv("../../data/companyData/match2_cityZips.csv")[['cstatCompanies','igCompanies','delete']]

In [474]:
dset3 = pd.read_csv("../../data/companyData/companyMatches3_indices.csv")[['cstatCompanies','igCompanies','delete']]

In [475]:
dset1[['cstatCompanies','igCompanies','delete']].head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,american software,american software,
1,apco oil and gas,apco oil gas,
2,constellation energy grp,constellation energy,
3,central natural res,central natural resources,
4,cracker barrel old ctry stor,cracker barrel olduntry str,


In [476]:
dset2[['cstatCompanies','igCompanies','delete']].head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,colorado interstate gas,colorado publishing house,
1,comtech telecommun,comtech telecommunications crp,1.0
2,winthrop realty trust,winthrop capital partners,1.0
3,intl flavors fragrances,intl development envrmnt,1.0
4,anixter,anixter international,


In [477]:
dset3[['cstatCompanies','igCompanies','delete']].head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,first data,location based technologies,1.0
1,albany,albany international,
2,kbs reit ii,kbs legacy partners apt reit,
3,ring energy,eqm technologies energy,1.0
4,timios national,national alliance mental ill,1.0


In [478]:
cleanMerge = pd.read_csv("../../data/companyData/nameMerge.csv")[['company']].\
    rename(columns = {'company': 'cstatCompanies'})
cleanMerge['igCompanies'] = cleanMerge['cstatCompanies']

cleanMerge['delete'] = ''

print(cleanMerge.head())


           cstatCompanies             igCompanies delete
0  adc telecommunications  adc telecommunications       
1      american airlines       american airlines        
2   pinnacle west capital   pinnacle west capital       
3     abbott laboratories     abbott laboratories       
4                   aceto                   aceto       


In [479]:
all = pd.concat([dset1,dset2,dset3,cleanMerge])
allFiltered = all[~(all.delete == 1.0)].drop_duplicates()

In [480]:
cleanMerge.head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,adc telecommunications,adc telecommunications,
1,american airlines,american airlines,
2,pinnacle west capital,pinnacle west capital,
3,abbott laboratories,abbott laboratories,
4,aceto,aceto,


In [481]:
allFiltered.shape

(2618, 3)

In [482]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'})

chq['cstatCompanies'] = list(map(cleanText, chq.company))
chqToMatch = chq[['cstatCompanies','gvkey']]

In [483]:
ig = pd.read_csv("../../data/companyData/lastHQs.csv")
ig['igCompanies'] = list(map(cleanText, ig.company))

igToMatch = ig[['igCompanies','abi']]

In [484]:
igToMatch.head()

Unnamed: 0,igCompanies,abi
0,caridian bct,158329
1,family brands,211946
2,jade,258574
3,montana metal products,262493
4,o i,455154


Put these all together.

In [485]:
gvKey_abiLinkingTable = allFiltered.merge(chqToMatch).merge(igToMatch).drop_duplicates()

In [486]:
gvKey_abiLinkingTable.to_csv('../../data/companyData/linkingTable.csv')

# Create the SC Linking Table for 2010s

In [487]:
c_links = pd.read_csv("../../data/companyData/compustatSCLinked.csv")

c_links['year'] = c_links.srcdate.astype('str').str.slice(0,4).astype('int64')

c_links = c_links[c_links.year > 2009][['year','gvkey','cgvkey','salecs']].\
    rename(columns = {'cgvkey': 'customer_gvkey','gvkey': 'supplier_gvkey'})




c_links.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs
80,2010,1013,9899,300.0
81,2010,1013,2136,146.0
281,2016,1094,31673,78.193
282,2017,1094,31673,76.598
283,2017,1094,7171,70.215


In [488]:
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv')


base_columns = gvKey_abiLinkingTable.columns 
customer_columns = "customer_" + base_columns
supplier_columns = "supplier_" + base_columns



#########################
# merge in customer information
gvKey_abiLinkingTable.columns = customer_columns

print(c_links.shape)
c_linksMerge1 = c_links.merge(gvKey_abiLinkingTable, on ='customer_gvkey')
print(c_links.shape,c_linksMerge1.shape)



#########################
# and merge in supplier 
gvKey_abiLinkingTable.columns = supplier_columns

print(c_links.shape)
c_linksMerge2 = c_linksMerge1.merge(gvKey_abiLinkingTable, on ='supplier_gvkey')
print(c_links.shape,c_linksMerge2.shape)

(34473, 4)
(34473, 4) (24295, 9)
(34473, 4)
(34473, 4) (20544, 14)


In [489]:
c_linksMerge2.to_csv("../../data/companyData/clinks_IG_selected.csv")

In [490]:
gvKey_abiLinkingTable

Unnamed: 0,supplier_Unnamed: 0,supplier_cstatCompanies,supplier_igCompanies,supplier_delete,supplier_gvkey,supplier_abi
0,0,american software,american software,,1562,4378204
1,1,apco oil and gas,apco oil gas,,1682,544813678
2,2,constellation energy grp,constellation energy,,1995,506384064
3,3,central natural res,central natural resources,,2852,312712631
4,4,cracker barrel old ctry stor,cracker barrel olduntry str,,3570,852053057
...,...,...,...,...,...,...
2654,2654,cdti advanced materials,cdti advanced materials,,282553,967328568
2655,2655,futurefuel,futurefuel,,287462,679546432
2656,2656,lyondellbasell industries nv,lyondellbasell industries nv,,294524,200051589
2657,2657,doriang,doriang,,317264,435494175


This is probably because: (1) companies are not in North America, or (2) companies are not in the physical goods industries we're interested in. We can verify this though: look at c_links where both the customer and supplier are in the dataset of interest.

In [491]:
c_linkTest = c_links[c_links.customer_gvkey.isin(chq.gvkey.unique()) & \
                     c_links.supplier_gvkey.isin(chq.gvkey.unique())]
print("Percent of firms with a match: ", c_linksMerge2.shape[0]/c_linkTest.shape[0])

Percent of firms with a match:  0.8768620086217935


It's entirely possible that we have too small of a sample from the 2010s alone. Let's just try it though and see how it goes.

First, make a sample with the companies on three years of either side of when it reports another customer.

In [492]:
scTableCustomers = c_linksMerge2.copy()[['year','customer_gvkey','customer_abi']]
scTableSuppliers = c_linksMerge2.copy()[['year','supplier_gvkey','supplier_abi']]

In [493]:
scTableCustomers.head()

Unnamed: 0,year,customer_gvkey,customer_abi
0,2010,9899,460637358
1,2010,2136,7564776
2,2010,9899,460637358
3,2011,9899,460637358
4,2012,9899,460637358


In [494]:
def makeThreeEitherSide(df): 
    yrPlus1 = df.copy(); yrPlus1['year'] += 1
    yrPlus2 = df.copy(); yrPlus2['year'] += 1
    yrPlus3 = df.copy(); yrPlus3['year'] += 1
    
    yrMinus1 = df.copy(); yrMinus1['year'] -= 1
    yrMinus2 = df.copy(); yrMinus2['year'] -= 1
    yrMinus3 = df.copy(); yrMinus3['year'] -= 1
    
    all = pd.concat([yrPlus1,yrPlus2,yrPlus3,yrMinus1,yrMinus2,yrMinus3])
    
    return(all)

In [495]:
allCustomerData = makeThreeEitherSide(scTableCustomers)
allCustomerData.columns = ['year','gvkey','abi']


allSupplierData = makeThreeEitherSide(scTableSuppliers)
allSupplierData.columns = ['year','gvkey','abi']

In [496]:
allSupplierData.year

0        2011
1        2011
2        2011
3        2012
4        2013
         ... 
20539    2014
20540    2015
20541    2016
20542    2017
20543    2018
Name: year, Length: 123264, dtype: int64

In [497]:
allAbi = allCustomerData.abi.append(allSupplierData.abi).drop_duplicates()

In [498]:
hqsOnly = pd.read_csv("../../data/ig2010s_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})

In [499]:
hqsOnly.head()

Unnamed: 0,abi,company
0,7609,SOLITRON DEVICES INC
1,21311,WESTERN STATES ENVELOPE & LBL
2,29603,THIELE KAOLIN CO
3,71340,TRI STAFF GROUP
4,77743,NATIONAL TECHNICAL SYSTEMS INC


In [500]:
hq = pd.read_csv("../../data/ig2010s_uniqueHQs_multLocations.csv").\
    drop(columns = {'Unnamed: 0'}).\
    rename(columns = {'archive_version_year': 'year'})

hq['year'] = hq.year.astype('int64')

hqRelevant = hq[hq.abi.isin(allAbi)]

In [501]:
hqRelevant[hqRelevant.abi == 71340]

Unnamed: 0,abi,ticker,company,year,state,city,address_line_1,zipcode,latitude,longitude


In [502]:
allSupplierData = allSupplierData.merge(hqRelevant).drop_duplicates()
allCustomerData = allCustomerData.merge(hqRelevant).drop_duplicates()

In [503]:
allCustomerData.to_csv("../../data/companyData/allCustomerData.csv")
allSupplierData.to_csv("../../data/companyData/allSupplierData.csv")

In [504]:
allSupplierData.shape

(9612, 11)

In [505]:
hqsOnly = pd.read_csv("../../data/ig2010s_uniqueHQs.csv")

In [506]:
allAbi.isin(hqsOnly.abi)

0        True
1        True
12       True
14       True
19       True
         ... 
20517    True
20525    True
20527    True
20529    True
20532    True
Name: abi, Length: 2390, dtype: bool

In [507]:
hq.head()

Unnamed: 0,abi,ticker,company,year,state,city,address_line_1,zipcode,latitude,longitude
0,7609,SODI,SOLITRON DEVICES INC,2010,FL,WEST PALM BEACH,3301 ELECTRONICS WAY # C,33407,26.7412,-80.06694
1,21311,,WESTERN STATES ENVELOPE & LBL,2010,WI,BUTLER,4480 N 132ND ST,53007,43.09799,-88.07399
2,29603,,THIELE KAOLIN CO,2010,GA,SANDERSVILLE,520 KAOLIN RD,31082,32.96893,-82.81953
3,71340,,TRI STAFF GROUP,2010,CA,SAN DIEGO,6336 GREENWICH DR # 100,92122,32.85445,-117.18594
4,77743,,NATIONAL TECHNICAL SYSTEMS INC,2010,CA,CALABASAS,24007 VENTURA BLVD # 200,91302,34.15562,-118.65163
