In [2]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy
  
nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Get Full IG Dataset

In [3]:
file = "../../data/companyData/infogroup2010s.csv"

In [4]:
import dask.dataframe as dd

df = dd.read_csv(file, assume_missing=True, 
                 dtype={'parent_number': 'object','parent_employee_size_code': 'object',
                       'parent_sales_volume_code': 'object',
                       'abi': 'object','zipcode': 'object'}, low_memory = False)
df = df[df.business_status_code == 1.0]


  import pandas.util.testing as tm


In [20]:
abiRents = df[['abi','parent_number']].drop_duplicates().compute(num_workers = 25)

In [22]:
abiRents.to_csv("../../data/abiRents.csv")

In [None]:
differentParents.to_csv("../../data/differentParents2010s.csv")

In [None]:
hq = df[['abi','ticker','company','archive_version_year','state','city',
         'address_line_1','zipcode',
         'latitude','longitude']].drop_duplicates().compute(num_workers = 100)

In [None]:
hqsOnly = hq[['abi','company']].drop_duplicates()

In [None]:
hqsOnly.company.value_counts()

In [None]:
print(hq.shape,hqsOnly.shape)

Some of the abi numbers seem to be duplicated; it looks like they might be primarily for different government agencies.

In [None]:
hqsOnly.company.value_counts()[hqsOnly.company.value_counts() > 10].index

In [None]:
toDiscard = hqsOnly.company.value_counts()[hqsOnly.company.value_counts() > 1].index
for company in toDiscard:
    print(company)


In [None]:
toDiscard

In [None]:
hqsOnly = hqsOnly[~hqsOnly.company.isin(toDiscard)]
hq      = hq[~hq.company.isin(toDiscard)]

In [None]:
hq.shape

At this point, we have a unique record of every company - hq here. Some of these may well be duplicate entries for a given company, for the cases in which we have a company that has multiple hq.

Let's stash it so that we don't have to go through the above ^^ again.

In [None]:
hqsOnly.to_csv("../../data/ig2010s_uniqueHQs.csv")

In [None]:
hq.to_csv("../../data/ig2010s_uniqueHQs_multLocations.csv")

## Grab Compustat Data

First filter down to the companies for whom we have the supply chain information.

In [27]:
c_links = pd.read_csv("../../data/companyData/compustatSCLinked.csv")

c_links['year'] = c_links.srcdate.astype('str').str.slice(0,4).astype('int64')

c_links = c_links[c_links.year > 2009]

relevant_gvkeys = c_links.gvkey.append(c_links.cgvkey).drop_duplicates()

print(c_links.head(),relevant_gvkeys.shape)

     gvkey                        conm  cgvkey                       cconm  \
80    1013  ADC TELECOMMUNICATIONS INC    9899                    AT&T INC   
81    1013  ADC TELECOMMUNICATIONS INC    2136  VERIZON COMMUNICATIONS INC   
281   1094                  ACETO CORP   31673      AMERISOURCEBERGEN CORP   
282   1094                  ACETO CORP   31673      AMERISOURCEBERGEN CORP   
283   1094                  ACETO CORP    7171               MCKESSON CORP   

                       cnms   srcdate  cid  sid    ctype   salecs     scusip  \
80                     AT&T  20100930   16    0  COMPANY  300.000  000886309   
81   VERIZON COMMUNICATIONS  20100930   13    0  COMPANY  146.000  000886309   
281  AmerisourceBergen Corp  20160630   13    0  COMPANY   78.193  004446100   
282  AmerisourceBergen Corp  20170630   13    0  COMPANY   76.598  004446100   
283           McKesson Corp  20170630   19    0  COMPANY   70.215  004446100   

       stic     ccusip ctic  year  
80   ADCT.1  0

In [28]:
c_links.gvkey.unique().shape[0]

2479

Get the company dataset and check.

The legal name and the given name are slightly different, but basically the same modulo punctuation and case.

In [29]:
c_addresses = pd.read_csv("../../data/companyData/compustatAddresses.csv", 
                dtype={'parent_number': 'object'})[['fyear',
                'gvkey',
                'conm',
                'add1',
                'city',
                'state',
                'idbflag',
                'addzip',
               'naics']].drop_duplicates().rename(columns = {'fyear': 'year'})
c_addresses = c_addresses[(c_addresses.year > 2009) & (c_addresses.year < 2020)]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [30]:
c_addresses.year.value_counts()

2012.0    11836
2013.0    11829
2014.0    11637
2011.0    11527
2015.0    11469
2016.0    11329
2018.0    11302
2019.0    11275
2017.0    11243
2010.0    10855
Name: year, dtype: int64

Subset this to focus on firms in: ag, mining, construction, manufacturing, wholesale and retail, and transportation.

In [31]:
c_addresses = c_addresses[(c_addresses.naics.astype('str').str.slice(0,2).isin(['11','21','22','23','31','32',
                                                         '33','42','44','45','48','49']))]

chq = c_addresses[['gvkey','conm','add1','city','state','addzip','idbflag']].drop_duplicates()

We're starting with the compustat north america dataset. Not all of the HQs are in North America, so we can filter some of the information down to match with Infogroup.

In [32]:
chq.idbflag.value_counts()

D    6960
B    1121
Name: idbflag, dtype: int64

In [33]:
canadian = ['ON', 'AB','QC', 'BC', 'NS', 'NF', 'SK', 'MB', 'NB']

chq.state.unique()

array(['IL', 'MN', 'TX', 'SC', 'AZ', 'NY', 'CT', 'FL', 'CA', nan, 'ON',
       'PA', 'NC', 'AL', 'WA', 'HI', 'NJ', 'MA', 'OH', 'NV', 'UT', 'OK',
       'WI', 'AR', 'CO', 'IA', 'DE', 'RI', 'AB', 'GA', 'MD', 'ME', 'VA',
       'IN', 'SD', 'OR', 'QC', 'BC', 'KY', 'MO', 'LA', 'VT', 'TN', 'MI',
       'DC', 'ID', 'ND', 'MS', 'KS', 'NS', 'NH', 'NM', 'NE', 'MT', 'WY',
       'NF', 'SK', 'MB', 'WV', 'NB'], dtype=object)

In [34]:
chq = chq[~(chq.state.isin(canadian)) & ~chq.state.isna()]

In [35]:
chq.addzip.str.len().value_counts()

5     4628
10     481
Name: addzip, dtype: int64

In [36]:
chq['addzip'] = chq.addzip.astype('str').str.slice(0,5)

In [37]:
chq[chq.idbflag == 'D'].addzip.str.len().value_counts()

5    4689
Name: addzip, dtype: int64

In [38]:
chq[chq.idbflag == "B"].addzip.value_counts()

77002    11
80202     5
94080     5
02139     5
10022     4
         ..
01760     1
97124     1
85251     1
18940     1
33394     1
Name: addzip, Length: 326, dtype: int64

In [39]:
print(chq.head(),chq.shape)

    gvkey                         conm  \
1    1004                     AAR CORP   
12   1013   ADC TELECOMMUNICATIONS INC   
13   1045  AMERICAN AIRLINES GROUP INC   
25   1050      CECO ENVIRONMENTAL CORP   
59   1072                     AVX CORP   

                                        add1          city state addzip  \
1   One AAR Place, 1100 North Wood Dale Road     Wood Dale    IL  60191   
12                    13625 Technology Drive  Eden Prairie    MN  55344   
13                           1 Skyview Drive    Fort Worth    TX  76155   
25     14651 North Dallas Parkway, Suite 500        Dallas    TX  75254   
59                         One AVX Boulevard  Fountain Inn    SC  29644   

   idbflag  
1        D  
12       D  
13       B  
25       D  
59       D   (5109, 7)


In [50]:
chq.rename(columns = {'conm': 'company','addzip': 'cstatZipcode'},inplace = True)
chq.head()

Unnamed: 0,gvkey,company,add1,city,state,cstatZipcode,idbflag
0,1013,ADC TELECOMMUNICATIONS INC,13625 Technology Drive,Eden Prairie,MN,55344,D
1,1045,AMERICAN AIRLINES GROUP INC,1 Skyview Drive,Fort Worth,TX,76155,B
2,1075,PINNACLE WEST CAPITAL CORP,"400 North Fifth Street, P.O. Box 53999",Phoenix,AZ,85072,D
3,1078,ABBOTT LABORATORIES,"100 Abbott Park Road, Abbott Park",North Chicago,IL,60064,B
4,1094,ACETO CORP,4 Tri Harbor Court,Port Washington,NY,11050,D


In [51]:
chq = chq[chq.gvkey.isin(relevant_gvkeys)]

In [52]:
chq.shape

(2101, 7)

In [53]:
chq.to_csv("../../data/chq.csv")

# Breakpoint
We can start from here and just clean everything from here.


Headquarters:

In [61]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'})

chq.head()

Unnamed: 0,gvkey,company,add1,city,state,cstatZipcode,idbflag
0,1013,ADC TELECOMMUNICATIONS INC,13625 Technology Drive,Eden Prairie,MN,55344,D
1,1045,AMERICAN AIRLINES GROUP INC,1 Skyview Drive,Fort Worth,TX,76155,B
2,1075,PINNACLE WEST CAPITAL CORP,"400 North Fifth Street, P.O. Box 53999",Phoenix,AZ,85072,D
3,1078,ABBOTT LABORATORIES,"100 Abbott Park Road, Abbott Park",North Chicago,IL,60064,B
4,1094,ACETO CORP,4 Tri Harbor Court,Port Washington,NY,11050,D


In [62]:
hqsOnly     = pd.read_csv("../../data/ig2010s_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})
hqsWithYear = pd.read_csv("../../data/ig2010s_uniqueHQs_multLocations.csv",dtype={'zipcode': 'object'})[['abi','company',
                                                                             'archive_version_year',
                                                                             'state','city','zipcode','address_line_1']]

print(hqsOnly.head())
print(hqsWithYear.head())

hqsWithYear = hqsWithYear[hqsWithYear.archive_version_year <= 2020]

hqsWithYear['last_year'] = hqsWithYear.groupby(['abi'])['archive_version_year'].transform(max)

print(hqsWithYear.shape)

lastHQs = hqsWithYear[hqsWithYear.archive_version_year == hqsWithYear.last_year][['abi','company','state','city','zipcode','address_line_1']]

print(lastHQs.shape)

lastHQs.head()

     abi                         company
0   7609            SOLITRON DEVICES INC
1  21311   WESTERN STATES ENVELOPE & LBL
2  29603                THIELE KAOLIN CO
3  71340                 TRI STAFF GROUP
4  77743  NATIONAL TECHNICAL SYSTEMS INC
     abi                         company  archive_version_year state  \
0   7609            SOLITRON DEVICES INC                2010.0    FL   
1  21311   WESTERN STATES ENVELOPE & LBL                2010.0    WI   
2  29603                THIELE KAOLIN CO                2010.0    GA   
3  71340                 TRI STAFF GROUP                2010.0    CA   
4  77743  NATIONAL TECHNICAL SYSTEMS INC                2010.0    CA   

              city zipcode            address_line_1  
0  WEST PALM BEACH   33407  3301 ELECTRONICS WAY # C  
1           BUTLER   53007           4480 N 132ND ST  
2     SANDERSVILLE   31082             520 KAOLIN RD  
3        SAN DIEGO   92122   6336 GREENWICH DR # 100  
4        CALABASAS   91302  24007 VENTURA BLVD

Unnamed: 0,abi,company,state,city,zipcode,address_line_1
17,158329,CARIDIAN BCT INC,CO,LAKEWOOD,80401,14143 DENVER WEST PKWY # 200
20,211946,FAMILY BRANDS INTL LLC,TN,LENOIR CITY,37771,1001 ELM HILL RD
23,258574,JADE CORP,PA,HUNTINGDON VLY,19006,3063 PHILMONT AVE
24,262493,MONTANA METAL PRODUCTS LLC,IL,DES PLAINES,60018,25 HOWARD AVE
43,455154,O I CORP,TX,COLLEGE STATION,77845,151 GRAHAM RD


In [241]:
lastHQs.to_csv("../../data/companyData/lastHQs.csv")

Only two of these company names appear 2x, which is good. There are ~20,000 companies in this sample.

Let's go through a little bit of a process here:
- Find the exact matches.
- Get a similarity measure between ; ideally something vectorized / something in matrix math.
- Find the top 10 matches for the remaining ones.
- Do some mix and match and see if there's any threshold at which matches become similar ``enough'' to say this is okay and good to go.


We might be able to use the fact that all of the addresses should be the same after some given point, as the compustat addresses are only the most recent ones. 

Let's try a few different ways to match these up.

First, let's find the exact matches.

Make a generic cleaning function that strips out all company names, any punctuation in the name, and makes everything lower case.

In [63]:
def cleanText(text):
    text = text.\
    replace(" CORP","").replace(" CO","").replace(" INC","").\
    replace(" LTD","").replace(" -CL A","").\
    replace(" -LP","").replace(" LP","").\
    replace("-OLD","").replace(" LLC","").\
    replace(", LLC.","").replace(" L.L.C.","").replace(" L.P.","").\
    replace(" L.TD","").replace(" L.L.C.","").replace(" -CL B","").\
    replace(" -CL B","").replace(" -CL i","").replace(" -CL","").\
    replace("-REDH","").replace(" CP","").\
    replace("-ADR","").replace(" PLC","").lower().replace(r'[^\w\s]+', '').\
    replace('-lp','').replace('-spn','').replace('hldg','').replace(' intl','').\
    replace('holdings','').replace('holding','').replace('prtnr','').replace('group','').\
    replace(" med ", " medical ").replace(" tradng ", " trading ").replace("gen ", "general ").\
    replace(" mtr ", " motors ").replace(" motor ", " motors ").replace("-", " ").\
    replace("/", " ").replace("'", " ").replace("&", " ").replace(" a g ", " ").\
    replace(" ag ", " ").replace("  adr ", " ").replace(" adr ", " ").replace("  cp ", " ").\
    replace(" cp ", " ").replace(" plc ", " ").replace(" intl ", " ").replace(" ent ", " ").\
    replace(" nv ", " ").replace(" n.v. ", " ").replace(" worldwide ", " ").\
    replace(" wldwide ", " ")
    
    return text

In [64]:
chq['company']               = list(map(cleanText, chq.company))
lastHQs['company']           = list(map(cleanText, lastHQs.company))

chq.rename(columns = {'city': 'cstatCity',
                     'state': 'cstatState',
                     'add1': 'cstatadd1'}, inplace = True)

chq['cstatCity']  = chq.cstatCity.str.lower()
chq['cstatState'] = chq.cstatState.str.lower()
chq['cstatadd1']  = chq.cstatadd1.str.lower()

lastHQs['city']            = lastHQs.city.str.lower()
lastHQs['state']           = lastHQs.state.str.lower()
lastHQs['address_line_1']  = lastHQs.address_line_1.str.lower()

NAICS names do not match up between compustat and infogroup so they're not helpful.

In [65]:
chq.head()

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,idbflag
0,1013,adc telecommunications,13625 technology drive,eden prairie,mn,55344,D
1,1045,american airlines,1 skyview drive,fort worth,tx,76155,B
2,1075,pinnacle west capital,"400 north fifth street, p.o. box 53999",phoenix,az,85072,D
3,1078,abbott laboratories,"100 abbott park road, abbott park",north chicago,il,60064,B
4,1094,aceto,4 tri harbor court,port washington,ny,11050,D


In [66]:
lastHQs.head()

Unnamed: 0,abi,company,state,city,zipcode,address_line_1
17,158329,caridian bct,co,lakewood,80401,14143 denver west pkwy # 200
20,211946,family brands,tn,lenoir city,37771,1001 elm hill rd
23,258574,jade,pa,huntingdon vly,19006,3063 philmont ave
24,262493,montana metal products,il,des plaines,60018,25 howard ave
43,455154,o i,tx,college station,77845,151 graham rd


## Match on company name directly

In [218]:
nameMerge = chq.merge(lastHQs)
nameMerge.shape

(1603, 12)

In [219]:
nameMerge.head()

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,idbflag,abi,state,city,zipcode,address_line_1
0,1013,adc telecommunications,13625 technology drive,eden prairie,mn,55344,D,7523129,mn,eden prairie,55344,13625 technology dr
1,1045,american airlines,1 skyview drive,fort worth,tx,76155,B,7501711,tx,fort worth,76155,1 skyview dr
2,1075,pinnacle west capital,"400 north fifth street, p.o. box 53999",phoenix,az,85072,D,4554051,az,phoenix,85004,400 n 5th st frnt
3,1078,abbott laboratories,"100 abbott park road, abbott park",north chicago,il,60064,B,4352373,il,abbott park,60064,100 abbott park rd
4,1094,aceto,4 tri harbor court,port washington,ny,11050,D,433140944,tx,houston,77010,1221 mckinney st # 3275


In [220]:
nameMerge.to_csv("../../data/companyData/nameMerge.csv")

In [69]:
sum(nameMerge.cstatState == nameMerge.state)/nameMerge.shape[0]

0.9563318777292577

In [70]:
sum(nameMerge.cstatZipcode.str.slice(0,5) == nameMerge.zipcode.str.slice(0,5))/nameMerge.shape[0]

0.7554585152838428

In [71]:
sum(nameMerge.cstatZipcode.str.slice(0,1) == nameMerge.zipcode.str.slice(0,1))/nameMerge.shape[0]

0.8284466625077979

In [72]:
nameMerge[nameMerge.cstatCity != nameMerge.city][50:100]

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,idbflag,abi,state,city,zipcode,address_line_1
439,12392,skywest,444 south river road,saint george,ut,84790,D,4716585,ut,st george,84790,444 s river rd
440,12395,toll brothers,1140 virginia drive,fort washington,pa,19034,D,121181911,pa,horsham,19044,250 gibraltar rd
443,12482,clearfield,"7050 winnetka avenue north, suite 100, brookly...",minneapolis,mn,55428,D,501829584,mn,brooklyn park,55428,7050 winnetka ave n # 100
470,13431,berry,"16000 north dallas parkway, suite 500",dallas,tx,75248,D,425673043,ks,wichita,67219,2402 e 37th st n
474,13634,exide technologies,"13000 deerfield parkway, building 200",milton,ga,30004,D,400648903,ga,alpharetta,30004,13000 deerfield pkwy # 200
479,13782,multi color,"4053 clough woods drive, batavia",cincinnati,oh,45103,D,9277914,oh,batavia,45103,4053 clough woods dr
501,14417,micronetics,201 riverneck road,chelmsford,ma,1824,D,4254223,nh,hudson,3051,26 hampshire dr
522,15274,walter energy,"3000 riverchase galleria, suite 1700",birmingham,al,35244,D,441471463,al,hoover,35244,3000 riverchase galleria #1700
524,15459,tredegar,1100 boulders parkway,richmond,va,23225,D,200105807,va,n chesterfield,23225,1100 boulders pkwy # 200
528,15708,allergan,2525 dupont drive,irvine,ca,92612,D,172439986,nj,madison,7940,5 giralda farms


Now focus down onto the companies that have not been matched.

In [73]:
chqUnmatched = chq[~chq.company.isin(nameMerge.company)].reset_index()
chqUnmatched.shape

(523, 8)

In [74]:
igUnmatched  = lastHQs[~lastHQs.company.isin(nameMerge.company)].reset_index()
igUnmatched.shape

(58209, 7)

In [75]:
lastHQs.shape

(59805, 6)

# Find Distance

Two distance measures here. Look at top 5 matches and pull the distance measure and matches as well.

### Levenshtein

In [76]:
from Levenshtein import distance as levenshtein_distance

Find LD between the unmatched compustat companies and the unmatched IG ones. 

In [77]:
companyArrayCStat = []

company = chqUnmatched.company[0]
start = time.time()
for company in chqUnmatched.company:
    thisCompany = []
    for ig in igUnmatched.company:
        thisCompany.append(levenshtein_distance(company,ig))
    
    companyArrayCStat.append([thisCompany])

allLD = np.concatenate(companyArrayCStat)


In [78]:
igUnmatched.head()

Unnamed: 0,index,abi,company,state,city,zipcode,address_line_1
0,17,158329,caridian bct,co,lakewood,80401,14143 denver west pkwy # 200
1,20,211946,family brands,tn,lenoir city,37771,1001 elm hill rd
2,23,258574,jade,pa,huntingdon vly,19006,3063 philmont ave
3,24,262493,montana metal products,il,des plaines,60018,25 howard ave
4,43,455154,o i,tx,college station,77845,151 graham rd


Now pull the closest companies in IG to those in CStat. We'll first populate a dataframe with the name, address, city, state, and zip of each unmatched company in compustat, then we'll use the LD to find the same information for the closest company in IG.



There's some legacy code in here that finds the top 5 closest companies; but it doesn't populate the dataframe.

In [79]:
singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]

# legacy
# n = 5
# largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


companyMatches = pd.DataFrame()
companyMatches['cstatCompanies'] = chqUnmatched.company
companyMatches['cstatadd1']      = chqUnmatched.cstatadd1
companyMatches['cstatCity']      = chqUnmatched.cstatCity
companyMatches['cstatState']     = chqUnmatched.cstatState
companyMatches['cstatZip']       = chqUnmatched.cstatZipcode

for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'misspelling']         = allLD[i,:][singleLargestLV[i]]
    companyMatches.at[i,'percMisspelled']      = companyMatches.misspelling[i]/len(companyMatches.cstatCompanies[i])
    companyMatches.at[i,'levCompany']          = igUnmatched.company[singleLargestLV[i]].iloc[0]
    
    companyMatches.at[i,'closestMatchIG_add']      = igUnmatched.address_line_1[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_city']     = igUnmatched.city[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_zipcode']  = igUnmatched.zipcode[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_state']    = igUnmatched.state[singleLargestLV[i]].iloc[0]

    
    # companyMatches.at[i,'closestMatchIG']      = np.array(igUnmatched.company)[largestElementsLV[i]]
    # companyMatches.at[i,'LevSim']              = np.array(allLD[i,:][largestElementsLV[i]], dtype=object)

In [80]:
companyMatches.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,percMisspelled,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state
0,beam,222 west merchandise mart plaza,chicago,il,60654,1.0,0.25,team,13131 dairy ashford rd # 600,sugar land,77478,tx
1,apa,"2000 post oak boulevard, suite 100",houston,tx,77056,1.0,0.333333,lpa,5301 california ave # 100,irvine,92617,ca
2,apco oil and gas,"one williams center, 35th floor",tulsa,ok,74172,3.0,0.1875,apco oil gas,1 one williams ctr # 35,tulsa,74172,ok
3,southern gas,"ten peachtree place, n.e.",atlanta,ga,30309,2.0,0.166667,southern its,9101 w sahara ave # 105,las vegas,89117,nv
4,constellation energy grp,100 constellation way,baltimore,md,21202,3.0,0.125,constellation energy,100 constellation way,baltimore,21202,md


Now get the embeddings and the cosine similarity between them.

In [81]:
def getMatrix(companyEmbeddings):
    companyArray = []
    
    for companies in companyEmbeddings:
        companyArray.append([companies.vector])
    
    companyArray = np.concatenate(companyArray)
    
    return(companyArray)
        

In [82]:
chqUnmatchedList = list(map(nlp, chqUnmatched.company))
allCompaniesIG   = list(map(nlp, igUnmatched.company))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

(523, 58209)

In [83]:
outfile =  '../../data/allCompaniesIG_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(ig, pickle_file)
    
outfile =  '../../data/allCompaniesCStat_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(cstat, pickle_file)    

Each row n here has the similarity between the nth company name in compustat and the IG company corresp to that column.

In [84]:
allSimilarities[0:5,:]

array([[-0.02306337,  0.06756251,  0.13372983, ...,  0.29612526,
         0.07285859,  0.17976503],
       [ 0.12749049,  0.01153258,  0.11958653, ...,  0.08740962,
         0.21394141,  0.13486943],
       [-0.09821712,  0.2702324 ,  0.09386543, ...,  0.37443492,
         0.15091573,  0.33140045],
       [-0.11109462,  0.30866355,  0.15008008, ...,  0.4353319 ,
         0.19054101,  0.48522738],
       [ 0.01174485,  0.2304165 ,  0.21849704, ...,  0.2877023 ,
         0.19672713,  0.29067034]], dtype=float32)

Find indices of companies in IG most similar to each company in CStat.

In [85]:
singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]

# legacy - largest n
# largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]


Add the cosine similarity measures to the similarity dataset.

In [86]:
for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'cosSimilarity']        = allSimilarities[i,:][singleLargestCos[i]]
    companyMatches.at[i,'cosSimilarityCompany'] = igUnmatched.company[singleLargestCos[i]].iloc[0]
    # companyMatches.at[i,'closestMatchCosine']   = np.array(igUnmatched.company)[largestElementsCos[i]]
    # companyMatches.at[i,'cosineSim']            = np.array(allSimilarities[i,:][largestElementsCos[i]], dtype=object)
    
    
    companyMatches.at[i,'costMatchIG_add']     = igUnmatched.address_line_1[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_city']     = igUnmatched.city[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_zipcode']  = igUnmatched.zipcode[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_state']    = igUnmatched.state[singleLargestCos[i]].iloc[0]
    

In [87]:
sum((companyMatches.levCompany == companyMatches.cosSimilarityCompany))

118

## Start Matching

### Take 1: Match + Zip or City

Now find the company matches: ABI - gvkey link.

Start with ones where the names both match.

If the cities or zipcodes match on one of the closest companies (LD or cos), it seems like it is good to go.


Do this in steps to start, at least. First find the companies where both match and either zip or city match. Then find companies where only one matches.

In [101]:
bothMatch_cityZip = companyMatches[(companyMatches.levCompany == companyMatches.cosSimilarityCompany) & \
              ((companyMatches.cstatCity == companyMatches.closestMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.closestMatchIG_zipcode))]
bothMatch_cityZip.shape

bothMatch_cityZip.to_csv("../../data/companyData/bothMatch_cityZip.csv")

In [102]:
bothMatch_cityZip['igCompanies'] = bothMatch_cityZip.levCompany
companiesToCheck                 = bothMatch_cityZip[['cstatCompanies','igCompanies',
                                    'cstatadd1','cstatCity','cstatZip',
                                    'closestMatchIG_add','closestMatchIG_city','closestMatchIG_zipcode']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [103]:
companyMatchesBoth = list(bothMatch_cityZip.cstatCompanies.unique())
len(companyMatchesBoth)

95

Grab the single company match versions.

In [104]:
oneMatch_cityZipOnly = companyMatches[-(companyMatches.cstatCompanies.isin(companyMatchesBoth)) & \
              (((companyMatches.cstatCity == companyMatches.closestMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.closestMatchIG_zipcode)) | \
              ((companyMatches.cstatCity == companyMatches.cosMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.cosMatchIG_zipcode)))].reset_index(drop=True)

oneMatch_cityZipOnly['igCompanies'] = ''

for i in range(0,oneMatch_cityZipOnly.shape[0]):
    if ((oneMatch_cityZipOnly.cstatCity[i] == oneMatch_cityZipOnly.closestMatchIG_city[i]) | \
              (oneMatch_cityZipOnly.cstatZip[i] == oneMatch_cityZipOnly.closestMatchIG_zipcode[i])):
        oneMatch_cityZipOnly.loc[i,'igCompanies'] = oneMatch_cityZipOnly.levCompany[i]
    else:
        oneMatch_cityZipOnly.loc[i,'igCompanies'] = oneMatch_cityZipOnly.cosSimilarityCompany[i]

# oneMatch_cityZipOnly.to_csv("../../data/companyData/oneMatch_cityZipOnly.csv")

In [105]:
companiesToCheck.shape

(95, 8)

In [106]:
oneMatch_cityZipOnly.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,percMisspelled,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state,igCompanies
0,beam,222 west merchandise mart plaza,chicago,il,60654,1.0,0.25,team,13131 dairy ashford rd # 600,sugar land,77478,tx,1.0,beam suntory,222 merchandise mart plz #1600,chicago,60654,il,beam suntory
1,southern gas,"ten peachtree place, n.e.",atlanta,ga,30309,2.0,0.166667,southern its,9101 w sahara ave # 105,las vegas,89117,nv,0.92152,gas south,3625 cumberland blvd se # 1500,atlanta,30339,ga,gas south
2,barry (r g),13405 yarmouth road n.w.,pickerington,oh,43147,6.0,0.545455,baby togs,100 w 33rd st # 1012,new york,10001,ny,0.835597,r g barry,13405 yarmouth dr,pickerington,43147,oh,r g barry
3,bausch lomb s,400 somerset corporate blvd.,bridgewater,nj,8807,9.0,0.529412,bush o donnell,353 marshall ave # m,st louis,63119,mo,0.752105,bausch health,400 somerset corporate blvd,bridgewater,8807,nj,bausch health
4,officemax,263 shuman boulevard,naperville,il,60563,1.0,0.111111,office max,263 shuman blvd,naperville,60563,il,0.426793,steelcase,901 44th st se,grand rapids,49508,mi,office max


In [107]:
companiesToCheck = companiesToCheck.append(oneMatch_cityZipOnly[['cstatCompanies','igCompanies',
                                    'cstatadd1','cstatCity','cstatZip',
                                    'closestMatchIG_add','closestMatchIG_city','closestMatchIG_zipcode']]).\
                                    drop_duplicates()

companiesToCheck.shape

(252, 8)

In [108]:
companiesToCheck[0:50]

Unnamed: 0,cstatCompanies,igCompanies,cstatadd1,cstatCity,cstatZip,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode
2,apco oil and gas,apco oil gas,"one williams center, 35th floor",tulsa,74172,1 one williams ctr # 35,tulsa,74172
4,constellation energy grp,constellation energy,100 constellation way,baltimore,21202,100 constellation way,baltimore,21202
25,electro scientific inds,electro scientific industries,13900 nw science park drive,portland,97229,13900 nw science park dr,portland,97229
28,federal mogul,federal mogul,27300 west 11 mile road,southfield,48034,26555 northwestern hwy,southfield,48033
29,forest laboratories,forest laboratories,909 third avenue,new york,10022,909 3rd ave # 23,new york,10022
45,kimball international,kimball international,1600 royal street,jasper,47546,1600 royal st,jasper,47546
46,snyders lance,snyder s lance,13515 ballantyne corporate place,charlotte,28277,13515 ballantyne corporate pl,charlotte,28277
49,lowe ss,lowe s,1000 lowe's boulevard,mooresville,28117,1000 lowes blvd,mooresville,28117
58,national presto inds,national presto industries,3925 north hastings way,eau claire,54703,3925 n hastings way,eau claire,54703
59,nike,nike,one bowerman drive,beaverton,97005,1 sw bowerman dr,beaverton,97005


2x check that there are no duplicates here.

In [109]:
duplicates = companiesToCheck.cstatCompanies.value_counts().index[companiesToCheck.cstatCompanies.value_counts() > 1]

companiesToCheck[companiesToCheck.cstatCompanies.isin(duplicates)]

Unnamed: 0,cstatCompanies,igCompanies,cstatadd1,cstatCity,cstatZip,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode


In [110]:
chqStillUnmatched = chqUnmatched[-chqUnmatched.company.isin(companiesToCheck.cstatCompanies)].reset_index(drop=True)
igStillUnmatched  = igUnmatched[-igUnmatched.company.isin(companiesToCheck.igCompanies)].reset_index(drop=True)

# companyMatches['cstatCompanies'] = chqUnmatched.company
print(chqUnmatched.shape, chqStillUnmatched.shape, companiesToCheck.shape)
print(igStillUnmatched.shape,igUnmatched.shape)

(523, 8) (271, 8) (252, 8)
(57958, 7) (58209, 7)


In [111]:
companiesToCheck.to_csv("../../data/companyData/companiesToCheck_cityZip.csv")

## Take 2
Match remaining ones on first word of name

In [112]:
chqUnmatched.company[0].split(' ')[0]

'beam'

Get the edit distance for the first words of the company names.

In [113]:
companyArrayCStat = []

start = time.time()
for company in chqStillUnmatched.company:
    thisCompany = []
    for ig in igStillUnmatched.company:
        thisCompany.append(levenshtein_distance(company.split(' ')[0],ig.split(' ')[0]))
    
    companyArrayCStat.append([thisCompany])

allLD = np.concatenate(companyArrayCStat)


And the cosine distance.

In [114]:
chqStillUnmatchedFirstCo = []
igStillUnmatchedFirstCo = []

for company in chqStillUnmatched.company:
    chqStillUnmatchedFirstCo.append(company.split(' ')[0])
    
for company in igStillUnmatched.company:
    igStillUnmatchedFirstCo.append(company.split(' ')[0])

In [115]:
chqUnmatchedList = list(map(nlp, chqStillUnmatchedFirstCo))
allCompaniesIG = list(map(nlp, igStillUnmatchedFirstCo))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

# largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]
singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]

In [116]:
igStillUnmatched.head()

Unnamed: 0,index,abi,company,state,city,zipcode,address_line_1
0,17,158329,caridian bct,co,lakewood,80401,14143 denver west pkwy # 200
1,20,211946,family brands,tn,lenoir city,37771,1001 elm hill rd
2,23,258574,jade,pa,huntingdon vly,19006,3063 philmont ave
3,24,262493,montana metal products,il,des plaines,60018,25 howard ave
4,43,455154,o i,tx,college station,77845,151 graham rd


In [171]:
singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]
# largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


companyMatches2 = pd.DataFrame()
companyMatches2['cstatCompanies'] = chqStillUnmatched.company
companyMatches2['cstatadd1']      = chqStillUnmatched.cstatadd1
companyMatches2['cstatCity']      = chqStillUnmatched.cstatCity
companyMatches2['cstatState']     = chqStillUnmatched.cstatState
companyMatches2['cstatZip']       = chqStillUnmatched.cstatZipcode


for i in range(0,companyMatches2.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches2.at[i,'misspelling']         = allLD[i,:][singleLargestLV[i]]
    companyMatches2.at[i,'levCompany']          = igStillUnmatched.company[singleLargestLV[i]].iloc[0]
    
    companyMatches2.at[i,'closestMatchIG_add']      = igStillUnmatched.address_line_1[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_city']     = igStillUnmatched.city[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_zipcode']  = igStillUnmatched.zipcode[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_state']    = igStillUnmatched.state[singleLargestLV[i]].iloc[0]

    
    companyMatches2.at[i,'cosSimilarity']        = allSimilarities[i,:][singleLargestCos[i]]
    companyMatches2.at[i,'cosSimilarityCompany'] = igStillUnmatched.company[singleLargestCos[i]].iloc[0]
   
    companyMatches2.at[i,'costMatchIG_add']     = igStillUnmatched.address_line_1[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_city']     = igStillUnmatched.city[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_zipcode']  = igStillUnmatched.zipcode[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_state']    = igStillUnmatched.state[singleLargestCos[i]].iloc[0]

    

In [172]:
companyMatches2.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state
0,apa,"2000 post oak boulevard, suite 100",houston,tx,77056,11.0,apache,2000 post oak blvd # 100,houston,77056,tx,0.821982,altus midstream,2000 post oak blvd # 100,houston,77056,tx
1,bard (c.r.),730 central avenue,murray hill,nj,7974,3.0,c r bard,730 central ave,new providence,7974,nj,0.906581,c r bard,730 central ave,new providence,7974,nj
2,earthstone energy,"633 17th street, suite 2320",denver,co,80202,12.0,environmental biotech usa,4693 19th street ct e,bradenton,34203,fl,0.733674,kennedy childs pc,633 17th st # 2200,denver,80202,co
3,belden blake,"1001 fannin street, suite 800",houston,tx,77002,10.0,enervest,1001 fannin st # 800,houston,77002,tx,0.726687,enervest,1001 fannin st # 800,houston,77002,tx
4,bemis,2301 industrial drive,neenah,wi,54956,3.0,amcor,2301 industrial dr,neenah,54956,wi,0.749667,amcor,2301 industrial dr,neenah,54956,wi


In [173]:
companyMatches2.shape

(271, 17)

Find if city or zip match here.

In [174]:
match2_cityZips = companyMatches2[(((companyMatches2.cstatCity == companyMatches2.closestMatchIG_city) | \
              (companyMatches2.cstatZip == companyMatches2.closestMatchIG_zipcode)) | \
              ((companyMatches2.cstatCity == companyMatches2.cosMatchIG_city) | \
              (companyMatches2.cstatZip == companyMatches2.cosMatchIG_zipcode)))].reset_index(drop=True)


In [175]:
match2_cityZips

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state
0,apa,"2000 post oak boulevard, suite 100",houston,tx,77056,11.0,apache,2000 post oak blvd # 100,houston,77056,tx,0.821982,altus midstream,2000 post oak blvd # 100,houston,77056,tx
1,earthstone energy,"633 17th street, suite 2320",denver,co,80202,12.0,environmental biotech usa,4693 19th street ct e,bradenton,34203,fl,0.733674,kennedy childs pc,633 17th st # 2200,denver,80202,co
2,belden blake,"1001 fannin street, suite 800",houston,tx,77002,10.0,enervest,1001 fannin st # 800,houston,77002,tx,0.726687,enervest,1001 fannin st # 800,houston,77002,tx
3,bemis,2301 industrial drive,neenah,wi,54956,3.0,amcor,2301 industrial dr,neenah,54956,wi,0.749667,amcor,2301 industrial dr,neenah,54956,wi
4,entrprize,"800 nicollet mall, suite 2690",minneapolis,mn,55402,6.0,entrx,800 nicollet mall # 2690,minneapolis,55402,mn,0.827987,entrx,800 nicollet mall # 2690,minneapolis,55402,mn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,petrologistics,4111 east 37th st. north,wichita,ks,67220,8.0,koch industries,4111 e 37th st n,wichita,67220,ks,0.866980,portageunty dist library,10482 south st,garrettsville,44231,oh
161,inergy midstream,"700 louisiana street, suite 2060",houston,tx,77002,11.0,hicks thomas llp,700 louisiana st # 2000,houston,77002,tx,0.722648,tc energy,700 louisiana st # 700,houston,77002,tx
162,cempra,"6320 quadrangle drive, suite 360",chapel hill,nc,27517,10.0,ifg,6320 quadrangle dr # 300,chapel hill,27517,nc,0.614257,ifg,6320 quadrangle dr # 300,chapel hill,27517,nc
163,holly energy partners,"2828 north harwood, suite 1300",dallas,tx,75201,10.0,aspen advisors,2828 n harwood st # 1700,dallas,75201,tx,0.709495,aspen advisors,2828 n harwood st # 1700,dallas,75201,tx


In [176]:
match2_cityZips['igCompanies'] = ''

for i in range(0,match2_cityZips.shape[0]):
    if ((match2_cityZips.cstatCity[i] == match2_cityZips.closestMatchIG_city[i]) | \
              (match2_cityZips.cstatZip[i] == match2_cityZips.closestMatchIG_zipcode[i])):
        match2_cityZips.loc[i,'igCompanies'] = match2_cityZips.levCompany[i]
    else:
        match2_cityZips.loc[i,'igCompanies'] = match2_cityZips.cosSimilarityCompany[i]


In [177]:
match2_cityZips[['cstatCompanies','igCompanies','closestMatchIG_add','costMatchIG_add']].shape

(165, 4)

In [179]:
match2_cityZips[['cstatCompanies','igCompanies','closestMatchIG_add','costMatchIG_add']].to_csv("../../data/companyData/match2_cityZips.csv")

## Take 3
Try the addresses here.

Let's try something similar:
- Find top 10 most similar addresses by cos sim
- Find top 10 most similar addresses by LD
- Find unique union of these two
- Record LD and cos sim for each
- Filter for totally dissimilar ones
- ``Explode'' the dataset so we have cstat company, address
- Find first word LD and cos sim
- Find total LD and cos sim

In [126]:
n = 10

In [127]:
chqStillUnmatched['cstatadd1']     = chqStillUnmatched.cstatadd1.astype(str)
igStillUnmatched['address_line_1'] = igStillUnmatched.address_line_1.astype(str)

In [128]:
addressArrayCStat = []

start = time.time()
for address in chqStillUnmatched.cstatadd1:
    thisAddress = []
    for ig in igStillUnmatched.address_line_1:
        thisAddress.append(levenshtein_distance(str(address),str(ig)))
    
    addressArrayCStat.append([thisAddress])

allLD = np.concatenate(addressArrayCStat)

singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]
largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


In [129]:
chqUnmatchedList = list(map(nlp, chqStillUnmatched['cstatadd1']))
allCompaniesIG   = list(map(nlp, igStillUnmatched['address_line_1']))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]
largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]

In [141]:
companyMatches3 = pd.DataFrame()
companyMatches3['cstatCompanies'] = chqStillUnmatched.company

Let's grab the set of the 10 closest addresses by LD and cos.

In [146]:
companyMatches3.head()

Unnamed: 0,cstatCompanies,closestAdds_indices
0,apa,"{42753, 36739, 8397, 36946, 24724, 44758, 5058..."
1,bard (c.r.),"{3904, 28225, 4232, 19467, 54670, 56593, 11281..."
2,earthstone energy,"{32192, 21633, 44611, 16836, 9740, 15245, 1665..."
3,belden blake,"{48384, 45892, 46918, 15500, 45390, 47644, 512..."
4,bemis,"{33728, 44613, 55946, 11148, 21849, 2460, 1698..."


Find the unique values in here.

In [143]:
set(largestElementsLV[i]).union(set(largestElementsCos[i]))  

{4018,
 8397,
 8997,
 10098,
 10605,
 10666,
 13469,
 21482,
 24724,
 25588,
 36701,
 36739,
 36946,
 40419,
 42753,
 44758,
 48802,
 50586}

In [145]:
for i in range(0,companyMatches3.shape[0]):
    
    # find all the closest LV and cos addresses and put them in a 
    companyMatches3.at[i,'closestAdds_indices']  = set(largestElementsLV[i]).union(set(largestElementsCos[i]))    

And explode it so one index per line.

In [147]:
companyMatches3_indices = companyMatches3.explode('closestAdds_indices').reset_index(drop=True)

Now get the companies, the cosine similarities, and the levenshtein distances.

In [148]:
companyMatches3_indices.closestAdds_indices

0       42753
1       36739
2        8397
3       36946
4       24724
        ...  
4951    53613
4952    16242
4953    16947
4954    26804
4955     5496
Name: closestAdds_indices, Length: 4956, dtype: object

Get the embeddings for the cosine similarity.

In [149]:
for i in range(0,companyMatches3_indices.shape[0]):
    companyMatches3_indices.at[i,'igCompanies']  = igStillUnmatched.company[companyMatches3_indices.closestAdds_indices[i]] # .iloc[0]

    companyMatches3_indices.at[i,'lv']           = levenshtein_distance(companyMatches3_indices.cstatCompanies[i],companyMatches3_indices.igCompanies[i])
    companyMatches3_indices.at[i,'percMisspelled']       = companyMatches3_indices.lv[i]/len(companyMatches3_indices.cstatCompanies[i])

    # companyMatches3.at[i,'add_percMisspelled'] = companyMatches2.add_misspelling[i]/len(companyMatches2['cstatadd1'][i])


Get the company embeddings here.

In [150]:
cMatches  = companyMatches3_indices[['cstatCompanies']].drop_duplicates()
igMatches = companyMatches3_indices[['igCompanies']].drop_duplicates()



In [151]:
def getVector(text):
    embedding = nlp(text)
    
    return(embedding.vector)

In [152]:
cStatEmbeddings = list(map(getVector, cMatches.cstatCompanies))
igEmbeddings    = list(map(getVector, igMatches.igCompanies))

In [153]:
igMatches['igEmbedding']       = igEmbeddings
cMatches['cstatEmbedding']     = cStatEmbeddings

In [154]:
companyMatches3_indices = companyMatches3_indices.merge(igMatches).merge(cMatches)

In [155]:
companyMatches3_indices.shape

(4956, 7)

Loop through and get the cosine similarity.

In [156]:
for i in range(0,companyMatches3_indices.shape[0]):
    companyMatches3_indices.at[i,'cosSim']  = cosine_similarity([companyMatches3_indices.igEmbedding[i]],
                                                                [companyMatches3_indices.cstatEmbedding[i]])


In [165]:
filtered = companyMatches3_indices[((companyMatches3_indices.percMisspelled < 0.4) | \
                        (companyMatches3_indices.cosSim > 0.6))]

In [166]:
filtered[['cstatCompanies','igCompanies','percMisspelled','cosSim']].to_csv("../../data/companyData/companyMatches3_indices.csv")

In [167]:
filtered.head()

Unnamed: 0,cstatCompanies,closestAdds_indices,igCompanies,lv,percMisspelled,igEmbedding,cstatEmbedding,cosSim
24,us well services,33746,us well svc,5.0,0.3125,"[-0.2185, 0.34304667, 0.10306334, -0.20798667,...","[-0.157454, 0.20863998, -0.15231887, -0.126730...",0.773974
128,earthstone energy,9049,nacel energy,10.0,0.555556,"[-0.26681, 0.391915, 0.329485, 0.092695, -0.18...","[-0.26681, 0.391915, 0.329485, 0.092695, -0.18...",1.0
130,earthstone energy,28709,bonanza creek energy,12.0,0.666667,"[-0.29143432, 0.39298332, 0.31564334, -0.22690...","[-0.26681, 0.391915, 0.329485, 0.092695, -0.18...",0.668672
162,triangle petroleum,1207,american oil gas,16.0,0.888889,"[-0.253985, -0.009545499, 0.3099225, 0.06152, ...","[-0.06825501, -0.25275052, 0.225445, 0.1408265...",0.621715
269,checkpoint therapeutics,29193,protara therapeutics,10.0,0.434783,"[0.261135, 0.08454, 0.055645, -0.306765, -0.25...","[0.11764501, 0.108727, -0.015774999, -0.093939...",0.75626


# Combine all these things

In [194]:
dset1 = pd.read_csv("../../data/companyData/companiesToCheck_cityZip.csv")[['cstatCompanies','igCompanies','delete']]

In [195]:
dset2 = pd.read_csv("../../data/companyData/match2_cityZips.csv")[['cstatCompanies','igCompanies','delete']]

In [196]:
dset3 = pd.read_csv("../../data/companyData/companyMatches3_indices.csv")[['cstatCompanies','igCompanies','delete']]

In [184]:
dset1[['cstatCompanies','igCompanies','delete']].head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,apco oil and gas,apco oil gas,
1,constellation energy grp,constellation energy,
2,electro scientific inds,electro scientific industries,
3,federal mogul,federal mogul,
4,forest laboratories,forest laboratories,


In [187]:
dset2[['cstatCompanies','igCompanies','delete']].head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,apa,apache,1.0
1,earthstone energy,kennedy childs pc,1.0
2,belden blake,enervest,1.0
3,bemis,amcor,1.0
4,entrprize,entrx,


In [188]:
dset3[['cstatCompanies','igCompanies','delete']].head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,us well services,us well svc,
1,earthstone energy,nacel energy,1.0
2,earthstone energy,bonanza creek energy,1.0
3,triangle petroleum,american oil gas,1.0
4,checkpoint therapeutics,protara therapeutics,1.0


In [230]:
cleanMerge = pd.read_csv("../../data/companyData/nameMerge.csv")[['company']].\
    rename(columns = {'company': 'cstatCompanies'})
cleanMerge['igCompanies'] = cleanMerge['cstatCompanies']

cleanMerge['delete'] = ''

print(cleanMerge.head())


           cstatCompanies             igCompanies delete
0  adc telecommunications  adc telecommunications       
1      american airlines       american airlines        
2   pinnacle west capital   pinnacle west capital       
3     abbott laboratories     abbott laboratories       
4                   aceto                   aceto       


In [231]:
all = pd.concat([dset1,dset2,dset3,cleanMerge])
allFiltered = all[~(all.delete == 1.0)].drop_duplicates()

In [232]:
cleanMerge.head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,adc telecommunications,adc telecommunications,
1,american airlines,american airlines,
2,pinnacle west capital,pinnacle west capital,
3,abbott laboratories,abbott laboratories,
4,aceto,aceto,


In [238]:
allFiltered.shape

(1846, 3)

In [236]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'})

chq['cstatCompanies'] = list(map(cleanText, chq.company))
chqToMatch = chq[['cstatCompanies','gvkey']]

In [244]:
ig = pd.read_csv("../../data/companyData/lastHQs.csv")
ig['igCompanies'] = list(map(cleanText, ig.company))

igToMatch = ig[['igCompanies','abi']]

In [245]:
igToMatch.head()

Unnamed: 0,igCompanies,abi
0,caridian bct,158329
1,family brands,211946
2,jade,258574
3,montana metal products,262493
4,o i,455154


Put these all together.

In [271]:
gvKey_abiLinkingTable = allFiltered.merge(chqToMatch).merge(igToMatch).drop_duplicates()

In [251]:
gvKey_abiLinkingTable.to_csv('../../data/companyData/linkingTable.csv')

# Create the SC Linking Table for 2010s

In [275]:
c_links = pd.read_csv("../../data/companyData/compustatSCLinked.csv")

c_links['year'] = c_links.srcdate.astype('str').str.slice(0,4).astype('int64')

c_links = c_links[c_links.year > 2009][['year','gvkey','cgvkey','salecs']].\
    rename(columns = {'cgvkey': 'customer_gvkey','gvkey': 'supplier_gvkey'})




c_links.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs
80,2010,1013,9899,300.0
81,2010,1013,2136,146.0
281,2016,1094,31673,78.193
282,2017,1094,31673,76.598
283,2017,1094,7171,70.215


In [281]:
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv')


base_columns = gvKey_abiLinkingTable.columns 
customer_columns = "customer_" + base_columns
supplier_columns = "supplier_" + base_columns



#########################
# merge in customer information
gvKey_abiLinkingTable.columns = customer_columns

print(c_links.shape)
c_linksMerge1 = c_links.merge(gvKey_abiLinkingTable, on ='customer_gvkey')
print(c_links.shape,c_linksMerge1.shape)



#########################
# and merge in supplier 
gvKey_abiLinkingTable.columns = supplier_columns

print(c_links.shape)
c_linksMerge2 = c_linksMerge1.merge(gvKey_abiLinkingTable, on ='supplier_gvkey')
print(c_links.shape,c_linksMerge2.shape)

(34473, 4)
(34473, 4) (19410, 9)
(34473, 4)
(34473, 4) (10943, 14)


In [264]:
gvKey_abiLinkingTable

Unnamed: 0,customer_cstatCompanies,customer_igCompanies,customer_delete,customer_gvkey,customer_abi
0,apco oil and gas,apco oil gas,,1682,544813678
1,constellation energy grp,constellation energy,,1995,506384064
2,electro scientific inds,electro scientific industries,,4274,9546995
3,federal mogul,federal mogul,,4600,7513781
4,forest laboratories,forest laboratories,,4843,7514862
...,...,...,...,...,...
1872,cdti advanced materials,cdti advanced materials,,282553,967328568
1873,futurefuel,futurefuel,,287462,679546432
1874,lyondellbasell industries nv,lyondellbasell industries nv,,294524,200051589
1875,doriang,doriang,,317264,435494175


This is probably because: (1) companies are not in North America, or (2) companies are not in the physical goods industries we're interested in. We can verify this though: look at c_links where both the customer and supplier are in the dataset of interest.

In [286]:
c_linkTest = c_links[c_links.customer_gvkey.isin(chq.gvkey.unique()) & \
                     c_links.supplier_gvkey.isin(chq.gvkey.unique())]
c_linkTest.shape

(12276, 4)

It's entirely possible that we have too small of a sample from the 2010s alone. Let's just try it though and see how it goes.