In [2]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy
  
nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Get Full IG Dataset

In [None]:
file = "../../data/companyData/infogroup2010s.csv"

In [None]:
import dask.dataframe as dd

df = dd.read_csv(file, assume_missing=True, 
                 dtype={'parent_number': 'object','parent_employee_size_code': 'object',
                       'parent_sales_volume_code': 'object',
                       'abi': 'object','zipcode': 'object'}, low_memory = False)
df = df[df.business_status_code == 1.0]


In [248]:
df.columns

Index(['archive_version_year', 'abi', 'ticker', 'parent_number', 'company',
       'address_line_1', 'city', 'state', 'zipcode',
       'location_employee_size_code', 'location_sales_volume_code',
       'primary_naics_code', 'sic_code', 'business_status_code',
       'parent_employee_size_code', 'parent_sales_volume_code', 'cbsa_code',
       'latitude', 'longitude'],
      dtype='object')

In [249]:
df.head()

Unnamed: 0,archive_version_year,abi,ticker,parent_number,company,address_line_1,city,state,zipcode,location_employee_size_code,location_sales_volume_code,primary_naics_code,sic_code,business_status_code,parent_employee_size_code,parent_sales_volume_code,cbsa_code,latitude,longitude
8,2010.0,7609,SODI,7609,SOLITRON DEVICES INC,3301 ELECTRONICS WAY # C,WEST PALM BEACH,FL,33407,E,,33441302.0,362998.0,1.0,E,,33100.0,26.7412,-80.06694
41,2010.0,21311,,21311,WESTERN STATES ENVELOPE & LBL,4480 N 132ND ST,BUTLER,WI,53007,H,,32222006.0,511216.0,1.0,H,,33340.0,43.09799,-88.07399
58,2010.0,29603,,29603,THIELE KAOLIN CO,520 KAOLIN RD,SANDERSVILLE,GA,31082,G,,32799204.0,145598.0,1.0,G,,0.0,32.96893,-82.81953
207,2010.0,71340,,71340,TRI STAFF GROUP,6336 GREENWICH DR # 100,SAN DIEGO,CA,92122,E,,56132001.0,736103.0,1.0,E,,41740.0,32.85445,-117.18594
216,2010.0,77743,,77743,NATIONAL TECHNICAL SYSTEMS INC,24007 VENTURA BLVD # 200,CALABASAS,CA,91302,C,,54138023.0,382998.0,1.0,C,,31080.0,34.15562,-118.65163


In [250]:
hq = df[['abi','ticker','company','archive_version_year','state','city',
         'address_line_1','zipcode',
         'latitude','longitude']].drop_duplicates().compute(num_workers = 100)

In [251]:
hqsOnly = hq[['abi','company']].drop_duplicates()

In [252]:
hqsOnly.company.value_counts()

GOVERNOR'S OFFICE                56
ATTORNEY GENERAL                 49
SECRETARY OF STATE               43
CHIEF OF STAFF                   43
LIEUTENANT GOVERNOR              39
                                 ..
CANAAN PARTNERS                   1
PRIME BANK                        1
NEW HAVEN FREE PUBLIC LIBRARY     1
MILFORD BANK                      1
USA COMPRESSION PARTNERS LP       1
Name: company, Length: 79777, dtype: int64

In [253]:
print(hq.shape,hqsOnly.shape)

(439546, 10) (84213, 2)


Some of the abi numbers seem to be duplicated; it looks like they might be primarily for different government agencies.

In [254]:
hqsOnly.company.value_counts()[hqsOnly.company.value_counts() > 10].index

Index(['GOVERNOR'S OFFICE', 'ATTORNEY GENERAL', 'SECRETARY OF STATE',
       'CHIEF OF STAFF', 'LIEUTENANT GOVERNOR', 'PRESS SECRETARY',
       'LIEUTENANT GOVERNOR'S OFFICE', 'TRANSPORTATION DEPARTMENT',
       'CORRECTIONS DEPT', 'CORRECTIONS DEPARTMENT',
       'SUPREME COURT CHIEF JUSTICE', 'EDUCATION DEPT', 'SUPREME COURT CLERK',
       'TRANSPORTATION DEPT', 'AGRICULTURE DEPT', 'EMERGENCY MEDICAL SVC',
       'ADJUTANT GENERAL', 'AGRICULTURE DEPARTMENT', 'ELECTIONS DIVISION',
       'EDUCATION DEPARTMENT', 'STATE VETERINARIAN', 'FIRE MARSHAL',
       'STATE LIBRARY', 'STATE TREASURER', 'REVENUE DEPARTMENT',
       'INSURANCE DEPT', 'LABOR DEPT', 'ETHICS COMMISSION',
       'NATURAL RESOURCES DEPT', 'REVENUE DEPT', 'GEOLOGICAL SURVEY',
       'CORPORATIONS DIVISION', 'LABOR DEPARTMENT', 'HEALTH DEPARTMENT',
       'PUBLIC SAFETY DEPT', 'HUMAN SERVICES DEPT', 'RACING COMMISSION',
       'PUBLIC SERVICE COMMISSION', 'SECURITIES DIVISION',
       'OCCUPATIONAL SAFETY & HEALTH', 'ADMI

In [255]:
toDiscard = hqsOnly.company.value_counts()[hqsOnly.company.value_counts() > 1].index
for company in toDiscard:
    print(company)


GOVERNOR'S OFFICE
ATTORNEY GENERAL
SECRETARY OF STATE
CHIEF OF STAFF
LIEUTENANT GOVERNOR
PRESS SECRETARY
LIEUTENANT GOVERNOR'S OFFICE
TRANSPORTATION DEPARTMENT
CORRECTIONS DEPT
CORRECTIONS DEPARTMENT
SUPREME COURT CHIEF JUSTICE
EDUCATION DEPT
SUPREME COURT CLERK
TRANSPORTATION DEPT
AGRICULTURE DEPT
EMERGENCY MEDICAL SVC
ADJUTANT GENERAL
AGRICULTURE DEPARTMENT
ELECTIONS DIVISION
EDUCATION DEPARTMENT
STATE VETERINARIAN
FIRE MARSHAL
STATE LIBRARY
STATE TREASURER
REVENUE DEPARTMENT
INSURANCE DEPT
LABOR DEPT
ETHICS COMMISSION
NATURAL RESOURCES DEPT
REVENUE DEPT
GEOLOGICAL SURVEY
CORPORATIONS DIVISION
LABOR DEPARTMENT
HEALTH DEPARTMENT
PUBLIC SAFETY DEPT
HUMAN SERVICES DEPT
RACING COMMISSION
PUBLIC SERVICE COMMISSION
SECURITIES DIVISION
OCCUPATIONAL SAFETY & HEALTH
ADMINISTRATION DEPT
WORKERS COMPENSATION
INSURANCE DEPARTMENT
ARTS COUNCIL
VOCATIONAL REHABILITATION
CHILD SUPPORT ENFORCEMENT
HUMAN RIGHTS COMMISSION
LAW LIBRARY
FORESTRY DIVISION
HEALTH DEPT
STATE POLICE
EMERGENCY MANAGEMENT AGE

In [256]:
toDiscard

Index(['GOVERNOR'S OFFICE', 'ATTORNEY GENERAL', 'SECRETARY OF STATE',
       'CHIEF OF STAFF', 'LIEUTENANT GOVERNOR', 'PRESS SECRETARY',
       'LIEUTENANT GOVERNOR'S OFFICE', 'TRANSPORTATION DEPARTMENT',
       'CORRECTIONS DEPT', 'CORRECTIONS DEPARTMENT',
       ...
       'ENVIRONMENTAL HEALTH DIV', 'MANCHESTER PUBLIC LIBRARY', 'TSP INC',
       'STERNE AGEE GROUP INC', 'KADEMENOS WISEHART HINES DOLYK',
       'WORK FORCE SVC', 'CULTURAL RESOURCES DEPT', 'COAST DENTAL SVC INC',
       'KENNIE'S MARKETS INC', 'BANCSHARES INC'],
      dtype='object', length=1975)

In [257]:
hqsOnly = hqsOnly[~hqsOnly.company.isin(toDiscard)]
hq      = hq[~hq.company.isin(toDiscard)]

In [258]:
hq.shape

(394920, 10)

At this point, we have a unique record of every company - hq here. Some of these may well be duplicate entries for a given company, for the cases in which we have a company that has multiple hq.

Let's stash it so that we don't have to go through the above ^^ again.

In [259]:
hqsOnly.to_csv("../../data/ig2010s_uniqueHQs.csv")

In [260]:
hq.to_csv("../../data/ig2010s_uniqueHQs_multLocations.csv")

In [3]:
hqsOnly     = pd.read_csv("../../data/ig2010s_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})
hqsWithYear = pd.read_csv("../../data/ig2010s_uniqueHQs_multLocations.csv",dtype={'zipcode': 'object'})[['abi','company',
                                                                             'archive_version_year',
                                                                             'state','city','zipcode','address_line_1']]

In [4]:
hqsOnly.head()

Unnamed: 0,abi,company
0,7609,SOLITRON DEVICES INC
1,21311,WESTERN STATES ENVELOPE & LBL
2,29603,THIELE KAOLIN CO
3,71340,TRI STAFF GROUP
4,77743,NATIONAL TECHNICAL SYSTEMS INC


In [5]:
hqsWithYear.head()

Unnamed: 0,abi,company,archive_version_year,state,city,zipcode,address_line_1
0,7609,SOLITRON DEVICES INC,2010.0,FL,WEST PALM BEACH,33407,3301 ELECTRONICS WAY # C
1,21311,WESTERN STATES ENVELOPE & LBL,2010.0,WI,BUTLER,53007,4480 N 132ND ST
2,29603,THIELE KAOLIN CO,2010.0,GA,SANDERSVILLE,31082,520 KAOLIN RD
3,71340,TRI STAFF GROUP,2010.0,CA,SAN DIEGO,92122,6336 GREENWICH DR # 100
4,77743,NATIONAL TECHNICAL SYSTEMS INC,2010.0,CA,CALABASAS,91302,24007 VENTURA BLVD # 200


In [6]:
hqsWithYear = hqsWithYear[hqsWithYear.archive_version_year <= 2020]

In [7]:
hqsWithYear['zipcode']

0         33407
1         53007
2         31082
3         92122
4         91302
          ...  
394915    17101
394916    92008
394917    46530
394918    06437
394919    90222
Name: zipcode, Length: 394920, dtype: object

In [8]:
hqsWithYear['last_year'] = hqsWithYear.groupby(['abi'])['archive_version_year'].transform(max)

In [9]:
print(hqsWithYear.shape)

lastHQs = hqsWithYear[hqsWithYear.archive_version_year == hqsWithYear.last_year][['abi','company','state','city','zipcode','address_line_1']]

(394920, 8)


In [10]:
lastHQs.shape

(59805, 6)

## Grab Compustat Data

First filter down to the companies for whom we have the supply chain information.

In [11]:
c_links = pd.read_csv("../../data/companyData/compustatSCLinked.csv")

c_links['year'] = c_links.srcdate.astype('str').str.slice(0,4).astype('int64')

c_links = c_links[c_links.year > 2009]

relevant_gvkeys = c_links.gvkey.append(c_links.cgvkey).drop_duplicates()

print(c_links.head(),relevant_gvkeys.shape)

     gvkey                        conm  cgvkey                       cconm  \
80    1013  ADC TELECOMMUNICATIONS INC    9899                    AT&T INC   
81    1013  ADC TELECOMMUNICATIONS INC    2136  VERIZON COMMUNICATIONS INC   
281   1094                  ACETO CORP   31673      AMERISOURCEBERGEN CORP   
282   1094                  ACETO CORP   31673      AMERISOURCEBERGEN CORP   
283   1094                  ACETO CORP    7171               MCKESSON CORP   

                       cnms   srcdate  cid  sid    ctype   salecs     scusip  \
80                     AT&T  20100930   16    0  COMPANY  300.000  000886309   
81   VERIZON COMMUNICATIONS  20100930   13    0  COMPANY  146.000  000886309   
281  AmerisourceBergen Corp  20160630   13    0  COMPANY   78.193  004446100   
282  AmerisourceBergen Corp  20170630   13    0  COMPANY   76.598  004446100   
283           McKesson Corp  20170630   19    0  COMPANY   70.215  004446100   

       stic     ccusip ctic  year  
80   ADCT.1  0

Get the company dataset and check.

The legal name and the given name are slightly different, but basically the same modulo punctuation and case.

In [12]:
c_addresses = pd.read_csv("../../data/companyData/compustatAddresses.csv", 
                dtype={'parent_number': 'object'})[['fyear',
                'gvkey',
                'conm',
                'add1',
                'city',
                'state',
                'idbflag',
                'addzip',
               'naics']].drop_duplicates().rename(columns = {'fyear': 'year'})
c_addresses = c_addresses[(c_addresses.year > 2009) & (c_addresses.year < 2020)]



  exec(code_obj, self.user_global_ns, self.user_ns)


In [13]:
c_addresses.year.value_counts()

2012.0    11836
2013.0    11829
2014.0    11637
2011.0    11527
2015.0    11469
2016.0    11329
2018.0    11302
2019.0    11275
2017.0    11243
2010.0    10855
Name: year, dtype: int64

Subset this to focus on firms in: ag, mining, construction, manufacturing, wholesale and retail, and transportation.

In [14]:
c_addresses = c_addresses[(c_addresses.naics.astype('str').str.slice(0,2).isin(['11','21','22','23','31','32',
                                                         '33','42','44','45','48','49']))]

chq = c_addresses[['gvkey','conm','add1','city','state','addzip','idbflag']].drop_duplicates()

We're starting with the compustat north america dataset. Not all of the HQs are in North America, so we can filter some of the information down to match with Infogroup.

In [15]:
chq.idbflag.value_counts()

D    6960
B    1121
Name: idbflag, dtype: int64

In [16]:
canadian = ['ON', 'AB','QC', 'BC', 'NS', 'NF', 'SK', 'MB', 'NB']

chq.state.unique()

array(['IL', 'MN', 'TX', 'SC', 'AZ', 'NY', 'CT', 'FL', 'CA', nan, 'ON',
       'PA', 'NC', 'AL', 'WA', 'HI', 'NJ', 'MA', 'OH', 'NV', 'UT', 'OK',
       'WI', 'AR', 'CO', 'IA', 'DE', 'RI', 'AB', 'GA', 'MD', 'ME', 'VA',
       'IN', 'SD', 'OR', 'QC', 'BC', 'KY', 'MO', 'LA', 'VT', 'TN', 'MI',
       'DC', 'ID', 'ND', 'MS', 'KS', 'NS', 'NH', 'NM', 'NE', 'MT', 'WY',
       'NF', 'SK', 'MB', 'WV', 'NB'], dtype=object)

In [17]:
chq = chq[~(chq.state.isin(canadian)) & ~chq.state.isna()]

In [18]:
chq.addzip.str.len().value_counts()

5     4628
10     481
Name: addzip, dtype: int64

In [19]:
chq['addzip'] = chq.addzip.astype('str').str.slice(0,5)

In [20]:
chq[chq.idbflag == 'D'].addzip.str.len().value_counts()

5    4689
Name: addzip, dtype: int64

In [21]:
chq[chq.idbflag == "B"].addzip.value_counts()

77002    11
80202     5
94080     5
02139     5
10022     4
         ..
01760     1
97124     1
85251     1
18940     1
33394     1
Name: addzip, Length: 326, dtype: int64

In [22]:
print(chq.head(),chq.shape)

    gvkey                         conm  \
1    1004                     AAR CORP   
12   1013   ADC TELECOMMUNICATIONS INC   
13   1045  AMERICAN AIRLINES GROUP INC   
25   1050      CECO ENVIRONMENTAL CORP   
59   1072                     AVX CORP   

                                        add1          city state addzip  \
1   One AAR Place, 1100 North Wood Dale Road     Wood Dale    IL  60191   
12                    13625 Technology Drive  Eden Prairie    MN  55344   
13                           1 Skyview Drive    Fort Worth    TX  76155   
25     14651 North Dallas Parkway, Suite 500        Dallas    TX  75254   
59                         One AVX Boulevard  Fountain Inn    SC  29644   

   idbflag  
1        D  
12       D  
13       B  
25       D  
59       D   (5109, 7)


In [23]:
chq.rename(columns = {'conm': 'company','addzip': 'cstatZipcode'},inplace = True)
chq.head()

Unnamed: 0,gvkey,company,add1,city,state,cstatZipcode,idbflag
1,1004,AAR CORP,"One AAR Place, 1100 North Wood Dale Road",Wood Dale,IL,60191,D
12,1013,ADC TELECOMMUNICATIONS INC,13625 Technology Drive,Eden Prairie,MN,55344,D
13,1045,AMERICAN AIRLINES GROUP INC,1 Skyview Drive,Fort Worth,TX,76155,B
25,1050,CECO ENVIRONMENTAL CORP,"14651 North Dallas Parkway, Suite 500",Dallas,TX,75254,D
59,1072,AVX CORP,One AVX Boulevard,Fountain Inn,SC,29644,D


Only two of these company names appear 2x, which is good. There are ~20,000 companies in this sample.

Let's go through a little bit of a process here:
- Find the exact matches.
- Get a similarity measure between ; ideally something vectorized / something in matrix math.
- Find the top 10 matches for the remaining ones.
- Do some mix and match and see if there's any threshold at which matches become similar ``enough'' to say this is okay and good to go.


We might be able to use the fact that all of the addresses should be the same after some given point, as the compustat addresses are only the most recent ones. 

Let's try a few different ways to match these up.

First, let's find the exact matches.

Make a generic cleaning function that strips out all company names, any punctuation in the name, and makes everything lower case.

In [24]:
def cleanText(text):
    text = text.\
    replace(" CORP","").replace(" CO","").replace(" INC","").\
    replace(" LTD","").replace(" -CL A","").\
    replace(" -LP","").replace(" LP","").\
    replace("-OLD","").replace(" LLC","").\
    replace(" -CL B","").replace(" -CL i","").replace(" -CL","").\
    replace("-REDH","").replace(" CP","").\
    replace("-ADR","").replace(" PLC","").lower().replace(r'[^\w\s]+', '').\
    replace('-lp','').replace('-spn','').replace('hldg','').replace(' intl','').\
    replace('holdings','').replace('holding','').replace('prtnr','').replace('group','')
    
    
    return text

In [25]:
chq['company']               = list(map(cleanText, chq.company))
lastHQs['company']           = list(map(cleanText, lastHQs.company))

chq.rename(columns = {'city': 'cstatCity',
                     'state': 'cstatState',
                     'add1': 'cstatadd1'}, inplace = True)

chq['cstatCity']  = chq.cstatCity.str.lower()
chq['cstatState'] = chq.cstatState.str.lower()
chq['cstatadd1']  = chq.cstatadd1.str.lower()

lastHQs['city']            = lastHQs.city.str.lower()
lastHQs['state']           = lastHQs.state.str.lower()
lastHQs['address_line_1']  = lastHQs.address_line_1.str.lower()

NAICS names do not match up between compustat and infogroup so they're not helpful.

In [26]:
chq.head()

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,idbflag
1,1004,aar,"one aar place, 1100 north wood dale road",wood dale,il,60191,D
12,1013,adc telecommunications,13625 technology drive,eden prairie,mn,55344,D
13,1045,american airlines,1 skyview drive,fort worth,tx,76155,B
25,1050,ceco environmental,"14651 north dallas parkway, suite 500",dallas,tx,75254,D
59,1072,avx,one avx boulevard,fountain inn,sc,29644,D


In [27]:
lastHQs.head()

Unnamed: 0,abi,company,state,city,zipcode,address_line_1
17,158329,caridian bct,co,lakewood,80401,14143 denver west pkwy # 200
20,211946,family brands,tn,lenoir city,37771,1001 elm hill rd
23,258574,jade,pa,huntingdon vly,19006,3063 philmont ave
24,262493,montana metal products,il,des plaines,60018,25 howard ave
43,455154,o i,tx,college station,77845,151 graham rd


## Match on company name directly

In [28]:
chq.head()

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,idbflag
1,1004,aar,"one aar place, 1100 north wood dale road",wood dale,il,60191,D
12,1013,adc telecommunications,13625 technology drive,eden prairie,mn,55344,D
13,1045,american airlines,1 skyview drive,fort worth,tx,76155,B
25,1050,ceco environmental,"14651 north dallas parkway, suite 500",dallas,tx,75254,D
59,1072,avx,one avx boulevard,fountain inn,sc,29644,D


In [29]:
nameMerge = chq.merge(lastHQs)
nameMerge.shape

(3344, 12)

In [30]:
nameMerge.head()

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,idbflag,abi,state,city,zipcode,address_line_1
0,1004,aar,"one aar place, 1100 north wood dale road",wood dale,il,60191,D,115523672,il,wood dale,60191,1100 n wood dale rd
1,1013,adc telecommunications,13625 technology drive,eden prairie,mn,55344,D,7523129,mn,eden prairie,55344,13625 technology dr
2,1045,american airlines,1 skyview drive,fort worth,tx,76155,B,7501711,tx,fort worth,76155,1 skyview dr
3,1050,ceco environmental,"14651 north dallas parkway, suite 500",dallas,tx,75254,D,596284992,tx,dallas,75254,14651 dallas pkwy # 500
4,1075,pinnacle west capital,"400 north fifth street, p.o. box 53999",phoenix,az,85072,D,4554051,az,phoenix,85004,400 n 5th st frnt


In [31]:
sum(nameMerge.cstatState == nameMerge.state)/nameMerge.shape[0]

0.9455741626794258

In [34]:
sum(nameMerge.cstatZipcode.str.slice(0,5) == nameMerge.zipcode.str.slice(0,5))/nameMerge.shape[0]

0.8609449760765551

In [33]:
sum(nameMerge.cstatZipcode.str.slice(0,1) == nameMerge.zipcode.str.slice(0,1))/nameMerge.shape[0]

0.9518540669856459

In [35]:
nameMerge[nameMerge.cstatCity != nameMerge.city][50:100]

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,idbflag,abi,state,city,zipcode,address_line_1
414,7620,murphy oil,"9805 katy freeway, suite g-200",houston,tx,77024,D,7525652,ar,el dorado,71730,300 e peach st
424,7849,new jersey resources,1415 wyckoff road,wall,nj,7719,D,878627181,nj,wall township,7727,1415 wyckoff rd
426,7881,newmont,"6900 east layton avenue, suite 700",denver,co,80237,B,7526734,co,greenwood vlg,80111,6363 s fiddlers green cir #800
435,7923,norfolk southern,650 west peachtree street nw,atlanta,ga,30308,D,7526965,va,norfolk,23510,3 commercial pl # 1a
440,7991,terex,"45 glover avenue, 4th floor",norwalk,ct,6850,D,437605223,ct,westport,6880,200 nyala farms rd # 2
445,8073,ocean bio-chem,4041 sw 47th avenue,fort lauderdale,fl,33314,D,443639711,fl,davie,33314,4041 sw 47th ave
499,8901,rpc,"2801 buford highway ne, suite 300",atlanta,ga,30329,D,3485588,ga,brookhaven,30329,2801 buford hwy ne # 300
516,9325,sl industries,"520 fellowship road, suite a-114",mount laurel,nj,8054,D,406295980,nj,mt laurel,8054,520 fellowship rd # a114
519,9372,st jude medical,one st. jude medical drive,st. paul,mn,55117,D,4349056,mn,st paul,55117,1 saint jude medical dr
520,9411,hillshire brands,3500 lacey road,downers grove,il,60515,D,7509318,il,chicago,60607,400 s jefferson st # 1n


Now focus down onto the companies that have not been matched.

In [36]:
chqUnmatched = chq[~chq.company.isin(nameMerge.company)].reset_index()
chqUnmatched.shape

(1800, 8)

In [37]:
igUnmatched  = lastHQs[~lastHQs.company.isin(nameMerge.company)].reset_index()
igUnmatched.shape

(56475, 7)

In [38]:
lastHQs.shape

(59805, 6)

# Find Distance

Two distance measures here. Look at top 5 matches and pull the distance measure and matches as well.

### Levenshtein

In [39]:
from Levenshtein import distance as levenshtein_distance

In [40]:
companyArrayCStat = []

company = chqUnmatched.company[0]
start = time.time()
for company in chqUnmatched.company:
    thisCompany = []
    for ig in igUnmatched.company:
        thisCompany.append(levenshtein_distance(company,ig))
    
    companyArrayCStat.append([thisCompany])

allLD = np.concatenate(companyArrayCStat)


In [41]:
igUnmatched.head()

Unnamed: 0,index,abi,company,state,city,zipcode,address_line_1
0,17,158329,caridian bct,co,lakewood,80401,14143 denver west pkwy # 200
1,20,211946,family brands,tn,lenoir city,37771,1001 elm hill rd
2,23,258574,jade,pa,huntingdon vly,19006,3063 philmont ave
3,24,262493,montana metal products,il,des plaines,60018,25 howard ave
4,43,455154,o i,tx,college station,77845,151 graham rd


In [89]:
n = 5
singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]
largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


companyMatches = pd.DataFrame()
companyMatches['cstatCompanies'] = chqUnmatched.company
companyMatches['cstatadd1']      = chqUnmatched.cstatadd1
companyMatches['cstatCity']      = chqUnmatched.cstatCity
companyMatches['cstatState']     = chqUnmatched.cstatState
companyMatches['cstatZip']       = chqUnmatched.cstatZipcode

for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'misspelling']         = allLD[i,:][singleLargestLV[i]]
    companyMatches.at[i,'levCompany']          = igUnmatched.company[singleLargestLV[i]].iloc[0]
    
    companyMatches.at[i,'closestMatchIG_add']      = igUnmatched.address_line_1[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_city']     = igUnmatched.city[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_zipcode']  = igUnmatched.zipcode[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_state']    = igUnmatched.state[singleLargestLV[i]].iloc[0]

    
    # companyMatches.at[i,'closestMatchIG']      = np.array(igUnmatched.company)[largestElementsLV[i]]
    # companyMatches.at[i,'LevSim']              = np.array(allLD[i,:][largestElementsLV[i]], dtype=object)

In [90]:
companyMatches.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state
0,avx,one avx boulevard,fountain inn,sc,29644,1.0,avp,960 knox st # a,torrance,90502,ca
1,spire alabama,605 richard arrington boulevard north,birmingham,al,35203,6.0,ire global,111 congress ave # 4th,austin,78701,tx
2,alabama power,600 north 18th street,birmingham,al,35203,4.0,alstom power,200 great pond dr,windsor,6095,ct
3,tamir biothechnology,"51 jfk parkway, 1st floor west, suite 108",short hills,nj,7078,1.0,tamir biotechnology,11 deerpark dr # 204,monmouth jct,8852,nj
4,petro usa,7325 oswego road,liverpool,ny,13090,1.0,etro usa,41 w 56th st,new york,10019,ny


Now get the embeddings and the cosine similarity between them.

In [91]:
def getMatrix(companyEmbeddings):
    companyArray = []
    
    for companies in companyEmbeddings:
        companyArray.append([companies.vector])
    
    companyArray = np.concatenate(companyArray)
    
    return(companyArray)
        

In [93]:
start = time.time()

time.time() - start

149.55907773971558

In [94]:
outfile =  '../../data/allCompaniesIG_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(allCompaniesIG, pickle_file)
    
outfile =  '../../data/allCompaniesCStat_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(allCompaniesCStat, pickle_file)    

NameError: name 'allCompaniesCStat' is not defined

In [95]:
chqUnmatchedList = list(map(nlp, chqUnmatched.company))
allCompaniesIG = list(map(nlp, igUnmatched.company))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

Each row n here has the similarity between the nth company name in compustat and the IG company corresp to that column.

In [98]:
allSimilarities[0:5,:]

array([[ 0.27351984, -0.16348277,  0.02695994, ..., -0.06730537,
         0.09521193, -0.00591509],
       [ 0.20687255,  0.05328101,  0.24833122, ...,  0.31417164,
         0.46006292,  0.2882711 ],
       [-0.01470069,  0.27651983,  0.19719386, ...,  0.45301446,
         0.3847386 ,  0.36670434],
       [ 0.24357395, -0.17722532,  0.09829525, ..., -0.095217  ,
         0.07805435, -0.07496908],
       [ 0.12485278,  0.12257036,  0.17973134, ...,  0.23482014,
         0.3301328 ,  0.3126555 ]], dtype=float32)

Find indices of companies in IG most similar to each company in CStat.

In [99]:
largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]
singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]

Add the cosine similarity measures to the similarity dataset.

In [101]:
for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'cosSimilarity']        = allSimilarities[i,:][singleLargestCos[i]]
    companyMatches.at[i,'cosSimilarityCompany'] = igUnmatched.company[singleLargestCos[i]].iloc[0]
    # companyMatches.at[i,'closestMatchCosine']   = np.array(igUnmatched.company)[largestElementsCos[i]]
    # companyMatches.at[i,'cosineSim']            = np.array(allSimilarities[i,:][largestElementsCos[i]], dtype=object)
    igUnmatched.company[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'costMatchIG_add']     = igUnmatched.address_line_1[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_city']     = igUnmatched.city[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_zipcode']  = igUnmatched.zipcode[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_state']    = igUnmatched.state[singleLargestCos[i]].iloc[0]
    

In [102]:
sum((companyMatches.levCompany == companyMatches.cosSimilarityCompany))

321

Now find the company matches: ABI - gvkey link.

Start with ones where the names both match.

In [103]:
bothMatch_cityZip = companyMatches[(companyMatches.levCompany == companyMatches.cosSimilarityCompany) & \
              ((companyMatches.cstatCity == companyMatches.closestMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.closestMatchIG_zipcode))]
bothMatch_cityZip.shape

bothMatch_cityZip.to_csv("../../data/companyData/bothMatch_cityZip.csv")

In [116]:
bothMatch_cityZip['igCompanies'] = bothMatch_cityZip.levCompany
companiesToCheck                 = bothMatch_cityZip[['cstatCompanies','igCompanies']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [105]:
companyMatchesBoth = list(bothMatch_cityZip.cstatCompanies.unique())
len(companyMatchesBoth)

238

In [111]:
oneMatch_cityZipOnly.cstatCity

7              chicago
14             phoenix
17             atlanta
18          scottsdale
20         new hampton
             ...      
1768         lexington
1771          plymouth
1780    virginia beach
1785         cambridge
1797         cambridge
Name: cstatCity, Length: 362, dtype: object

In [114]:
oneMatch_cityZipOnly = companyMatches[-(companyMatches.cstatCompanies.isin(companyMatchesBoth)) & \
              (((companyMatches.cstatCity == companyMatches.closestMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.closestMatchIG_zipcode)) | \
              ((companyMatches.cstatCity == companyMatches.cosMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.cosMatchIG_zipcode)))].reset_index(drop=True)

oneMatch_cityZipOnly['igCompanies'] = ''

for i in range(0,oneMatch_cityZipOnly.shape[0]):
    if ((oneMatch_cityZipOnly.cstatCity[i] == oneMatch_cityZipOnly.closestMatchIG_city[i]) | \
              (oneMatch_cityZipOnly.cstatZip[i] == oneMatch_cityZipOnly.closestMatchIG_zipcode[i])):
        oneMatch_cityZipOnly.loc[i,'igCompanies'] = companyMatches.levCompany[i]
    else:
        oneMatch_cityZipOnly.loc[i,'igCompanies'] = companyMatches.cosSimilarityCompany[i]

# oneMatch_cityZipOnly.to_csv("../../data/companyData/oneMatch_cityZipOnly.csv")

In [119]:
companiesToCheck = companiesToCheck.append(oneMatch_cityZipOnly[['cstatCompanies','igCompanies']]).drop_duplicates()

companiesToCheck.shape

(600, 2)

In [120]:
chqUnmatched.head()

Unnamed: 0,index,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,idbflag
0,59,1072,avx,one avx boulevard,fountain inn,sc,29644,D
1,272,1224,spire alabama,605 richard arrington boulevard north,birmingham,al,35203,D
2,284,1225,alabama power,600 north 18th street,birmingham,al,35203,D
3,364,1259,tamir biothechnology,"51 jfk parkway, 1st floor west, suite 108",short hills,nj,7078,D
4,395,1272,petro usa,7325 oswego road,liverpool,ny,13090,D


In [127]:
companiesToCheck.head()

Unnamed: 0,cstatCompanies,igCompanies
8,american science engineering,american science & engineering
11,apco oil and gas,apco oil & gas
13,arden,arden
16,arts way mfg,art's way mfg
21,constellation energy grp,constellation energy


In [152]:
chqStillUnmatched = chqUnmatched[-chqUnmatched.company.isin(companiesToCheck.cstatCompanies)].reset_index(drop=True)
igStillUnmatched  = igUnmatched[-igUnmatched.company.isin(companiesToCheck.igCompanies)].reset_index(drop=True)

# companyMatches['cstatCompanies'] = chqUnmatched.company
print(chqUnmatched.shape, chqStillUnmatched.shape, companiesToCheck.shape)
print(igStillUnmatched.shape,igUnmatched.shape)

(1800, 8) (1200, 8) (600, 2)
(55956, 7) (56475, 7)


## Take 2
Match remaining ones on first word of name

In [124]:
chqUnmatched.company[0].split(' ')[0]

'avx'

Get the edit distance for the first words of the company names.

In [131]:
companyArrayCStat = []

start = time.time()
for company in chqStillUnmatched.company:
    thisCompany = []
    for ig in igStillUnmatched.company:
        thisCompany.append(levenshtein_distance(company.split(' ')[0],ig.split(' ')[0]))
    
    companyArrayCStat.append([thisCompany])

allLD = np.concatenate(companyArrayCStat)


And the cosine distance.

In [146]:
chqStillUnmatchedFirstCo = []
igStillUnmatchedFirstCo = []

for company in chqStillUnmatched.company:
    chqStillUnmatchedFirstCo.append(company.split(' ')[0])
    
for company in igStillUnmatched.company:
    igStillUnmatchedFirstCo.append(company.split(' ')[0])

In [149]:
chqUnmatchedList = list(map(nlp, chqStillUnmatchedFirstCo))
allCompaniesIG = list(map(nlp, igStillUnmatchedFirstCo))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]
singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]

In [151]:
igStillUnmatched.head()

Unnamed: 0,index,abi,company,state,city,zipcode,address_line_1
1,20,211946,family brands,tn,lenoir city,37771,1001 elm hill rd
2,23,258574,jade,pa,huntingdon vly,19006,3063 philmont ave
3,24,262493,montana metal products,il,des plaines,60018,25 howard ave
4,43,455154,o i,tx,college station,77845,151 graham rd
5,65,536672,ic isaacs &,ny,new york,10018,475 10th ave # 9


In [161]:
n = 5
singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]
largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


companyMatches2 = pd.DataFrame()
companyMatches2['cstatCompanies'] = chqStillUnmatched.company
companyMatches2['cstatadd1']      = chqStillUnmatched.cstatadd1
companyMatches2['cstatCity']      = chqStillUnmatched.cstatCity
companyMatches2['cstatState']     = chqStillUnmatched.cstatState
companyMatches2['cstatZip']       = chqStillUnmatched.cstatZipcode

for i in range(0,companyMatches2.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches2.at[i,'misspelling']         = allLD[i,:][singleLargestLV[i]]
    companyMatches2.at[i,'levCompany']          = igStillUnmatched.company[singleLargestLV[i]].iloc[0]
    
    companyMatches2.at[i,'closestMatchIG_add']      = igStillUnmatched.address_line_1[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_city']     = igStillUnmatched.city[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_zipcode']  = igStillUnmatched.zipcode[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_state']    = igStillUnmatched.state[singleLargestLV[i]].iloc[0]

    
    companyMatches2.at[i,'cosSimilarity']        = allSimilarities[i,:][singleLargestCos[i]]
    companyMatches2.at[i,'cosSimilarityCompany'] = igStillUnmatched.company[singleLargestCos[i]].iloc[0]
   
    companyMatches2.at[i,'costMatchIG_add']     = igStillUnmatched.address_line_1[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_city']     = igStillUnmatched.city[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_zipcode']  = igStillUnmatched.zipcode[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_state']    = igStillUnmatched.state[singleLargestCos[i]].iloc[0]

    

In [162]:
companyMatches2.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state
0,avx,one avx boulevard,fountain inn,sc,29644,1.0,avi foodsystems,2590 elm rd ne,warren,44483,oh,0.452242,protech home medical,1019 town dr,highland heights,41076,ky
1,spire alabama,605 richard arrington boulevard north,birmingham,al,35203,0.0,spire digital,940 n lincoln st # 3,denver,80203,co,1.0,spire digital,940 n lincoln st # 3,denver,80203,co
2,alabama power,600 north 18th street,birmingham,al,35203,0.0,alabama business furnishings,12 20th st s,birmingham,35233,al,1.0,alabama forestymmission,513 madison ave,montgomery,36104,al
3,tamir biothechnology,"51 jfk parkway, 1st floor west, suite 108",short hills,nj,7078,1.0,tamer media,13 broadcast plz sw,albuquerque,87104,nm,0.407618,novatech,4106 charlotte ave,nashville,37209,tn
4,petro usa,7325 oswego road,liverpool,ny,13090,0.0,petro tech oil & gas,1600 airport fwy,bedford,76022,tx,1.0,petro energy,920 10th ave n,onalaska,54650,wi


Find if city or zip match here.

In [163]:
match2_cityZips = companyMatches2[(((companyMatches2.cstatCity == companyMatches2.closestMatchIG_city) | \
              (companyMatches2.cstatZip == companyMatches2.closestMatchIG_zipcode)) | \
              ((companyMatches2.cstatCity == companyMatches2.cosMatchIG_city) | \
              (companyMatches2.cstatZip == companyMatches2.cosMatchIG_zipcode)))].reset_index(drop=True)


In [164]:
match2_cityZips.shape

(90, 17)

In [165]:
match2_cityZips.to_csv("../../data/companyData/match2_cityZips.csv")

## Take 3
Try the addresses here.

In [441]:
spelling.shape

(230, 17)

In [320]:
companyMatches.sort_values(by=['misspelling','cosSimilarity'],inplace = True)

In [345]:
companyMatches.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state
0,avx,one avx boulevard,fountain inn,sc,29644,1.0,avp,960 knox st # a,torrance,90502,ca,0.420928,[esm ferolie],2 van riper rd,montvale,7645,nj
1,spire alabama,605 richard arrington boulevard north,birmingham,al,35203,6.0,ire global,111 congress ave # 4th,austin,78701,tx,0.754357,[alabama forestymmission],513 madison ave,montgomery,36104,al
2,alabama power,600 north 18th street,birmingham,al,35203,4.0,alstom power,200 great pond dr,windsor,6095,ct,0.813616,[alabama electric],1728 headland ave,dothan,36303,al
3,tamir biothechnology,"51 jfk parkway, 1st floor west, suite 108",short hills,nj,7078,1.0,tamir biotechnology,11 deerpark dr # 204,monmouth jct,8852,nj,0.78913,[tamir biotechnology],11 deerpark dr # 204,monmouth jct,8852,nj
4,petro usa,7325 oswego road,liverpool,ny,13090,1.0,etro usa,41 w 56th st,new york,10019,ny,0.858537,[petro serve usa],1772 main ave w,west fargo,58078,nd


In [None]:
chq = pd.read_csv("../../data/companyData/compustatChanges_2010s.csv").drop(columns = {'Unnamed: 0'})

In [None]:
# Put copmanies 