In [2]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy
  
nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Get Full IG Dataset

In [None]:
file = "../../data/companyData/infogroup2010s.csv"

In [None]:
import dask.dataframe as dd

df = dd.read_csv(file, assume_missing=True, 
                 dtype={'parent_number': 'object','parent_employee_size_code': 'object',
                       'parent_sales_volume_code': 'object',
                       'abi': 'object','zipcode': 'object'}, low_memory = False)
df = df[df.business_status_code == 1.0]


In [None]:
df.columns

In [None]:
df.head()

In [None]:
hq = df[['abi','ticker','company','archive_version_year','state','city',
         'address_line_1','zipcode',
         'latitude','longitude']].drop_duplicates().compute(num_workers = 100)

In [None]:
hqsOnly = hq[['abi','company']].drop_duplicates()

In [None]:
hqsOnly.company.value_counts()

In [None]:
print(hq.shape,hqsOnly.shape)

Some of the abi numbers seem to be duplicated; it looks like they might be primarily for different government agencies.

In [None]:
hqsOnly.company.value_counts()[hqsOnly.company.value_counts() > 10].index

In [None]:
toDiscard = hqsOnly.company.value_counts()[hqsOnly.company.value_counts() > 1].index
for company in toDiscard:
    print(company)


In [None]:
toDiscard

In [None]:
hqsOnly = hqsOnly[~hqsOnly.company.isin(toDiscard)]
hq      = hq[~hq.company.isin(toDiscard)]

In [None]:
hq.shape

At this point, we have a unique record of every company - hq here. Some of these may well be duplicate entries for a given company, for the cases in which we have a company that has multiple hq.

Let's stash it so that we don't have to go through the above ^^ again.

In [None]:
hqsOnly.to_csv("../../data/ig2010s_uniqueHQs.csv")

In [None]:
hq.to_csv("../../data/ig2010s_uniqueHQs_multLocations.csv")

## Grab Compustat Data

First filter down to the companies for whom we have the supply chain information.

In [357]:
c_links.gvkey.unique().shape[0]

2479

In [11]:
c_links = pd.read_csv("../../data/companyData/compustatSCLinked.csv")

c_links['year'] = c_links.srcdate.astype('str').str.slice(0,4).astype('int64')

c_links = c_links[c_links.year > 2009]

relevant_gvkeys = c_links.gvkey.append(c_links.cgvkey).drop_duplicates()

print(c_links.head(),relevant_gvkeys.shape)

     gvkey                        conm  cgvkey                       cconm  \
80    1013  ADC TELECOMMUNICATIONS INC    9899                    AT&T INC   
81    1013  ADC TELECOMMUNICATIONS INC    2136  VERIZON COMMUNICATIONS INC   
281   1094                  ACETO CORP   31673      AMERISOURCEBERGEN CORP   
282   1094                  ACETO CORP   31673      AMERISOURCEBERGEN CORP   
283   1094                  ACETO CORP    7171               MCKESSON CORP   

                       cnms   srcdate  cid  sid    ctype   salecs     scusip  \
80                     AT&T  20100930   16    0  COMPANY  300.000  000886309   
81   VERIZON COMMUNICATIONS  20100930   13    0  COMPANY  146.000  000886309   
281  AmerisourceBergen Corp  20160630   13    0  COMPANY   78.193  004446100   
282  AmerisourceBergen Corp  20170630   13    0  COMPANY   76.598  004446100   
283           McKesson Corp  20170630   19    0  COMPANY   70.215  004446100   

       stic     ccusip ctic  year  
80   ADCT.1  0

Get the company dataset and check.

The legal name and the given name are slightly different, but basically the same modulo punctuation and case.

In [12]:
c_addresses = pd.read_csv("../../data/companyData/compustatAddresses.csv", 
                dtype={'parent_number': 'object'})[['fyear',
                'gvkey',
                'conm',
                'add1',
                'city',
                'state',
                'idbflag',
                'addzip',
               'naics']].drop_duplicates().rename(columns = {'fyear': 'year'})
c_addresses = c_addresses[(c_addresses.year > 2009) & (c_addresses.year < 2020)]



  exec(code_obj, self.user_global_ns, self.user_ns)


In [13]:
c_addresses.year.value_counts()

2012.0    11836
2013.0    11829
2014.0    11637
2011.0    11527
2015.0    11469
2016.0    11329
2018.0    11302
2019.0    11275
2017.0    11243
2010.0    10855
Name: year, dtype: int64

Subset this to focus on firms in: ag, mining, construction, manufacturing, wholesale and retail, and transportation.

In [14]:
c_addresses = c_addresses[(c_addresses.naics.astype('str').str.slice(0,2).isin(['11','21','22','23','31','32',
                                                         '33','42','44','45','48','49']))]

chq = c_addresses[['gvkey','conm','add1','city','state','addzip','idbflag']].drop_duplicates()

We're starting with the compustat north america dataset. Not all of the HQs are in North America, so we can filter some of the information down to match with Infogroup.

In [15]:
chq.idbflag.value_counts()

D    6960
B    1121
Name: idbflag, dtype: int64

In [16]:
canadian = ['ON', 'AB','QC', 'BC', 'NS', 'NF', 'SK', 'MB', 'NB']

chq.state.unique()

array(['IL', 'MN', 'TX', 'SC', 'AZ', 'NY', 'CT', 'FL', 'CA', nan, 'ON',
       'PA', 'NC', 'AL', 'WA', 'HI', 'NJ', 'MA', 'OH', 'NV', 'UT', 'OK',
       'WI', 'AR', 'CO', 'IA', 'DE', 'RI', 'AB', 'GA', 'MD', 'ME', 'VA',
       'IN', 'SD', 'OR', 'QC', 'BC', 'KY', 'MO', 'LA', 'VT', 'TN', 'MI',
       'DC', 'ID', 'ND', 'MS', 'KS', 'NS', 'NH', 'NM', 'NE', 'MT', 'WY',
       'NF', 'SK', 'MB', 'WV', 'NB'], dtype=object)

In [17]:
chq = chq[~(chq.state.isin(canadian)) & ~chq.state.isna()]

In [18]:
chq.addzip.str.len().value_counts()

5     4628
10     481
Name: addzip, dtype: int64

In [19]:
chq['addzip'] = chq.addzip.astype('str').str.slice(0,5)

In [20]:
chq[chq.idbflag == 'D'].addzip.str.len().value_counts()

5    4689
Name: addzip, dtype: int64

In [21]:
chq[chq.idbflag == "B"].addzip.value_counts()

77002    11
80202     5
94080     5
02139     5
10022     4
         ..
01760     1
97124     1
85251     1
18940     1
33394     1
Name: addzip, Length: 326, dtype: int64

In [22]:
print(chq.head(),chq.shape)

    gvkey                         conm  \
1    1004                     AAR CORP   
12   1013   ADC TELECOMMUNICATIONS INC   
13   1045  AMERICAN AIRLINES GROUP INC   
25   1050      CECO ENVIRONMENTAL CORP   
59   1072                     AVX CORP   

                                        add1          city state addzip  \
1   One AAR Place, 1100 North Wood Dale Road     Wood Dale    IL  60191   
12                    13625 Technology Drive  Eden Prairie    MN  55344   
13                           1 Skyview Drive    Fort Worth    TX  76155   
25     14651 North Dallas Parkway, Suite 500        Dallas    TX  75254   
59                         One AVX Boulevard  Fountain Inn    SC  29644   

   idbflag  
1        D  
12       D  
13       B  
25       D  
59       D   (5109, 7)


In [23]:
chq.rename(columns = {'conm': 'company','addzip': 'cstatZipcode'},inplace = True)
chq.head()

Unnamed: 0,gvkey,company,add1,city,state,cstatZipcode,idbflag
1,1004,AAR CORP,"One AAR Place, 1100 North Wood Dale Road",Wood Dale,IL,60191,D
12,1013,ADC TELECOMMUNICATIONS INC,13625 Technology Drive,Eden Prairie,MN,55344,D
13,1045,AMERICAN AIRLINES GROUP INC,1 Skyview Drive,Fort Worth,TX,76155,B
25,1050,CECO ENVIRONMENTAL CORP,"14651 North Dallas Parkway, Suite 500",Dallas,TX,75254,D
59,1072,AVX CORP,One AVX Boulevard,Fountain Inn,SC,29644,D


In [27]:
chq.to_csv("../../data/chq.csv")

## Breakpoint
We can start from here and just clean everything from here.


Headquarters:

In [44]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'})

chq.head()

Unnamed: 0,gvkey,company,add1,city,state,cstatZipcode,idbflag
0,1004,AAR CORP,"One AAR Place, 1100 North Wood Dale Road",Wood Dale,IL,60191,D
1,1013,ADC TELECOMMUNICATIONS INC,13625 Technology Drive,Eden Prairie,MN,55344,D
2,1045,AMERICAN AIRLINES GROUP INC,1 Skyview Drive,Fort Worth,TX,76155,B
3,1050,CECO ENVIRONMENTAL CORP,"14651 North Dallas Parkway, Suite 500",Dallas,TX,75254,D
4,1072,AVX CORP,One AVX Boulevard,Fountain Inn,SC,29644,D


In [45]:
hqsOnly     = pd.read_csv("../../data/ig2010s_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})
hqsWithYear = pd.read_csv("../../data/ig2010s_uniqueHQs_multLocations.csv",dtype={'zipcode': 'object'})[['abi','company',
                                                                             'archive_version_year',
                                                                             'state','city','zipcode','address_line_1']]

print(hqsOnly.head())
print(hqsWithYear.head())

hqsWithYear = hqsWithYear[hqsWithYear.archive_version_year <= 2020]

hqsWithYear['last_year'] = hqsWithYear.groupby(['abi'])['archive_version_year'].transform(max)

print(hqsWithYear.shape)

lastHQs = hqsWithYear[hqsWithYear.archive_version_year == hqsWithYear.last_year][['abi','company','state','city','zipcode','address_line_1']]

print(lastHQs.shape)

lastHQs.head()

     abi                         company
0   7609            SOLITRON DEVICES INC
1  21311   WESTERN STATES ENVELOPE & LBL
2  29603                THIELE KAOLIN CO
3  71340                 TRI STAFF GROUP
4  77743  NATIONAL TECHNICAL SYSTEMS INC
     abi                         company  archive_version_year state  \
0   7609            SOLITRON DEVICES INC                2010.0    FL   
1  21311   WESTERN STATES ENVELOPE & LBL                2010.0    WI   
2  29603                THIELE KAOLIN CO                2010.0    GA   
3  71340                 TRI STAFF GROUP                2010.0    CA   
4  77743  NATIONAL TECHNICAL SYSTEMS INC                2010.0    CA   

              city zipcode            address_line_1  
0  WEST PALM BEACH   33407  3301 ELECTRONICS WAY # C  
1           BUTLER   53007           4480 N 132ND ST  
2     SANDERSVILLE   31082             520 KAOLIN RD  
3        SAN DIEGO   92122   6336 GREENWICH DR # 100  
4        CALABASAS   91302  24007 VENTURA BLVD

Unnamed: 0,abi,company,state,city,zipcode,address_line_1
17,158329,CARIDIAN BCT INC,CO,LAKEWOOD,80401,14143 DENVER WEST PKWY # 200
20,211946,FAMILY BRANDS INTL LLC,TN,LENOIR CITY,37771,1001 ELM HILL RD
23,258574,JADE CORP,PA,HUNTINGDON VLY,19006,3063 PHILMONT AVE
24,262493,MONTANA METAL PRODUCTS LLC,IL,DES PLAINES,60018,25 HOWARD AVE
43,455154,O I CORP,TX,COLLEGE STATION,77845,151 GRAHAM RD


Only two of these company names appear 2x, which is good. There are ~20,000 companies in this sample.

Let's go through a little bit of a process here:
- Find the exact matches.
- Get a similarity measure between ; ideally something vectorized / something in matrix math.
- Find the top 10 matches for the remaining ones.
- Do some mix and match and see if there's any threshold at which matches become similar ``enough'' to say this is okay and good to go.


We might be able to use the fact that all of the addresses should be the same after some given point, as the compustat addresses are only the most recent ones. 

Let's try a few different ways to match these up.

First, let's find the exact matches.

Make a generic cleaning function that strips out all company names, any punctuation in the name, and makes everything lower case.

In [34]:
def cleanText(text):
    text = text.\
    replace(" CORP","").replace(" CO","").replace(" INC","").\
    replace(" LTD","").replace(" -CL A","").\
    replace(" -LP","").replace(" LP","").\
    replace("-OLD","").replace(" LLC","").\
    replace(" -CL B","").replace(" -CL i","").replace(" -CL","").\
    replace("-REDH","").replace(" CP","").\
    replace("-ADR","").replace(" PLC","").lower().replace(r'[^\w\s]+', '').\
    replace('-lp','').replace('-spn','').replace('hldg','').replace(' intl','').\
    replace('holdings','').replace('holding','').replace('prtnr','').replace('group','')
    
    
    return text

In [46]:
chq['company']               = list(map(cleanText, chq.company))
lastHQs['company']           = list(map(cleanText, lastHQs.company))

chq.rename(columns = {'city': 'cstatCity',
                     'state': 'cstatState',
                     'add1': 'cstatadd1'}, inplace = True)

chq['cstatCity']  = chq.cstatCity.str.lower()
chq['cstatState'] = chq.cstatState.str.lower()
chq['cstatadd1']  = chq.cstatadd1.str.lower()

lastHQs['city']            = lastHQs.city.str.lower()
lastHQs['state']           = lastHQs.state.str.lower()
lastHQs['address_line_1']  = lastHQs.address_line_1.str.lower()

NAICS names do not match up between compustat and infogroup so they're not helpful.

In [47]:
chq.head()

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,idbflag
0,1004,aar,"one aar place, 1100 north wood dale road",wood dale,il,60191,D
1,1013,adc telecommunications,13625 technology drive,eden prairie,mn,55344,D
2,1045,american airlines,1 skyview drive,fort worth,tx,76155,B
3,1050,ceco environmental,"14651 north dallas parkway, suite 500",dallas,tx,75254,D
4,1072,avx,one avx boulevard,fountain inn,sc,29644,D


In [48]:
lastHQs.head()

Unnamed: 0,abi,company,state,city,zipcode,address_line_1
17,158329,caridian bct,co,lakewood,80401,14143 denver west pkwy # 200
20,211946,family brands,tn,lenoir city,37771,1001 elm hill rd
23,258574,jade,pa,huntingdon vly,19006,3063 philmont ave
24,262493,montana metal products,il,des plaines,60018,25 howard ave
43,455154,o i,tx,college station,77845,151 graham rd


## Match on company name directly

In [49]:
nameMerge = chq.merge(lastHQs)
nameMerge.shape

(3344, 12)

In [50]:
nameMerge.head()

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,idbflag,abi,state,city,zipcode,address_line_1
0,1004,aar,"one aar place, 1100 north wood dale road",wood dale,il,60191,D,115523672,il,wood dale,60191,1100 n wood dale rd
1,1013,adc telecommunications,13625 technology drive,eden prairie,mn,55344,D,7523129,mn,eden prairie,55344,13625 technology dr
2,1045,american airlines,1 skyview drive,fort worth,tx,76155,B,7501711,tx,fort worth,76155,1 skyview dr
3,1050,ceco environmental,"14651 north dallas parkway, suite 500",dallas,tx,75254,D,596284992,tx,dallas,75254,14651 dallas pkwy # 500
4,1075,pinnacle west capital,"400 north fifth street, p.o. box 53999",phoenix,az,85072,D,4554051,az,phoenix,85004,400 n 5th st frnt


In [51]:
sum(nameMerge.cstatState == nameMerge.state)/nameMerge.shape[0]

0.9455741626794258

In [54]:
sum(nameMerge.cstatZipcode.str.slice(0,5) == nameMerge.zipcode.str.slice(0,5))/nameMerge.shape[0]

0.8609449760765551

In [55]:
sum(nameMerge.cstatZipcode.str.slice(0,1) == nameMerge.zipcode.str.slice(0,1))/nameMerge.shape[0]

0.9518540669856459

In [56]:
nameMerge[nameMerge.cstatCity != nameMerge.city][50:100]

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,idbflag,abi,state,city,zipcode,address_line_1
414,7620,murphy oil,"9805 katy freeway, suite g-200",houston,tx,77024,D,7525652,ar,el dorado,71730,300 e peach st
424,7849,new jersey resources,1415 wyckoff road,wall,nj,7719,D,878627181,nj,wall township,7727,1415 wyckoff rd
426,7881,newmont,"6900 east layton avenue, suite 700",denver,co,80237,B,7526734,co,greenwood vlg,80111,6363 s fiddlers green cir #800
435,7923,norfolk southern,650 west peachtree street nw,atlanta,ga,30308,D,7526965,va,norfolk,23510,3 commercial pl # 1a
440,7991,terex,"45 glover avenue, 4th floor",norwalk,ct,6850,D,437605223,ct,westport,6880,200 nyala farms rd # 2
445,8073,ocean bio-chem,4041 sw 47th avenue,fort lauderdale,fl,33314,D,443639711,fl,davie,33314,4041 sw 47th ave
499,8901,rpc,"2801 buford highway ne, suite 300",atlanta,ga,30329,D,3485588,ga,brookhaven,30329,2801 buford hwy ne # 300
516,9325,sl industries,"520 fellowship road, suite a-114",mount laurel,nj,8054,D,406295980,nj,mt laurel,8054,520 fellowship rd # a114
519,9372,st jude medical,one st. jude medical drive,st. paul,mn,55117,D,4349056,mn,st paul,55117,1 saint jude medical dr
520,9411,hillshire brands,3500 lacey road,downers grove,il,60515,D,7509318,il,chicago,60607,400 s jefferson st # 1n


Now focus down onto the companies that have not been matched.

In [57]:
chqUnmatched = chq[~chq.company.isin(nameMerge.company)].reset_index()
chqUnmatched.shape

(1800, 8)

In [58]:
igUnmatched  = lastHQs[~lastHQs.company.isin(nameMerge.company)].reset_index()
igUnmatched.shape

(56475, 7)

In [59]:
lastHQs.shape

(59805, 6)

# Find Distance

Two distance measures here. Look at top 5 matches and pull the distance measure and matches as well.

### Levenshtein

In [60]:
from Levenshtein import distance as levenshtein_distance

Find LD between the unmatched compustat companies and the unmatched IG ones. 

In [61]:
companyArrayCStat = []

company = chqUnmatched.company[0]
start = time.time()
for company in chqUnmatched.company:
    thisCompany = []
    for ig in igUnmatched.company:
        thisCompany.append(levenshtein_distance(company,ig))
    
    companyArrayCStat.append([thisCompany])

allLD = np.concatenate(companyArrayCStat)


In [62]:
igUnmatched.head()

Unnamed: 0,index,abi,company,state,city,zipcode,address_line_1
0,17,158329,caridian bct,co,lakewood,80401,14143 denver west pkwy # 200
1,20,211946,family brands,tn,lenoir city,37771,1001 elm hill rd
2,23,258574,jade,pa,huntingdon vly,19006,3063 philmont ave
3,24,262493,montana metal products,il,des plaines,60018,25 howard ave
4,43,455154,o i,tx,college station,77845,151 graham rd


Now pull the closest companies in IG to those in CStat. We'll first populate a dataframe with the name, address, city, state, and zip of each unmatched company in compustat, then we'll use the LD to find the same information for the closest company in IG.



There's some legacy code in here that finds the top 5 closest companies; but it doesn't populate the dataframe.

In [73]:
singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]

# legacy
# n = 5
# largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


companyMatches = pd.DataFrame()
companyMatches['cstatCompanies'] = chqUnmatched.company
companyMatches['cstatadd1']      = chqUnmatched.cstatadd1
companyMatches['cstatCity']      = chqUnmatched.cstatCity
companyMatches['cstatState']     = chqUnmatched.cstatState
companyMatches['cstatZip']       = chqUnmatched.cstatZipcode

for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'misspelling']         = allLD[i,:][singleLargestLV[i]]
    companyMatches.at[i,'percMisspelled']      = companyMatches.misspelling[i]/len(companyMatches.cstatCompanies[i])
    companyMatches.at[i,'levCompany']          = igUnmatched.company[singleLargestLV[i]].iloc[0]
    
    companyMatches.at[i,'closestMatchIG_add']      = igUnmatched.address_line_1[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_city']     = igUnmatched.city[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_zipcode']  = igUnmatched.zipcode[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_state']    = igUnmatched.state[singleLargestLV[i]].iloc[0]

    
    # companyMatches.at[i,'closestMatchIG']      = np.array(igUnmatched.company)[largestElementsLV[i]]
    # companyMatches.at[i,'LevSim']              = np.array(allLD[i,:][largestElementsLV[i]], dtype=object)

In [74]:
companyMatches.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,percMisspelled,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state
0,avx,one avx boulevard,fountain inn,sc,29644,1.0,0.333333,avp,960 knox st # a,torrance,90502,ca
1,spire alabama,605 richard arrington boulevard north,birmingham,al,35203,6.0,0.461538,ire global,111 congress ave # 4th,austin,78701,tx
2,alabama power,600 north 18th street,birmingham,al,35203,4.0,0.307692,alstom power,200 great pond dr,windsor,6095,ct
3,tamir biothechnology,"51 jfk parkway, 1st floor west, suite 108",short hills,nj,7078,1.0,0.05,tamir biotechnology,11 deerpark dr # 204,monmouth jct,8852,nj
4,petro usa,7325 oswego road,liverpool,ny,13090,1.0,0.111111,etro usa,41 w 56th st,new york,10019,ny


Now get the embeddings and the cosine similarity between them.

In [80]:
def getMatrix(companyEmbeddings):
    companyArray = []
    
    for companies in companyEmbeddings:
        companyArray.append([companies.vector])
    
    companyArray = np.concatenate(companyArray)
    
    return(companyArray)
        

In [81]:
chqUnmatchedList = list(map(nlp, chqUnmatched.company))
allCompaniesIG   = list(map(nlp, igUnmatched.company))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

(1800, 56475)

In [82]:
outfile =  '../../data/allCompaniesIG_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(ig, pickle_file)
    
outfile =  '../../data/allCompaniesCStat_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(cstat, pickle_file)    

Each row n here has the similarity between the nth company name in compustat and the IG company corresp to that column.

In [83]:
allSimilarities[0:5,:]

array([[ 0.27351984, -0.16348277,  0.02695994, ..., -0.06730537,
         0.09521193, -0.00591509],
       [ 0.20687255,  0.05328101,  0.24833122, ...,  0.31417164,
         0.46006292,  0.2882711 ],
       [-0.01470069,  0.27651983,  0.19719386, ...,  0.45301446,
         0.3847386 ,  0.36670434],
       [ 0.24357395, -0.17722532,  0.09829525, ..., -0.095217  ,
         0.07805435, -0.07496908],
       [ 0.12485278,  0.12257036,  0.17973134, ...,  0.23482014,
         0.3301328 ,  0.3126555 ]], dtype=float32)

Find indices of companies in IG most similar to each company in CStat.

In [85]:
singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]

# legacy - largest n
# largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]


Add the cosine similarity measures to the similarity dataset.

In [86]:
for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'cosSimilarity']        = allSimilarities[i,:][singleLargestCos[i]]
    companyMatches.at[i,'cosSimilarityCompany'] = igUnmatched.company[singleLargestCos[i]].iloc[0]
    # companyMatches.at[i,'closestMatchCosine']   = np.array(igUnmatched.company)[largestElementsCos[i]]
    # companyMatches.at[i,'cosineSim']            = np.array(allSimilarities[i,:][largestElementsCos[i]], dtype=object)
    
    
    companyMatches.at[i,'costMatchIG_add']     = igUnmatched.address_line_1[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_city']     = igUnmatched.city[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_zipcode']  = igUnmatched.zipcode[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_state']    = igUnmatched.state[singleLargestCos[i]].iloc[0]
    

In [87]:
sum((companyMatches.levCompany == companyMatches.cosSimilarityCompany))

321

## Start Matching

### Take 1: Match + Zip or City

Now find the company matches: ABI - gvkey link.

Start with ones where the names both match.

If the cities or zipcodes match on one of the closest companies (LD or cos), it seems like it is good to go.


Do this in steps to start, at least. First find the companies where both match and either zip or city match. Then find companies where only one matches.

In [132]:
bothMatch_cityZip = companyMatches[(companyMatches.levCompany == companyMatches.cosSimilarityCompany) & \
              ((companyMatches.cstatCity == companyMatches.closestMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.closestMatchIG_zipcode))]
bothMatch_cityZip.shape

bothMatch_cityZip.to_csv("../../data/companyData/bothMatch_cityZip.csv")

In [133]:
bothMatch_cityZip['igCompanies'] = bothMatch_cityZip.levCompany
companiesToCheck                 = bothMatch_cityZip[['cstatCompanies','igCompanies']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [134]:
companyMatchesBoth = list(bothMatch_cityZip.cstatCompanies.unique())
len(companyMatchesBoth)

238

Grab the single company match versions.

In [135]:
oneMatch_cityZipOnly.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,percMisspelled,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state,igCompanies
0,beam,222 west merchandise mart plaza,chicago,il,60654,1.0,0.25,team,13131 dairy ashford rd # 600,sugar land,77478,tx,1.0,beam suntory,222 merchandise mart plz #1600,chicago,60654,il,beam suntory
1,arizona public service,400 north fifth street,phoenix,az,85072,8.0,0.363636,republic services,18500 n allied way # 100,phoenix,85054,az,0.842057,california public employees,400 q st,sacramento,95811,ca,republic services
2,southern gas,"ten peachtree place, n.e.",atlanta,ga,30309,2.0,0.166667,southern its,9101 w sahara ave # 105,las vegas,89117,nv,0.92152,gas south,3625 cumberland blvd se # 1500,atlanta,30339,ga,gas south
3,a v homes,"4900 north scottsdale road, suite 2000",scottsdale,az,85251,1.0,0.111111,av homes,8601 n scottsdale rd # 225,scottsdale,85253,az,0.712529,aronov new homes,3500 eastern blvd # 100,montgomery,36116,al,av homes
4,balchem,52 sunrise park road,new hampton,ny,10958,1.0,0.125,balchem,52 sunrise park rd,new hampton,10958,ny,0.0,caridian bct,14143 denver west pkwy # 200,lakewood,80401,co,balchem


In [136]:
oneMatch_cityZipOnly = companyMatches[-(companyMatches.cstatCompanies.isin(companyMatchesBoth)) & \
              (((companyMatches.cstatCity == companyMatches.closestMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.closestMatchIG_zipcode)) | \
              ((companyMatches.cstatCity == companyMatches.cosMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.cosMatchIG_zipcode)))].reset_index(drop=True)

oneMatch_cityZipOnly['igCompanies'] = ''

for i in range(0,oneMatch_cityZipOnly.shape[0]):
    if ((oneMatch_cityZipOnly.cstatCity[i] == oneMatch_cityZipOnly.closestMatchIG_city[i]) | \
              (oneMatch_cityZipOnly.cstatZip[i] == oneMatch_cityZipOnly.closestMatchIG_zipcode[i])):
        oneMatch_cityZipOnly.loc[i,'igCompanies'] = oneMatch_cityZipOnly.levCompany[i]
    else:
        oneMatch_cityZipOnly.loc[i,'igCompanies'] = oneMatch_cityZipOnly.cosSimilarityCompany[i]

# oneMatch_cityZipOnly.to_csv("../../data/companyData/oneMatch_cityZipOnly.csv")

In [137]:
oneMatch_cityZipOnly.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,percMisspelled,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state,igCompanies
0,beam,222 west merchandise mart plaza,chicago,il,60654,1.0,0.25,team,13131 dairy ashford rd # 600,sugar land,77478,tx,1.0,beam suntory,222 merchandise mart plz #1600,chicago,60654,il,beam suntory
1,arizona public service,400 north fifth street,phoenix,az,85072,8.0,0.363636,republic services,18500 n allied way # 100,phoenix,85054,az,0.842057,california public employees,400 q st,sacramento,95811,ca,republic services
2,southern gas,"ten peachtree place, n.e.",atlanta,ga,30309,2.0,0.166667,southern its,9101 w sahara ave # 105,las vegas,89117,nv,0.92152,gas south,3625 cumberland blvd se # 1500,atlanta,30339,ga,gas south
3,a v homes,"4900 north scottsdale road, suite 2000",scottsdale,az,85251,1.0,0.111111,av homes,8601 n scottsdale rd # 225,scottsdale,85253,az,0.712529,aronov new homes,3500 eastern blvd # 100,montgomery,36116,al,av homes
4,balchem,52 sunrise park road,new hampton,ny,10958,1.0,0.125,balchem,52 sunrise park rd,new hampton,10958,ny,0.0,caridian bct,14143 denver west pkwy # 200,lakewood,80401,co,balchem


In [138]:
companiesToCheck = companiesToCheck.append(oneMatch_cityZipOnly[['cstatCompanies','igCompanies']]).drop_duplicates()

companiesToCheck.shape

(600, 2)

In [139]:
companiesToCheck[0:50]

Unnamed: 0,cstatCompanies,igCompanies
8,american science engineering,american science & engineering
11,apco oil and gas,apco oil & gas
13,arden,arden
16,arts way mfg,art's way mfg
21,constellation energy grp,constellation energy
26,bassett furniture inds,bassett furniture industries
32,non invasive monitor,non-invasive monitoring systs
37,brown forman,brown-forman
41,cagle's,cagle's
42,california water service gp,california water svc


2x check that there are no duplicates here.

In [141]:
duplicates = companiesToCheck.cstatCompanies.value_counts().index[companiesToCheck.cstatCompanies.value_counts() > 1]

companiesToCheck[companiesToCheck.cstatCompanies.isin(duplicates)]

Unnamed: 0,cstatCompanies,igCompanies


In [142]:
chqStillUnmatched = chqUnmatched[-chqUnmatched.company.isin(companiesToCheck.cstatCompanies)].reset_index(drop=True)
igStillUnmatched  = igUnmatched[-igUnmatched.company.isin(companiesToCheck.igCompanies)].reset_index(drop=True)

# companyMatches['cstatCompanies'] = chqUnmatched.company
print(chqUnmatched.shape, chqStillUnmatched.shape, companiesToCheck.shape)
print(igStillUnmatched.shape,igUnmatched.shape)

(1800, 8) (1200, 8) (600, 2)
(55892, 7) (56475, 7)


In [143]:
companiesToCheck.to_csv("../../data/companyData/companiesToCheck_cityZip.csv")

## Take 2
Match remaining ones on first word of name

In [144]:
chqUnmatched.company[0].split(' ')[0]

'avx'

Get the edit distance for the first words of the company names.

In [145]:
companyArrayCStat = []

start = time.time()
for company in chqStillUnmatched.company:
    thisCompany = []
    for ig in igStillUnmatched.company:
        thisCompany.append(levenshtein_distance(company.split(' ')[0],ig.split(' ')[0]))
    
    companyArrayCStat.append([thisCompany])

allLD = np.concatenate(companyArrayCStat)


And the cosine distance.

In [146]:
chqStillUnmatchedFirstCo = []
igStillUnmatchedFirstCo = []

for company in chqStillUnmatched.company:
    chqStillUnmatchedFirstCo.append(company.split(' ')[0])
    
for company in igStillUnmatched.company:
    igStillUnmatchedFirstCo.append(company.split(' ')[0])

In [147]:
chqUnmatchedList = list(map(nlp, chqStillUnmatchedFirstCo))
allCompaniesIG = list(map(nlp, igStillUnmatchedFirstCo))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

# largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]
singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]

In [148]:
igStillUnmatched.head()

Unnamed: 0,index,abi,company,state,city,zipcode,address_line_1
0,17,158329,caridian bct,co,lakewood,80401,14143 denver west pkwy # 200
1,20,211946,family brands,tn,lenoir city,37771,1001 elm hill rd
2,23,258574,jade,pa,huntingdon vly,19006,3063 philmont ave
3,24,262493,montana metal products,il,des plaines,60018,25 howard ave
4,43,455154,o i,tx,college station,77845,151 graham rd


In [None]:
Let's try a slightly different tact:
- Find all n companies ith similar first names as 

In [189]:
n = 10

largestElementsLV = (allLD).argsort(axis=-1)[:, :n]
largestElementsLV.shape

(1200, 10)

In [149]:
singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]
# largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


companyMatches2 = pd.DataFrame()
companyMatches2['cstatCompanies'] = chqStillUnmatched.company
companyMatches2['cstatadd1']      = chqStillUnmatched.cstatadd1
companyMatches2['cstatCity']      = chqStillUnmatched.cstatCity
companyMatches2['cstatState']     = chqStillUnmatched.cstatState
companyMatches2['cstatZip']       = chqStillUnmatched.cstatZipcode

for i in range(0,companyMatches2.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches2.at[i,'misspelling']         = allLD[i,:][singleLargestLV[i]]
    companyMatches2.at[i,'levCompany']          = igStillUnmatched.company[singleLargestLV[i]].iloc[0]
    
    companyMatches2.at[i,'closestMatchIG_add']      = igStillUnmatched.address_line_1[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_city']     = igStillUnmatched.city[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_zipcode']  = igStillUnmatched.zipcode[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_state']    = igStillUnmatched.state[singleLargestLV[i]].iloc[0]

    
    companyMatches2.at[i,'cosSimilarity']        = allSimilarities[i,:][singleLargestCos[i]]
    companyMatches2.at[i,'cosSimilarityCompany'] = igStillUnmatched.company[singleLargestCos[i]].iloc[0]
   
    companyMatches2.at[i,'costMatchIG_add']     = igStillUnmatched.address_line_1[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_city']     = igStillUnmatched.city[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_zipcode']  = igStillUnmatched.zipcode[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_state']    = igStillUnmatched.state[singleLargestCos[i]].iloc[0]

    

In [150]:
companyMatches2.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state
0,avx,one avx boulevard,fountain inn,sc,29644,1.0,apx,4931 n 300 w,provo,84604,ut,0.452242,protech home medical,1019 town dr,highland heights,41076,ky
1,spire alabama,605 richard arrington boulevard north,birmingham,al,35203,0.0,spire digital,940 n lincoln st # 3,denver,80203,co,1.0,spire digital,940 n lincoln st # 3,denver,80203,co
2,alabama power,600 north 18th street,birmingham,al,35203,0.0,alabama state senate,11 s union st,montgomery,36130,al,1.0,alabama state banking dept,401 adams ave # 680,montgomery,36104,al
3,tamir biothechnology,"51 jfk parkway, 1st floor west, suite 108",short hills,nj,7078,0.0,tamir biotechnology,11 deerpark dr # 204,monmouth jct,8852,nj,1.0,tamir biotechnology,11 deerpark dr # 204,monmouth jct,8852,nj
4,petro usa,7325 oswego road,liverpool,ny,13090,0.0,petro energy,920 10th ave n,onalaska,54650,wi,1.0,petro skills,2930 s yale ave,tulsa,74114,ok


Find if city or zip match here.

In [152]:
match2_cityZips = companyMatches2[(((companyMatches2.cstatCity == companyMatches2.closestMatchIG_city) | \
              (companyMatches2.cstatZip == companyMatches2.closestMatchIG_zipcode)) | \
              ((companyMatches2.cstatCity == companyMatches2.cosMatchIG_city) | \
              (companyMatches2.cstatZip == companyMatches2.cosMatchIG_zipcode)))].reset_index(drop=True)


In [198]:
match2_cityZips[50:100]

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state
50,aterian,"37 east 18th street, 7th floor",new york,ny,10003,0.0,aterian investment partners,11 e 44th st # 1803,new york,10017,ny,0.0,caridian bct,14143 denver west pkwy # 200,lakewood,80401,co
51,stabilis solutions,"11750 katy freeway, suite 900",houston,tx,77079,0.0,stabilis energy,10375 richmond ave # 700,houston,77042,tx,0.0,caridian bct,14143 denver west pkwy # 200,lakewood,80401,co
52,phathom pharma,"100 campus drive, suite 102",florham park,nj,7932,0.0,phathom pharmaceuticals,100 campus dr # 102,florham park,7932,nj,0.0,caridian bct,14143 denver west pkwy # 200,lakewood,80401,co
53,brookfield renewable,"250 vesey street, 15th floor",new york,ny,10281,0.0,brookfield office properties,200 vesey st # 11,new york,10281,ny,1.0,brookfield engineering lab,11 commerce blvd,middleboro,2346,ma
54,pactiv evergreen,1900 west field court,lake forest,il,60045,0.0,pactiv,1900 w field ct,lake forest,60045,il,0.0,caridian bct,14143 denver west pkwy # 200,lakewood,80401,co
55,aarons (the),"400 galleria parkway se, suite 300",atlanta,ga,30339,1.0,aaron's,400 galleria pkwy se # 300,atlanta,30339,ga,0.478533,garys,923 newport center dr,newport beach,92660,ca
56,petco health & wellness,10850 via frontera,san diego,ca,92127,0.0,petco animal supplies,10850 via frontera,san diego,92127,ca,1.0,petco animal supplies,10850 via frontera,san diego,92127,ca
57,latham,787 watervliet shaker road,latham,ny,12110,0.0,latham & watkins llp,355 s grand ave # 100,los angeles,90071,ca,1.0,latham plastics,787 watervliet shaker rd,latham,12110,ny
58,torrid,18501 east san jose avenue,city of industry,ca,91748,2.0,morris,15125 proctor ave,city of industry,91746,ca,0.605563,sizzling wok,999 s washington st,north attleboro,2760,ma
59,warby parker,"233 spring street, 6 th floor east",new york,ny,10013,1.0,darby & darby,7 world trade ctr,new york,10007,ny,0.4135,hardison &chran,7340 six forks rd # 220,raleigh,27615,nc


In [154]:
match2_cityZips.shape

(88, 17)

In [155]:
match2_cityZips.to_csv("../../data/companyData/match2_cityZips.csv")

## Take 3
Try the addresses here.

Let's try something similar:
- Find top 10 most similar addresses by cos sim
- Find top 10 most similar addresses by LD
- Find unique union of these two
- Record LD and cos sim for each
- Filter for totally dissimilar ones
- ``Explode'' the dataset so we have cstat company, address
- Find first word LD and cos sim
- Find total LD and cos sim

In [199]:
n = 10

In [200]:
chqStillUnmatched['cstatadd1']     = chqStillUnmatched.cstatadd1.astype(str)
igStillUnmatched['address_line_1'] = igStillUnmatched.address_line_1.astype(str)

In [201]:
addressArrayCStat = []

start = time.time()
for address in chqStillUnmatched.cstatadd1:
    thisAddress = []
    for ig in igStillUnmatched.address_line_1:
        thisAddress.append(levenshtein_distance(str(address),str(ig)))
    
    addressArrayCStat.append([thisAddress])

allLD = np.concatenate(addressArrayCStat)

singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]
largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


In [202]:
chqUnmatchedList = list(map(nlp, chqStillUnmatched['cstatadd1']))
allCompaniesIG   = list(map(nlp, igStillUnmatched['address_line_1']))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]
largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]

In [256]:
companyMatches3 = pd.DataFrame()
companyMatches3['cstatCompanies'] = chqStillUnmatched.company

Let's grab the set of the 10 closest addresses by LD and cos.

In [257]:
companyMatches3.head()

Unnamed: 0,cstatCompanies
0,avx
1,spire alabama
2,alabama power
3,tamir biothechnology
4,petro usa


Find the unique values in here.

In [275]:
for i in range(0,companyMatches3.shape[0]):
    
    # find all the closest LV and cos addresses and put them in a 
    companyMatches3.at[i,'closestAdds_indices']  = set(largestElementsLV[i]).union(set(largestElementsCos[i]))    

And explode it so one index per line.

In [276]:
companyMatches3_indices = companyMatches3.explode('closestAdds_indices').reset_index(drop=True)

Now get the companies, the cosine similarities, and the levenshtein distances.

In [271]:
companyMatches3_indices.closestAdds_indices

Unnamed: 0,cstatCompanies,closestAdds_indices
0,avx,28865
1,avx,55494
2,avx,16393
3,avx,27856
4,avx,38490


Get the embeddings for the cosine similarity.

In [287]:
for i in range(0,companyMatches3_indices.shape[0]):
    companyMatches3_indices.at[i,'igCompanies']  = igStillUnmatched.company[companyMatches3_indices.closestAdds_indices[i]] # .iloc[0]

    companyMatches3_indices.at[i,'lv']           = levenshtein_distance(companyMatches3_indices.cstatCompanies[i],companyMatches3_indices.igCompanies[i])
    companyMatches3_indices.at[i,'percMisspelled']       = companyMatches3_indices.lv[i]/len(companyMatches3_indices.cstatCompanies[i])

    # companyMatches3.at[i,'add_percMisspelled'] = companyMatches2.add_misspelling[i]/len(companyMatches2['cstatadd1'][i])


Get the company embeddings here.

In [290]:
cMatches  = companyMatches3_indices[['cstatCompanies']].drop_duplicates()
igMatches = companyMatches3_indices[['igCompanies']].drop_duplicates()



In [301]:
def getVector(text):
    embedding = nlp(text)
    
    return(embedding.vector)

In [303]:
cStatEmbeddings = list(map(getVector, cMatches.cstatCompanies))
igEmbeddings    = list(map(getVector, igMatches.igCompanies))

In [304]:
igMatches['igEmbedding']       = igEmbeddings
cMatches['cstatEmbedding']     = cStatEmbeddings

In [309]:
companyMatches3_indices = companyMatches3_indices.merge(igMatches).merge(cMatches)

In [311]:
companyMatches3_indices.shape

(22277, 7)

Loop through and get the cosine similarity.

In [313]:
for i in range(0,companyMatches3_indices.shape[0]):
    companyMatches3_indices.at[i,'cosSim']  = cosine_similarity([companyMatches3_indices.igEmbedding[i]],
                                                                [companyMatches3_indices.cstatEmbedding[i]])


In [350]:
filtered = companyMatches3_indices[((companyMatches3_indices.percMisspelled < 0.4) | \
                        (companyMatches3_indices.cosSim > 0.7))]

In [351]:
filtered[['cstatCompanies','igCompanies','percMisspelled','cosSim']].to_csv("../../data/companyData/companyMatches3_indices.csv")

In [352]:
filtered.head()

Unnamed: 0,cstatCompanies,closestAdds_indices,igCompanies,lv,percMisspelled,igEmbedding,cstatEmbedding,cosSim
410,safestitch medical,44186,opko health,16.0,0.888889,"[-0.164405, 0.10554, 0.021776, 0.069895, -0.26...","[-0.10295, -0.06815, 0.017488, 0.149165, -0.06...",0.708606
1266,advanced envir recycl,3209,advanced environmental rcyclng,11.0,0.5,"[-0.20946334, 0.0027800004, 0.07977566, 0.0587...","[-0.104956664, -0.012522667, 0.095026664, 0.04...",0.790612
1538,synthorx,32645,synterra,3.0,0.375,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
2270,golden state water,11145,golden state bank,4.0,0.222222,"[-0.076966666, 0.10898667, 0.33576334, -0.1391...","[0.11373501, 0.075163335, 0.47196665, -0.20447...",0.827425
3121,atlantic city electric,26711,fuji electric of america,18.0,0.818182,"[0.0013964921, 0.16315, 0.182865, -0.4701525, ...","[0.37305912, 0.04321533, 0.33236334, -0.473356...",0.739828


In [None]:
Below 0.6 threshold: 
    northern tech
nac global technologies 
rvl pharmaceuticals
capstone green energy
american res
intl flavors & fragrances
pope resources/de 
sanfilippo john b&son
reneo pharma

757

In [355]:
(chqUnmatched.shape[0] - (filtered.shape[0] + match2_cityZips.shape[0] + companiesToCheck.shape[0]))/chq.shape[0]

0.20414954002740263