In [7]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy
  
nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Breakpoint
We can start from here and just clean everything from here.


We are going to try to match every company that is headquartered in the US.

In [81]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'})

chq.head()

Unnamed: 0,gvkey,companyName,add1,city,state,cstatZipcode
0,1004,AAR CORP,"One AAR Place, 1100 North Wood Dale Road",Wood Dale,IL,60191
1,1082,SERVIDYNE INC,"1945 The Exchange, Suite 325",Atlanta,GA,30339
2,1244,ALCIDE CORP,8561 154th Avenue North East,Redmond,WA,98052
3,1258,CAPCO ENERGY INC,"1800 West Loop South, Suite 1950",Houston,TX,77027
4,1331,ALPINE GROUP INC,One Meadowlands Plaza,East Rutherford,NJ,7073


In [97]:
hqsOnly     = pd.read_csv("../../data/ig_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})
hqsWithYear = pd.read_csv("../../data/ig_uniqueHQs_multLocations.csv",dtype={'zipcode': 'object'})[['abi','company',
                                                                             'archive_version_year',
                                                                             'state','city','zipcode','address_line_1']]

print(hqsOnly.head())
print(hqsWithYear.head())

hqsWithYear = hqsWithYear[hqsWithYear.archive_version_year <= 2020]

hqsWithYear['last_year'] = hqsWithYear.groupby(['abi'])['archive_version_year'].transform(max)

print(hqsWithYear.shape)

lastHQs = hqsWithYear[hqsWithYear.archive_version_year == hqsWithYear.last_year][['abi','company','state','city','zipcode','address_line_1']]

print(lastHQs.shape)

lastHQs.head()

     abi                         company
0   7609            SOLITRON DEVICES INC
1  23077                JENNY LEE BAKERY
2  76547          MASTER PROTECTION CORP
3  77743  NATIONAL TECHNICAL SYSTEMS INC
4  89151         HILLTOP BASIC RESOURCES
     abi                         company  archive_version_year state  \
0   7609            SOLITRON DEVICES INC                2003.0    FL   
1  23077                JENNY LEE BAKERY                2003.0    PA   
2  76547          MASTER PROTECTION CORP                2003.0    FL   
3  77743  NATIONAL TECHNICAL SYSTEMS INC                2003.0    CA   
4  89151         HILLTOP BASIC RESOURCES                2003.0    OH   

              city zipcode             address_line_1  
0  WEST PALM BEACH   33407   3301 ELECTRONICS WAY # C  
1    MC KEES ROCKS   15136             620 ISLAND AVE  
2       FORT MYERS   33907  12800 UNIVERSITY DR # 400  
3        CALABASAS   91302   24007 VENTURA BLVD # 200  
4       CINCINNATI   45202          1 W 4

Unnamed: 0,abi,company,state,city,zipcode,address_line_1
28,435388,MBC HOLDING CO,MN,ST PAUL,55102,882 7TH ST W
40,479535,CMP INDUSTRIES LLC,NY,ALBANY,12207,413 N PEARL ST
67,595777,SEACAT,MS,HOLLANDALE,38748,1616 RICE MILL RD
73,602789,AMI SEMICONDUCTOR INC,ID,POCATELLO,83201,2300 W BUCKSKIN RD
90,651166,OSMONICS INC,MN,HOPKINS,55343,5951 CLEARWATER DR


In [98]:
lastHQs.to_csv("../../data/companyData/lastHQs.csv")

Only two of these company names appear 2x, which is good. There are ~20,000 companies in this sample.

Let's go through a little bit of a process here:
- Find the exact matches.
- Get a similarity measure between ; ideally something vectorized / something in matrix math.
- Find the top 10 matches for the remaining ones.
- Do some mix and match and see if there's any threshold at which matches become similar ``enough'' to say this is okay and good to go.


We might be able to use the fact that all of the addresses should be the same after some given point, as the compustat addresses are only the most recent ones. 

Let's try a few different ways to match these up.

First, let's find the exact matches.

Make a generic cleaning function that strips out all company names, any punctuation in the name, and makes everything lower case.

In [99]:
def cleanText(text):
    text = text.strip().\
    replace(" CORP","").replace(" CO","").replace(" INC","").\
    replace(" LTD","").replace(" -CL A","").\
    replace(" -LP","").replace(" LP","").\
    replace("-OLD","").replace(" LLC","").\
    replace(", LLC.","").replace(" L.L.C.","").replace(" L.P.","").\
    replace(" L.TD","").replace(" L.L.C.","").replace(" -CL B","").\
    replace(" -CL B","").replace(" -CL i","").replace(" -CL","").\
    replace("-REDH","").replace(" CP","").\
    replace("-ADR","").replace(" PLC","").lower().replace(r'[^\w\s]+', '').\
    replace('-lp','').replace('-spn','').replace('hldg','').replace(' intl','').\
    replace('holdings','').replace('holding','').replace('prtnr','').replace('group','').\
    replace(" med ", " medical ").replace(" tradng ", " trading ").replace("gen ", "general ").\
    replace(" mtr ", " motors ").replace(" motor ", " motors ").replace("-", " ").\
    replace("/", " ").replace("'", " ").replace("&", " ").replace(" a g ", " ").\
    replace(" ag ", " ").replace("  adr ", " ").replace(" adr ", " ").replace("  cp ", " ").\
    replace(" cp ", " ").replace(" plc ", " ").replace(" intl ", " ").replace(" ent ", " ").\
    replace(" nv ", " ").replace(" n.v. ", " ").replace(" worldwide ", " ").\
    replace(" wldwide ", " ").replace(" banc"," bank").replace("^banc","bank")
    
    return text

In [100]:
chq['company']               = list(map(cleanText, chq.company))
lastHQs['company']           = list(map(cleanText, lastHQs.company))

chq.rename(columns = {'city': 'cstatCity',
                     'state': 'cstatState',
                     'add1': 'cstatadd1'}, inplace = True)

chq['cstatCity']  = chq.cstatCity.str.lower()
chq['cstatState'] = chq.cstatState.str.lower()
chq['cstatadd1']  = chq.cstatadd1.str.lower()

lastHQs['city']            = lastHQs.city.str.lower()
lastHQs['state']           = lastHQs.state.str.lower()
lastHQs['address_line_1']  = lastHQs.address_line_1.str.lower()

NAICS names do not match up between compustat and infogroup so they're not helpful.

In [101]:
chq.head()

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode
0,1004,aar,"one aar place, 1100 north wood dale road",wood dale,il,60191
1,1010,acf industries,101 clark street,saint charles,mo,63301
2,1013,adc telecommunications,13625 technology drive,eden prairie,mn,55344
3,1019,afa protective systems,155 michael drive,syosset,ny,11791
4,1021,afp imaging,"8 westchester plaza, suite 112",elmsford,ny,10523


In [102]:
lastHQs.head()

Unnamed: 0,abi,company,state,city,zipcode,address_line_1
28,435388,mbc,mn,st paul,55102,882 7th st w
40,479535,cmp industries,ny,albany,12207,413 n pearl st
67,595777,seacat,ms,hollandale,38748,1616 rice mill rd
73,602789,ami semiconductor,id,pocatello,83201,2300 w buckskin rd
90,651166,osmonics,mn,hopkins,55343,5951 clearwater dr


## Match on company name directly

In [103]:
nameMerge = chq.merge(lastHQs)
nameMerge.shape

(8664, 11)

In [104]:
nameMerge.head()

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,abi,state,city,zipcode,address_line_1
0,1004,aar,"one aar place, 1100 north wood dale road",wood dale,il,60191,115523672,il,wood dale,60191,1100 n wood dale rd
1,1013,adc telecommunications,13625 technology drive,eden prairie,mn,55344,7523129,mn,eden prairie,55344,13625 technology dr
2,1019,afa protective systems,155 michael drive,syosset,ny,11791,317110880,ny,syosset,11791,155 michael dr
3,1037,ammmunications,1900 am drive,quakertown,pa,18951,1917483,pa,quakertown,18951,1900 am dr
4,1038,amc entertainment,920 main st,kansas city,mo,64105,387746266,ks,leawood,66211,11500 ash st


In [105]:
nameMerge.to_csv("../../data/companyData/nameMerge.csv")

In [106]:
sum(nameMerge.cstatState == nameMerge.state)/nameMerge.shape[0]

0.9105493998153278

In [107]:
sum(nameMerge.cstatZipcode.str.slice(0,5) == nameMerge.zipcode.str.slice(0,5))/nameMerge.shape[0]

0.7957063711911357

In [108]:
sum(nameMerge.cstatZipcode.str.slice(0,1) == nameMerge.zipcode.str.slice(0,1))/nameMerge.shape[0]

0.9190904893813481

In [109]:
nameMerge[nameMerge.cstatCity != nameMerge.city][50:100]

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,abi,state,city,zipcode,address_line_1
241,2951,delphax technologies,"5775 west old shakopee road, suite 80",bloomington,mn,55437,3621109,mn,minneapolis,55437,5775 w old shakopee rd # 80
242,2953,checkpoint systems,101 wolf drive,thorofare,nj,8086,3445046,nj,west deptford,8086,101 wolf dr
243,2955,chefs international,62 broadway,point pleasant beach,nj,8742,877995605,nj,pt pleasant bch,8742,62 broadway
256,3048,ciprico,"7003 west lake street, suite 400",st louis park,mn,55426,3608908,mn,minneapolis,55426,7003 w lake st # 400
258,3062,cintas,"6800 cintas boulevard, po box 625737",cincinnati,oh,45262,407717271,oh,mason,45040,6800 cintas blvd
263,3105,iheartmedia,20880 stone oak parkway,san antonio,tx,78258,426840329,ny,new york,10013,32 avenue of the americas # 1
267,3116,clinical data,"one gateway center, suite 702",newton,ma,2458,490516424,ri,smithfield,2917,2 thurber blvd
269,3126,all american,910 summa drive,elkhart,in,46516,657439147,ga,atlanta,30327,1380 w paces ferry rd nw #2180
274,3164,cole national,4000 luxottica place,mason,oh,45040,7508302,oh,twinsburg,44087,1925 enterprise pkwy
275,3165,ranger industries,3400 82nd way north,st petersburg,fl,33710,988451753,nj,tinton falls,7724,15 park rd


Now focus down onto the companies that have not been matched.

In [110]:
chqUnmatched = chq[~chq.company.isin(nameMerge.company)].reset_index()
chqUnmatched.shape

(11755, 7)

In [111]:
igUnmatched  = lastHQs[~lastHQs.company.isin(nameMerge.company)].reset_index()
igUnmatched.shape

(60595, 7)

In [112]:
lastHQs.shape

(69186, 6)

# Find Distance

Two distance measures here. Look at top 5 matches and pull the distance measure and matches as well.

### Levenshtein

In [113]:
from Levenshtein import distance as levenshtein_distance

Find LD between the unmatched compustat companies and the unmatched IG ones. 

In [114]:
companyArrayCStat = []

company = chqUnmatched.company[0]
start = time.time()
for company in chqUnmatched.company:
    thisCompany = []
    for ig in igUnmatched.company:
        thisCompany.append(levenshtein_distance(company,ig))
    
    companyArrayCStat.append([thisCompany])

allLD = np.concatenate(companyArrayCStat)


In [115]:
igUnmatched.head()

Unnamed: 0,index,abi,company,state,city,zipcode,address_line_1
0,40,479535,cmp industries,ny,albany,12207,413 n pearl st
1,67,595777,seacat,ms,hollandale,38748,1616 rice mill rd
2,73,602789,ami semiconductor,id,pocatello,83201,2300 w buckskin rd
3,103,683474,newport creamery,ri,middletown,2842,208 w main rd
4,125,742908,femco machine,pa,punxsutawney,15767,754 s main street ext


Now pull the closest companies in IG to those in CStat. We'll first populate a dataframe with the name, address, city, state, and zip of each unmatched company in compustat, then we'll use the LD to find the same information for the closest company in IG.



There's some legacy code in here that finds the top 5 closest companies; but it doesn't populate the dataframe.

In [116]:
singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]

# legacy
# n = 5
# largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


companyMatches = pd.DataFrame()
companyMatches['cstatCompanies'] = chqUnmatched.company
companyMatches['cstatadd1']      = chqUnmatched.cstatadd1
companyMatches['cstatCity']      = chqUnmatched.cstatCity
companyMatches['cstatState']     = chqUnmatched.cstatState
companyMatches['cstatZip']       = chqUnmatched.cstatZipcode

for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'misspelling']         = allLD[i,:][singleLargestLV[i]]
    companyMatches.at[i,'percMisspelled']      = companyMatches.misspelling[i]/len(companyMatches.cstatCompanies[i])
    companyMatches.at[i,'levCompany']          = igUnmatched.company[singleLargestLV[i]].iloc[0]
    
    companyMatches.at[i,'closestMatchIG_add']      = igUnmatched.address_line_1[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_city']     = igUnmatched.city[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_zipcode']  = igUnmatched.zipcode[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_state']    = igUnmatched.state[singleLargestLV[i]].iloc[0]

    
    # companyMatches.at[i,'closestMatchIG']      = np.array(igUnmatched.company)[largestElementsLV[i]]
    # companyMatches.at[i,'LevSim']              = np.array(allLD[i,:][largestElementsLV[i]], dtype=object)

In [117]:
companyMatches.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,percMisspelled,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state
0,acf industries,101 clark street,saint charles,mo,63301,1.0,0.071429,acs industries,1 new england way # 1,lincoln,2865,ri
1,afp imaging,"8 westchester plaza, suite 112",elmsford,ny,10523,2.0,0.181818,abc imaging,5290 shawnee rd # 300,alexandria,22312,va
2,alpharma,"440 route 22 east, p.o. box 1399",bridgewater,nj,8807,1.0,0.111111,alpharma,440 us highway 22,bridgewater,8807,nj
3,united dominion industries,2300 one first union center,charlotte,nc,28202,8.0,0.307692,united furniture industries,431 highway 41 e,okolona,38860,ms
4,anr pipeline,700 louisiana street,houston,tx,77002,5.0,0.416667,arizona pipeline,17372 lilac st,hesperia,92345,ca


Now get the embeddings and the cosine similarity between them.

In [118]:
def getMatrix(companyEmbeddings):
    companyArray = []
    
    for companies in companyEmbeddings:
        companyArray.append([companies.vector])
    
    companyArray = np.concatenate(companyArray)
    
    return(companyArray)
        

In [119]:
chqUnmatchedList = list(map(nlp, chqUnmatched.company))
allCompaniesIG   = list(map(nlp, igUnmatched.company))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

(11755, 60595)

In [120]:
outfile =  '../../data/allCompaniesIG_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(ig, pickle_file)
    
outfile =  '../../data/allCompaniesCStat_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(cstat, pickle_file)    

Each row n here has the similarity between the nth company name in compustat and the IG company corresp to that column.

In [121]:
allSimilarities[0:5,:]

array([[0.6746448 , 0.        , 0.23701075, ..., 0.20649062, 0.25971833,
        0.2667256 ],
       [0.25185254, 0.        , 0.31978977, ..., 0.03964766, 0.37598202,
        0.09968254],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.5100785 , 0.        , 0.2642994 , ..., 0.39772192, 0.3532551 ,
        0.55830365],
       [0.34178126, 0.        , 0.18507437, ..., 0.21167924, 0.21122286,
        0.20400302]], dtype=float32)

Find indices of companies in IG most similar to each company in CStat.

In [122]:
singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]

# legacy - largest n
# largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]


Add the cosine similarity measures to the similarity dataset.

In [123]:
for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'cosSimilarity']        = allSimilarities[i,:][singleLargestCos[i]]
    companyMatches.at[i,'cosSimilarityCompany'] = igUnmatched.company[singleLargestCos[i]].iloc[0]
    # companyMatches.at[i,'closestMatchCosine']   = np.array(igUnmatched.company)[largestElementsCos[i]]
    # companyMatches.at[i,'cosineSim']            = np.array(allSimilarities[i,:][largestElementsCos[i]], dtype=object)
    
    
    companyMatches.at[i,'costMatchIG_add']     = igUnmatched.address_line_1[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_city']     = igUnmatched.city[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_zipcode']  = igUnmatched.zipcode[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_state']    = igUnmatched.state[singleLargestCos[i]].iloc[0]
    

In [124]:
sum((companyMatches.levCompany == companyMatches.cosSimilarityCompany))

1278

## Start Matching

### Take 1: Match + Zip or City

Now find the company matches: ABI - gvkey link.

Start with ones where the names both match.

If the cities or zipcodes match on one of the closest companies (LD or cos), it seems like it is good to go.


Do this in steps to start, at least. First find the companies where both match and either zip or city match. Then find companies where only one matches.

In [126]:
bothMatch_cityZip = companyMatches[(companyMatches.levCompany == companyMatches.cosSimilarityCompany) & \
              ((companyMatches.cstatCity == companyMatches.closestMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.closestMatchIG_zipcode))]
print(bothMatch_cityZip.shape)

bothMatch_cityZip.to_csv("../../data/companyData/bothMatch_cityZip.csv")

(793, 18)


In [127]:
bothMatch_cityZip['igCompanies'] = bothMatch_cityZip.levCompany
companiesToCheck                 = bothMatch_cityZip[['cstatCompanies','igCompanies',
                                    'cstatadd1','cstatCity','cstatZip',
                                    'closestMatchIG_add','closestMatchIG_city','closestMatchIG_zipcode']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [128]:
companyMatchesBoth = list(bothMatch_cityZip.cstatCompanies.unique())
len(companyMatchesBoth)

790

Grab the single company match versions.

In [129]:
oneMatch_cityZipOnly = companyMatches[-(companyMatches.cstatCompanies.isin(companyMatchesBoth)) & \
              (((companyMatches.cstatCity == companyMatches.closestMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.closestMatchIG_zipcode)) | \
              ((companyMatches.cstatCity == companyMatches.cosMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.cosMatchIG_zipcode)))].reset_index(drop=True)

oneMatch_cityZipOnly['igCompanies'] = ''

for i in range(0,oneMatch_cityZipOnly.shape[0]):
    if ((oneMatch_cityZipOnly.cstatCity[i] == oneMatch_cityZipOnly.closestMatchIG_city[i]) | \
              (oneMatch_cityZipOnly.cstatZip[i] == oneMatch_cityZipOnly.closestMatchIG_zipcode[i])):
        oneMatch_cityZipOnly.loc[i,'igCompanies'] = oneMatch_cityZipOnly.levCompany[i]
    else:
        oneMatch_cityZipOnly.loc[i,'igCompanies'] = oneMatch_cityZipOnly.cosSimilarityCompany[i]

# oneMatch_cityZipOnly.to_csv("../../data/companyData/oneMatch_cityZipOnly.csv")

In [131]:
oneMatch_cityZipOnly.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,percMisspelled,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state,igCompanies
0,alpharma,"440 route 22 east, p.o. box 1399",bridgewater,nj,8807,1.0,0.111111,alpharma,440 us highway 22,bridgewater,8807,nj,0.0,cmp industries,413 n pearl st,albany,12207,ny,alpharma
1,acmat,30 south road,farmington,ct,6032,1.0,0.166667,acmat,30 south rd,farmington,6032,ct,0.0,cmp industries,413 n pearl st,albany,12207,ny,acmat
2,albertson s,"250 parkcenter boulevard, po box 20",boise,id,83726,1.0,0.090909,albertsons,250 e parkcenter blvd,boise,83706,id,0.659798,schwan s,115 w college dr,marshall,56258,mn,albertsons
3,beam,222 west merchandise mart plaza,chicago,il,60654,1.0,0.25,besam,2140 priest bridge ct # 19,crofton,21114,md,1.0,beam suntory,222 merchandise mart plz #1600,chicago,60654,il,beam suntory
4,american family finl svcs,6000 american parkway,madison,wi,53783,5.0,0.2,american family fitness,4435 waterfront dr # 304,glen allen,23060,va,0.847102,american family insurance,6000 american pkwy,madison,53783,wi,american family insurance


In [132]:
companiesToCheck = companiesToCheck.append(oneMatch_cityZipOnly[['cstatCompanies','igCompanies',
                                    'cstatadd1','cstatCity','cstatZip',
                                    'closestMatchIG_add','closestMatchIG_city','closestMatchIG_zipcode']]).\
                                    drop_duplicates()

companiesToCheck.shape

(2443, 8)

In [133]:
companiesToCheck[0:50]

Unnamed: 0,cstatCompanies,igCompanies,cstatadd1,cstatCity,cstatZip,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode
5,asa gold and precious metals,asa gold precious metals,three canal plaza,portland,4101,3 canal plz,portland,4101
12,adams diversified equity fd,adams diversified equity fund,"500 east pratt street, suite 1300",baltimore,21202,500 e pratt st # 1300,baltimore,21202
23,allen organ,allen organ,"150 locust street, p.o. box 36",macungie,18062,150 locust st,macungie,18062
48,american physicians svc gp,american physicians svc,"1301 south capital of texas highway, suite c-300",austin,78746,1301 s capital of texas hwy,west lake hills,78746
49,american science engineering,american science engineering,829 middlesex turnpike,billerica,1821,829 middlesex tpke,billerica,1821
50,american shared hsptl serv,american shared hospital svc,"601 montgomery street, suite 1112",san francisco,94111,2 embarcadero ctr # 410,san francisco,94111
51,american software,american software,"470 east paces ferry road, n.e.",atlanta,30305,470 e paces ferry rd ne,atlanta,30305
58,anheuser buschs,anheuser busch,one busch place,saint louis,63118,1 busch pl,st louis,63118
61,apco oil and gas,apco oil gas,"one williams center, 35th floor",tulsa,74172,1 one williams ctr # 35,tulsa,74172
70,arts way mfg,art s way mfg,"5556 highway 9, po box 288",armstrong,50514,5556 highway 9,armstrong,50514


2x check that there are no duplicates here.

In [134]:
duplicates = companiesToCheck.cstatCompanies.value_counts().index[companiesToCheck.cstatCompanies.value_counts() > 1]

companiesToCheck[companiesToCheck.cstatCompanies.isin(duplicates)]

Unnamed: 0,cstatCompanies,igCompanies,cstatadd1,cstatCity,cstatZip,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode
7074,ziff davis,ziff davis,28 east 28th street,new york,10016,28 e 28th st,new york,10016
7630,ziff davis,ziff davis,"114 5th avenue, 15th floor",new york,10011,28 e 28th st,new york,10016
8309,magellan midstream s,magellan midstream,"one williams center, p.o. box 22186",tulsa,74121,1 one williams ctr # 2,tulsa,74172
8910,h e equipment services,h e equipment svc,11100 mead rd ste 200,baton rouge,70816,7500 pecue ln,baton rouge,70809
9453,h e equipment services,h e equipment svc,7500 pecue lane,baton rouge,70809,7500 pecue ln,baton rouge,70809
9468,magellan midstream s,magellan midstream,one williams center,tulsa,74172,1 one williams ctr # 2,tulsa,74172
77,pepsiamericas,pepsi americas,"4000 rbc plaza, 60 south sixth street",minneapolis,55402,60 s 6th st # 4000,minneapolis,55402
810,pepsiamericas,pepsi americas,"3800 dain rauscher plaza, 66 south sixth street",minneapolis,55402,60 s 6th st # 4000,minneapolis,55402


In [135]:
chqStillUnmatched = chqUnmatched[-chqUnmatched.company.isin(companiesToCheck.cstatCompanies)].reset_index(drop=True)
igStillUnmatched  = igUnmatched[-igUnmatched.company.isin(companiesToCheck.igCompanies)].reset_index(drop=True)

# companyMatches['cstatCompanies'] = chqUnmatched.company
print(chqUnmatched.shape, chqStillUnmatched.shape, companiesToCheck.shape)
print(igStillUnmatched.shape,igUnmatched.shape)

(11755, 7) (9303, 7) (2443, 8)
(58397, 7) (60595, 7)


In [136]:
companiesToCheck.to_csv("../../data/companyData/companiesToCheck_cityZip.csv")

## Take 2
Match remaining ones on first word of name

In [137]:
chqUnmatched.company[0].split(' ')[0]

'acf'

In [138]:
chqUnmatched.company[0]

'acf industries'

Get the edit distance for the first words of the company names.

In [139]:
companyArrayCStat = []

start = time.time()
for company in chqStillUnmatched.company:
    thisCompany = []
    for ig in igStillUnmatched.company:
        thisCompany.append(levenshtein_distance(company.split(' ')[0],ig.split(' ')[0]))
    
    companyArrayCStat.append([thisCompany])

allLD = np.concatenate(companyArrayCStat)


And the cosine distance.

In [140]:
chqStillUnmatchedFirstCo = []
igStillUnmatchedFirstCo = []

for company in chqStillUnmatched.company:
    chqStillUnmatchedFirstCo.append(company.split(' ')[0])
    
for company in igStillUnmatched.company:
    igStillUnmatchedFirstCo.append(company.split(' ')[0])

In [141]:
chqUnmatchedList = list(map(nlp, chqStillUnmatchedFirstCo))
allCompaniesIG = list(map(nlp, igStillUnmatchedFirstCo))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

# largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]
singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]

In [142]:
igStillUnmatched.head()

Unnamed: 0,index,abi,company,state,city,zipcode,address_line_1
0,67,595777,seacat,ms,hollandale,38748,1616 rice mill rd
1,73,602789,ami semiconductor,id,pocatello,83201,2300 w buckskin rd
2,103,683474,newport creamery,ri,middletown,2842,208 w main rd
3,125,742908,femco machine,pa,punxsutawney,15767,754 s main street ext
4,134,827154,mako marine,fl,opa locka,33054,4355 nw 128th st


In [143]:
singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]
# largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


companyMatches2 = pd.DataFrame()
companyMatches2['cstatCompanies'] = chqStillUnmatched.company
companyMatches2['cstatadd1']      = chqStillUnmatched.cstatadd1
companyMatches2['cstatCity']      = chqStillUnmatched.cstatCity
companyMatches2['cstatState']     = chqStillUnmatched.cstatState
companyMatches2['cstatZip']       = chqStillUnmatched.cstatZipcode


for i in range(0,companyMatches2.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches2.at[i,'misspelling']         = allLD[i,:][singleLargestLV[i]]
    companyMatches2.at[i,'levCompany']          = igStillUnmatched.company[singleLargestLV[i]].iloc[0]
    
    companyMatches2.at[i,'closestMatchIG_add']      = igStillUnmatched.address_line_1[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_city']     = igStillUnmatched.city[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_zipcode']  = igStillUnmatched.zipcode[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_state']    = igStillUnmatched.state[singleLargestLV[i]].iloc[0]

    
    companyMatches2.at[i,'cosSimilarity']        = allSimilarities[i,:][singleLargestCos[i]]
    companyMatches2.at[i,'cosSimilarityCompany'] = igStillUnmatched.company[singleLargestCos[i]].iloc[0]
   
    companyMatches2.at[i,'costMatchIG_add']     = igStillUnmatched.address_line_1[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_city']     = igStillUnmatched.city[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_zipcode']  = igStillUnmatched.zipcode[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_state']    = igStillUnmatched.state[singleLargestCos[i]].iloc[0]

    

In [144]:
companyMatches2.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state
0,acf industries,101 clark street,saint charles,mo,63301,0.0,acf,9311 solar dr,tampa,33619,fl,1.0,acf,9311 solar dr,tampa,33619,fl
1,afp imaging,"8 westchester plaza, suite 112",elmsford,ny,10523,1.0,app annie,23 geary st # 8,san francisco,94108,ca,0.40283,cdc publishing,2001 9th ave # 204,vero beach,32960,fl
2,united dominion industries,2300 one first union center,charlotte,nc,28202,0.0,united feather down,3340 dundee rd # 2s3,northbrook,60062,il,1.0,united supermarkets,7830 orlando ave,lubbock,79423,tx
3,anr pipeline,700 louisiana street,houston,tx,77002,0.0,anr,636 shelby st # 3,bristol,37620,tn,1.0,anr,636 shelby st # 3,bristol,37620,tn
4,prog,256 west data drive,draper,ut,84020,1.0,pro serve solutions,5385 main st # 105,williamsville,14221,ny,0.54259,techno serve,1120 19th st nw # 8,washington,20036,dc


In [145]:
companyMatches2.shape

(9303, 17)

Find if city or zip match here.

In [146]:
match2_cityZips = companyMatches2[(((companyMatches2.cstatCity == companyMatches2.closestMatchIG_city) | \
              (companyMatches2.cstatZip == companyMatches2.closestMatchIG_zipcode)) | \
              ((companyMatches2.cstatCity == companyMatches2.cosMatchIG_city) | \
              (companyMatches2.cstatZip == companyMatches2.cosMatchIG_zipcode)))].reset_index(drop=True)


In [147]:
match2_cityZips

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state
0,alabama power,600 north 18th street,birmingham,al,35203,0.0,alabama national bankorp,1927 1st ave n,birmingham,35203,al,1.0,alabama forestymmission,513 madison ave,montgomery,36104,al
1,maxum energy logistics,20 horseneck lane,greenwich,ct,06830,0.0,maxum petroleum,20 horseneck ln,greenwich,06830,ct,0.0,seacat,1616 rice mill rd,hollandale,38748,ms
2,arizona public service,"400 north fifth street, p.o. box 53999",phoenix,az,85072,0.0,arizona water,3805 n black canyon hwy,phoenix,85015,az,1.0,arizona materials,3636 s 43rd ave,phoenix,85009,az
3,aviall,"2750 regent boulevard, dallas fort worth airport",dallas,tx,75261,0.0,aviall services,2750 regent blvd,dallas,75261,tx,0.0,seacat,1616 rice mill rd,hollandale,38748,ms
4,bangor hydro electric,p.o. box 932,bangor,me,04401,0.0,bangor savings bank,3 state st,bangor,04401,me,1.0,bangor savings bank,3 state st,bangor,04401,me
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,dynacast international,"14045 ballantyne corporate place, suite 400",charlotte,nc,28277,0.0,dynacast,14045 ballantyne corporat #400,charlotte,28277,nc,0.0,seacat,1616 rice mill rd,hollandale,38748,ms
284,kofax,15211 laguna canyon road,irvine,ca,92618,0.0,kofax image products,16245 laguna canyon rd,irvine,92618,ca,0.0,seacat,1616 rice mill rd,hollandale,38748,ms
285,brightmail,"301 howard street, suite 1800",san francisco,ca,94105,2.0,brightmark energy,235 pine st # 1100,san francisco,94104,ca,0.0,seacat,1616 rice mill rd,hollandale,38748,ms
286,ensync,"n88 w13901 main street, suite 200",menomonee falls,wi,53051,0.0,ensync energy systems,n88w13901 main st # 200,menomonee falls,53051,wi,0.0,seacat,1616 rice mill rd,hollandale,38748,ms


In [148]:
match2_cityZips['igCompanies'] = ''

for i in range(0,match2_cityZips.shape[0]):
    if ((match2_cityZips.cstatCity[i] == match2_cityZips.closestMatchIG_city[i]) | \
              (match2_cityZips.cstatZip[i] == match2_cityZips.closestMatchIG_zipcode[i])):
        match2_cityZips.loc[i,'igCompanies'] = match2_cityZips.levCompany[i]
    else:
        match2_cityZips.loc[i,'igCompanies'] = match2_cityZips.cosSimilarityCompany[i]


In [149]:
match2_cityZips[['cstatCompanies','igCompanies','closestMatchIG_add','costMatchIG_add']].shape

(288, 4)

In [150]:
match2_cityZips[['cstatCompanies','igCompanies','closestMatchIG_add','costMatchIG_add']].to_csv("../../data/companyData/match2_cityZips.csv")

## Take 3
Try the addresses here.

Let's try something similar:
- Find top 10 most similar addresses by cos sim
- Find top 10 most similar addresses by LD
- Find unique union of these two
- Record LD and cos sim for each
- Filter for totally dissimilar ones
- ``Explode'' the dataset so we have cstat company, address
- Find first word LD and cos sim
- Find total LD and cos sim

In [151]:
n = 10

In [152]:
chqStillUnmatched['cstatadd1']     = chqStillUnmatched.cstatadd1.astype(str)
igStillUnmatched['address_line_1'] = igStillUnmatched.address_line_1.astype(str)

In [153]:
addressArrayCStat = []

start = time.time()
for address in chqStillUnmatched.cstatadd1:
    thisAddress = []
    for ig in igStillUnmatched.address_line_1:
        thisAddress.append(levenshtein_distance(str(address),str(ig)))
    
    addressArrayCStat.append([thisAddress])

allLD = np.concatenate(addressArrayCStat)

singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]
largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


In [154]:
chqUnmatchedList = list(map(nlp, chqStillUnmatched['cstatadd1']))
allCompaniesIG   = list(map(nlp, igStillUnmatched['address_line_1']))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]
largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]

In [155]:
companyMatches3 = pd.DataFrame()
companyMatches3['cstatCompanies'] = chqStillUnmatched.company

Let's grab the set of the 10 closest addresses by LD and cos.

In [156]:
companyMatches3.head()

Unnamed: 0,cstatCompanies
0,acf industries
1,afp imaging
2,united dominion industries
3,anr pipeline
4,prog


Find the unique values in here.

In [157]:
set(largestElementsLV[i]).union(set(largestElementsCos[i]))  

{1548,
 2607,
 5830,
 5927,
 7719,
 12973,
 13493,
 17557,
 23940,
 35277,
 36223,
 37646,
 39345,
 41363,
 42023,
 49922,
 55016}

In [158]:
companyMatches3.head()

Unnamed: 0,cstatCompanies
0,acf industries
1,afp imaging
2,united dominion industries
3,anr pipeline
4,prog


In [159]:
companyMatches3['closestAdds_indices'] = ''


for i in range(0,companyMatches3.shape[0]):
    
    # find all the closest LV and cos addresses and put them in a 
    companyMatches3.at[i,'closestAdds_indices']  = set(largestElementsLV[i]).union(set(largestElementsCos[i]))    

And explode it so one index per line.

In [160]:
companyMatches3_indices = companyMatches3.explode('closestAdds_indices').reset_index(drop=True)

Now get the companies, the cosine similarities, and the levenshtein distances.

In [161]:
companyMatches3_indices.closestAdds_indices

0         27776
1          3521
2         42564
3         40519
4         28620
          ...  
172530    26489
172531    39034
172532     3707
172533     6461
172534    24830
Name: closestAdds_indices, Length: 172535, dtype: object

Get the embeddings for the cosine similarity.

In [162]:
for i in range(0,companyMatches3_indices.shape[0]):
    companyMatches3_indices.at[i,'igCompanies']  = igStillUnmatched.company[companyMatches3_indices.closestAdds_indices[i]] # .iloc[0]

    companyMatches3_indices.at[i,'lv']           = levenshtein_distance(companyMatches3_indices.cstatCompanies[i],companyMatches3_indices.igCompanies[i])
    companyMatches3_indices.at[i,'percMisspelled']       = companyMatches3_indices.lv[i]/len(companyMatches3_indices.cstatCompanies[i])

    # companyMatches3.at[i,'add_percMisspelled'] = companyMatches2.add_misspelling[i]/len(companyMatches2['cstatadd1'][i])


Get the company embeddings here.

In [163]:
cMatches  = companyMatches3_indices[['cstatCompanies']].drop_duplicates()
igMatches = companyMatches3_indices[['igCompanies']].drop_duplicates()



In [164]:
def getVector(text):
    embedding = nlp(text)
    
    return(embedding.vector)

In [165]:
cStatEmbeddings = list(map(getVector, cMatches.cstatCompanies))
igEmbeddings    = list(map(getVector, igMatches.igCompanies))

In [166]:
igMatches['igEmbedding']       = igEmbeddings
cMatches['cstatEmbedding']     = cStatEmbeddings

In [167]:
companyMatches3_indices = companyMatches3_indices.merge(igMatches).merge(cMatches)

In [168]:
companyMatches3_indices.shape

(172535, 7)

Loop through and get the cosine similarity.

In [169]:
for i in range(0,companyMatches3_indices.shape[0]):
    companyMatches3_indices.at[i,'cosSim']  = cosine_similarity([companyMatches3_indices.igEmbedding[i]],
                                                                [companyMatches3_indices.cstatEmbedding[i]])


In [170]:
filtered = companyMatches3_indices[((companyMatches3_indices.percMisspelled < 0.4) | \
                        (companyMatches3_indices.cosSim > 0.6))]

In [171]:
filtered[['cstatCompanies','igCompanies','percMisspelled','cosSim']].to_csv("../../data/companyData/companyMatches3_indices.csv")

In [None]:
filtered.shape

In [None]:
filtered.head()

# Combine all these things

In [172]:
dset1 = pd.read_csv("../../data/companyData/companiesToCheck_cityZip.csv")[['cstatCompanies','igCompanies','delete']]

In [173]:
dset2 = pd.read_csv("../../data/companyData/match2_cityZips.csv")[['cstatCompanies','igCompanies','delete']]

In [174]:
dset3 = pd.read_csv("../../data/companyData/companyMatches3_indices.csv")[['cstatCompanies','igCompanies','delete']]

In [175]:
dset1[['cstatCompanies','igCompanies','delete']].head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,asa gold and precious metals,asa gold precious metals,
1,adams diversified equity fd,adams diversified equity fund,
2,allen organ,allen organ,
3,american physicians svc gp,american physicians svc,
4,american science engineering,american science engineering,


In [176]:
dset2[['cstatCompanies','igCompanies','delete']].head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,alabama power,alabama national bankorp,1.0
1,maxum energy logistics,maxum petroleum,
2,arizona public service,arizona water,1.0
3,aviall,aviall services,
4,bangor hydro electric,bangor savings bank,1.0


In [177]:
dset3[['cstatCompanies','igCompanies','delete']].head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,arizona public service,the fort jennings state bank,1.0
1,duke energy ohio,the fort jennings state bank,1.0
2,duke energy ohio,jefferson madison regional lib,1.0
3,craig,gilbert kelly crowley jennett,1.0
4,intl flavors fragrances,international flavors frgrncs,


In [178]:
cleanMerge = pd.read_csv("../../data/companyData/nameMerge.csv")[['company']].\
    rename(columns = {'company': 'cstatCompanies'})
cleanMerge['igCompanies'] = cleanMerge['cstatCompanies']

cleanMerge['delete'] = ''

print(cleanMerge.head())


           cstatCompanies             igCompanies delete
0                     aar                     aar       
1  adc telecommunications  adc telecommunications       
2  afa protective systems  afa protective systems       
3          ammmunications          ammmunications       
4      amc entertainment       amc entertainment        


In [179]:
all = pd.concat([dset1,dset2,dset3,cleanMerge])
allFiltered = all[~(all.delete == 1.0)].drop_duplicates()

In [180]:
cleanMerge.head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,aar,aar,
1,adc telecommunications,adc telecommunications,
2,afa protective systems,afa protective systems,
3,ammmunications,ammmunications,
4,amc entertainment,amc entertainment,


In [181]:
allFiltered.shape

(10743, 3)

In [182]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'})

chq['cstatCompanies'] = list(map(cleanText, chq.company))
chqToMatch = chq[['cstatCompanies','gvkey']]

In [183]:
ig = pd.read_csv("../../data/companyData/lastHQs.csv")
ig['igCompanies'] = list(map(cleanText, ig.company))

igToMatch = ig[['igCompanies','abi']]

In [184]:
igToMatch.head()

Unnamed: 0,igCompanies,abi
0,caridian bct,158329
1,family brands,211946
2,jade,258574
3,montana metal products,262493
4,o i,455154


Put these all together.

In [185]:
gvKey_abiLinkingTable = allFiltered.merge(chqToMatch).merge(igToMatch).drop_duplicates()

In [186]:
gvKey_abiLinkingTable.to_csv('../../data/companyData/linkingTable.csv')

---------------------

# Take 4
Find the firms remaining the in the Compustat segments data that don't have a match. Then, find 10 closest matches to each and match by hand.

In [10]:
file = "../../data/companyData/compustat2000s.csv"
compustat = pd.read_csv(file, encoding = 'unicode_escape').drop(columns = {'curncdq'})

# rename to clean this up a little bit
compustat.rename(columns = {'fyearq':'year',
                            'fqtr':  'qtr',
                            'conm':  'companyName',
                           },
                 inplace = True)
compustat[['companyName','gvkey']].drop_duplicates().gvkey.value_counts()

1004      1
120359    1
120513    1
120493    1
120463    1
         ..
28983     1
28981     1
28980     1
28979     1
353444    1
Name: gvkey, Length: 30218, dtype: int64

In [11]:
compustat[['companyName','gvkey']].drop_duplicates().gvkey.value_counts().max()

1

In [14]:
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = {'Unnamed: 0'})

In [15]:
gvKey_abiLinkingTable.head()

Unnamed: 0,cstatCompanies,igCompanies,delete,gvkey,abi
0,asa gold and precious metals,asa gold precious metals,,1062,402180222
1,adams diversified equity fd,adams diversified equity fund,,1119,397759739
2,allen organ,allen organ,,1283,400700704
3,american physicians svc gp,american physicians svc,,1539,218548014
4,american science engineering,american science engineering,,1554,441435880


In [35]:
c_links = pd.read_csv("../../data/companyData/compustatSCLinked.csv") # pd.read_csv("../../data/companyData/compustatSCLinked.csv")

c_links['year'] = c_links.srcdate.astype('str').str.slice(0,4).astype('int64')

c_links = c_links[c_links.year > 1997].\
    rename(columns = {'cgvkey': 'customer_gvkey',
                      'gvkey': 'supplier_gvkey',
                      'cconm': 'customer_name',
                      'conm': 'supplier_company'})
 
c_links['year'] = pd.to_datetime(c_links.year, format = '%Y').dt.year
c_links.head()



Unnamed: 0,supplier_gvkey,supplier_company,customer_gvkey,customer_name,cnms,srcdate,cid,sid,ctype,salecs,scusip,stic,ccusip,ctic,year
70,1013,ADC TELECOMMUNICATIONS INC,2136,VERIZON COMMUNICATIONS INC,VERIZON COMMUNICATIONS,20021031,10,0,COMPANY,111.056,886309,ADCT.1,92343V104,VZ,2002
71,1013,ADC TELECOMMUNICATIONS INC,2136,VERIZON COMMUNICATIONS INC,VERIZON COMMUNICATIONS,20041031,13,0,COMPANY,104.312,886309,ADCT.1,92343V104,VZ,2004
72,1013,ADC TELECOMMUNICATIONS INC,2136,VERIZON COMMUNICATIONS INC,VERIZON COMMUNICATIONS,20051031,13,0,COMPANY,146.0,886309,ADCT.1,92343V104,VZ,2005
73,1013,ADC TELECOMMUNICATIONS INC,2136,VERIZON COMMUNICATIONS INC,VERIZON COMMUNICATIONS,20061031,13,0,COMPANY,205.0,886309,ADCT.1,92343V104,VZ,2006
74,1013,ADC TELECOMMUNICATIONS INC,2136,VERIZON COMMUNICATIONS INC,VERIZON COMMUNICATIONS,20071031,13,0,COMPANY,236.0,886309,ADCT.1,92343V104,VZ,2007


In [36]:
companies = c_links.supplier_gvkey.append(c_links.customer_gvkey)
companies = companies.drop_duplicates()
companies

70          1013
84          1021
121         1037
129         1048
139         1050
           ...  
118570    243597
118588    271355
118597    270918
118621    209671
118625    326087
Length: 7788, dtype: int64

In [37]:
gvKey_abiLinkingTable.gvkey.shape

(8399,)

In [39]:
unmatched = companies[~companies.isin(gvKey_abiLinkingTable.gvkey)].drop_duplicates()
unmatched.shape

(4288,)

    gvkey                        conm  cgvkey                       cconm  \
70   1013  ADC TELECOMMUNICATIONS INC    2136  VERIZON COMMUNICATIONS INC   
71   1013  ADC TELECOMMUNICATIONS INC    2136  VERIZON COMMUNICATIONS INC   
72   1013  ADC TELECOMMUNICATIONS INC    2136  VERIZON COMMUNICATIONS INC   
73   1013  ADC TELECOMMUNICATIONS INC    2136  VERIZON COMMUNICATIONS INC   
74   1013  ADC TELECOMMUNICATIONS INC    2136  VERIZON COMMUNICATIONS INC   

                      cnms   srcdate  cid  sid    ctype   salecs     scusip  \
70  VERIZON COMMUNICATIONS  20021031   10    0  COMPANY  111.056  000886309   
71  VERIZON COMMUNICATIONS  20041031   13    0  COMPANY  104.312  000886309   
72  VERIZON COMMUNICATIONS  20051031   13    0  COMPANY  146.000  000886309   
73  VERIZON COMMUNICATIONS  20061031   13    0  COMPANY  205.000  000886309   
74  VERIZON COMMUNICATIONS  20071031   13    0  COMPANY  236.000  000886309   

      stic     ccusip ctic  year  
70  ADCT.1  92343V104   VZ 