In [7]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy
  
nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Breakpoint
We can start from here and just clean everything from here.


We are going to try to match every company that is headquartered in the US.

In [111]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'}).\
    rename(columns = {'companyName': 'company'})

chq.head()

Unnamed: 0,gvkey,company,add1,city,state,cstatZipcode
0,1004,AAR CORP,"One AAR Place, 1100 North Wood Dale Road",Wood Dale,IL,60191
1,1082,SERVIDYNE INC,"1945 The Exchange, Suite 325",Atlanta,GA,30339
2,1244,ALCIDE CORP,8561 154th Avenue North East,Redmond,WA,98052
3,1331,ALPINE GROUP INC,One Meadowlands Plaza,East Rutherford,NJ,7073
4,1562,AMERICAN SOFTWARE -CL A,"470 East Paces Ferry Road, NE",Atlanta,GA,30305


In [107]:
len(chq.gvkey.unique())

6250

In [203]:
hqsOnly     = pd.read_csv("../../data/ig_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})
hqsWithYear = pd.read_csv("../../data/ig_uniqueHQs_multLocations.csv",dtype={'zipcode': 'object'})[['abi','company',
                                                                             'archive_version_year',
                                                                             'state','city','zipcode',
                                                                            'address_line_1']]

print(hqsOnly.head())
print(hqsWithYear.head())

hqsWithYear = hqsWithYear[hqsWithYear.archive_version_year <= 2020]

hqsWithYear['last_year'] = hqsWithYear.groupby(['abi'])['archive_version_year'].transform(max)

print(hqsWithYear.shape)

lastHQs = hqsWithYear[hqsWithYear.archive_version_year == hqsWithYear.last_year][['abi','company','state',
                                                                                  'city','zipcode','address_line_1']]

print(lastHQs.shape)

lastHQs.head()

     abi                         company
0   7609            SOLITRON DEVICES INC
1  23077                JENNY LEE BAKERY
2  76547          MASTER PROTECTION CORP
3  77743  NATIONAL TECHNICAL SYSTEMS INC
4  89151         HILLTOP BASIC RESOURCES
     abi                         company  archive_version_year state  \
0   7609            SOLITRON DEVICES INC                2003.0    FL   
1  23077                JENNY LEE BAKERY                2003.0    PA   
2  76547          MASTER PROTECTION CORP                2003.0    FL   
3  77743  NATIONAL TECHNICAL SYSTEMS INC                2003.0    CA   
4  89151         HILLTOP BASIC RESOURCES                2003.0    OH   

              city zipcode             address_line_1  
0  WEST PALM BEACH   33407   3301 ELECTRONICS WAY # C  
1    MC KEES ROCKS   15136             620 ISLAND AVE  
2       FORT MYERS   33907  12800 UNIVERSITY DR # 400  
3        CALABASAS   91302   24007 VENTURA BLVD # 200  
4       CINCINNATI   45202          1 W 4

Unnamed: 0,abi,company,state,city,zipcode,address_line_1
28,435388,MBC HOLDING CO,MN,ST PAUL,55102,882 7TH ST W
40,479535,CMP INDUSTRIES LLC,NY,ALBANY,12207,413 N PEARL ST
67,595777,SEACAT,MS,HOLLANDALE,38748,1616 RICE MILL RD
73,602789,AMI SEMICONDUCTOR INC,ID,POCATELLO,83201,2300 W BUCKSKIN RD
90,651166,OSMONICS INC,MN,HOPKINS,55343,5951 CLEARWATER DR


In [204]:
lastHQs.to_csv("../../data/companyData/sc_lastHQs.csv")

Only two of these company names appear 2x, which is good. There are ~20,000 companies in this sample.

Let's go through a little bit of a process here:
- Find the exact matches.
- Get a similarity measure between ; ideally something vectorized / something in matrix math.
- Find the top 10 matches for the remaining ones.
- Do some mix and match and see if there's any threshold at which matches become similar ``enough'' to say this is okay and good to go.


We might be able to use the fact that all of the addresses should be the same after some given point, as the compustat addresses are only the most recent ones. 

Let's try a few different ways to match these up.

First, let's find the exact matches.

Make a generic cleaning function that strips out all company names, any punctuation in the name, and makes everything lower case.

In [109]:
def cleanText(text):
    text = text.strip().\
    replace(" CORP","").replace(" CO","").replace(" INC","").\
    replace(" LTD","").replace(" -CL A","").\
    replace(" -LP","").replace(" LP","").\
    replace("-OLD","").replace(" LLC","").\
    replace(", LLC.","").replace(" L.L.C.","").replace(" L.P.","").\
    replace(" L.TD","").replace(" L.L.C.","").replace(" -CL B","").\
    replace(" -CL B","").replace(" -CL i","").replace(" -CL","").\
    replace("-REDH","").replace(" CP","").\
    replace("-ADR","").replace(" PLC","").lower().replace(r'[^\w\s]+', '').\
    replace('-lp','').replace('-spn','').replace('hldg','').replace(' intl','').\
    replace('holdings','').replace('holding','').replace('prtnr','').replace('group','').\
    replace(" med ", " medical ").replace(" tradng ", " trading ").replace("gen ", "general ").\
    replace(" mtr ", " motors ").replace(" motor ", " motors ").replace("-", " ").\
    replace("/", " ").replace("'", " ").replace("&", " ").replace(" a g ", " ").\
    replace(" ag ", " ").replace("  adr ", " ").replace(" adr ", " ").replace("  cp ", " ").\
    replace(" cp ", " ").replace(" plc ", " ").replace(" intl ", " ").replace(" ent ", " ").\
    replace(" nv ", " ").replace(" n.v. ", " ").replace(" worldwide ", " ").\
    replace(" wldwide ", " ").replace(" banc"," bank").replace("^banc","bank")
    
    return text

In [112]:
chq['company']               = list(map(cleanText, chq.company))
lastHQs['company']           = list(map(cleanText, lastHQs.company))

chq.rename(columns = {'city': 'cstatCity',
                     'state': 'cstatState',
                     'add1': 'cstatadd1'}, inplace = True)

chq['cstatCity']  = chq.cstatCity.str.lower()
chq['cstatState'] = chq.cstatState.str.lower()
chq['cstatadd1']  = chq.cstatadd1.str.lower()

lastHQs['city']            = lastHQs.city.str.lower()
lastHQs['state']           = lastHQs.state.str.lower()
lastHQs['address_line_1']  = lastHQs.address_line_1.str.lower()

NAICS names do not match up between compustat and infogroup so they're not helpful.

In [113]:
chq.head()

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode
0,1004,aar,"one aar place, 1100 north wood dale road",wood dale,il,60191
1,1082,servidyne,"1945 the exchange, suite 325",atlanta,ga,30339
2,1244,alcide,8561 154th avenue north east,redmond,wa,98052
3,1331,alpine,one meadowlands plaza,east rutherford,nj,7073
4,1562,american software,"470 east paces ferry road, ne",atlanta,ga,30305


In [114]:
lastHQs.head()

Unnamed: 0,abi,company,state,city,zipcode,address_line_1
28,435388,mbc,mn,st paul,55102,882 7th st w
40,479535,cmp industries,ny,albany,12207,413 n pearl st
67,595777,seacat,ms,hollandale,38748,1616 rice mill rd
73,602789,ami semiconductor,id,pocatello,83201,2300 w buckskin rd
90,651166,osmonics,mn,hopkins,55343,5951 clearwater dr


## Match on company name directly

In [115]:
nameMerge = chq.merge(lastHQs)
nameMerge.shape

(3818, 11)

In [116]:
nameMerge.head()

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,abi,state,city,zipcode,address_line_1
0,1004,aar,"one aar place, 1100 north wood dale road",wood dale,il,60191,115523672,il,wood dale,60191,1100 n wood dale rd
1,1082,servidyne,"1945 the exchange, suite 325",atlanta,ga,30339,5380076,ga,atlanta,30339,1945 the exchange se # 300
2,1244,alcide,8561 154th avenue north east,redmond,wa,98052,495280943,wa,redmond,98052,8561 154th ave ne
3,1331,alpine,one meadowlands plaza,east rutherford,nj,7073,3563459,nj,east rutherford,7073,1 meadowlands plz # 800
4,2230,biomet,"56 east bell drive, po box 587",warsaw,in,46581,8288730,in,warsaw,46582,56 e bell dr


In [117]:
nameMerge.to_csv("../../data/companyData/nameMerge_cLinks.csv")

In [118]:
sum(nameMerge.cstatState == nameMerge.state)/nameMerge.shape[0]

0.9195914091147197

In [119]:
sum(nameMerge.cstatZipcode.str.slice(0,5) == nameMerge.zipcode.str.slice(0,5))/nameMerge.shape[0]

0.8132530120481928

In [120]:
sum(nameMerge.cstatZipcode.str.slice(0,1) == nameMerge.zipcode.str.slice(0,1))/nameMerge.shape[0]

0.927972760607648

In [121]:
nameMerge[nameMerge.cstatCity != nameMerge.city][50:100]

Unnamed: 0,gvkey,company,cstatadd1,cstatCity,cstatState,cstatZipcode,abi,state,city,zipcode,address_line_1
254,24371,geographics,"108 main street, 3rd floor",norwalk,ct,6851,700882711,ga,atlanta,30354,3450 browns mill rd se
261,25099,catalina marketing,200 carillon parkway,st. petersburg,fl,33716,478722135,fl,st petersburg,33716,200 carillon pkwy
267,25433,cholestech,9975 summers ridge road,san diego,ca,92121,404812778,ca,hayward,94545,3347 investment blvd
273,27991,brookstone,one innovation way,merrimack,nh,3054,4600490,nh,nashua,3063,22 cotton rd # 220
281,28762,spartannash,"850 76th street, s.w., po box 8700",grand rapids,mi,49518,402458400,mi,byron center,49315,850 76th st sw
291,29846,pathmark stores,2 paragon drive,montvale,nj,7645,885215905,nj,carteret,7008,200 milik st
297,30883,american uranium mining,highway 268 west,wilkesboro,nc,28697,656692548,tx,the woodlands,77380,9595 six pines dr # 8210
299,31022,sports authority,3383 north state road 7,fort lauderdale,fl,33319,441355401,co,englewood,80110,1050 w hampden ave
309,61718,atari,"475 park avenue south, 12th floor",new york,ny,10016,712379885,ca,sunnyvale,94089,1196 borregas ave
315,62221,columbus mckinnon,205 crosspoint parkway,buffalo,ny,14068,230053498,ny,getzville,14068,205 crosspoint pkwy


Now focus down onto the companies that have not been matched.

In [122]:
chqUnmatched = chq[~chq.company.isin(nameMerge.company)].reset_index()
chqUnmatched.shape

(2500, 7)

In [123]:
igUnmatched  = lastHQs[~lastHQs.company.isin(nameMerge.company)].reset_index()
igUnmatched.shape

(65394, 7)

In [124]:
lastHQs.shape

(69186, 6)

# Find Distance

Two distance measures here. Look at top 5 matches and pull the distance measure and matches as well.

### Levenshtein

In [125]:
from Levenshtein import distance as levenshtein_distance

Find LD between the unmatched compustat companies and the unmatched IG ones. 

In [126]:
companyArrayCStat = []

company = chqUnmatched.company[0]
start = time.time()
for company in chqUnmatched.company:
    thisCompany = []
    for ig in igUnmatched.company:
        thisCompany.append(levenshtein_distance(company,ig))
    
    companyArrayCStat.append([thisCompany])

allLD = np.concatenate(companyArrayCStat)


In [127]:
igUnmatched.head()

Unnamed: 0,index,abi,company,state,city,zipcode,address_line_1
0,28,435388,mbc,mn,st paul,55102,882 7th st w
1,40,479535,cmp industries,ny,albany,12207,413 n pearl st
2,67,595777,seacat,ms,hollandale,38748,1616 rice mill rd
3,73,602789,ami semiconductor,id,pocatello,83201,2300 w buckskin rd
4,90,651166,osmonics,mn,hopkins,55343,5951 clearwater dr


Now pull the closest companies in IG to those in CStat. We'll first populate a dataframe with the name, address, city, state, and zip of each unmatched company in compustat, then we'll use the LD to find the same information for the closest company in IG.



There's some legacy code in here that finds the top 5 closest companies; but it doesn't populate the dataframe.

In [133]:
singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]

# legacy
n = 5
largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


companyMatches = pd.DataFrame()
companyMatches['cstatCompanies'] = chqUnmatched.company
companyMatches['cstatadd1']      = chqUnmatched.cstatadd1
companyMatches['cstatCity']      = chqUnmatched.cstatCity
companyMatches['cstatState']     = chqUnmatched.cstatState
companyMatches['cstatZip']       = chqUnmatched.cstatZipcode

for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'misspelling']         = allLD[i,:][singleLargestLV[i]]
    companyMatches.at[i,'percMisspelled']      = companyMatches.misspelling[i]/len(companyMatches.cstatCompanies[i])
    companyMatches.at[i,'levCompany']          = igUnmatched.company[singleLargestLV[i]].iloc[0]
    
    companyMatches.at[i,'closestMatchIG_add']      = igUnmatched.address_line_1[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_city']     = igUnmatched.city[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_zipcode']  = igUnmatched.zipcode[singleLargestLV[i]].iloc[0]
    companyMatches.at[i,'closestMatchIG_state']    = igUnmatched.state[singleLargestLV[i]].iloc[0]

    # legacy as well
    companyMatches.at[i,'closestMatchIG']      = np.array(igUnmatched.company)[largestElementsLV[i]]
    companyMatches.at[i,'LevSim']              = np.array(allLD[i,:][largestElementsLV[i]], dtype=object)

In [134]:
companyMatches.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,percMisspelled,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,closestMatchIG,LevSim
0,american software,"470 east paces ferry road, ne",atlanta,ga,30305,1.0,0.055556,american software,470 e paces ferry rd ne,atlanta,30305,ga,"[american software, american mortgage , americ...","[1, 4, 5, 5, 6]"
1,bethlehem,25th and lennox streets,easton,pa,18045,4.0,0.444444,balchem,52 sunrise park rd,new hampton,10958,ny,"[balchem, pet life, merichem, metal tek, zea c...","[4, 5, 5, 5, 5]"
2,burlingtonat factory invs,1830 route 130 north,burlington,nj,8016,4.0,0.16,burlingtonat factory whse,1830 route 130 n,burlington,8016,nj,"[burlingtonat factory whse, burlington brands,...","[4, 11, 12, 12, 13]"
3,dataram old,"777 alexander road, suite 100",princeton,nj,8540,6.0,0.461538,star gold,611 e sherman ave,coeur d alene,83814,id,"[star gold, patriot gold, adams golf, azteca g...","[6, 6, 7, 7, 7]"
4,electro scientific inds,13900 nw science park drive,portland,or,97229,6.0,0.26087,electro scientific industries,13900 nw science park dr,portland,97229,or,"[electro scientific industries, reflect scient...","[6, 9, 10, 10, 10]"


Now get the embeddings and the cosine similarity between them.

In [130]:
def getMatrix(companyEmbeddings):
    companyArray = []
    
    for companies in companyEmbeddings:
        companyArray.append([companies.vector])
    
    companyArray = np.concatenate(companyArray)
    
    return(companyArray)
        

In [131]:
chqUnmatchedList = list(map(nlp, chqUnmatched.company))
allCompaniesIG   = list(map(nlp, igUnmatched.company))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

(2500, 65394)

In [132]:
outfile =  '../../data/scCompaniesIG_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(ig, pickle_file)
    
outfile =  '../../data/scCompaniesCStat_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(cstat, pickle_file)    

Each row n here has the similarity between the nth company name in compustat and the IG company corresp to that column.

In [135]:
allSimilarities[0:5,:]

array([[ 0.06614351,  0.30165398,  0.        , ...,  0.43370304,
         0.36172885,  0.3919765 ],
       [ 0.20533155,  0.12865698,  0.        , ...,  0.07761187,
         0.4333101 ,  0.1429844 ],
       [-0.03254405,  0.3387761 ,  0.        , ...,  0.36552852,
         0.05581836,  0.28216398],
       [-0.13385464,  0.05125522,  0.        , ...,  0.4190689 ,
         0.03858362,  0.18789534],
       [ 0.06890466,  0.29094046,  0.        , ...,  0.18409653,
         0.25897923,  0.21555442]], dtype=float32)

Find indices of companies in IG most similar to each company in CStat.

In [136]:
singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]

# legacy - largest n
# largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]


Add the cosine similarity measures to the similarity dataset.

In [137]:
for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'cosSimilarity']        = allSimilarities[i,:][singleLargestCos[i]]
    companyMatches.at[i,'cosSimilarityCompany'] = igUnmatched.company[singleLargestCos[i]].iloc[0]
    # companyMatches.at[i,'closestMatchCosine']   = np.array(igUnmatched.company)[largestElementsCos[i]]
    # companyMatches.at[i,'cosineSim']            = np.array(allSimilarities[i,:][largestElementsCos[i]], dtype=object)
    
    
    companyMatches.at[i,'costMatchIG_add']     = igUnmatched.address_line_1[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_city']     = igUnmatched.city[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_zipcode']  = igUnmatched.zipcode[singleLargestCos[i]].iloc[0]
    companyMatches.at[i,'cosMatchIG_state']    = igUnmatched.state[singleLargestCos[i]].iloc[0]
    

In [138]:
sum((companyMatches.levCompany == companyMatches.cosSimilarityCompany))

369

## Start Matching

### Take 1: Match + Zip or City

Now find the company matches: ABI - gvkey link.

Start with ones where the names both match.

If the cities or zipcodes match on one of the closest companies (LD or cos), it seems like it is good to go.


Do this in steps to start, at least. First find the companies where both match and either zip or city match. Then find companies where only one matches.

In [139]:
bothMatch_cityZip = companyMatches[(companyMatches.levCompany == companyMatches.cosSimilarityCompany) & \
              ((companyMatches.cstatCity == companyMatches.closestMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.closestMatchIG_zipcode))]
print(bothMatch_cityZip.shape)

bothMatch_cityZip.to_csv("../../data/companyData/sc_bothMatch_cityZip.csv")

(263, 20)


In [140]:
bothMatch_cityZip['igCompanies'] = bothMatch_cityZip.levCompany
companiesToCheck                 = bothMatch_cityZip[['cstatCompanies','igCompanies',
                                    'cstatadd1','cstatCity','cstatZip',
                                    'closestMatchIG_add','closestMatchIG_city','closestMatchIG_zipcode']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [141]:
companyMatchesBoth = list(bothMatch_cityZip.cstatCompanies.unique())
len(companyMatchesBoth)

263

Grab the single company match versions.

In [142]:
oneMatch_cityZipOnly = companyMatches[-(companyMatches.cstatCompanies.isin(companyMatchesBoth)) & \
              (((companyMatches.cstatCity == companyMatches.closestMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.closestMatchIG_zipcode)) | \
              ((companyMatches.cstatCity == companyMatches.cosMatchIG_city) | \
              (companyMatches.cstatZip == companyMatches.cosMatchIG_zipcode)))].reset_index(drop=True)

oneMatch_cityZipOnly['igCompanies'] = ''

for i in range(0,oneMatch_cityZipOnly.shape[0]):
    if ((oneMatch_cityZipOnly.cstatCity[i] == oneMatch_cityZipOnly.closestMatchIG_city[i]) | \
              (oneMatch_cityZipOnly.cstatZip[i] == oneMatch_cityZipOnly.closestMatchIG_zipcode[i])):
        oneMatch_cityZipOnly.loc[i,'igCompanies'] = oneMatch_cityZipOnly.levCompany[i]
    else:
        oneMatch_cityZipOnly.loc[i,'igCompanies'] = oneMatch_cityZipOnly.cosSimilarityCompany[i]

# oneMatch_cityZipOnly.to_csv("../../data/companyData/oneMatch_cityZipOnly.csv")

In [143]:
oneMatch_cityZipOnly.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,percMisspelled,levCompany,closestMatchIG_add,closestMatchIG_city,...,closestMatchIG_state,closestMatchIG,LevSim,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state,igCompanies
0,ndchealth,"ndc plaza, 5th floor",atlanta,ga,30329,1.0,0.111111,ndc health,2 national data plz ne,atlanta,...,ga,"[ndc health, nmi health, uw health, uc health,...","[1, 3, 3, 3, 3]",0.0,mbc,882 7th st w,st paul,55102,mn,ndc health
1,nike,one bowerman drive,beaverton,or,97005,1.0,0.2,nite,9595 six pines dr,the woodlands,...,tx,"[nite , nike, ntk , nii , wine ]","[1, 1, 2, 2, 2]",1.0,nike,1 sw bowerman dr,beaverton,97005,or,nike
2,smucker (jm),one strawberry lane,orrville,oh,44667,7.0,0.583333,becker,1701 highway a1a # 204,vero beach,...,fl,"[becker , super kids, sutter , super care, mer...","[7, 7, 7, 7, 7]",0.659204,jm smucker,1 strawberry ln,orrville,44667,oh,jm smucker
3,fastcommmmunications,45472 holiday drive,dulles,va,20166,2.0,0.1,fastmmmmunications,45472 holiday dr,sterling,...,va,"[fastmmmmunications, southcommmmunications, fr...","[2, 4, 5, 5, 5]",0.0,mbc,882 7th st w,st paul,55102,mn,fastmmmmunications
4,postrock energy,210 park avenue,oklahoma city,ok,73102,1.0,0.066667,post rock energy,210 park ave # 2750,oklahoma city,...,ok,"[post rock energy, petro energy, cap rock ener...","[1, 4, 4, 5, 5]",0.694047,eflo energy,333 n sam houston pkwy e # 600,houston,77060,tx,post rock energy


In [144]:
companiesToCheck = companiesToCheck.append(oneMatch_cityZipOnly[['cstatCompanies','igCompanies',
                                    'cstatadd1','cstatCity','cstatZip',
                                    'closestMatchIG_add','closestMatchIG_city','closestMatchIG_zipcode']]).\
                                    drop_duplicates()

companiesToCheck.shape

(759, 8)

In [145]:
companiesToCheck[0:50]

Unnamed: 0,cstatCompanies,igCompanies,cstatadd1,cstatCity,cstatZip,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode
0,american software,american software,"470 east paces ferry road, ne",atlanta,30305,470 e paces ferry rd ne,atlanta,30305
2,burlingtonat factory invs,burlingtonat factory whse,1830 route 130 north,burlington,8016,1830 route 130 n,burlington,8016
4,electro scientific inds,electro scientific industries,13900 nw science park drive,portland,97229,13900 nw science park dr,portland,97229
26,kindercare learning centers,kindercare learning ctr,"650 ne holladay street, suite 1400",portland,97232,650 ne holladay st # 1400,portland,97232
31,schmitt industries or,schmitt industries,2765 n.w. nicolai street,portland,97210,2765 nw nicolai st,portland,97210
36,pacific aerospace electr,pacific aerospace electroncs,430 olds station road,wenatchee,98801,434 olds station rd,wenatchee,98801
52,american greetings,american greetings,one american road,cleveland,44145,1 american way,westlake,44145
60,cagle s,cagle s,1385 collier road nw,atlanta,30318,1385 collier rd nw,atlanta,30318
71,forest laboratories,forest laboratories,909 third avenue,new york,10022,909 3rd ave # 23,new york,10022
72,great atlantic pac tea,great atlantic pacific tea,2 paragon drive,montvale,7645,2 paragon dr,montvale,7645


2x check that there are no duplicates here.

In [146]:
duplicates = companiesToCheck.cstatCompanies.value_counts().index[companiesToCheck.cstatCompanies.value_counts() > 1]

companiesToCheck[companiesToCheck.cstatCompanies.isin(duplicates)]

Unnamed: 0,cstatCompanies,igCompanies,cstatadd1,cstatCity,cstatZip,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode


In [147]:
chqStillUnmatched = chqUnmatched[-chqUnmatched.company.isin(companiesToCheck.cstatCompanies)].reset_index(drop=True)
igStillUnmatched  = igUnmatched[-igUnmatched.company.isin(companiesToCheck.igCompanies)].reset_index(drop=True)

# companyMatches['cstatCompanies'] = chqUnmatched.company
print(chqUnmatched.shape, chqStillUnmatched.shape, companiesToCheck.shape)
print(igStillUnmatched.shape,igUnmatched.shape)

(2500, 7) (1738, 7) (759, 8)
(64638, 7) (65394, 7)


In [148]:
companiesToCheck.to_csv("../../data/companyData/sc_companiesToCheck_cityZip.csv")

## Take 2
Match remaining ones on first word of name

In [149]:
chqUnmatched.company[0].split(' ')[0]

'american'

In [150]:
chqUnmatched.company[0]

'american software '

Get the edit distance for the first words of the company names.

In [151]:
companyArrayCStat = []

start = time.time()
for company in chqStillUnmatched.company:
    thisCompany = []
    for ig in igStillUnmatched.company:
        thisCompany.append(levenshtein_distance(company.split(' ')[0],ig.split(' ')[0]))
    
    companyArrayCStat.append([thisCompany])

allLD = np.concatenate(companyArrayCStat)


And the cosine distance.

In [152]:
chqStillUnmatchedFirstCo = []
igStillUnmatchedFirstCo = []

for company in chqStillUnmatched.company:
    chqStillUnmatchedFirstCo.append(company.split(' ')[0])
    
for company in igStillUnmatched.company:
    igStillUnmatchedFirstCo.append(company.split(' ')[0])

In [153]:
chqUnmatchedList = list(map(nlp, chqStillUnmatchedFirstCo))
allCompaniesIG = list(map(nlp, igStillUnmatchedFirstCo))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

# largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]
singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]

In [154]:
igStillUnmatched.head()

Unnamed: 0,index,abi,company,state,city,zipcode,address_line_1
0,40,479535,cmp industries,ny,albany,12207,413 n pearl st
1,67,595777,seacat,ms,hollandale,38748,1616 rice mill rd
2,73,602789,ami semiconductor,id,pocatello,83201,2300 w buckskin rd
3,90,651166,osmonics,mn,hopkins,55343,5951 clearwater dr
4,94,665471,liqui box,oh,worthington,43085,6950 worthington galena rd


In [155]:
singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]
# largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


companyMatches2 = pd.DataFrame()
companyMatches2['cstatCompanies'] = chqStillUnmatched.company
companyMatches2['cstatadd1']      = chqStillUnmatched.cstatadd1
companyMatches2['cstatCity']      = chqStillUnmatched.cstatCity
companyMatches2['cstatState']     = chqStillUnmatched.cstatState
companyMatches2['cstatZip']       = chqStillUnmatched.cstatZipcode


for i in range(0,companyMatches2.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches2.at[i,'misspelling']         = allLD[i,:][singleLargestLV[i]]
    companyMatches2.at[i,'levCompany']          = igStillUnmatched.company[singleLargestLV[i]].iloc[0]
    
    companyMatches2.at[i,'closestMatchIG_add']      = igStillUnmatched.address_line_1[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_city']     = igStillUnmatched.city[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_zipcode']  = igStillUnmatched.zipcode[singleLargestLV[i]].iloc[0]
    companyMatches2.at[i,'closestMatchIG_state']    = igStillUnmatched.state[singleLargestLV[i]].iloc[0]

    
    companyMatches2.at[i,'cosSimilarity']        = allSimilarities[i,:][singleLargestCos[i]]
    companyMatches2.at[i,'cosSimilarityCompany'] = igStillUnmatched.company[singleLargestCos[i]].iloc[0]
   
    companyMatches2.at[i,'costMatchIG_add']     = igStillUnmatched.address_line_1[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_city']     = igStillUnmatched.city[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_zipcode']  = igStillUnmatched.zipcode[singleLargestCos[i]].iloc[0]
    companyMatches2.at[i,'cosMatchIG_state']    = igStillUnmatched.state[singleLargestCos[i]].iloc[0]

    

In [156]:
companyMatches2.head()

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state
0,bethlehem,25th and lennox streets,easton,pa,18045,0.0,bethlehem area public library,11 w church st,bethlehem,18018,pa,1.0,bethlehem area public library,11 w church st,bethlehem,18018,pa
1,dataram old,"777 alexander road, suite 100",princeton,nj,8540,2.0,atara biotherapeutics,611 gateway blvd # 900,s san francisco,94080,ca,0.0,cmp industries,413 n pearl st,albany,12207,ny
2,engineering measurements,2150 miller drive,longmont,co,80501,0.0,engineering america,1822 buerkle rd,white bear lake,55110,mn,1.0,engineering services div,395 john ireland blvd # 120,st paul,55155,mn
3,genesee,"16 west main street, 600 powers building",rochester,ny,14614,0.0,genesee global,975 john st,west henrietta,14586,ny,1.0,genesee survey svc,3136 winton rd s # 203,rochester,14623,ny
4,intermagnetics general,450 old niskayuna road - 1569,latham,ny,12110,4.0,inergetics,550 broad st # 1212,newark,7102,nj,0.0,cmp industries,413 n pearl st,albany,12207,ny


In [157]:
companyMatches2.shape

(1738, 17)

Find if city or zip match here.

In [158]:
match2_cityZips = companyMatches2[(((companyMatches2.cstatCity == companyMatches2.closestMatchIG_city) | \
              (companyMatches2.cstatZip == companyMatches2.closestMatchIG_zipcode)) | \
              ((companyMatches2.cstatCity == companyMatches2.cosMatchIG_city) | \
              (companyMatches2.cstatZip == companyMatches2.cosMatchIG_zipcode)))].reset_index(drop=True)


In [159]:
match2_cityZips

Unnamed: 0,cstatCompanies,cstatadd1,cstatCity,cstatState,cstatZip,misspelling,levCompany,closestMatchIG_add,closestMatchIG_city,closestMatchIG_zipcode,closestMatchIG_state,cosSimilarity,cosSimilarityCompany,costMatchIG_add,cosMatchIG_city,cosMatchIG_zipcode,cosMatchIG_state
0,genesee,"16 west main street, 600 powers building",rochester,ny,14614,0.0,genesee global,975 john st,west henrietta,14586,ny,1.0,genesee survey svc,3136 winton rd s # 203,rochester,14623,ny
1,tjxs (the),770 cochituate road,framingham,ma,01701,1.0,tjx,770 cochituate rd # 1,framingham,01701,ma,0.0,cmp industries,413 n pearl st,albany,12207,ny
2,monmouth capital,"juniper business plaza, 3499 route 9 north sui...",freehold,nj,07728,0.0,monmouth real estate investmnt,3499 route 9 n # 3c,freehold,07728,nj,1.0,monmouth real est invstmnt crp,101 crawfords corner rd # 1405,holmdel,07733,nj
3,daisytek,"1025 central expressway south, suite 200",allen,tx,75013,0.0,daisytek international,1025 central expy s # 200,allen,75013,tx,0.0,cmp industries,413 n pearl st,albany,12207,ny
4,aviall,"2750 regent boulevard, dallas fort worth airport",dallas,tx,75261,0.0,aviall services,2750 regent blvd,dallas,75261,tx,0.0,cmp industries,413 n pearl st,albany,12207,ny
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,commercehub,"zen building, 201 fuller road, 6th floor",albany,ny,12203,3.0,commerce consumer affairs dept,335 merchant st # 201,honolulu,96813,hi,0.0,cmp industries,413 n pearl st,albany,12207,ny
88,ranger energy services,"10350 richmond, suite 550",houston,tx,77042,0.0,ranger industries,15 park rd,tinton falls,07724,nj,1.0,ranger steel,1225 north loop w # 650,houston,77008,tx
89,liquidia,"419 davis drive, suite 100",morrisville,nc,27560,0.0,liquidia technologies,419 davis dr # 100,morrisville,27560,nc,0.0,cmp industries,413 n pearl st,albany,12207,ny
90,gritstone bio,"5959 horton street, suite 300",emeryville,ca,94608,0.0,gritstone oncology,5858 horton st # 210,emeryville,94608,ca,1.0,gritstone oncology,5858 horton st # 210,emeryville,94608,ca


In [160]:
match2_cityZips['igCompanies'] = ''

for i in range(0,match2_cityZips.shape[0]):
    if ((match2_cityZips.cstatCity[i] == match2_cityZips.closestMatchIG_city[i]) | \
              (match2_cityZips.cstatZip[i] == match2_cityZips.closestMatchIG_zipcode[i])):
        match2_cityZips.loc[i,'igCompanies'] = match2_cityZips.levCompany[i]
    else:
        match2_cityZips.loc[i,'igCompanies'] = match2_cityZips.cosSimilarityCompany[i]


In [161]:
match2_cityZips[['cstatCompanies','igCompanies','closestMatchIG_add','costMatchIG_add']].shape

(92, 4)

In [164]:
match2_cityZips[['cstatCompanies','igCompanies','closestMatchIG_add','costMatchIG_add']].to_csv("../../data/companyData/sc_match2_cityZips.csv")

## Take 3
Try the addresses here.

Let's try something similar:
- Find top 10 most similar addresses by cos sim
- Find top 10 most similar addresses by LD
- Find unique union of these two
- Record LD and cos sim for each
- Filter for totally dissimilar ones
- ``Explode'' the dataset so we have cstat company, address
- Find first word LD and cos sim
- Find total LD and cos sim

In [165]:
n = 10

In [166]:
chqStillUnmatched['cstatadd1']     = chqStillUnmatched.cstatadd1.astype(str)
igStillUnmatched['address_line_1'] = igStillUnmatched.address_line_1.astype(str)

In [167]:
addressArrayCStat = []

start = time.time()
for address in chqStillUnmatched.cstatadd1:
    thisAddress = []
    for ig in igStillUnmatched.address_line_1:
        thisAddress.append(levenshtein_distance(str(address),str(ig)))
    
    addressArrayCStat.append([thisAddress])

allLD = np.concatenate(addressArrayCStat)

singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]
largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


In [168]:
chqUnmatchedList = list(map(nlp, chqStillUnmatched['cstatadd1']))
allCompaniesIG   = list(map(nlp, igStillUnmatched['address_line_1']))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

singleLargestCos   = (-allSimilarities).argsort(axis=-1)[:, :1]
largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]

In [169]:
companyMatches3 = pd.DataFrame()
companyMatches3['cstatCompanies'] = chqStillUnmatched.company

Let's grab the set of the 10 closest addresses by LD and cos.

In [170]:
companyMatches3.head()

Unnamed: 0,cstatCompanies
0,bethlehem
1,dataram old
2,engineering measurements
3,genesee
4,intermagnetics general


Find the unique values in here.

In [171]:
set(largestElementsLV[i]).union(set(largestElementsCos[i]))  

{317,
 4400,
 8739,
 10481,
 19492,
 23606,
 27447,
 28442,
 43100,
 44751,
 47804,
 48063,
 48588,
 49354,
 51642,
 62632,
 62907}

In [172]:
companyMatches3.head()

Unnamed: 0,cstatCompanies
0,bethlehem
1,dataram old
2,engineering measurements
3,genesee
4,intermagnetics general


In [173]:
companyMatches3['closestAdds_indices'] = ''


for i in range(0,companyMatches3.shape[0]):
    
    # find all the closest LV and cos addresses and put them in a 
    companyMatches3.at[i,'closestAdds_indices']  = set(largestElementsLV[i]).union(set(largestElementsCos[i]))    

And explode it so one index per line.

In [174]:
companyMatches3_indices = companyMatches3.explode('closestAdds_indices').reset_index(drop=True)

Now get the companies, the cosine similarities, and the levenshtein distances.

In [175]:
companyMatches3_indices.closestAdds_indices

0        29440
1         9536
2        60943
3        27536
4        52113
         ...  
31680    34994
31681    17333
31682    24311
31683    11517
31684    44542
Name: closestAdds_indices, Length: 31685, dtype: object

Get the embeddings for the cosine similarity.

In [176]:
for i in range(0,companyMatches3_indices.shape[0]):
    companyMatches3_indices.at[i,'igCompanies']  = igStillUnmatched.company[companyMatches3_indices.closestAdds_indices[i]] # .iloc[0]

    companyMatches3_indices.at[i,'lv']           = levenshtein_distance(companyMatches3_indices.cstatCompanies[i],companyMatches3_indices.igCompanies[i])
    companyMatches3_indices.at[i,'percMisspelled']       = companyMatches3_indices.lv[i]/len(companyMatches3_indices.cstatCompanies[i])

    # companyMatches3.at[i,'add_percMisspelled'] = companyMatches2.add_misspelling[i]/len(companyMatches2['cstatadd1'][i])


Get the company embeddings here.

In [177]:
cMatches  = companyMatches3_indices[['cstatCompanies']].drop_duplicates()
igMatches = companyMatches3_indices[['igCompanies']].drop_duplicates()



In [178]:
def getVector(text):
    embedding = nlp(text)
    
    return(embedding.vector)

In [179]:
cStatEmbeddings = list(map(getVector, cMatches.cstatCompanies))
igEmbeddings    = list(map(getVector, igMatches.igCompanies))

In [180]:
igMatches['igEmbedding']       = igEmbeddings
cMatches['cstatEmbedding']     = cStatEmbeddings

In [181]:
companyMatches3_indices = companyMatches3_indices.merge(igMatches).merge(cMatches)

In [182]:
companyMatches3_indices.shape

(31685, 7)

Loop through and get the cosine similarity.

In [183]:
for i in range(0,companyMatches3_indices.shape[0]):
    companyMatches3_indices.at[i,'cosSim']  = cosine_similarity([companyMatches3_indices.igEmbedding[i]],
                                                                [companyMatches3_indices.cstatEmbedding[i]])


In [184]:
filtered = companyMatches3_indices[((companyMatches3_indices.percMisspelled < 0.4) | \
                        (companyMatches3_indices.cosSim > 0.6))]

In [185]:
filtered[['cstatCompanies','igCompanies','percMisspelled','cosSim']].to_csv("../../data/companyData/sc_companyMatches3_indices.csv")

In [None]:
filtered.shape

In [None]:
filtered.head()

# Combine all these things

In [186]:
dset1 = pd.read_csv("../../data/companyData/sc_companiesToCheck_cityZip.csv")[['cstatCompanies','igCompanies','delete']]

In [187]:
dset2 = pd.read_csv("../../data/companyData/sc_match2_cityZips.csv")[['cstatCompanies','igCompanies','delete']]

In [188]:
dset3 = pd.read_csv("../../data/companyData/sc_companyMatches3_indices.csv")[['cstatCompanies','igCompanies','delete']]

In [189]:
dset1[['cstatCompanies','igCompanies','delete']].head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,american software,american software,
1,burlingtonat factory invs,burlingtonat factory whse,
2,electro scientific inds,electro scientific industries,
3,kindercare learning centers,kindercare learning ctr,
4,schmitt industries or,schmitt industries,


In [190]:
dset2[['cstatCompanies','igCompanies','delete']].head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,genesee,genesee survey svc,
1,tjxs (the),tjx,
2,monmouth capital,monmouth real estate investmnt,
3,daisytek,daisytek international,
4,aviall,aviall services,


In [191]:
dset3[['cstatCompanies','igCompanies','delete']].head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,united road services,child health systems,1.0
1,risk(george) industries,george risk industries,
2,alabama power,the fort jennings state bank,1.0
3,alabama power,firstbanc of alabama,1.0
4,carlisles,carlisle,


In [192]:
cleanMerge = pd.read_csv("../../data/companyData/nameMerge.csv")[['company']].\
    rename(columns = {'company': 'cstatCompanies'})
cleanMerge['igCompanies'] = cleanMerge['cstatCompanies']

cleanMerge['delete'] = ''

print(cleanMerge.head())


           cstatCompanies             igCompanies delete
0                     aar                     aar       
1  adc telecommunications  adc telecommunications       
2  afa protective systems  afa protective systems       
3          ammmunications          ammmunications       
4      amc entertainment       amc entertainment        


In [193]:
all = pd.concat([dset1,dset2,dset3,cleanMerge])
allFiltered = all[~(all.delete == 1.0)].drop_duplicates()

In [194]:
cleanMerge.head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,aar,aar,
1,adc telecommunications,adc telecommunications,
2,afa protective systems,afa protective systems,
3,ammmunications,ammmunications,
4,amc entertainment,amc entertainment,


In [207]:
allFiltered.shape

(9332, 3)

In [208]:
allFiltered.head()

Unnamed: 0,cstatCompanies,igCompanies,delete
0,american software,american software,
1,burlingtonat factory invs,burlingtonat factory whse,
2,electro scientific inds,electro scientific industries,
3,kindercare learning centers,kindercare learning ctr,
4,schmitt industries or,schmitt industries,


In [209]:
allFiltered.shape

(9332, 3)

In [199]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'}).\
    rename(columns = {'companyName': 'company'})

chq['cstatCompanies'] = list(map(cleanText, chq.company))
chqToMatch = chq[['cstatCompanies','gvkey']]

In [200]:
chq.head()

Unnamed: 0,gvkey,company,add1,city,state,cstatZipcode,cstatCompanies
0,1004,AAR CORP,"One AAR Place, 1100 North Wood Dale Road",Wood Dale,IL,60191,aar
1,1082,SERVIDYNE INC,"1945 The Exchange, Suite 325",Atlanta,GA,30339,servidyne
2,1244,ALCIDE CORP,8561 154th Avenue North East,Redmond,WA,98052,alcide
3,1331,ALPINE GROUP INC,One Meadowlands Plaza,East Rutherford,NJ,7073,alpine
4,1562,AMERICAN SOFTWARE -CL A,"470 East Paces Ferry Road, NE",Atlanta,GA,30305,american software


In [205]:
ig = pd.read_csv("../../data/companyData/sc_lastHQs.csv")
ig['igCompanies'] = list(map(cleanText, ig.company))

igToMatch = ig[['igCompanies','abi']]

In [206]:
igToMatch.head()

Unnamed: 0,igCompanies,abi
0,mbc,435388
1,cmp industries,479535
2,seacat,595777
3,ami semiconductor,602789
4,osmonics,651166


Put these all together.

In [210]:
gvKey_abiLinkingTable = allFiltered.merge(chqToMatch).merge(igToMatch).drop_duplicates()

In [211]:
gvKey_abiLinkingTable.to_csv('../../data/companyData/sc_linkingTable.csv')

In [212]:
gvKey_abiLinkingTable.head()

Unnamed: 0,cstatCompanies,igCompanies,delete,gvkey,abi
0,american software,american software,,1562,4378204
1,burlingtonat factory invs,burlingtonat factory whse,,2484,849416722
2,electro scientific inds,electro scientific industries,,4274,9546995
3,kindercare learning centers,kindercare learning ctr,,14835,2406528
4,schmitt industries or,schmitt industries,,26520,479790834


In [221]:
c_linksUS = pd.read_csv("../../data/companyData/c_linksUS.csv").drop(columns = {'Unnamed: 0'})
c_linksUS.head()

Unnamed: 0,gvkey,conm,cgvkey,cconm,cnms,srcdate,cid,sid,ctype,salecs,scusip,stic,ccusip,ctic,year
0,1013,ADC TELECOMMUNICATIONS INC,2136,VERIZON COMMUNICATIONS INC,VERIZON COMMUNICATIONS,20021031,10,0,COMPANY,111.056,886309,ADCT.1,92343V104,VZ,2002
1,1013,ADC TELECOMMUNICATIONS INC,2136,VERIZON COMMUNICATIONS INC,VERIZON COMMUNICATIONS,20041031,13,0,COMPANY,104.312,886309,ADCT.1,92343V104,VZ,2004
2,1013,ADC TELECOMMUNICATIONS INC,2136,VERIZON COMMUNICATIONS INC,VERIZON COMMUNICATIONS,20051031,13,0,COMPANY,146.0,886309,ADCT.1,92343V104,VZ,2005
3,1013,ADC TELECOMMUNICATIONS INC,2136,VERIZON COMMUNICATIONS INC,VERIZON COMMUNICATIONS,20061031,13,0,COMPANY,205.0,886309,ADCT.1,92343V104,VZ,2006
4,1013,ADC TELECOMMUNICATIONS INC,2136,VERIZON COMMUNICATIONS INC,VERIZON COMMUNICATIONS,20071031,13,0,COMPANY,236.0,886309,ADCT.1,92343V104,VZ,2007


In [220]:
us_gvkey = c_linksUS.gvkey.append(c_linksUS.cgvkey).drop_duplicates()
sum(us_gvkey.isin(gvKey_abiLinkingTable.gvkey))/len(us_gvkey)

0.7394772572980313

In [222]:
c_linksUS[~(c_linksUS.gvkey.isin(gvKey_abiLinkingTable.gvkey) & c_linksUS.cgvkey.isin(gvKey_abiLinkingTable.gvkey))].shape

(11611, 15)

In [223]:
c_linksUS.shape

(51643, 15)

What fraction of within-US sales do we capture?

In [227]:
c_linksUS[(c_linksUS.gvkey.isin(gvKey_abiLinkingTable.gvkey) \
           & c_linksUS.cgvkey.isin(gvKey_abiLinkingTable.gvkey))].salecs.sum()/c_linksUS.salecs.sum()

0.8935782116435496

In [228]:
c_links_all = pd.read_csv("../../data/companyData/c_links.csv").drop(columns = {'Unnamed: 0'})


In [231]:
c_links_all.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,customer_ind,supplier_ind
0,2002,1013.0,2136.0,111.056,transportUtilities,manu
1,2003,1013.0,2136.0,111.056,transportUtilities,manu
2,2004,1013.0,2136.0,104.312,transportUtilities,manu
3,2005,1013.0,2136.0,146.0,transportUtilities,manu
4,2006,1013.0,2136.0,205.0,transportUtilities,manu


In [232]:
c_links_all[(c_links_all.supplier_gvkey.isin(gvKey_abiLinkingTable.gvkey) \
           & c_links_all.customer_gvkey.isin(gvKey_abiLinkingTable.gvkey))].salecs.sum()/c_links_all.salecs.sum()

0.6815255399716054

---------------------

# Take 4
Find the firms remaining the in the Compustat segments data that don't have a match. Then, find 10 closest matches to each and match by hand.

In [245]:
c_links_co1 = c_linksUS[['conm','gvkey']].rename(columns = {'conm': 'company'})
c_links_co2 = c_linksUS[['cconm','cgvkey']].rename(columns = {'cconm': 'company',
                                                             'cgvkey': 'gvkey'})

remainingCOs = c_links_co1.append(c_links_co2).drop_duplicates()
remainingCOs = remainingCOs[~remainingCOs.gvkey.isin(gvKey_abiLinkingTable.gvkey)].reset_index()
remainingCOs.shape

remainingCOs['company'] = list(map(cleanText, remainingCOs.company))





In [258]:
igToMatch.head()
igToMatch.rename(columns = {'igCompanies': 'company'}, inplace = True)

In [248]:
igRemaining = igToMatch[~igToMatch.abi.isin(gvKey_abiLinkingTable.abi)].reset_index(drop = True)
igRemaining.rename(columns = {'igCompanies': 'company'}, inplace = True)
igRemaining.head()

Unnamed: 0,company,abi
0,mbc,435388
1,cmp industries,479535
2,seacat,595777
3,ami semiconductor,602789
4,osmonics,651166


First do the levenshtein distance.

In [259]:
companyArrayRemaining = []

company = remainingCOs.company[0]
start = time.time()
for company in remainingCOs.company:
    thisCompany = []
    for ig in igToMatch.company:
        thisCompany.append(levenshtein_distance(company,ig))
    
    companyArrayRemaining.append([thisCompany])

allLD = np.concatenate(companyArrayRemaining)


and the cosine similarity

In [265]:
igToMatch

Unnamed: 0,index,company,gvkey
0,12,afp imaging,1021
1,33,aaron s,1076
2,66,activision,1111
3,80,aero systems engineering,1154
4,171,alexander s,1257
...,...,...,...
1559,49902,square,26367
1560,50048,popular,2002
1561,50705,oklahoma gas electric,115687
1562,50914,west penn power,11378


In [266]:
n = 5

chqUnmatchedList = list(map(nlp, remainingCOs.company))
allCompaniesIG = list(map(nlp, igToMatch.company))


cstat = getMatrix(chqUnmatchedList)
ig    = getMatrix(allCompaniesIG)

allSimilarities = cosine_similarity(cstat,ig)

allSimilarities.shape

largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]

In [269]:
# legacy
largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


companyMatches = pd.DataFrame()
companyMatches['cstatCompanies'] = remainingCOs.company

for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    '''companyMatches.at[i,'misspelling']      = allLD[i,:][singleLargestLV[i]]
    companyMatches.at[i,'percMisspelled']      = companyMatches.misspelling[i]/len(companyMatches.cstatCompanies[i])
    companyMatches.at[i,'levCompany']          = igUnmatched.company[singleLargestLV[i]].iloc[0]'''

    # legacy as well
    companyMatches.at[i,'closestMatchLV']      = np.array(igToMatch.company)[largestElementsLV[i]]
    companyMatches.at[i,'closestMatchCOS']      = np.array(igToMatch.company)[largestElementsCos[i]]
    companyMatches.at[i,'LevSim']              = np.array(allLD[i,:][largestElementsLV[i]], dtype=object)

Now save this and check them by hand.

In [270]:
companyMatches.to_csv("../../data/companyData/companyMatches.csv")

We don't have a 100\% match for all these companies, but we do have a bigger chunk of them.

In [271]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'}).\
    rename(columns = {'companyName': 'company'})

chq['cstatCompanies'] = list(map(cleanText, chq.company))
chqToMatch = chq[['cstatCompanies','gvkey']]

ig = pd.read_csv("../../data/companyData/sc_lastHQs.csv")
ig['igCompanies'] = list(map(cleanText, ig.company))

igToMatch = ig[['igCompanies','abi']]

In [273]:
chqToMatch.cstatCompanies.str.strip()

0                      aar
1                servidyne
2                   alcide
3                   alpine
4        american software
               ...        
6245           vital farms
6246            cleanspark
6247    target hospitality
6248     diamond eagle acq
6249    stabilis solutions
Name: cstatCompanies, Length: 6250, dtype: object

In [274]:
chqToMatch['cstatCompanies'] = chqToMatch.cstatCompanies.str.strip()
igToMatch['igCompanies']     = igToMatch.igCompanies.str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [277]:
take2                   = pd.read_csv('../../data/companyData/companyMatches_secondPass.csv')
take2['cstatCompanies'] = take2.cstatCompanies.str.strip()
take2['igCompanies']    = take2.igCompanies.str.strip()
take2 = take2[['cstatCompanies', 'igCompanies']]

In [278]:
gvKey_abiLinkingTable2 = take2.merge(chqToMatch).merge(igToMatch).drop_duplicates()

In [280]:
gvKey_abiLinkingTable.append(gvKey_abiLinkingTable2)

(4680, 5)

In [279]:
gvKey_abiLinkingTable2.shape

(188, 4)

In [283]:
gvKey_abiLinkingTable = gvKey_abiLinkingTable.drop(columns = {'delete'})
gvKey_abiLinkingTable.head()

Unnamed: 0,cstatCompanies,igCompanies,gvkey,abi
0,american software,american software,1562,4378204
1,burlingtonat factory invs,burlingtonat factory whse,2484,849416722
2,electro scientific inds,electro scientific industries,4274,9546995
3,kindercare learning centers,kindercare learning ctr,14835,2406528
4,schmitt industries or,schmitt industries,26520,479790834


In [282]:
gvKey_abiLinkingTable2.head()

Unnamed: 0,cstatCompanies,igCompanies,gvkey,abi
0,activision,activision blizzard,1111,456268317
1,alexander s,j alexander s,1257,467774501
2,alexander s,j alexander s,1257,439536247
3,armatron international,armatron international,1754,681828380
4,bard (c.r.),c r bard,2044,7506074


In [284]:
linkingTable = gvKey_abiLinkingTable.append(gvKey_abiLinkingTable2).drop_duplicates()
linkingTable.shape

(4863, 4)

In [286]:
c_links_all[(c_links_all.supplier_gvkey.isin(linkingTable.gvkey) \
           & c_links_all.customer_gvkey.isin(linkingTable.gvkey))].salecs.sum()/c_links_all.salecs.sum()

0.6885253954111357

In [287]:
c_linksUS[(c_linksUS.gvkey.isin(linkingTable.gvkey) \
           & c_linksUS.cgvkey.isin(linkingTable.gvkey))].salecs.sum()/c_linksUS.salecs.sum()

0.9033502200296648

In [288]:
linkingTable.to_csv('../../data/companyData/sc_linkingTable.csv')