In [2]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy
  
nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Get Full IG Dataset

In [None]:
file = "../../data/companyData/infogroup2010s.csv"

In [None]:
import dask.dataframe as dd

df = dd.read_csv(file, assume_missing=True, 
                 dtype={'parent_number': 'object','parent_employee_size_code': 'object',
                       'parent_sales_volume_code': 'object',
                       'abi': 'object','zipcode': 'object'}, low_memory = False)
df = df[df.business_status_code == 1.0]


In [None]:
df.head()

In [6]:
hq = df[['abi','ticker','company','archive_version_year','state','city','zipcode','primary_naics_code','latitude','longitude']].drop_duplicates().compute(num_workers = 100)

In [8]:
hqsOnly = hq[['abi','company']].drop_duplicates()

In [9]:
hqsOnly.company.value_counts()

GOVERNOR'S OFFICE                56
ATTORNEY GENERAL                 49
SECRETARY OF STATE               43
CHIEF OF STAFF                   43
LIEUTENANT GOVERNOR              39
                                 ..
CANAAN PARTNERS                   1
PRIME BANK                        1
NEW HAVEN FREE PUBLIC LIBRARY     1
MILFORD BANK                      1
USA COMPRESSION PARTNERS LP       1
Name: company, Length: 79777, dtype: int64

In [10]:
print(hq.shape,hqsOnly.shape)

(439546, 10) (84213, 2)


Some of the abi numbers seem to be duplicated; it looks like they might be primarily for different government agencies.

In [11]:
hqsOnly.company.value_counts()[hqsOnly.company.value_counts() > 10].index

Index(['GOVERNOR'S OFFICE', 'ATTORNEY GENERAL', 'SECRETARY OF STATE',
       'CHIEF OF STAFF', 'LIEUTENANT GOVERNOR', 'PRESS SECRETARY',
       'LIEUTENANT GOVERNOR'S OFFICE', 'TRANSPORTATION DEPARTMENT',
       'CORRECTIONS DEPT', 'CORRECTIONS DEPARTMENT',
       'SUPREME COURT CHIEF JUSTICE', 'EDUCATION DEPT', 'SUPREME COURT CLERK',
       'TRANSPORTATION DEPT', 'AGRICULTURE DEPT', 'EMERGENCY MEDICAL SVC',
       'ADJUTANT GENERAL', 'AGRICULTURE DEPARTMENT', 'ELECTIONS DIVISION',
       'EDUCATION DEPARTMENT', 'STATE VETERINARIAN', 'FIRE MARSHAL',
       'STATE LIBRARY', 'STATE TREASURER', 'REVENUE DEPARTMENT',
       'INSURANCE DEPT', 'LABOR DEPT', 'ETHICS COMMISSION',
       'NATURAL RESOURCES DEPT', 'REVENUE DEPT', 'GEOLOGICAL SURVEY',
       'CORPORATIONS DIVISION', 'LABOR DEPARTMENT', 'HEALTH DEPARTMENT',
       'PUBLIC SAFETY DEPT', 'HUMAN SERVICES DEPT', 'RACING COMMISSION',
       'PUBLIC SERVICE COMMISSION', 'SECURITIES DIVISION',
       'OCCUPATIONAL SAFETY & HEALTH', 'ADMI

In [12]:
toDiscard = hqsOnly.company.value_counts()[hqsOnly.company.value_counts() > 1].index
for company in toDiscard:
    print(company)


GOVERNOR'S OFFICE
ATTORNEY GENERAL
SECRETARY OF STATE
CHIEF OF STAFF
LIEUTENANT GOVERNOR
PRESS SECRETARY
LIEUTENANT GOVERNOR'S OFFICE
TRANSPORTATION DEPARTMENT
CORRECTIONS DEPT
CORRECTIONS DEPARTMENT
SUPREME COURT CHIEF JUSTICE
EDUCATION DEPT
SUPREME COURT CLERK
TRANSPORTATION DEPT
AGRICULTURE DEPT
EMERGENCY MEDICAL SVC
ADJUTANT GENERAL
AGRICULTURE DEPARTMENT
ELECTIONS DIVISION
EDUCATION DEPARTMENT
STATE VETERINARIAN
FIRE MARSHAL
STATE LIBRARY
STATE TREASURER
REVENUE DEPARTMENT
INSURANCE DEPT
LABOR DEPT
ETHICS COMMISSION
NATURAL RESOURCES DEPT
REVENUE DEPT
GEOLOGICAL SURVEY
CORPORATIONS DIVISION
LABOR DEPARTMENT
HEALTH DEPARTMENT
PUBLIC SAFETY DEPT
HUMAN SERVICES DEPT
RACING COMMISSION
PUBLIC SERVICE COMMISSION
SECURITIES DIVISION
OCCUPATIONAL SAFETY & HEALTH
ADMINISTRATION DEPT
WORKERS COMPENSATION
INSURANCE DEPARTMENT
ARTS COUNCIL
VOCATIONAL REHABILITATION
CHILD SUPPORT ENFORCEMENT
HUMAN RIGHTS COMMISSION
LAW LIBRARY
FORESTRY DIVISION
HEALTH DEPT
STATE POLICE
EMERGENCY MANAGEMENT AGE

In [13]:
toDiscard

Index(['GOVERNOR'S OFFICE', 'ATTORNEY GENERAL', 'SECRETARY OF STATE',
       'CHIEF OF STAFF', 'LIEUTENANT GOVERNOR', 'PRESS SECRETARY',
       'LIEUTENANT GOVERNOR'S OFFICE', 'TRANSPORTATION DEPARTMENT',
       'CORRECTIONS DEPT', 'CORRECTIONS DEPARTMENT',
       ...
       'ENVIRONMENTAL HEALTH DIV', 'MANCHESTER PUBLIC LIBRARY', 'TSP INC',
       'STERNE AGEE GROUP INC', 'KADEMENOS WISEHART HINES DOLYK',
       'WORK FORCE SVC', 'CULTURAL RESOURCES DEPT', 'COAST DENTAL SVC INC',
       'KENNIE'S MARKETS INC', 'BANCSHARES INC'],
      dtype='object', length=1975)

In [14]:
hqsOnly = hqsOnly[~hqsOnly.company.isin(toDiscard)]
hq      = hq[~hq.company.isin(toDiscard)]

In [15]:
hq.shape

(394920, 10)

At this point, we have a unique record of every company - hq here. Some of these may well be duplicate entries for a given company, for the cases in which we have a company that has multiple hq.

Let's stash it so that we don't have to go through the above ^^ again.

In [16]:
hqsOnly.to_csv("../../data/ig2010s_uniqueHQs.csv")

In [17]:
hq.to_csv("../../data/ig2010s_uniqueHQs_multLocations.csv")

In [49]:
hqsOnly     = pd.read_csv("../../data/ig2010s_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})
hqsWithYear = pd.read_csv("../../data/ig2010s_uniqueHQs_multLocations.csv",dtype={'zipcode': 'object'})[['abi','company',
                                                                             'archive_version_year',
                                                                             'state','city','zipcode']]

In [50]:
hqsOnly.head()

Unnamed: 0,abi,company
0,7609,SOLITRON DEVICES INC
1,21311,WESTERN STATES ENVELOPE & LBL
2,29603,THIELE KAOLIN CO
3,71340,TRI STAFF GROUP
4,77743,NATIONAL TECHNICAL SYSTEMS INC


In [51]:
hqsWithYear.head()

Unnamed: 0,abi,company,archive_version_year,state,city,zipcode
0,7609,SOLITRON DEVICES INC,2010.0,FL,WEST PALM BEACH,33407
1,21311,WESTERN STATES ENVELOPE & LBL,2010.0,WI,BUTLER,53007
2,29603,THIELE KAOLIN CO,2010.0,GA,SANDERSVILLE,31082
3,71340,TRI STAFF GROUP,2010.0,CA,SAN DIEGO,92122
4,77743,NATIONAL TECHNICAL SYSTEMS INC,2010.0,CA,CALABASAS,91302


In [52]:
hqsWithYear = hqsWithYear[hqsWithYear.archive_version_year <= 2018]

In [53]:
hqsWithYear['zipcode']

0         33407
1         53007
2         31082
3         92122
4         91302
          ...  
316021    17101
316022    92008
316023    46530
316024    06437
316025    90222
Name: zipcode, Length: 316026, dtype: object

In [54]:
hqsWithYear['last_year'] = hqsWithYear.groupby(['abi'])['archive_version_year'].transform(max)

In [55]:
print(hqsWithYear.shape)

lastHQs = hqsWithYear[hqsWithYear.archive_version_year == hqsWithYear.last_year][['abi','company','zipcode']]

(316026, 7)


In [56]:
lastHQs.zipcode

17        80401
20        37771
23        19006
24        60018
43        77845
          ...  
316021    17101
316022    92008
316023    46530
316024    06437
316025    90222
Name: zipcode, Length: 54200, dtype: object

## Grab Compustat Data

First filter down to the companies for whom we have the supply chain information.

In [89]:
c_links = pd.read_csv("../../data/companyData/compustatSCLinked.csv")

c_links['year'] = c_links.srcdate.astype('str').str.slice(0,4).astype('int64')

c_links = c_links[c_links.year > 2009]

relevant_gvkeys = c_links.gvkey.append(c_links.cgvkey).drop_duplicates()

print(c_links.head(),relevant_gvkeys.shape)

     gvkey                        conm  cgvkey                       cconm  \
80    1013  ADC TELECOMMUNICATIONS INC    9899                    AT&T INC   
81    1013  ADC TELECOMMUNICATIONS INC    2136  VERIZON COMMUNICATIONS INC   
281   1094                  ACETO CORP   31673      AMERISOURCEBERGEN CORP   
282   1094                  ACETO CORP   31673      AMERISOURCEBERGEN CORP   
283   1094                  ACETO CORP    7171               MCKESSON CORP   

                       cnms   srcdate  cid  sid    ctype   salecs     scusip  \
80                     AT&T  20100930   16    0  COMPANY  300.000  000886309   
81   VERIZON COMMUNICATIONS  20100930   13    0  COMPANY  146.000  000886309   
281  AmerisourceBergen Corp  20160630   13    0  COMPANY   78.193  004446100   
282  AmerisourceBergen Corp  20170630   13    0  COMPANY   76.598  004446100   
283           McKesson Corp  20170630   19    0  COMPANY   70.215  004446100   

       stic     ccusip ctic  year  
80   ADCT.1  0

Get the company dataset and check.

The legal name and the given name are slightly different, but basically the same modulo punctuation and case.

In [107]:
c_addresses = pd.read_csv("../../data/companyData/compustatAddresses.csv")[['fyear',
                                                                            'gvkey',
                                                                            'conm',
                                                                            'addzip',
                                                                           'naics']].drop_duplicates().rename(columns = {'fyear': 'year'})

In [108]:
c_addresses.columns

Index(['year', 'gvkey', 'conm', 'addzip', 'naics'], dtype='object')

In [93]:
chq = pd.read_csv("../../data/companyData/compustatChanges_2010s.csv").drop(columns = {'Unnamed: 0'})

In [94]:
print(chq.columns,chq.shape)

Index(['year', 'qtr', 'gvkey', 'companyName', 'tic', 'naics', 'curcdq',
       'incomeChange', 'revenueChange', 'revenueChangeAbsolute', 'costChange',
       'inventoryChange'],
      dtype='object') (382298, 12)


In [95]:
max(chq.year)

2018

Subset this to focus on firms in: ag, mining, construction, manufacturing, wholesale and retail, and transportation.

In [62]:
chq = chq[(chq.naics.astype('str').str.slice(0,2).isin(['11','21','22','23','31','32',
                                                         '33','42','44','45','48','49']))]

In [63]:
print(chq.head(),chq.shape)

   year  qtr  gvkey companyName  tic     naics curcdq  incomeChange  \
0  2010  1.0   1004    AAR CORP  AIR  423860.0    USD      0.213983   
1  2010  2.0   1004    AAR CORP  AIR  423860.0    USD      0.045617   
2  2010  3.0   1004    AAR CORP  AIR  423860.0    USD      0.153198   
3  2010  4.0   1004    AAR CORP  AIR  423860.0    USD     -0.398739   
4  2011  1.0   1004    AAR CORP  AIR  423860.0    USD      0.096386   

   revenueChange  revenueChangeAbsolute  costChange  inventoryChange  
0       0.200565                 81.107    0.178258         0.055631  
1       0.059577                 27.099    0.017636         0.086776  
2       0.166276                 76.160    0.129456         0.169270  
3       0.174283                 85.020    0.136013         0.182304  
4       0.133883                 65.000    0.101523         0.104150   (179948, 12)


In [64]:
chq = chq[['gvkey', 'companyName', 'naics']].drop_duplicates()

In [29]:
print(chq.shape)

chq = chq.merge(c_addresses)

print(chq.shape)

(7743, 3)
(7743, 5)


In [30]:
chq.rename(columns = {'companyName': 'company'},inplace = True)
chq.drop(columns = {'conm'},inplace = True)
chq.head()

Unnamed: 0,gvkey,company,naics,addzip
0,1004,AAR CORP,423860.0,60191
1,1045,AMERICAN AIRLINES GROUP INC,481111.0,76155
2,1050,CECO ENVIRONMENTAL CORP,333413.0,75254
3,1072,AVX CORP,334416.0,29644
4,1075,PINNACLE WEST CAPITAL CORP,2211.0,85072-3999


Only two of these company names appear 2x, which is good. There are ~20,000 companies in this sample.

Let's go through a little bit of a process here:
- Find the exact matches.
- Get a similarity measure between ; ideally something vectorized / something in matrix math.
- Find the top 10 matches for the remaining ones.
- Do some mix and match and see if there's any threshold at which matches become similar ``enough'' to say this is okay and good to go.


We might be able to use the fact that all of the addresses should be the same after some given point, as the compustat addresses are only the most recent ones. 

Let's try a few different ways to match these up.

First, let's find the exact matches.

Make a generic cleaning function that strips out all company names, any punctuation in the name, and makes everything lower case.

In [31]:
def cleanText(text):
    text = text.\
    replace(" CORP","").replace(" CO","").replace(" INC","").\
    replace(" LTD","").replace(" -CL A","").\
    replace(" -LP","").replace(" LP","").\
    replace("-OLD","").replace(" LLC","").\
    replace(" -CL B","").replace(" -CL i","").replace(" -CL","").\
    replace("-REDH","").replace(" CP","").\
    replace("-ADR","").replace(" PLC","").lower().replace(r'[^\w\s]+', '')
    
    
    return text

In [32]:
chq.head()

Unnamed: 0,gvkey,company,naics,addzip
0,1004,AAR CORP,423860.0,60191
1,1045,AMERICAN AIRLINES GROUP INC,481111.0,76155
2,1050,CECO ENVIRONMENTAL CORP,333413.0,75254
3,1072,AVX CORP,334416.0,29644
4,1075,PINNACLE WEST CAPITAL CORP,2211.0,85072-3999


In [33]:
chq['company']               = list(map(cleanText, chq.company))
lastHQs['company']           = list(map(cleanText, lastHQs.company))

NAICS names do not match up between compustat and infogroup so they're not helpful.

In [34]:
chq.head()

Unnamed: 0,gvkey,company,naics,addzip
0,1004,aar,423860.0,60191
1,1045,american airlines group,481111.0,76155
2,1050,ceco environmental,333413.0,75254
3,1072,avx,334416.0,29644
4,1075,pinnacle west capital,2211.0,85072-3999


In [35]:
lastHQs.head()

Unnamed: 0,abi,company,zipcode
17,158329,caridian bct,80401
20,211946,family brands intl,37771
23,258574,jade,19006
24,262493,montana metal products,60018
43,455154,o i,77845


## Match on company name directly

In [36]:
nameMerge = chq.merge(lastHQs)
nameMerge.shape

(3404, 6)

In [37]:
nameMerge.head()

Unnamed: 0,gvkey,company,naics,addzip,abi,zipcode
0,1004,aar,423860.0,60191,115523672,60191
1,1045,american airlines group,481111.0,76155,7501711,76155
2,1050,ceco environmental,333413.0,75254,596284992,75254
3,1075,pinnacle west capital,2211.0,85072-3999,4554051,85004
4,1078,abbott laboratories,334510.0,60064-6400,4352373,60064


In [38]:
nameMerge.zipcode 

0       60191
1       76155
2       75254
3       85004
4       60064
        ...  
3399    33394
3400    46032
3401    06902
3402    02210
3403    75201
Name: zipcode, Length: 3404, dtype: object

In [42]:
nameMerge[nameMerge.addzip != nameMerge.zipcode][0:50]

Unnamed: 0,gvkey,company,naics,addzip,abi,zipcode
3,1075,pinnacle west capital,2211.0,85072-3999,4554051,85004
4,1078,abbott laboratories,334510.0,60064-6400,4352373,60064
5,1094,aceto,424690.0,11050,433140944,77010
6,1104,acme united,332215.0,06484,406179390,6824
11,1209,air products & chemicals,325120.0,18106-5500,4692679,18195
15,1234,atrion,339112.0,75002-4211,441309168,75002
20,1327,skyworks solutions,334413.0,92617,3563285,1801
22,1356,alcoa,3364.0,10022-4608,440350715,15212
23,27638,alcoa,3313.0,15212-5858,440350715,15212
26,1397,american biltrite,322220.0,02481-2097,107630048,2481


Now focus down onto the companies that have not been matched.

In [251]:
chqUnmatched = chq[~chq.company.isin(nameMerge.company)].reset_index()
chqUnmatched.shape

(4201, 4)

In [252]:
igUnmatched  = hq[~hq.company.isin(nameMerge.company)].reset_index()
igUnmatched.shape

(74128, 5)

# Find Distance

Two distance measures here. Look at top 5 matches and pull the distance measure and matches as well.

### Levenshtein

In [180]:
from Levenshtein import distance as levenshtein_distance

In [334]:
companyArrayCStat = []

company = chqUnmatched.company[0]
start = time.time()
for company in chqUnmatched.company:
    thisCompany = []
    for ig in igUnmatched.company:
        thisCompany.append(levenshtein_distance(company,ig))
    
    companyArrayCStat.append([thisCompany])

allLD = np.concatenate(companyArrayCStat)


In [367]:
n = 10
singleLargestLV   = (allLD).argsort(axis=-1)[:, :1]
largestElementsLV = (allLD).argsort(axis=-1)[:, :n]


companyMatches = pd.DataFrame()
companyMatches['cstatCompanies'] = chqUnmatched.company

for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'misspelling']         = allLD[i,:][singleLargestLV[i]]
    companyMatches.at[i,'levCompany']          = np.array(igUnmatched.company)[singleLargestLV[i]]
    companyMatches.at[i,'closestMatchIG']      = np.array(igUnmatched.company)[largestElementsLV[i]]
    companyMatches.at[i,'LevSim']              = np.array(allLD[i,:][largestElementsLV[i]], dtype=object)

Now get the embeddings and the cosine similarity between them.

In [257]:
def getMatrix(companyEmbeddings):
    companyArray = []
    
    for companies in companyEmbeddings:
        companyArray.append([companies.vector])
    
    companyArray = np.concatenate(companyArray)
    
    return(companyArray)
        

In [184]:
chqUnmatchedList = list(map(nlp, chqUnmatched.company))

In [179]:
start = time.time()
allCompaniesIG = list(map(nlp, igUnmatched.company))

time.time() - start

199.3105640411377

In [None]:
outfile =  '../../data/allCompaniesIG_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(allCompaniesIG, pickle_file)
    
outfile =  '../../data/allCompaniesCStat_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(allCompaniesCStat, pickle_file)    

In [283]:
cstat = getMatrix(chqUnmatchedList)
ig = getMatrix(allCompaniesIG)

In [284]:
allSimilarities = cosine_similarity(cstat,ig)

In [285]:
allSimilarities.shape

(4201, 74128)

Each row n here has the similarity between the nth company name in compustat and the IG company corresp to that column.

In [286]:
allSimilarities[0:5,:]

array([[-0.10962249,  0.20628196, -0.10305265, ...,  0.0833259 ,
        -0.01536614, -0.14353019],
       [ 0.33820218,  0.00172063,  0.2970876 , ...,  0.2250421 ,
         0.16540973,  0.30576435],
       [ 0.3390207 ,  0.08955836,  0.2373584 , ...,  0.3363049 ,
         0.10127196,  0.11704967],
       [ 0.25192523,  0.04667885,  0.19891556, ...,  0.32147086,
         0.18959029,  0.03053621],
       [ 0.36714765, -0.01750274,  0.33334446, ...,  0.3137754 ,
         0.1732849 ,  0.21875949]], dtype=float32)

Find indices of companies in IG most similar to each company in CStat.

In [373]:
n = 10
largestElementsCos = (-allSimilarities).argsort(axis=-1)[:, :n]
singleLargestCos = (-allSimilarities).argsort(axis=-1)[:, :1]

Add the cosine similarity measures to the similarity dataset.

In [374]:
for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'cosSimilarity']        = allSimilarities[i,:][singleLargestCos[i]]
    companyMatches.at[i,'cosSimilarityCompany'] = np.array(igUnmatched.company)[singleLargestCos[i]]
    companyMatches.at[i,'closestMatchCosine'] = np.array(igUnmatched.company)[largestElementsCos[i]]
    companyMatches.at[i,'cosineSim']          = np.array(allSimilarities[i,:][largestElementsCos[i]], dtype=object)

In [375]:
companyMatches.head()

Unnamed: 0,cstatCompanies,misspelling,levCompany,closestMatchIG,LevSim,cosSimilarity,cosSimilarityCompany,closestMatchCosine,cosineSim
0,avx,1.0,[avp],"[avp, trx, inx, gpx, ckx, acs, anr, box, mnx, ...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2]",0.420928,[esm ferolie],"[esm ferolie, jff, chj, tkb intl, ilg, kawasak...","[0.42092809081077576, 0.41812896728515625, 0.3..."
1,asm international nv,4.0,[astm international],"[astm international, asa international, nsf in...","[4, 4, 5, 5, 5, 5, 5, 5, 5, 5]",0.715796,[asm america],"[asm america, lyondell basell industries nv, a...","[0.7157955169677734, 0.6497361660003662, 0.644..."
2,agnico eagle mines,8.0,[accolade wines],"[accolade wines, eagle diner, golden eagle int...","[8, 9, 9, 9, 9, 9, 9, 9, 9, 9]",0.78448,[colorado gold mines],"[colorado gold mines, gold crest mines, b4mc g...","[0.7844799160957336, 0.7792075872421265, 0.766..."
3,spire alabama,5.0,[swirecala usa],"[swirecala usa, ire global, prevail bank, viro...","[5, 6, 7, 7, 7, 7, 7, 7, 7, 7]",0.754357,[alabama forestymmission],"[alabama forestymmission, life spire of virgin...","[0.7543568015098572, 0.7075667977333069, 0.663..."
4,alabama power,4.0,[alstom power],"[alstom power, ram power, manpower, solar powe...","[4, 5, 6, 6, 6, 6, 6, 6, 6, 6]",0.813616,[alabama electric],"[alabama electric, vermont electric power, fir...","[0.8136155009269714, 0.7850184440612793, 0.767..."


In [384]:
companyMatches[(companyMatches.levCompany == companyMatches.cosSimilarityCompany) & (companyMatches.misspelling < 10)][400:450]

Unnamed: 0,cstatCompanies,misspelling,levCompany,closestMatchIG,LevSim,cosSimilarity,cosSimilarityCompany,closestMatchCosine,cosineSim
3643,secure energy services,7.0,[energy services],"[energy services, emergency services, elderly ...","[7, 8, 9, 9, 9, 9, 9, 9, 9, 9]",0.915964,[energy services],"[energy services, energy services groupnstr, e...","[0.9159643650054932, 0.9159643054008484, 0.852..."
3656,nxp semiconductors nv,3.0,[nxp semiconductors],"[nxp semiconductors, nxp semiconductors, nxp s...","[3, 3, 4, 7, 8, 9, 9, 9, 10, 11]",0.886324,[nxp semiconductors],"[nxp semiconductors, nxp semiconductors, nxp s...","[0.8863240480422974, 0.8863240480422974, 0.798..."
3662,stonegate agricom,7.0,[stonegate bank],"[stonegate bank, stonegate mortgage, synovate ...","[7, 7, 8, 9, 9, 9, 9, 9, 9, 9]",0.74088,[stonegate bank],"[stonegate bank, stonegate mortgage, stonegate...","[0.7408800721168518, 0.721794605255127, 0.5772..."
3685,gold standard ventures,6.0,[gold standard enterprises],"[gold standard enterprises, lstar ventures, co...","[6, 8, 9, 9, 9, 9, 9, 9, 10, 10]",0.912064,[gold standard enterprises],"[gold standard enterprises, international silv...","[0.9120638370513916, 0.7628298997879028, 0.759..."
3686,global battery metals,7.0,[american battery metals],"[american battery metals, global baristas, alc...","[7, 8, 9, 9, 9, 9, 10, 10, 10, 10]",0.861582,[american battery metals],"[american battery metals, international batter...","[0.8615818023681641, 0.7732982039451599, 0.749..."
3700,china kanghui holdings,8.0,[china trust holdings],"[china trust holdings, china voice holding, ch...","[8, 9, 9, 10, 10, 10, 10, 10, 10, 10]",0.909243,[china trust holdings],"[china trust holdings, yum china holdings, tei...","[0.9092429876327515, 0.84050053358078, 0.81798..."
3705,global brass &pper hldgs,3.0,[global brass &pper holdings],"[global brass &pper holdings, global cash acce...","[3, 11, 12, 12, 12, 12, 13, 13, 13, 13]",0.910043,[global brass &pper holdings],"[global brass &pper holdings, alaskanpper & br...","[0.9100431203842163, 0.8418891429901123, 0.785..."
3707,horizon therapeutics pub,4.0,[horizon therapeutics],"[horizon therapeutics, hotspot therapeutics, p...","[4, 8, 9, 9, 9, 9, 9, 9, 9, 9]",0.838753,[horizon therapeutics],"[horizon therapeutics, turning point therapeut...","[0.8387528657913208, 0.6750178337097168, 0.657..."
3713,pacific biosciences of calif,3.0,[pacific biosciences of ca],"[pacific biosciences of ca, pacific bioscience...","[3, 9, 12, 12, 13, 13, 14, 14, 14, 14]",0.872247,[pacific biosciences of ca],"[pacific biosciences of ca, pacific bioscience...","[0.8722467422485352, 0.8425444960594177, 0.793..."
3719,global pharm holdings group,5.0,[global energy holdings group],"[global energy holdings group, orchard holding...","[5, 8, 10, 10, 11, 11, 11, 11, 11, 11]",0.854724,[global energy holdings group],"[global energy holdings group, global diversif...","[0.8547235727310181, 0.8481877446174622, 0.843..."


In [360]:
companyMatches.sort_values(by=['misspelling'],inplace = True)

In [361]:
companyMatches.to_csv("../../data/companyData/closestMatch.csv")

In [355]:
companyMatches.head()

Unnamed: 0,cstatCompanies,misspelling,closestMatchIG,LevSim,cosSimilarity,closestMatchCosine,cosineSim
3649,reald,1.0,"[real d, deal, recall, rebold, recall, rand, n...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2]",-0.038573,"[real d, deal, recall, rebold, recall, rand, n...","[-0.03857274726033211, -0.11090588569641113, -..."
716,heron lake bioenergy,1.0,"[heron lake bio energy, abengoa bioenergy, geo...","[1, 8, 9, 9, 9, 9, 10, 10, 10, 10]",0.828559,"[heron lake bio energy, abengoa bioenergy, geo...","[0.8285592794418335, 0.5588446259498596, 0.378..."
288,west pharmaceutical svsc,1.0,"[west pharmaceutical svc, osi pharmaceuticals,...","[1, 7, 7, 7, 7, 7, 7, 7, 8, 8]",0.827309,"[west pharmaceutical svc, osi pharmaceuticals,...","[0.827308714389801, 0.42986994981765747, 0.575..."
2647,seracare life sciences,1.0,"[sera care life sciences, miraca life sciences...","[1, 4, 5, 6, 6, 7, 7, 7, 7, 7]",0.826967,"[sera care life sciences, miraca life sciences...","[0.8269668221473694, 0.9999998211860657, 0.854..."
20,arts way mfg,1.0,"[art's way mfg, arti flex mfg, ro man mfg, eas...","[1, 5, 5, 6, 6, 6, 6, 7, 7, 7]",0.826265,"[art's way mfg, arti flex mfg, ro man mfg, eas...","[0.8262650966644287, 0.4649088978767395, 0.572..."
