In [None]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy
  
nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Get Full IG Dataset

In [None]:
file = "../../data/companyData/infogroup2010s.csv"

In [None]:
import dask.dataframe as dd

df = dd.read_csv(file, assume_missing=True, 
                 dtype={'parent_number': 'object','parent_employee_size_code': 'object',
                       'parent_sales_volume_code': 'object',
                       'abi': 'object'}, low_memory = False)
df = df[df.business_status_code == 1.0]


In [None]:
df.head()

In [None]:
hq = df[['abi','company']].drop_duplicates().compute(num_workers = 100)

In [None]:
hq.shape

Some of the abi numbers seem to be duplicated; it looks like they might be primarily for different government agencies.

In [None]:
hq.company.value_counts()[hq.company.value_counts() > 10].index

In [None]:
hq.company.isin(toDiscard)

In [None]:
toDiscard = hq.company.value_counts()[hq.company.value_counts() > 1].index
for company in toDiscard:
    print(company)

hq = hq[~hq.company.isin(toDiscard)]

At this point, we have a unique record of every company - hq here. Some of these may well be duplicate entries for a given company, for the cases in which we have a company that has multiple hq.

Let's stash it so that we don't have to go through the above ^^ again.

In [None]:
hq.to_csv("../../data/ig2010s_uniqueHQs.csv")

In [None]:
hq = pd.read_csv("../../data/ig2010s_uniqueHQs.csv")

## Grab Compustat Data

Get the company dataset and check.

The legal name and the given name are slightly different, but basically the same modulo punctuation and case.

In [None]:
chq = pd.read_csv("../../data/companyData/compustatChanges_2010s.csv").drop(columns = {'Unnamed: 0'})

In [None]:
print(chq.columns,chq.shape)

Subset this to focus on firms in: ag, mining, construction, manufacturing, wholesale and retail, and transportation.

In [None]:
chq = chq[(chq.naics.astype('str').str.slice(0,2).isin(['11','21','22','23','31','32',
                                                         '33','42','44','45','48','49']))]

In [None]:
print(chq.head(),chq.shape)

In [None]:
chq = chq[['gvkey','companyName']].drop_duplicates()
chq.rename(columns = {'companyName': 'company'},inplace = True)

Only two of these company names appear 2x, which is good. There are ~20,000 companies in this sample.

Let's go through a little bit of a process here:
- Find the exact matches.
- Get a similarity measure between ; ideally something vectorized / something in matrix math.
- Find the top 10 matches for the remaining ones.
- Do some mix and match and see if there's any threshold at which matches become similar ``enough'' to say this is okay and good to go.


We might be able to use the fact that all of the addresses should be the same after some given point, as the compustat addresses are only the most recent ones. 

In [None]:
chq.shape

In [None]:
chq.head()

Let's try a few different ways to match these up.

First, let's find the exact matches.

In [None]:
easyMerge = chq.merge(hq)
print(easyMerge.shape,easyMerge.head())

In [None]:
chqUnmatched = chq[~chq.company.isin(easyMerge.company)].reset_index()
chqUnmatched.shape

Make a generic cleaning function that strips out all company names, any punctuation in the name, and makes everything lower case.

In [None]:
def cleanText(text):
    text = text.\
    replace(" CORP","").replace(" CO","").replace(" INC","").\
    replace(" LTD","").replace(" -CL A","").\
    replace(" -LP","").replace(" LP","").\
    replace("-OLD","").replace(" LLC","").\
    replace(" -CL B","").replace(" -CL i","").replace(" -CL","").\
    replace("-REDH","").replace(" CP","").\
    replace("-ADR","").replace(" PLC","").lower().replace(r'[^\w\s]+', '')
    
    
    return text

In [None]:
chqUnmatched['company'] = list(map(cleanText, chqUnmatched.company))
hq['company']           = list(map(cleanText, hq.company))

In [None]:
allCompaniesCStat = list(map(nlp, chqUnmatched.company))

In [None]:
allCompaniesIG = list(map(nlp, hq.company))

In [None]:
outfile =  '../../data/allCompaniesIG_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(allCompaniesIG, pickle_file)
    
outfile =  '../../data/allCompaniesCStat_embeddings.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(allCompaniesCStat, pickle_file)    

In [None]:
def getMatrix(companyEmbeddings):
    companyArray = []
    
    for companies in companyEmbeddings:
        companyArray.append([companies.vector])
    
    companyArray = np.concatenate(companyArray)
    
    return(companyArray)
        

In [None]:
cstat = getMatrix(allCompaniesCStat)
ig = getMatrix(allCompaniesIG)

In [None]:
ig.shape

In [None]:
allSimilarities = cosine_similarity(cstat,ig)

Each row n here has the similarity between the nth company name in compustat and the IG company corresp to that column.

In [None]:
allSimilarities[0:5,:]

Find indices of companies in IG most similar to each company in CStat.

In [None]:
n = 50
largestElements = (-allSimilarities).argsort(axis=1)[:, :n]

In [None]:
companyMatches = pd.DataFrame()
companyMatches['cstatCompanies'] = chqUnmatched.company

In [None]:
companyMatches

In [None]:
print(companyMatches.shape[0])

In [None]:
for i in range(0,companyMatches.shape[0]):
    # print(list(np.array(hq.company)[largestElements[i]]))
    companyMatches.at[i,'closestMatch'] = np.array(hq.company)[largestElements[i]]

In [None]:
companyMatches

In [None]:
companyMatches.to_csv("../../data/companyData/closestMatch.csv")

In [None]:
Other ideas here: try out the levenshtein distance; try to match just on the first word