In [1]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

# Get Full IG Dataset

In [2]:
file = "../../data/companyData/infogroup2010s.csv"

In [3]:
import dask.dataframe as dd

df = dd.read_csv(file, assume_missing=True, 
                 dtype={'parent_number': 'object','parent_employee_size_code': 'object',
                       'parent_sales_volume_code': 'object',
                       'abi': 'object'}, low_memory = False)
df = df[df.business_status_code == 1.0]


  import pandas.util.testing as tm


In [4]:
df.head()

Unnamed: 0,archive_version_year,abi,ticker,company,city,state,zipcode,location_employee_size_code,location_sales_volume_code,primary_sic_code,primary_naics_code,sic_code,business_status_code,company_holding_status,parent_employee_size_code,parent_sales_volume_code,cbsa_code,latitude,longitude
8,2010.0,7609,SODI,SOLITRON DEVICES INC,WEST PALM BEACH,FL,33407.0,E,,367401.0,33441302.0,362998.0,1.0,1.0,E,,33100.0,26.7412,-80.06694
41,2010.0,21311,,WESTERN STATES ENVELOPE & LBL,BUTLER,WI,53007.0,H,,267201.0,32222006.0,511216.0,1.0,,H,,33340.0,43.09799,-88.07399
58,2010.0,29603,,THIELE KAOLIN CO,SANDERSVILLE,GA,31082.0,G,,329598.0,32799204.0,145598.0,1.0,,G,,0.0,32.96893,-82.81953
207,2010.0,71340,,TRI STAFF GROUP,SAN DIEGO,CA,92122.0,E,,736304.0,56132001.0,736103.0,1.0,,E,,41740.0,32.85445,-117.18594
216,2010.0,77743,,NATIONAL TECHNICAL SYSTEMS INC,CALABASAS,CA,91302.0,C,,873402.0,54138023.0,382998.0,1.0,1.0,C,,31080.0,34.15562,-118.65163


In [5]:
hq = df[['abi','company']].drop_duplicates().compute(num_workers = 100)

In [6]:
hq.shape

(84213, 2)

Some of the abi numbers seem to be duplicated; it looks like they might be primarily for different government agencies.

In [19]:
hq.company.value_counts()[hq.company.value_counts() > 10].index

Index(['GOVERNOR'S OFFICE', 'ATTORNEY GENERAL', 'SECRETARY OF STATE',
       'CHIEF OF STAFF', 'LIEUTENANT GOVERNOR', 'PRESS SECRETARY',
       'LIEUTENANT GOVERNOR'S OFFICE', 'TRANSPORTATION DEPARTMENT',
       'CORRECTIONS DEPT', 'CORRECTIONS DEPARTMENT',
       'SUPREME COURT CHIEF JUSTICE', 'EDUCATION DEPT', 'SUPREME COURT CLERK',
       'TRANSPORTATION DEPT', 'AGRICULTURE DEPT', 'EMERGENCY MEDICAL SVC',
       'ADJUTANT GENERAL', 'AGRICULTURE DEPARTMENT', 'ELECTIONS DIVISION',
       'EDUCATION DEPARTMENT', 'STATE VETERINARIAN', 'FIRE MARSHAL',
       'STATE LIBRARY', 'STATE TREASURER', 'REVENUE DEPARTMENT',
       'INSURANCE DEPT', 'LABOR DEPT', 'ETHICS COMMISSION',
       'NATURAL RESOURCES DEPT', 'REVENUE DEPT', 'GEOLOGICAL SURVEY',
       'CORPORATIONS DIVISION', 'LABOR DEPARTMENT', 'HEALTH DEPARTMENT',
       'PUBLIC SAFETY DEPT', 'HUMAN SERVICES DEPT', 'RACING COMMISSION',
       'PUBLIC SERVICE COMMISSION', 'SECURITIES DIVISION',
       'OCCUPATIONAL SAFETY & HEALTH', 'ADMI

In [20]:
toDiscard = hq.company.value_counts()[hq.company.value_counts() > 1].index
for company in toDiscard:
    print(company)

GOVERNOR'S OFFICE
ATTORNEY GENERAL
SECRETARY OF STATE
CHIEF OF STAFF
LIEUTENANT GOVERNOR
PRESS SECRETARY
LIEUTENANT GOVERNOR'S OFFICE
TRANSPORTATION DEPARTMENT
CORRECTIONS DEPT
CORRECTIONS DEPARTMENT
SUPREME COURT CHIEF JUSTICE
EDUCATION DEPT
SUPREME COURT CLERK
TRANSPORTATION DEPT
AGRICULTURE DEPT
EMERGENCY MEDICAL SVC
ADJUTANT GENERAL
AGRICULTURE DEPARTMENT
ELECTIONS DIVISION
EDUCATION DEPARTMENT
STATE VETERINARIAN
FIRE MARSHAL
STATE LIBRARY
STATE TREASURER
REVENUE DEPARTMENT
INSURANCE DEPT
LABOR DEPT
ETHICS COMMISSION
NATURAL RESOURCES DEPT
REVENUE DEPT
GEOLOGICAL SURVEY
CORPORATIONS DIVISION
LABOR DEPARTMENT
HEALTH DEPARTMENT
PUBLIC SAFETY DEPT
HUMAN SERVICES DEPT
RACING COMMISSION
PUBLIC SERVICE COMMISSION
SECURITIES DIVISION
OCCUPATIONAL SAFETY & HEALTH
ADMINISTRATION DEPT
WORKERS COMPENSATION
INSURANCE DEPARTMENT
ARTS COUNCIL
VOCATIONAL REHABILITATION
CHILD SUPPORT ENFORCEMENT
HUMAN RIGHTS COMMISSION
LAW LIBRARY
FORESTRY DIVISION
HEALTH DEPT
STATE POLICE
EMERGENCY MANAGEMENT AGE

At this point, we have a unique record of every company - hq here. Some of these may well be duplicate entries for a given company, for the cases in which we have a company that has multiple hq.

Let's stash it so that we don't have to go through the above ^^ again.

In [None]:
hq.to_csv("../../data/ig2010s_uniqueHQs.csv")

Get the company dataset and check.

The legal name and the given name are slightly different, but basically the same modulo punctuation and case.

In [None]:
chq = pd.read_csv("../../data/companyData/compustatAddresses2010s.csv")

In [None]:
chq.columns

In [None]:
chq = chq[['gvkey','conm','addzip']].drop_duplicates()
chq.rename(columns = {'conm': 'company'},inplace = True)

Only two of these company names appear 2x, which is good. There are ~20,000 companies in this sample.

Let's go through a little bit of a process here:
- Find the exact matches.
- Get a similarity measure between ; ideally something vectorized / something in matrix math.
- Find the top 10 matches for the remaining ones.
- Do some mix and match and see if there's any threshold at which matches become similar ``enough'' to say this is okay and good to go.


We might be able to use the fact that all of the addresses should be the same after some given point, as the compustat addresses are only the most recent ones. 

In [None]:
chq.head()

In [None]:
easyMerge = chq.merge(hq)
print(easyMerge.shape,easyMerge.head())

In [None]:
Let's try a few different ways to match these up.

First, let's find the exact matches.

In [None]:
Next, let's print out the e