# Initial Set-Up

In [16]:
import pickle
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

from difflib import get_close_matches

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

import dask.dataframe as dd

from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import haversine_distances

from math import radians

from scipy import stats

import multiprocessing
from multiprocessing import Pool

from sklearn.neighbors import KernelDensity

import gc

# Infogroup

We'll do two separate things in this file.
1. Get the headquarters
2. Find the weights by company location. 


In [2]:
file = "../../data/companyData/infogroup_firmHQs.csv"

In [None]:
import dask.dataframe as dd

df = dd.read_csv(file, assume_missing=True, 
                 dtype={'abi': 'object','zipcode': 'object'}, low_memory = False)

df = df # [df.business_status_code == 1.0]
df.head()

In [31]:
df.archive_version_year.min().compute()

1998.0

In [32]:
df.business_status_code.value_counts().compute()

1.0    781431
Name: business_status_code, dtype: int64

In [33]:
df.shape[0].compute()

781431

In [None]:
# abiRents = df[['abi','parent_number']].drop_duplicates().compute(num_workers = 25)

# abiRents.to_csv("../../data/abiRents.csv")

abiRents = pd.read_csv("../../data/abiRents.csv")

In [None]:
# differentParents.to_csv("../../data/differentParents2010s.csv")

In [34]:
hq = df[['abi','ticker','company','archive_version_year','state','city',
         'address_line_1','zipcode',
         'latitude','longitude']].drop_duplicates().compute(num_workers = 100)

hq.to_csv("../../data/tempHQs.csv")

In [None]:
hq = pd.read_csv("../../data/tempHQs.csv").drop(columns = {'Unnamed: 0'})

In [36]:
hqsOnly = hq[['abi','company']].drop_duplicates()

In [37]:
hqsOnly.company.value_counts()

FIRST NATIONAL BANK            87
GOVERNOR'S OFFICE              63
FIRST STATE BANK               53
ATTORNEY GENERAL               51
TRANSPORTATION DEPT            45
                               ..
DORCHESTER MINERALS LP          1
CRESCENT HEALTHCARE INC         1
CITY CAPITAL CORP               1
RELIANT TRANSPORTATION INC      1
USA COMPRESSION PARTNERS LP     1
Name: company, Length: 111646, dtype: int64

In [38]:
print(hq.shape,hqsOnly.shape)

(781431, 10) (119142, 2)


Some of the abi numbers seem to be duplicated; it looks like they might be primarily for different government agencies.

In [39]:
hqsOnly.company.value_counts()[hqsOnly.company.value_counts() > 10].index

Index(['FIRST NATIONAL BANK', 'GOVERNOR'S OFFICE', 'FIRST STATE BANK',
       'ATTORNEY GENERAL', 'TRANSPORTATION DEPT', 'CHIEF OF STAFF',
       'SECRETARY OF STATE', 'CORRECTIONS DEPT', 'LIEUTENANT GOVERNOR',
       'PRESS SECRETARY', 'EDUCATION DEPT', 'TRANSPORTATION DEPARTMENT',
       'LIEUTENANT GOVERNOR'S OFFICE', 'AGRICULTURE DEPT',
       'CORRECTIONS DEPARTMENT', 'SUPREME COURT CLERK',
       'SUPREME COURT CHIEF JUSTICE', 'STATE LIBRARY', 'ADJUTANT GENERAL',
       'EMERGENCY MEDICAL SVC', 'STATE VETERINARIAN', 'AGRICULTURE DEPARTMENT',
       'ELECTIONS DIVISION', 'FIRE MARSHAL', 'EDUCATION DEPARTMENT',
       'PEOPLES BANK', 'HEALTH DEPT', 'STATE TREASURER', 'GEOLOGICAL SURVEY',
       'INSURANCE DEPT', 'FIRST FEDERAL SAVINGS & LOAN', 'REVENUE DEPT',
       'LABOR DEPT', 'FARMERS & MERCHANTS BANK', 'ETHICS COMMISSION',
       'CORPORATIONS DIVISION', 'REVENUE DEPARTMENT', 'NATURAL RESOURCES DEPT',
       'PUBLIC SAFETY DEPT', 'FARMERS STATE BANK', 'HUMAN SERVICES DEPT',
  

Filter out the ones with duplicated HQs.

In [40]:
toDiscard = hqsOnly.company.value_counts()[hqsOnly.company.value_counts() > 1].index
for company in toDiscard:
    print(company)


FIRST NATIONAL BANK
GOVERNOR'S OFFICE
FIRST STATE BANK
ATTORNEY GENERAL
TRANSPORTATION DEPT
CHIEF OF STAFF
SECRETARY OF STATE
CORRECTIONS DEPT
LIEUTENANT GOVERNOR
PRESS SECRETARY
EDUCATION DEPT
TRANSPORTATION DEPARTMENT
LIEUTENANT GOVERNOR'S OFFICE
AGRICULTURE DEPT
CORRECTIONS DEPARTMENT
SUPREME COURT CLERK
SUPREME COURT CHIEF JUSTICE
STATE LIBRARY
ADJUTANT GENERAL
EMERGENCY MEDICAL SVC
STATE VETERINARIAN
AGRICULTURE DEPARTMENT
ELECTIONS DIVISION
FIRE MARSHAL
EDUCATION DEPARTMENT
PEOPLES BANK
HEALTH DEPT
STATE TREASURER
GEOLOGICAL SURVEY
INSURANCE DEPT
FIRST FEDERAL SAVINGS & LOAN
REVENUE DEPT
LABOR DEPT
FARMERS & MERCHANTS BANK
ETHICS COMMISSION
CORPORATIONS DIVISION
REVENUE DEPARTMENT
NATURAL RESOURCES DEPT
PUBLIC SAFETY DEPT
FARMERS STATE BANK
HUMAN SERVICES DEPT
LABOR DEPARTMENT
HEALTH DEPARTMENT
RACING COMMISSION
SECURITIES DIVISION
WORKERS COMPENSATION
ARTS COUNCIL
COMMERCE DEPT
PUBLIC SERVICE COMMISSION
LAW LIBRARY
OCCUPATIONAL SAFETY & HEALTH
VOCATIONAL REHABILITATION
CITIZENS 

DIVISION-CMNTY BEHAVIORAL HLTH
BUREAU OF MOTOR VEHICLES
FIRST AMERICAN BANCSHARES INC
BANK UNITED FINANCIAL CORP
VIRGINIA BEACH PUBLIC LIBRARY
VILLAGE BANK
GREEN SHIFT CORP
COUNTY BANK
RICHARDS GROUP
OFFICE OF INFO TECHNOLOGY
CENTENNIAL BANCSHARES INC
SAVINGS BANK
INN SEASON RESORTS
NATERRA LAND
METROPOLITAN LIBRARY SYSTEM
HALE-HALSELL CO
KANBAY INTERNATIONAL INC
TRANSPORTATION-PUBLIC TRANS
LIQUOR CONTROL DEPT
ROSS CO
INVISA INC
MC KEE FOODS CORP
MTM RECOGNITION
CORRECTIONS CORP OF AMERICA
EMMAUS LIFE SCIENCES INC
BRIDGETECH HOLDINGS INTL INC
QUEPASA CORP
PRO-FAC COOPERATIVE INC
AUTOCAM CORP
MARION COUNTY LIBRARY SYSTEM
PLANNING DIV
DEWMAR INTERNATIONAL BMC INC
APPLIED NANOSCIENCE INC
HUNTSMAN CORP
HILITE INTERNATIONAL INC
VIVA INTERNATIONAL INC
WILCOHESS LLC
EQUITY RESIDENTIAL
INSYN Q INC
GUARDIAN TECHNOLOGIES INTL INC
BEST CLEANERS
ALCOHOL & DRUG ABUSE OFC
FIRST TEXAS BANCORP INC
BROWN SMITH WALLACE LLC
DAKOTA MINNESOTA & EASTERN RR
ROLLS-ROYCE NORTH AMERICA INC
TRI-COUNTY NATIONAL B

In [41]:
hqsOnly = hqsOnly[~hqsOnly.company.isin(toDiscard)]
hq      = hq[~hq.company.isin(toDiscard)]

In [42]:
hq.shape

(667047, 10)

At this point, we have a unique record of every company - hq here. Some of these may well be duplicate entries for a given company, for the cases in which we have a company that has multiple hq.

Many of these companies are likely sole proprietorships.

Let's stash it so that we don't have to go through the above ^^ again.

In [43]:
hqsOnly.to_csv("../../data/ig_uniqueHQs.csv")

In [44]:
hq.to_csv("../../data/ig_uniqueHQs_multLocations.csv")

In [None]:
hqsOnly   = pd.read_csv("../../data/ig_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})

hq        = pd.read_csv("../../data/ig_uniqueHQs_multLocations.csv").drop(columns = {'Unnamed: 0'})

In [None]:
hqsOnly.shape

In [None]:
hq.shape

In [None]:
hq.head()

# Shortlist of IG Cos
Focus on companies for whom we have a gvkey-ig link.

In [14]:
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/sc_linkingTable.csv').drop(columns = ['Unnamed: 0']).\
    drop_duplicates()
    
gvKey_abiLinkingTable.head()

Unnamed: 0,cstatCompanies,igCompanies,gvkey,abi
0,american software,american software,1562,4378204
1,burlingtonat factory invs,burlingtonat factory whse,2484,849416722
2,electro scientific inds,electro scientific industries,4274,9546995
3,kindercare learning centers,kindercare learning ctr,14835,2406528
4,schmitt industries or,schmitt industries,26520,479790834


0       004378204
1       849416722
2       009546995
3       002406528
4       479790834
          ...    
4858    424894115
4859    003464351
4860    507844918
4861    420377676
4862    950563882
Name: abi, Length: 4863, dtype: object

In [24]:
allIGCos = gvKey_abiLinkingTable.abi.astype('str').str.pad(width = 9, side = 'left', fillchar = '0').unique()



allIGCos[0:5]

array(['004378204', '849416722', '009546995', '002406528', '479790834'],
      dtype=object)

In [25]:
aCos = allIGCos[0:5]
firstCos

array(['004378204', '849416722', '009546995', '002406528', '479790834'],
      dtype=object)

In [27]:
with open('../../data/companyData/cos.txt', 'w') as f:
    for line in allIGCos:
        f.write(f"{line}\n")

In [36]:
parents = subset.parent_number.unique()

In [38]:
subsetTest = df[df.parent_number.isin(parents)].compute(workers = 100)
subsetTest.shape

  return func(*args, **kwargs)
  return func(*args, **kwargs)
  return func(*args, **kwargs)


(15001054, 6)

In [41]:
len(subsetTest.parent_number.unique())

5556

# Find Fraction of Employees at Locations

The code commented out below lets us scale if we do the full infogroup file.

In [4]:
df1 = dd.read_csv( "../../data/companyData/allIGData.csv", assume_missing=True, 
                 dtype={'parent_number': 'object','parent_employee_size_code': 'object',
                       'parent_sales_volume_code': 'object',
                       'abi': 'object'})[['archive_version_year', 'parent_number',
                       'employee_size_location','latitude', 'longitude','zipcode']]

df1 = df1[~df1.parent_number.isna()]

In [5]:
df1.columns

Index(['archive_version_year', 'parent_number', 'employee_size_location',
       'latitude', 'longitude', 'zipcode'],
      dtype='object')

In [6]:
df2 = dd.read_csv( "../../data/companyData/allIG_pre2003.csv", assume_missing=True, 
                 dtype={'parent_number': 'object','parent_employee_size_code': 'object',
                       'parent_sales_volume_code': 'object',
                       'abi': 'object'})[['archive_version_year', 'parent_number',
                       'employee_size_location','latitude', 'longitude','zipcode']]

df2 = df2[~df2.parent_number.isna()]

df2.columns

Index(['archive_version_year', 'parent_number', 'employee_size_location',
       'latitude', 'longitude', 'zipcode'],
      dtype='object')

In [9]:
df = df1.append(df2)

# df = df[df.parent_number.isin(parents)].compute(workers = 100)
df = df.compute(workers = 100)
df.head()

  return func(*args, **kwargs)
  return func(*args, **kwargs)
  return func(*args, **kwargs)


Unnamed: 0,archive_version_year,parent_number,employee_size_location,latitude,longitude,zipcode
0,2003.0,7508146,125.0,33.24433,-87.49823,35404.0
3,2003.0,441297108,168.0,33.54118,-84.55948,30213.0
7,2003.0,222904740,170.0,40.80478,-81.5323,44647.0
8,2003.0,212928204,250.0,39.95779,-91.36099,62301.0
10,2003.0,7530124,700.0,42.84868,-87.92959,53154.0


In [10]:
df.shape

(93875584, 6)

In [82]:
df.to_csv('../../data/companyData/onlyParentsOfInterest.csv')

In [11]:
df.to_csv('../../data/companyData/allCos.csv')

In [20]:
df = dd.read_csv('../../data/companyData/allCos.csv',
                                dtype={'parent_number': 'object'}).drop(columns = 'Unnamed: 0')

Find all employees by parent number by year.

In [14]:
totalParents = df[['archive_version_year','parent_number','employee_size_location']].\
    groupby(['archive_version_year','parent_number']).sum()


totalParents.to_csv('../../data/companyData/allCosSummed.csv')

In [18]:
'''del employeesByParents
del df'''
gc.collect()

97

In [13]:
employeesByParents = totalParents

In [19]:
employeesByParents = dd.read_csv('../../data/companyData/allCosSummed.csv',
                                dtype={'parent_number': 'object'}).\
                                rename(columns = {'employee_size_location': 'allEmployees'})
employeesByParents.head()

Unnamed: 0,archive_version_year,parent_number,allEmployees
0,1997.0,/,2.0
1,1997.0,000007609,0.0
2,1997.0,000015578,646.0
3,1997.0,000023077,93.0
4,1997.0,000076547,1064.0


In [21]:
toCalculate = df.merge(employeesByParents)

In [7]:
toCalculate.head()

Unnamed: 0,archive_version_year,parent_number,employee_size_location,latitude,longitude,zipcode,allEmployees
0,2003.0,212928204,250.0,39.95779,-91.36099,62301.0,3122.0
1,2003.0,212928204,64.0,41.41058,-81.77999,44130.0,3122.0
2,2003.0,212928204,150.0,30.88473,-87.79219,36507.0,3122.0
3,2003.0,212928204,25.0,43.48623,-83.94214,48604.0,3122.0
4,2003.0,212928204,425.0,39.84488,-75.18354,8086.0,3122.0


In [22]:
toCalculate['locationFracOfEmployees'] = toCalculate['employee_size_location']/toCalculate['allEmployees']

In [23]:
fractions = toCalculate[['archive_version_year','parent_number','latitude','longitude','zipcode','locationFracOfEmployees']].\
    groupby(['archive_version_year','parent_number','latitude','longitude','zipcode']).sum().compute(workers = 100)

In [27]:
fractions = fractions.reset_index()
fractions.head()

Unnamed: 0,archive_version_year,parent_number,latitude,longitude,zipcode,locationFracOfEmployees
0,2003.0,7609,26.7412,-80.06694,33407.0,0.0
1,2003.0,100537,33.86163,-79.75452,29560.0,0.002385
2,2003.0,100537,35.58317,-77.59319,27828.0,0.357782
3,2003.0,100537,35.61509,-77.39556,27834.0,0.001789
4,2003.0,100537,35.97062,-77.7912,27804.0,0.017889


In [28]:
np.sum(fractions.locationFracOfEmployees > 0.10)/fractions.shape[0]

0.02453451864208095

In [29]:
fractions = fractions[fractions.locationFracOfEmployees > 0.10]

In [30]:
fractions.shape

(1557414, 1)

In [31]:
fractions.to_csv('../../data/companyData/fractionEmployees_byEstablishment.csv')

In [2]:
fractions = dd.read_csv('../../data/companyData/fractionEmployees_byEstablishment.csv')

In [5]:
fractions.compute(workers = 100).shape

(42066221, 7)

In [3]:
fractions.archive_version_year.min().compute()

1997.0

In [29]:
fractions.shape

(42066221, 6)

# Measures of Concentration
try duranton overman for one company

In [2]:
df = dd.read_csv('../../data/companyData/onlyParentsOfInterest.csv',
                                dtype={'parent_number': 'object'}).drop(columns = 'Unnamed: 0')

employeesByParents = dd.read_csv('../../data/companyData/totalParents.csv',
                                dtype={'parent_number': 'object'}).\
                                rename(columns = {'employee_size_location': 'allEmployees'})


toCalculate = df.merge(employeesByParents)

In [3]:
industries = pd.read_csv("../../data/companyData/gvkeyIndustries.csv").drop(columns = ['Unnamed: 0']).\
    drop_duplicates()
linkingTable = pd.read_csv('../../data/companyData/sc_linkingTable.csv').drop(columns = ['Unnamed: 0']).\
    drop_duplicates().drop(columns = {'cstatCompanies','igCompanies'})

abiIndustries = industries.merge(linkingTable)
abiIndustries['abi'] = abiIndustries['abi'].astype('str').str.pad(width = 9, side = 'left', fillchar = '0')
abiIndustries.drop(columns = {'gvkey'}, inplace = True)

abiIndustries.head()

Unnamed: 0,indGroup,abi
0,transportUtilities,7501711
1,transportUtilities,4554051
2,manu,4352373
3,manu,4692679
4,manu,9318528


In [4]:
data = toCalculate[['archive_version_year','parent_number','latitude','longitude','employee_size_location']].\
    groupby(['archive_version_year','parent_number','latitude','longitude']).sum().compute(workers = 100).\
    reset_index().rename(columns = {'archive_version_year': 'year', 'parent_number': 'abi'})

data['latitude_radians']  = [radians(_) for _ in data.latitude]

data['longitude_radians'] = [radians(_) for _ in data.longitude]

data = data.merge(abiIndustries)

In [5]:
data.indGroup.unique()

array(['manu', 'wholesale', 'mining', 'services', 'finance',
       'construction', 'retail', 'transportUtilities', nan, 'agForFish'],
      dtype=object)

In [24]:
kernelDistance = 10

start = time.time()


subset     = data[(data.year == 2003.0) & \
                  (data.indGroup == 'construction') ].reset_index(drop = True)
print(time.time() - start)


employment = subset.employee_size_location.to_numpy()
print(time.time() - start)


points     = subset[['longitude_radians','latitude_radians']].to_numpy()
print(time.time() - start)


distMatrix = haversine_distances(points, points) * 6371000/1000
print(time.time() - start)


#### get the distance values only
indices    = np.triu_indices_from(distMatrix,k = 1)
distances  = np.asarray(distMatrix[indices])
print(time.time() - start)

kernel = stats.gaussian_kde(distances, bw_method = 'silverman')
print(time.time() - start)

f   = kernel.covariance_factor()
bw  = f * distances.std()
print(time.time() - start)



normd      = (distMatrix.diagonal(1) - kernelDistance)/bw.clip(0)

0.7021150588989258
0.7023732662200928
0.7029361724853516
2.5213959217071533
2.5718510150909424
2.635136127471924
2.6440091133117676


In [44]:
normd      = np.ravel((distMatrix - 10)/bw).clip(0)
# kernelDist = kernel.evaluate(normd.clip(0))
len(normd)

19439281

In [63]:
X

array([[4124.63458   ],
       [3545.58350545],
       [4434.72081969],
       [3883.85003992],
       [4693.398886  ]])

In [61]:
np.exp(kde.score_samples(X))

array([0.39894228, 0.39894228, 0.39894228, 0.39894228, 0.39894228])

In [66]:
start = time.time()

X = distances[0:500000].reshape(-1, 1) # np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(X)

scores = kde.score_samples(X)




time.time() - start

KeyboardInterrupt: 

In [54]:

start = time.time()

with multiprocessing.Pool() as pool:
    kernelDist = pool.map(kernel.evaluate, normd[0:10000])
    pool.close()
    
print(time.time() - start)

# first = employment[1:]*employment[:-1]

# 1/(bw*np.dot(employment[1:],employment[:-1]))*np.dot(first,kernelDist)

3221.1379981040955


ValueError: shapes (4408,) and (10000,1) not aligned: 4408 (dim 0) != 10000 (dim 0)

In [41]:
employmentSum = 0
for i in range(0,len(employment) - 1):
    # print(i)
    
    employmentSum += np.dot(employment[i:(len(employment)-1)],employment[(i+1):])
    
employmentSum    
    

13352792198.0

In [29]:
start = time.time()
kernelDist = kernel.evaluate(normd[0:1000])
print(time.time() - start)


''''''

77.89801979064941


'first = employment[1:]*employment[:-1]\n\n1/(bw*np.dot(employment[1:],employment[:-1]))*np.dot(first,kernelDist)'

In [20]:
np.sum(data[(data.year == 2003.0)].employee_size_location == 2)


35314

Below code tries to do it for one company. The tricky bit here is that we need more than one company to estimate the kde, so for companies that have only one establishment in the dataset, we might have to take them out and manually assign them a value later on.

In [230]:
subset     = data[(data.archive_version_year == 2003.0) & \
                  (data.parent_number == '000100537') & \
                 (data.employee_size_location > 100)].reset_index(drop = True)
employment = subset.employee_size_location.to_numpy()

points     = subset[['longitude_radians','latitude_radians']].to_numpy()

# distMatrix = haversine_distances(points,points)
distMatrix = haversine_distances(points, points) * 6371000/1000

#### get the distance values only
indices    = np.triu_indices_from(distMatrix,k = 1)
distances  = np.asarray(distMatrix[indices])

In [232]:
# get the kde

kernel = stats.gaussian_kde(distances, bw_method = 'silverman')

f   = kernel.covariance_factor()
bw  = f * distances.std()
bw

ValueError: `dataset` input should have multiple elements.

In [188]:
normd      = np.ravel((distMatrix - 10)/bw).clip(0)
kernelDist = kernel.evaluate(normd.clip(0))

# put into a matrix and 
kernelDistMatrix = np.reshape(kernelDist,distMatrix.shape)

distArray   = kernelDistMatrix.diagonal(1)

distArray

1/(bw*np.dot(employment[1:],employment[:-1]))*np.dot(first,distArray)

array([0.00220754, 0.00212267, 0.00213167, 0.00218307, 0.00211793])

In [189]:
first = employment[1:]*employment[:-1]
first

array([ 2400.,  1800.,    90.,  2400., 76800.])

In [190]:
1/(bw*np.dot(employment[1:],employment[:-1]))*np.dot(first,distArray)

3.8638673662379256e-05

In [None]:
data = data.reset_index()
data.to_csv('../../data/companyData/totalEmployees_byEstablishment.csv')