In [48]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy
  
nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Get Data

## Changes from year to year

In [49]:
changes = pd.read_csv("../../data/companyData/compustatChanges_2010s.csv").drop(columns = ['Unnamed: 0'])

changes.head()

Unnamed: 0,year,qtr,gvkey,companyName,tic,naics,curcdq,incomeChange,revenueChange,revenueChangeAbsolute,costChange,inventoryChange
0,2010,1.0,1004,AAR CORP,AIR,423860.0,USD,0.213983,0.200565,81.107,0.178258,0.055631
1,2010,2.0,1004,AAR CORP,AIR,423860.0,USD,0.045617,0.059577,27.099,0.017636,0.086776
2,2010,3.0,1004,AAR CORP,AIR,423860.0,USD,0.153198,0.166276,76.16,0.129456,0.16927
3,2010,4.0,1004,AAR CORP,AIR,423860.0,USD,-0.398739,0.174283,85.02,0.136013,0.182304
4,2011,1.0,1004,AAR CORP,AIR,423860.0,USD,0.096386,0.133883,65.0,0.101523,0.10415


In [50]:
industries = changes[['gvkey','naics']].drop_duplicates()
industries['naics'] = industries.naics.astype('str').str.slice(0,2)

## SC Linking Table for 2010s

In [51]:
c_links = pd.read_csv("../../data/companyData/compustatSCLinked.csv")

c_links['year'] = c_links.srcdate.astype('str').str.slice(0,4).astype('int64')

c_links = c_links[c_links.year > 2009][['year','gvkey','cgvkey','salecs']].\
    rename(columns = {'cgvkey': 'customer_gvkey','gvkey': 'supplier_gvkey'})


print(c_links.shape)

c_links.head()

(34473, 4)


Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs
80,2010,1013,9899,300.0
81,2010,1013,2136,146.0
281,2016,1094,31673,78.193
282,2017,1094,31673,76.598
283,2017,1094,7171,70.215


In [52]:
industries.columns = ['customer_gvkey','customer_naics']

c_links = c_links.merge(industries)

industries.columns = ['supplier_gvkey','supplier_naics']

c_links = c_links.merge(industries)
c_links.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,customer_naics,supplier_naics
0,2010,2497,9899,461.6,51,23
1,2011,2497,9899,692.065,51,23
2,2012,2497,9899,670.822,51,23
3,2013,2497,9899,778.462,51,23
4,2014,2497,9899,968.479,51,23


In [53]:
c_links.shape

(31778, 6)

Now see if it's common to have one in and one out of the industries of interest.

In [43]:
ofInterest = ['11','21','22','23','31','32','33','42','44','45','48','49']

c_linksCut = c_links[~(c_links.customer_naics.isin(ofInterest) & c_links.supplier_naics.isin(ofInterest))]

In [45]:
c_linksCut['relat'] = c_linksCut.customer_naics + "_" + c_linksCut.supplier_naics
c_linksCut.relat.value_counts()[0:10]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


44_53    2094
45_53    1175
51_51    1119
51_53     870
52_53     658
51_33     541
32_53     513
33_53     498
33_51     465
62_53     285
Name: relat, dtype: int64

In [47]:
c_linksCut.supplier_naics.value_counts()[0:10]

53    7204
51    2651
33     969
54     892
56     556
52     459
32     188
61     156
62     123
23     112
Name: supplier_naics, dtype: int64

## Compustat and ABI Linking

In [54]:
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0'])


base_columns = gvKey_abiLinkingTable.columns 
customer_columns = "customer_" + base_columns
supplier_columns = "supplier_" + base_columns


gvKey_abiLinkingTable.head()

Unnamed: 0,cstatCompanies,igCompanies,delete,gvkey,abi
0,american software,american software,,1562,4378204
1,apco oil and gas,apco oil gas,,1682,544813678
2,constellation energy grp,constellation energy,,1995,506384064
3,central natural res,central natural resources,,2852,312712631
4,cracker barrel old ctry stor,cracker barrel olduntry str,,3570,852053057


In [55]:
hasMatch = gvKey_abiLinkingTable.gvkey.unique()

In [56]:
sum(c_links.supplier_gvkey.isin(hasMatch) | c_links.customer_gvkey.isin(hasMatch))

29402

# Merge

In [57]:

#########################
# merge in customer information
gvKey_abiLinkingTable.columns = customer_columns

print(c_links.shape)
c_linksMerge1 = c_links.merge(gvKey_abiLinkingTable, on ='customer_gvkey')
print(c_links.shape,c_linksMerge1.shape)



#########################
# and merge in supplier 
gvKey_abiLinkingTable.columns = supplier_columns

print(c_links.shape)
c_linksMerge2 = c_linksMerge1.merge(gvKey_abiLinkingTable, on ='supplier_gvkey')
print(c_links.shape,c_linksMerge2.shape)

(31778, 6)
(31778, 6) (24214, 10)
(31778, 6)
(31778, 6) (20496, 14)


In [58]:
c_linksMerge2.to_csv("../../data/companyData/clinks_IG_selected.csv")

In [59]:
gvKey_abiLinkingTable

Unnamed: 0,supplier_cstatCompanies,supplier_igCompanies,supplier_delete,supplier_gvkey,supplier_abi
0,american software,american software,,1562,4378204
1,apco oil and gas,apco oil gas,,1682,544813678
2,constellation energy grp,constellation energy,,1995,506384064
3,central natural res,central natural resources,,2852,312712631
4,cracker barrel old ctry stor,cracker barrel olduntry str,,3570,852053057
...,...,...,...,...,...
2654,cdti advanced materials,cdti advanced materials,,282553,967328568
2655,futurefuel,futurefuel,,287462,679546432
2656,lyondellbasell industries nv,lyondellbasell industries nv,,294524,200051589
2657,doriang,doriang,,317264,435494175


This is probably because: (1) companies are not in North America, or (2) companies are not in the physical goods industries we're interested in. We can verify this though: look at c_links where both the customer and supplier are in the dataset of interest.

In [63]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'})

c_linkTest = c_links[c_links.customer_gvkey.isin(chq.gvkey.unique()) & \
                     c_links.supplier_gvkey.isin(chq.gvkey.unique())]

print("Percent of firms with a match: ", c_linksMerge2.shape[0]/c_linkTest.shape[0])

Percent of firms with a match:  0.8772095013909694


It's entirely possible that we have too small of a sample from the 2010s alone. Let's just try it though and see how it goes.

First, make a sample with the companies on three years of either side of when it reports another customer.

In [64]:
scTableCustomers = c_linksMerge2.copy()[['year','customer_gvkey','customer_abi']].drop_duplicates()
scTableSuppliers = c_linksMerge2.copy()[['year','supplier_gvkey','supplier_abi']].drop_duplicates()

In [70]:
scTableCustomers.head()

Unnamed: 0,year,customer_gvkey,customer_abi
0,2010,9899,460637358
1,2011,9899,460637358
2,2012,9899,460637358
3,2013,9899,460637358
4,2014,9899,460637358


In [71]:
def makeThreeEitherSide(df): 
    yrPlus1 = df.copy(); yrPlus1['year'] += 1
    yrPlus2 = df.copy(); yrPlus2['year'] += 1
    yrPlus3 = df.copy(); yrPlus3['year'] += 1
    
    yrMinus1 = df.copy(); yrMinus1['year'] -= 1
    yrMinus2 = df.copy(); yrMinus2['year'] -= 1
    yrMinus3 = df.copy(); yrMinus3['year'] -= 1
    
    all = pd.concat([yrPlus1,yrPlus2,yrPlus3,yrMinus1,yrMinus2,yrMinus3])
    
    return(all)

In [72]:
allCustomerData = makeThreeEitherSide(scTableCustomers)
allCustomerData.columns = ['year','gvkey','abi']


allSupplierData = makeThreeEitherSide(scTableSuppliers)
allSupplierData.columns = ['year','gvkey','abi']

In [73]:
allSupplierData.year

0        2011
1        2012
2        2013
3        2014
4        2015
         ... 
20491    2014
20492    2015
20493    2016
20494    2017
20495    2018
Name: year, Length: 51912, dtype: int64

In [74]:
allAbi = allCustomerData.abi.append(allSupplierData.abi).drop_duplicates()

In [75]:
hqsOnly = pd.read_csv("../../data/ig2010s_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})

In [76]:
hqsOnly.head()

Unnamed: 0,abi,company
0,7609,SOLITRON DEVICES INC
1,21311,WESTERN STATES ENVELOPE & LBL
2,29603,THIELE KAOLIN CO
3,71340,TRI STAFF GROUP
4,77743,NATIONAL TECHNICAL SYSTEMS INC


In [77]:
hq = pd.read_csv("../../data/ig2010s_uniqueHQs_multLocations.csv").\
    drop(columns = {'Unnamed: 0'}).\
    rename(columns = {'archive_version_year': 'year'})

hq['year'] = hq.year.astype('int64')

hqRelevant = hq[hq.abi.isin(allAbi)]

In [78]:
hqRelevant[hqRelevant.abi == 71340]

Unnamed: 0,abi,ticker,company,year,state,city,address_line_1,zipcode,latitude,longitude


In [79]:
allSupplierData = allSupplierData.merge(hqRelevant).drop_duplicates()
allCustomerData = allCustomerData.merge(hqRelevant).drop_duplicates()

In [81]:
allSupplierData.head()

Unnamed: 0,year,gvkey,abi,ticker,company,state,city,address_line_1,zipcode,latitude,longitude
0,2011,2497,482985413,MTZ,MAS TEC INC,FL,CORAL GABLES,800 S DOUGLAS RD # 12,33134,25.7639,-80.25634
6,2012,2497,482985413,MTZ,MAS TEC INC,FL,CORAL GABLES,800 S DOUGLAS RD # 12,33134,25.7639,-80.25634
12,2013,2497,482985413,MTZ,MAS TEC INC,FL,CORAL GABLES,800 S DOUGLAS RD # 1200,33134,25.76375,-80.25635
18,2014,2497,482985413,MTZ,MAS TEC INC,FL,CORAL GABLES,800 S DOUGLAS RD # 1200,33134,25.7639,-80.25634
24,2015,2497,482985413,MTZ,MAS TEC INC,FL,CORAL GABLES,800 S DOUGLAS RD # 1200,33134,25.76375,-80.25635


In [326]:
allCustomerData.to_csv("../../data/companyData/allCustomerData.csv")
allSupplierData.to_csv("../../data/companyData/allSupplierData.csv")

In [80]:
allSupplierData.shape

(9592, 11)

In [335]:
hqsOnly = pd.read_csv("../../data/ig2010s_uniqueHQs.csv")