In [2]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy

import gc

import geopy.distance

nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Compustat and ABI Linking

In [3]:
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0'])

base_columns = gvKey_abiLinkingTable.columns 
customer_columns = "customer_" + base_columns
supplier_columns = "supplier_" + base_columns



hasMatch = gvKey_abiLinkingTable.gvkey.unique()

gvKey_abiLinkingTable.head()


Unnamed: 0,cstatCompanies,igCompanies,delete,gvkey,abi
0,asa gold and precious metals,asa gold precious metals,,1062,402180222
1,adams diversified equity fd,adams diversified equity fund,,1119,397759739
2,allen organ,allen organ,,1283,400700704
3,american physicians svc gp,american physicians svc,,1539,218548014
4,american science engineering,american science engineering,,1554,441435880


In [4]:
lines = gvKey_abiLinkingTable.abi.astype('str').unique()

print(lines)

with open('../../data/companyData/igCompanies.txt', 'w') as f:
    for line in lines:
        f.write(line)
        f.write('\n')

['402180222' '397759739' '400700704' ... '739118540' '227688843'
 '488766353']


---------------------------------

# Get all change data together
Get the linking table and merge the abi labels into the change df. 

Then, merge the location data into the change data and get as complete a record of companies as possible given the HQ data.

In [5]:
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0'])

changes = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv").drop(columns = ['Unnamed: 0'])
print(changes.shape, changes.head())


changesABI = changes.merge(gvKey_abiLinkingTable, on ='gvkey').drop(columns = {'state','city'})
print(changesABI.shape, changesABI.head())

(649614, 59)    gvkey  datadate  year  qtr              companyName curcdq      assets  \
0   8515  19990930  1999    3                  PHI INC    USD         NaN   
1  12405  19990930  1999    3               AVESIS INC    USD         NaN   
2  24474  19990930  1999    3  TECHNOLOGY SOLUTIONS CO    USD  362.960666   
3  26830  19990930  1999    3   NAMIBIAN MINERALS CORP    USD  111.299129   
4  30448  19990331  1970    1   AMERICAN HOMESTAR CORP    USD         NaN   

   cash  costGoodsSold  totalInv  ...  sic2            indGroup  earliestYear  \
0   NaN            NaN       NaN  ...    45  transportUtilities          1981   
1   NaN       2.470251       NaN  ...    64             finance          1984   
2   NaN      36.229854  0.000000  ...    73            services          1990   
3   NaN       7.938061  4.269385  ...    14              mining          1994   
4   NaN            NaN       NaN  ...    24                manu          1993   

   ageTercile sizeTercile  profitTerc

Now merge in the hq information.

In [6]:
canadian = ['ON', 'AB','QC', 'BC', 'NS', 'NF', 'SK', 'MB', 'NB']
changes = changes[~(changes.state.isin(canadian)) & ~changes.state.isna()]

changes['addzip'] = changes.addzip.astype('str').str.slice(0,5)

changes.state.unique()

array(['LA', 'AZ', 'IL', 'TX', 'IA', 'GA', 'CA', 'PA', 'MO', 'SC', 'NC',
       'UT', 'NV', 'MA', 'MN', 'OH', 'CO', 'FL', 'NJ', 'NY', 'WI', 'IN',
       'VA', 'MD', 'CT', 'MI', 'KS', 'TN', 'DE', 'WA', 'OR', 'HI', 'KY',
       'RI', 'ME', 'NM', 'VT', 'AL', 'AR', 'ID', 'WY', 'OK', 'PR', 'NE',
       'SD', 'DC', 'WV', 'MS', 'ND', 'MT', 'NH', 'AK', 'GU', 'VI'],
      dtype=object)

In [7]:
changesABI.head()

Unnamed: 0,gvkey,datadate,year,qtr,companyName,curcdq,assets,cash,costGoodsSold,totalInv,...,sizeTercile,profitTercile,datacqtr,datafqtr,fyr,DATE,cstatCompanies,igCompanies,delete,abi
0,8515,19990930,1999,3,PHI INC,USD,,,,,...,1.0,1.0,1999Q3,1999Q3,12,1970-01-01 00:00:00.019990930,phi,phi,,4103925
1,8515,19991231,1999,4,PHI INC,USD,332.308866,,,55.535281,...,1.0,2.0,1999Q4,1999Q4,12,1970-01-01 00:00:00.019991231,phi,phi,,4103925
2,8515,20000331,2000,1,PHI INC,USD,330.571463,,68.550922,56.906485,...,1.0,1.0,2000Q1,2000Q1,12,1970-01-01 00:00:00.020000331,phi,phi,,4103925
3,8515,20000630,2000,2,PHI INC,USD,314.349477,,72.109204,57.186407,...,1.0,1.0,2000Q2,2000Q2,12,1970-01-01 00:00:00.020000630,phi,phi,,4103925
4,8515,20000930,2000,3,PHI INC,USD,315.826957,,76.488119,60.317248,...,1.0,1.0,2000Q3,2000Q3,12,1970-01-01 00:00:00.020000930,phi,phi,,4103925


In [8]:
hq = pd.read_csv("../../data/ig_uniqueHQs_multLocations.csv").\
    drop(columns = {'Unnamed: 0'}).\
    rename(columns = {'archive_version_year': 'year'})

hq['year'] = hq.year.astype('int64')

igChanges = changesABI.merge(hq)
print(igChanges.shape, igChanges.head())


hq.head()

(271847, 69)    gvkey  datadate  year  qtr companyName curcdq      assets  cash  \
0   8515  19990930  1999    3     PHI INC    USD         NaN   NaN   
1   8515  19991231  1999    4     PHI INC    USD  332.308866   NaN   
2   8515  20000331  2000    1     PHI INC    USD  330.571463   NaN   
3   8515  20000630  2000    2     PHI INC    USD  314.349477   NaN   
4   8515  20000930  2000    3     PHI INC    USD  315.826957   NaN   

   costGoodsSold   totalInv  ...  delete      abi  ticker  \
0            NaN        NaN  ...     NaN  4103925    PHII   
1            NaN  55.535281  ...     NaN  4103925    PHII   
2      68.550922  56.906485  ...     NaN  4103925    PHII   
3      72.109204  57.186407  ...     NaN  4103925    PHII   
4      76.488119  60.317248  ...     NaN  4103925    PHII   

                     company state      city address_line_1 zipcode  latitude  \
0  PETROLEUM HELICOPTERS INC    LA  METAIRIE     PO BOX 578   70004  29.97589   
1  PETROLEUM HELICOPTERS INC    LA  M

Unnamed: 0,abi,ticker,company,year,state,city,address_line_1,zipcode,latitude,longitude
0,7609,SODI,SOLITRON DEVICES INC,1998,FL,WEST PALM BEACH,3301 ELECTRONICS WAY # C,33407,26.7412,-80.06694
1,15578,,BRIDGEPORT MACHINES INC,1998,CT,BRIDGEPORT,500 LINDLEY ST,6606,41.19809,-73.19549
2,23077,,JENNY LEE BAKERY,1998,PA,MC KEES ROCKS,620 ISLAND AVE,15136,40.47235,-80.06152
3,76547,,MASTER PROTECTION CORP,1998,CA,SANTA MONICA,520 BROADWAY # 650,90401,34.01618,-118.49206
4,77743,,NATIONAL TECHNICAL SYSTEMS INC,1998,CA,CALABASAS,24007 VENTURA BLVD # 200,91302,34.15562,-118.65163


In [9]:
igChanges.year.min()

1999

In [10]:
igChanges.to_csv("../../data/companyData/igData.csv")

In [11]:
igChanges.columns

Index(['gvkey', 'datadate', 'year', 'qtr', 'companyName', 'curcdq', 'assets',
       'cash', 'costGoodsSold', 'totalInv', 'netIncome', 'opInc_afDep',
       'opInc_befDep', 'totalRevenue', 'costat', 'priceClose', 'add1',
       'addzip', 'earningsPerShare', 'sales', 'otherCosts', 'shares',
       'assetsLast', 'netIncomeLast', 'totalRevenueLast', 'costGoodsSoldLast',
       'totalInvLast', 'opInc_afDepLast', 'opInc_befDepLast', 'priceCloseLast',
       'cashLast', 'earningsPerShareLast', 'sharesLast', 'salesLast',
       'otherCostsLast', 'incomeChange', 'revenueChange', 'costChange',
       'inventoryChange', 'opInc_afDepChange', 'opInc_befDepChange',
       'priceCloseChange', 'assetsPrev', 'fyearq', 'assetsLagged',
       'netIncomeLagged', 'roa_lagged', 'sic2', 'indGroup', 'earliestYear',
       'ageTercile', 'sizeTercile', 'profitTercile', 'datacqtr', 'datafqtr',
       'fyr', 'DATE', 'cstatCompanies', 'igCompanies', 'delete', 'abi',
       'ticker', 'company', 'state', 'city', 'a

In [12]:
igChanges.shape

(271847, 69)

At this point, we have zip information in the following forms (from most to least examples):
    - changes: all compustat companies, from the compustat address system
    - igChanges: subset of compustat companies, from the ig merge
    - subset of compustat companies that have SC information and survived the ig merge
    
We could potentially look at the subset of compustat companies for which we have SC information, usign the compustat address system as well.

For now: follow similar trajectory as before but add in weather data for all cstat companies and all ig-merged companies.

First: pull all zips that are mentioned in changes and igChanges and use this to get the weather data.



In [13]:
changes = changes[(~changes.addzip.isna()) & (changes.addzip != 'nan')]
relevantZips = changes.addzip.astype('int64').append(igChanges.zipcode).unique()

changes.rename(columns = {'addzip': 'zipcode'}, inplace = True)
changes.drop(columns = {'datadate','costat', 'add1', 'city',  'state'}, inplace = True)

# 

In [14]:
len(relevantZips)

5048

In [15]:
# relevantZips = allCustomerData.zipcode.append(allSupplierData.zipcode).unique()
outfile =  '../../data/companyData/relevantZips.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(relevantZips, pickle_file)

------------------------------------------------

## create the original weather with lags dataset
at this particular point, the g dataset is fraction by establishment, for all suppliers and customers in the dataset.

In [16]:
g = pd.read_csv("../../data/companyData/weatherByEstablishment.csv").\
    drop(columns = {"Unnamed: 0"})

allWeather_withLags = pd.read_csv("../../data/companyData/allWeather_withLags_allZips.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'}).astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})


averages = pd.read_csv("../../data/companyData/quarterlyStatsByZip.csv").\
    drop(columns = {"Unnamed: 0"}).rename(columns = {'ZIP': 'zipcode'})
averages['qtr'] = averages.quarter.str.slice(1,2).astype('float')
averages.drop(columns = {'quarter'}, inplace = True) 
averages = averages.astype({'qtr':        'category',
                           'zipcode':    'category'})


allWeather_withLags2 = pd.read_csv("../../data/companyData/allWeather_withLags_new.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'}).astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})


'''thunderstorms_withLags = pd.read_csv("../../data/companyData/thunderstorms_withLags.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'}).astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})'''

'thunderstorms_withLags = pd.read_csv("../../data/companyData/thunderstorms_withLags.csv").    drop(columns = {"Unnamed: 0", \'yearQtr\'}).astype({\'year\':       \'category\',\n                           \'qtr\':        \'category\',\n                           \'zipcode\':    \'category\'})'

Create direct effects database. Merge weather to full cstat database.

Merge weather to the ig-cstat database.

In [17]:
igChanges = pd.read_csv("../../data/companyData/igData.csv").drop(columns = {'Unnamed: 0'})

'''fractions_byZip = pd.read_csv("../../data/companyData/fractions_byZip.csv").drop(columns = {'Unnamed: 0'})
fractions_byZip = fractions_byZip[fractions_byZip.gvkey.isin(list(igChanges.gvkey.unique())) & \
                                  fractions_byZip.zipcode.isin(list(igChanges.zipcode.unique())) ]

fractions_byZip = fractions_byZip.groupby(['year','zipcode','gvkey']).sum().reset_index()

print(igChanges.shape)

igChanges = igChanges.merge(fractions_byZip)'''

print(igChanges.shape)

(271847, 69)


In [18]:
igChanges.year.min()

1999

In [19]:
igChangesWithWeather = igChanges.merge(allWeather_withLags).merge(allWeather_withLags2).\
    merge(averages).merge(g, how = 'left')
igChangesWithWeather.shape

(239599, 435)

In [20]:
igChangesWithWeather.year.min()

2001

In [21]:
igChangesWithWeather.head()

Unnamed: 0,gvkey,datadate,year,qtr,companyName,curcdq,assets,cash,costGoodsSold,totalInv,...,empWt_lag2_temp_zipQuarter95,empWt_lag3_temp_zipQuarter95,empWt_precip_zipQuarter95,empWt_lag1_precip_zipQuarter95,empWt_lag2_precip_zipQuarter95,empWt_lag3_precip_zipQuarter95,empWt_days90Plus,empWt_lag1_days90Plus,empWt_lag2_days90Plus,empWt_lag3_days90Plus
0,8515,20030331,2003,1,PHI INC,USD,524.003551,,67.936692,52.543325,...,0.0,0.0,0.0,0.27439,0.27439,0.0,0.0,7.981707,73.262195,51.341463
1,25874,20030331,2003,1,PETROQUEST ENERGY INC,USD,198.075923,,4.340718,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,5.0,62.0,29.0
2,28564,20030331,2003,1,STONE ENERGY CORP,USD,1840.969152,,29.874514,0.0,...,0.25,0.25,0.25,0.75,0.178571,0.0,0.0,0.714286,51.821429,17.642857
3,8515,20040331,2004,1,PHI INC,USD,520.557973,,68.98805,53.825001,...,0.0,0.0,0.0,0.0,0.251572,0.0,7.484277,17.962264,69.113208,40.415094
4,25874,20040331,2004,1,PETROQUEST ENERGY INC,USD,243.365831,,4.63376,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,45.0,38.0


In [22]:
for col in igChangesWithWeather.columns:
    print(col)

gvkey
datadate
year
qtr
companyName
curcdq
assets
cash
costGoodsSold
totalInv
netIncome
opInc_afDep
opInc_befDep
totalRevenue
costat
priceClose
add1
addzip
earningsPerShare
sales
otherCosts
shares
assetsLast
netIncomeLast
totalRevenueLast
costGoodsSoldLast
totalInvLast
opInc_afDepLast
opInc_befDepLast
priceCloseLast
cashLast
earningsPerShareLast
sharesLast
salesLast
otherCostsLast
incomeChange
revenueChange
costChange
inventoryChange
opInc_afDepChange
opInc_befDepChange
priceCloseChange
assetsPrev
fyearq
assetsLagged
netIncomeLagged
roa_lagged
sic2
indGroup
earliestYear
ageTercile
sizeTercile
profitTercile
datacqtr
datafqtr
fyr
DATE
cstatCompanies
igCompanies
delete
abi
ticker
company
state
city
address_line_1
zipcode
latitude
longitude
precip_annual_50
precip_annual_95
precip_annual_99
precip_zip_50
precip_zip_95
precip_zip_99
precip_zipQuarter_50
precip_zipQuarter_95
precip_zipQuarter_99
temp_annual_50
temp_annual_95
temp_annual_99
temp_zip_50
temp_zip_95
temp_zip_99
temp_zipQuarter_

In [23]:
g.head()

Unnamed: 0,gvkey,year,qtr,empMx_precip_annual_50,empMx_precip_annual_95,empMx_precip_annual_99,empMx_precip_zip_50,empMx_precip_zip_95,empMx_precip_zip_99,empMx_precip_zipQuarter_50,...,empWt_lag2_temp_zipQuarter95,empWt_lag3_temp_zipQuarter95,empWt_precip_zipQuarter95,empWt_lag1_precip_zipQuarter95,empWt_lag2_precip_zipQuarter95,empWt_lag3_precip_zipQuarter95,empWt_days90Plus,empWt_lag1_days90Plus,empWt_lag2_days90Plus,empWt_lag3_days90Plus
0,3937,2001,1,13.0,2.0,1.0,30.0,4.0,1.0,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.853659,9.512195
1,3937,2001,2,15.0,4.0,1.0,34.0,5.0,2.0,34.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.658537,0.0,0.0,5.853659
2,3937,2001,3,22.0,5.0,0.0,37.0,10.0,1.0,37.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6.95122,3.658537,0.0,0.0
3,3937,2001,4,4.0,0.0,0.0,12.0,1.0,0.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.95122,3.658537,0.0
4,3937,2002,1,19.0,7.0,0.0,26.0,10.0,2.0,26.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.95122,3.658537


In [24]:
igChangesWithWeather.shape

(239599, 435)

In [25]:
igChangesWithWeather.to_csv("../../data/companyData/igWithWeather.csv")

In [63]:
igChangesWithWeather.indGroup.unique()

array(['transportUtilities', 'mining', 'services', 'manu', 'finance',
       'retail', 'construction', 'wholesale', nan, 'agForFish'],
      dtype=object)

# Supplier Focus
Subset to focus on firms who are listed as suppliers, in the years of focus.

In [53]:
igChangesWithWeather = pd.read_csv("../../data/companyData/igWithWeather.csv").drop(columns = 'Unnamed: 0')

In [54]:
suppliersOnly = pd.read_csv("../../data/companyData/suppliers.csv").drop(columns = 'Unnamed: 0').\
    rename(columns = {'supplier_gvkey': 'gvkey'})

In [55]:
igChangesWithWeather.columns

Index(['gvkey', 'datadate', 'year', 'qtr', 'companyName', 'curcdq', 'assets',
       'cash', 'costGoodsSold', 'totalInv',
       ...
       'empWt_lag4_precip_zipQuarter_99', 'empWt_lag4_temp_annual_50',
       'empWt_lag4_temp_annual_95', 'empWt_lag4_temp_annual_99',
       'empWt_lag4_temp_zip_50', 'empWt_lag4_temp_zip_95',
       'empWt_lag4_temp_zip_99', 'empWt_lag4_temp_zipQuarter_50',
       'empWt_lag4_temp_zipQuarter_95', 'empWt_lag4_temp_zipQuarter_99'],
      dtype='object', length=304)

In [56]:
suppliersOnly.columns

Index(['year', 'gvkey'], dtype='object')

In [57]:
allSupplierData = suppliersOnly.merge(igChangesWithWeather)

In [58]:
allSupplierData.shape

(66089, 304)

In [59]:
allSupplierData.head()

Unnamed: 0,year,gvkey,datadate,qtr,companyName,curcdq,assets,cash,costGoodsSold,totalInv,...,empWt_lag4_precip_zipQuarter_99,empWt_lag4_temp_annual_50,empWt_lag4_temp_annual_95,empWt_lag4_temp_annual_99,empWt_lag4_temp_zip_50,empWt_lag4_temp_zip_95,empWt_lag4_temp_zip_99,empWt_lag4_temp_zipQuarter_50,empWt_lag4_temp_zipQuarter_95,empWt_lag4_temp_zipQuarter_99
0,2009,1013.0,20091231,4,ADC TELECOMMUNICATIONS INC,USD,1600.631743,669.076288,189.162302,148.430711,...,0.1221,1.251933,0.0,0.0,17.621897,0.0,0.0,33.45177,4.816036,0.0
1,2010,1013.0,20100331,1,ADC TELECOMMUNICATIONS INC,USD,1603.590912,531.947264,188.959304,146.398754,...,0.0,0.0,0.0,0.0,4.65423,0.0,0.0,26.566594,4.232104,0.521041
2,2010,1013.0,20100630,2,ADC TELECOMMUNICATIONS INC,USD,1681.842764,533.910791,209.428053,138.232024,...,0.16269,13.250325,0.0,0.0,44.34577,3.288937,1.563124,30.442516,3.147939,0.738395
3,2010,1013.0,20100930,3,ADC TELECOMMUNICATIONS INC,USD,1742.356995,612.217809,218.134351,125.728575,...,1.042082,35.792191,0.16269,0.0,61.427766,2.365293,0.16269,23.215184,0.16269,0.0
4,2004,1050.0,20040331,1,CECO ENVIRONMENTAL CORP,USD,53.955953,,15.469965,6.11378,...,0.0,0.0,0.0,0.0,8.500669,0.0,0.0,24.016064,7.543507,0.0


In [60]:
allSupplierData.to_csv("../../data/companyData/allSupplierData.csv")