In [2]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy

import gc

import geopy.distance

nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Stocks

In [6]:
igChanges = pd.read_csv("../../data/companyData/igData.csv").\
    drop(columns = {'Unnamed: 0'})
igChanges.head()

Unnamed: 0,gvkey,datadate,year,qtr,companyName,curcdq,assets,cash,costGoodsSold,totalInv,...,company,state,city,address_line_1,zipcode,latitude,longitude,parent_employee_size_code,location_employee_size_code,employeesAtLocation
0,8515,20030331,2003,1,PHI INC,USD,524.003551,,67.936692,52.543325,...,PETROLEUM HELICOPTERS INC,LA,LAFAYETTE,2001 SE EVANGELINE TRWY,70508,30.19255,-91.99479,10.0,10.0,1.0
1,8515,20030630,2003,2,PHI INC,USD,510.945781,,67.929738,53.962642,...,PETROLEUM HELICOPTERS INC,LA,LAFAYETTE,2001 SE EVANGELINE TRWY,70508,30.19255,-91.99479,10.0,10.0,1.0
2,8515,20030930,2003,3,PHI INC,USD,518.038412,,73.764198,55.502832,...,PETROLEUM HELICOPTERS INC,LA,LAFAYETTE,2001 SE EVANGELINE TRWY,70508,30.19255,-91.99479,10.0,10.0,1.0
3,8515,20031231,2003,4,PHI INC,USD,518.515573,,74.28658,55.505099,...,PETROLEUM HELICOPTERS INC,LA,LAFAYETTE,2001 SE EVANGELINE TRWY,70508,30.19255,-91.99479,10.0,10.0,1.0
4,8515,20040331,2004,1,PHI INC,USD,520.557973,,68.98805,53.825001,...,PETROLEUM HELICOPTERS INC,LA,LAFAYETTE,2002 SE EVANGELINE TRWY,70508,30.16299,-92.02032,10.0,10.0,1.0


In [5]:
with open('../../data/stockReturns.pkl', 'rb') as f:
    stocks = pkl.load(f)[['date','gvkey','RET']]

In [7]:
stocks.head()

Unnamed: 0,date,gvkey,RET
0,2000-01-03,1690.0,0.088754
1,2000-01-04,1690.0,-0.08431
2,2000-01-05,1690.0,0.014634
3,2000-01-06,1690.0,-0.086538
4,2000-01-07,1690.0,0.047368


In [8]:
sum(stocks.gvkey.isna())

2194269

In [9]:
stocks = stocks[stocks.date.dt.year > 2008]

stocks['qtr']  = stocks.date.dt.quarter
stocks['year'] = stocks.date.dt.year

stocks = stocks[~stocks.gvkey.isna()]
stocks['gvkey'] = stocks['gvkey'].astype(int)
stocks.shape

(7565684, 5)

In [10]:
igChanges.columns

Index(['gvkey', 'datadate', 'year', 'qtr', 'companyName', 'curcdq', 'assets',
       'cash', 'costGoodsSold', 'totalInv', 'netIncome', 'opInc_afDep',
       'opInc_befDep', 'totalRevenue', 'costat', 'priceClose', 'add1',
       'addzip', 'assetsLast', 'netIncomeLast', 'totalRevenueLast',
       'costGoodsSoldLast', 'totalInvLast', 'opInc_afDepLast',
       'opInc_befDepLast', 'priceCloseLast', 'cashLast', 'incomeChange',
       'revenueChange', 'costChange', 'inventoryChange', 'opInc_afDepChange',
       'opInc_befDepChange', 'priceCloseChange', 'assetsPrev', 'assetsLagged',
       'netIncomeLagged', 'roa_lagged', 'famafrench', 'sic2', 'indGroup',
       'earliestYear', 'ageTercile', 'sizeTercile', 'profitTercile',
       'datacqtr', 'datafqtr', 'fyr', 'DATE', 'cstatCompanies', 'igCompanies',
       'delete', 'abi', 'ticker', 'company', 'state', 'city', 'address_line_1',
       'zipcode', 'latitude', 'longitude', 'parent_employee_size_code',
       'location_employee_size_code', 'emplo

In [11]:
companyControls = igChanges[['gvkey','year','qtr','famafrench','ageTercile','sizeTercile','profitTercile','zipcode']]
companyControls.head()

Unnamed: 0,gvkey,year,qtr,famafrench,ageTercile,sizeTercile,profitTercile,zipcode
0,8515,2003,1,40.0,0,1.0,1.0,70508
1,8515,2003,2,40.0,0,1.0,2.0,70508
2,8515,2003,3,40.0,0,1.0,2.0,70508
3,8515,2003,4,40.0,0,1.0,2.0,70508
4,8515,2004,1,40.0,0,1.0,2.0,70508


In [12]:
print(stocks.dtypes, companyControls.dtypes)

date     datetime64[ns]
gvkey             int64
RET             float64
qtr               int64
year              int64
dtype: object gvkey              int64
year               int64
qtr                int64
famafrench       float64
ageTercile         int64
sizeTercile      float64
profitTercile    float64
zipcode            int64
dtype: object


In [13]:
stocksWithControls = stocks.merge(companyControls)
print(stocksWithControls.shape,stocks.shape,companyControls.shape)
stocksWithControls.head()

(4966717, 10) (7565684, 5) (241355, 8)


Unnamed: 0,date,gvkey,RET,qtr,year,famafrench,ageTercile,sizeTercile,profitTercile,zipcode
0,2009-01-02,1690,0.063269,1,2009,36.0,0,2.0,2.0,95014
1,2009-01-05,1690,0.042204,1,2009,36.0,0,2.0,2.0,95014
2,2009-01-06,1690,-0.016494,1,2009,36.0,0,2.0,2.0,95014
3,2009-01-07,1690,-0.021608,1,2009,36.0,0,2.0,2.0,95014
4,2009-01-08,1690,0.018569,1,2009,36.0,0,2.0,2.0,95014


In [14]:
del stocks
del companyControls
del igChanges
gc.collect()

492

In [None]:
annualWeather = pd.read_csv("../../data/companyData/stockWeather_annual.csv").\
    drop(columns = {'Unnamed: 0'})

annualWeather = annualWeather[~annualWeather.temp_annualLast5.isna()].reset_index(drop = True)

annualWeather['date'] = pd.to_datetime(annualWeather['date'],
                                   format = "%Y%m%d")

annualWeather.rename(columns = {'ZIP': 'zipcode'}, inplace = True)
print(annualWeather.dtypes)
annualWeather.head()

In [None]:
allWeather = pd.read_csv("../../data/companyData/stockWeather_zipQuarterQuants.csv").\
    drop(columns = {'Unnamed: 0'})

allWeather = allWeather[~allWeather.temp_zipQuarterLast5.isna()].reset_index(drop = True)

allWeather['date'] = pd.to_datetime(allWeather['date'],
                                   format = "%Y-%m-%d")

allWeather.rename(columns = {'ZIP': 'zipcode'}, inplace = True)
print(allWeather.dtypes)
allWeather.head()

In [None]:
stocksWithControlsWeather = stocksWithControls.merge(allWeather).merge(annualWeather)
print(stocksWithControlsWeather.shape,allWeather.shape)

stocksWithControlsWeather.head()

In [None]:
stocksWithControlsWeather.to_csv("../../data/companyData/stocksWithControlsWeather.csv")

In [None]:
sum(stocksWithControlsWeather.RET.isna())