In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from scipy.stats.mstats import winsorize

### **Panel A: Emission variables**

In [39]:
# load data as pandas data frames
dataset_raw = pd.read_csv('dataset_raw.csv')

# store emission variables
emission_variables =  pd.DataFrame()

In [40]:
dataset_sorted = dataset_raw.sort_values(['gvkey', 'fyear'])

mask = (dataset_sorted['Scope_1'] > 0) & (dataset_sorted['Scope_2'] > 0)
dataset_sorted['Scope_1'] = dataset_sorted['Scope_1'][mask]
dataset_sorted['Scope_2'] = dataset_sorted['Scope_2'][mask]

#### Log(total Emission)

In [41]:
emission_variables['total_emission_scope1'] = np.log(dataset_sorted['Scope_1'])
emission_variables['total_emission_scope2'] = np.log(dataset_sorted['Scope_2'])

In [42]:
# store 'gvkey' and 'fyear' from the sorted dataset in the emission_variables dictionary
emission_variables['gvkey'] = dataset_sorted['gvkey']
emission_variables['fyear'] = dataset_sorted['fyear']

#### Emission Growth Rate

In [43]:
emission_variables['emission_growth_scope1'] = dataset_sorted.groupby('gvkey')['Scope_1'].pct_change()
emission_variables['emission_growth_scope2'] = dataset_sorted.groupby('gvkey')['Scope_2'].pct_change()

  emission_variables['emission_growth_scope1'] = dataset_sorted.groupby('gvkey')['Scope_1'].pct_change()
  emission_variables['emission_growth_scope2'] = dataset_sorted.groupby('gvkey')['Scope_2'].pct_change()


In [44]:
growth_rates1 = emission_variables['emission_growth_scope1']
growth_rates2 = emission_variables['emission_growth_scope2']

# inf, nan
growth_rates1.replace([np.inf, -np.inf], np.nan, inplace=True)
growth_rates1.dropna(inplace=True)
growth_rates2.replace([np.inf, -np.inf], np.nan, inplace=True)
growth_rates2.dropna(inplace=True)

# winsorize
winsorized_growth_rates1 = winsorize(growth_rates1, limits=[0.025, 0.025])
winsorized_growth_rates2 = winsorize(growth_rates2, limits=[0.025, 0.025])

In [45]:
# replace the emission growth by winsorized data
emission_variables['emission_growth_scope1'] = pd.DataFrame(winsorized_growth_rates1)
emission_variables['emission_growth_scope2'] = pd.DataFrame(winsorized_growth_rates2)

### Carbon Intensity

In [49]:
emission_variables['carbon_intensity_scope1'] = (dataset_sorted['Scope_1'] / (dataset_sorted['sale']*100)) 
emission_variables['carbon_intensity_scope2'] = (dataset_sorted['Scope_2'] / (dataset_sorted['sale']*100)) 

# inf, nan #
emission_variables['carbon_intensity_scope1'].replace([np.inf, -np.inf], np.nan, inplace=True)
emission_variables['carbon_intensity_scope1'].dropna(inplace=True)
emission_variables['carbon_intensity_scope2'].replace([np.inf, -np.inf], np.nan, inplace=True)
emission_variables['carbon_intensity_scope2'].dropna(inplace=True)

# winsorize
emission_variables['carbon_intensity_scope1'] = winsorize(emission_variables['carbon_intensity_scope1'], limits=[0.025, 0.025])
emission_variables['carbon_intensity_scope2'] = winsorize(emission_variables['carbon_intensity_scope2'], limits=[0.025, 0.025])

### Check emission data

In [47]:
emission_variables.describe()

  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(


Unnamed: 0,total_emission_scope1,total_emission_scope2,gvkey,fyear,emission_growth_scope1,emission_growth_scope2,carbon_intensity_scope1,carbon_intensity_scope2
count,15987.0,15987.0,15999.0,15999.0,13365.0,13365.0,15999.0,15999.0
mean,9.880214,10.095786,55747.186137,2016.570911,0.056679,0.068529,1.163097,0.303205
std,2.978628,2.464885,64393.751756,2.857792,0.295652,0.292209,3.586708,0.370308
min,-1.645065,-0.079043,1004.0,2010.0,-0.587399,-0.510633,0.003999,0.008185
25%,7.884177,8.6271,9815.0,2015.0,-0.070276,-0.074187,0.035725,0.073925
50%,9.816265,10.300611,24430.0,2017.0,0.023288,0.024767,0.130434,0.168343
75%,11.627291,11.748831,66731.0,2019.0,0.133633,0.155335,0.294071,0.403065
max,18.804073,16.920606,328795.0,2020.0,1.124937,1.138111,19.735829,1.780249


In [50]:
emission_variables.to_csv('./emission_variables.csv', index=True)