In this notebook I investigate the effect of inequality reduction on GHG emissions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from kde import *
from perc import *
%matplotlib inline

from wb_api_wraper import *

In [2]:
datadir = "csvdata/";

In [3]:
any_name_to_wb_name = pd.read_csv("any_name_to_wb_name.csv",index_col="any",squeeze=True)
iso_to_name = pd.read_csv("iso3_to_wb_name.csv",usecols=["iso3","country"],index_col="iso3",squeeze=True)
names_to_iso = pd.read_csv("names_to_iso.csv",usecols=["country","iso3"],index_col="country",squeeze=True)

###Data from World bank API

In [4]:
wb_pop = get_wb_mrv("SP.POP.TOTL","wb_pop").wb_pop
wb_emis = get_wb_mrv("EN.ATM.CO2E.FF.KT","wb_emis").wb_emis #CO2 emissions from fossil fuels, total;

###Data from Maros

In [10]:
#personal data
persdata=pd.read_csv("csvdata/PERSDATA.csv").replace(iso_to_name)  #here emost of weights are 1 but some weights are 0.6
persdata.columns = map(str.lower, persdata.columns)
persdata=persdata.set_index("schema").drop("arm04").reset_index()#drops armenia 2004 since we have armenia 2012
persdata.schema.unique() #countries should apper only once

array(['alb05', 'arm12', 'gha06', 'ind05', 'irq06', 'lbr07', 'mda12',
       'mwi11', 'ner11', 'rwa11', 'sen11', 'sle11', 'vnm12'], dtype=object)

In [11]:
#household data
hhdata = pd.read_csv("csvdata/HHLDDATA.csv").replace(iso_to_name)
hhdata.columns = map(str.lower, hhdata.columns)
hhdata = hhdata.set_index("schema").drop("arm04").reset_index()#drops armenia 2004 since we have armenia 2012
hhdata.schema.unique()#countries should apper only once

array(['alb05', 'arm12', 'bgd05', 'gha06', 'ind05', 'irq06', 'lbr07',
       'mda12', 'mwi11', 'ner11', 'rwa11', 'sen11', 'sle11', 'vnm12'], dtype=object)

In [12]:
#emissions per household and per capita
emis_hh = pd.read_csv("csvdata/emissionsbyhousehold.csv").replace(iso_to_name).rename(columns={"value": "emis_hh"})
emis_hh.schema.unique() #countries should apper only once (Armenia  is only once in emissionsbyhousehold)

array(['alb05', 'arm12', 'gha06', 'ind05', 'irq06', 'lbr07', 'mda12',
       'mwi11', 'ner11', 'rwa11', 'sen11', 'sle11', 'vnm12'], dtype=object)

In [13]:
weight_hh = hhdata.set_index(['country',"hid"])["weight"]

In [14]:
#number of persons per household
nb_people_in_hh =persdata.groupby(["country","hid"])["pid"].count()
nb_people_in_hh.name = "nb_people_in_hh"
nb_people_in_hh.head();
#total number of people in the economy
nb_people_tot = (weight_hh*nb_people_in_hh).sum(level="country")
(nb_people_tot/wb_pop).dropna()

country
Albania         0.412566
Armenia         0.759794
Ghana           0.872517
India           0.796689
Iraq            0.900637
Liberia         0.630027
Malawi          0.861687
Moldova         0.942429
Niger           0.933535
Rwanda          0.190569
Senegal         0.964585
Sierra Leone    0.971050
Vietnam         0.995079
dtype: float64

Previous cell confirms how I should compute population from HHdata

In [15]:
#checks sanity with the index
emis_hh[['country',"hid"]].duplicated().sum()  , hhdata[['country',"hid"]].duplicated().sum()

(0, 0)

In [17]:
df= pd.concat([nb_people_in_hh,weight_hh,emis_hh.set_index(["country","hid"])["emis_hh"]],axis=1)

In [18]:
df.columns

Index(['nb_people_in_hh', 'weight', 'emis_hh'], dtype='object')

In [31]:
(df[['nb_people_in_hh', 'weight', 'emis_hh']].prod(axis=1).sum(level="country")/wb_emis).dropna()

country
Ghana            7668427.679740
Liberia         10581813.800422
Malawi           9546913.785954
Niger           42850772.803739
Rwanda           2473575.143272
Senegal         19994975.983993
Sierra Leone     7157177.872343
dtype: float64

In [32]:
(df[[ 'weight', 'emis_hh']].prod(axis=1).sum(level="country")/wb_emis).dropna()

country
Ghana           1693810.553586
Liberia         1788823.758953
Malawi          1877455.169198
Niger           5696680.335304
Rwanda          2473575.143272
Senegal         1941522.382588
Sierra Leone    1168308.696252
dtype: float64

In [25]:
(df[['nb_people_in_hh',  'emis_hh']].prod(axis=1).sum(level="country")/wb_emis).dropna()

country
Ghana            11569.724725
Liberia          74623.075227
Malawi           39678.283600
Niger           171889.152776
Rwanda           16620.793017
Senegal          68731.240231
Sierra Leone     50327.673803
dtype: float64

In [44]:
df.columns

Index(['nb_people_in_hh', 'weight', 'emis_hh', 'prod'], dtype='object')

In [51]:
((df.emis_hh/(df.weight*df.nb_people_in_hh)).sum(level="country")/wb_emis).dropna()

country
Ghana             1.584868
Liberia          40.918310
Malawi           15.435595
Niger            31.996557
Rwanda          119.544156
Senegal          11.144409
Sierra Leone     15.401058
dtype: float64

In [None]:
(df[['nb_people_in_hh', 'weight', 'emis_hh']].prod(axis=1).sum(level="country")/wb_emis).dropna()

###Data from GTAP and computation in SQL

In [35]:
gtap_emis = pd.read_csv("tot_emis_gtap.csv",header=None,names=["country","emis"],index_col = "country", squeeze=True)

In [36]:
gtap_emis;

In [38]:
(gtap_emis / wb_emis).dropna()

country
Benin               1.110148
Botswana            0.678491
Burkina Faso        1.268314
Cote d'Ivoire       1.109351
Egypt, Arab Rep.    0.512281
Ethiopia            1.524031
Ghana               1.599943
Guinea              1.541588
Kenya               0.837620
Madagascar          1.847151
Malawi              1.886472
Mauritius           0.878304
Morocco             0.648597
Mozambique          2.624023
Namibia             0.743559
Nigeria             1.002656
Rwanda              1.891985
Senegal             1.120419
South Africa        0.194588
Tanzania            1.059062
Togo                1.326361
Tunisia             0.621414
Uganda              1.259882
Zambia              4.171393
Zimbabwe            0.879348
dtype: float64

In [52]:
(gta{ap_emis / tot_emis_maybe).dropna()

Unnamed: 0_level_0,Albania,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahrain,Bangladesh,Belarus,Belgium,...,XSE,XSM,XSU,XTW,XWF,XWS,Zambia,Zimbabwe,emis_hh,year
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [29]:
wb_emis.ix[wb_emis.index]

country
Albania              NaN
Armenia              NaN
Bangladesh           NaN
Ghana           7444.010
India                NaN
Iraq                 NaN
Liberia          524.381
Malawi          1059.763
Moldova              NaN
Niger           1158.772
Rwanda           726.066
Senegal         4576.416
Sierra Leone    1415.462
Vietnam              NaN
Name: wb_emis, dtype: float64

In [16]:
wb_emis = wb_emis.ix[tot_emis.index]

In [27]:
tot_emis

country
Albania         1.040662e+09
Armenia         1.282214e+11
Bangladesh               NaN
Ghana           1.260871e+10
India           1.906669e+11
Iraq            3.202297e+10
Liberia         9.380252e+08
Malawi          1.989655e+09
Moldova         3.982440e+10
Niger           6.601154e+09
Rwanda          1.795978e+09
Senegal         8.885214e+09
Sierra Leone    1.653691e+09
Vietnam         5.715552e+10
dtype: float64

###Data from IEA

In [39]:

iea_emis=pd.read_csv("carbon_emissions_from_iea.csv",skiprows=[1],na_values=[".."]).dropna(axis=1,how="all").dropna().replace(any_name_to_wb_name).set_index("country")
iea_emis = iea_emis.ix[wb_emis.index].astype("float")

In [19]:
wb_emis

country
Albania              NaN
Armenia              NaN
Bangladesh           NaN
Ghana           7444.010
India                NaN
Iraq                 NaN
Liberia          524.381
Malawi          1059.763
Moldova              NaN
Niger           1158.772
Rwanda           726.066
Senegal         4576.416
Sierra Leone    1415.462
Vietnam              NaN
Name: wb_emis, dtype: float64

In [20]:
iea_emis.iloc[:,-1]

NameError: name 'iea_emis' is not defined

###Comparing various sources

In [21]:
(tot_pop/wb_pop).dropna()

country
Albania         0.412566
Armenia         3.624206
Ghana           0.872517
India           0.796689
Iraq            0.900637
Liberia         0.630027
Malawi          0.861687
Moldova         0.942429
Niger           0.933535
Rwanda          0.190569
Senegal         0.964585
Sierra Leone    0.971050
Vietnam         0.995079
dtype: float64

In [22]:
wb_emis_pc=wb_emis_pc.ix[tot_emis.index]
wb_emis = wb_emis.ix[tot_emis.index] #kilotons, for some reason
wb_emis;

NameError: name 'wb_emis_pc' is not defined

In [23]:
tot_emis/wb_emis

country
Albania                    NaN
Armenia                    NaN
Bangladesh                 NaN
Ghana           1693805.518934
India                      NaN
Iraq                       NaN
Liberia         1788823.758953
Malawi          1877452.705637
Moldova                    NaN
Niger           5696680.335304
Rwanda          2473574.510851
Senegal         1941522.382588
Sierra Leone    1168304.614147
Vietnam                    NaN
dtype: float64

In [26]:
(tot_emis/tot_pop)/(wb_emis_pc)

NameError: name 'wb_emis_pc' is not defined

In [25]:
#expenditures per capita
hh_exp = pd.read_csv("csvdata/PerCapitaExpenditures.csv").replace(iso_to_name)
hh_exp.rename(columns={"value": "hh_exp_pc"},inplace=True)
hh_exp.set_index(["country","hid"],inplace="True")

In [None]:
#builds dataframe indexed by country and hid with all the data
df = pd.DataFrame(hh_exp["hh_exp_pc"])
df["hh_emi"]=hh_emi["hh_emis"]
df["hh_emi_pc"]=hh_emi_pc
df["nb_people"]=nb_people
df["w"]=hop["weight"]
df["hh_emis_perdol"]=df["hh_emi_pc"]/df["hh_exp_pc"]

#Vietnam 
vn=df.ix["VNM",:]

In [None]:
def wavg(val_col_name, wt_col_name):
    """computes the average of val_col_name weighted by wt_col_name"""
    def inner(group):
        return (group[val_col_name] * group[wt_col_name]).sum() / group[wt_col_name].sum()
    #inner.__name__ = res_name
    return inner

In [None]:
flat_df =  df.reset_index()
country_list = flat_df.country.unique()

In [None]:
#Categories USD per day categories
income_bins = np.array([0,1,1.25,2,2.5,4]+[10*2.**i for i in range(0,15,1)]) 
income_cut = pd.cut(flat_df["hh_exp_pc"]/365, income_bins)

income_cut.unique()


In [None]:
flat_df

In [None]:
#
stats = pd.DataFrame()
grouped =flat_df.groupby(["country",income_cut])
stats["avg_emis_pc"]=grouped.apply(wavg('hh_emi_pc', 'w'))
stats["nb_people0"] =grouped["nb_people"].sum()
#stats.sort_index(inplace=True)
#stats["nb_people1"]=grouped["nb_people"]
stats

In [None]:
#stats.ix[c]
#country_stats

In [None]:
#for c in country_list:
#def lift_folks(country_stats):
def lift_folks(country_stats):
    #out = pd.DataFrame(index=country_stats.index)
    emis =pd.DataFrame(index=["emis"])
    out= country_stats.ix[:,["nb_people0"]]
    for i in range(1,10):
        nam = 'min'+str(income_bins[i])
        out[nam] = country_stats["nb_people0"]
        out.ix[i,nam] = country_stats.ix[0:i+1,"nb_people0"].sum()
        out.ix[0:i,nam] =0
        #emis[str(i)]= np.average(country_stats["avg_emis_pc"],weights=out[nam])
    return out
 #   stats.ix[c,[out.columns.values]]=out

In [None]:
emis_sce_country = pd.DataFrame()
#computes new emissions at once
for c in country_list:
    emis_sce_country[c]= stats.ix[c]["avg_emis_pc"].dot(lift_folks(stats.ix[c,:]))/(stats.ix[c]["nb_people0"].sum())
    
emis_sce_country    

In [None]:
#emis_sce_country['tot']=emis_sce_country.sum(axis=1)  

In [None]:
(emis_sce_country/emis_sce_country.ix["nb_people0",:]-1)*100

In [None]:
#paf = stats.reset_index().groupby("country").apply(lift_folks)
#paf.reset_index().groupby("country").apply(wavg("avg_emis_pc","nb_people1"))
#paf

In [None]:
#pd.concat([flat_df,pd.get_dummies(income_cut)],axis=1).set_index(["country","hid"])
income_cut = pd.cut(df["hh_exp_pc"]/365, income_bins)

income_dummies= pd.get_dummies(income_cut)
df= pd.concat([df,income_dummies],axis=1)
df

In [None]:
stats.ix[:,"nb_people1"] = stats.ix[:,"nb_people"]
stats.ix[:,"nb_people1"]

In [None]:
stats.ix[pd.MultiIndex("ALB)]

In [None]:
wp(vn["hh_exp_pc"],vn["nb_people"],np.arange(0,1,0.1))

In [None]:
font = {'family' : 'serif',
        'weight' : 'normal',
        'size'   : 15}
plt.rc('font', **font)
plt.figure(figsize=(9,7))
plt.plot((df["hh_exp_pc"]),df["hh_emi_pc"]/df["hh_exp_pc"],color="blue",marker='.',markeredgewidth=1,linestyle='None',markersize=12,clip_on=False)
plt.xlabel("$\$/yr$")
plt.ylabel("$gCO_2/\$$")
plt.xscale("log")

In [None]:
stats.

In [None]:
def weighted_rolling_mean(x,y,w,kernel):
    

In [None]:
from statsmodels.nonparametric.kernel_regression import KernelReg
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(0,2*np.pi,100)
y = np.sin(x) + np.random.random(100) * 0.2
kr = KernelReg(y,x,'o')
plt.plot(x, y, '+')
y_pred, y_std = kr.fit(x)
plt.plot(x, y_pred)

In [None]:
def plot_distribution(y,w,thecolor="red"):
    hist, bin_edges = np.histogram(y,bins=100,weights=w,density=True)
    width = 0.5 * (bin_edges[1] - bin_edges[0])
    center = (bin_edges[:-1] + bin_edges[1:]) / 2
    xs = np.linspace(center.min(),center.max(),300)
    plt.bar(center, hist, align='center', width=width,alpha=0.0,color=thecolor)
    density = gaussian_kde(y,weights=w)
    density._compute_covariance()
    plt.plot(xs,density(xs),linewidth=2,color=thecolor)


In [None]:
plot_distribution(np.log(vn.hh_emi_pc),vn.w)

