In this notebook I investigate the effect of inequality reduction on GHG emissions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from kde import *
from perc import *
%matplotlib inline


from wb_api_wraper import *

In [45]:
def names_to_wb_name(df_in,any_name_to_iso3,iso3_to_unique_name,common_name="iso3",end_name="country"):
    """Matches arbitrary country names from a dataset to world bank country names thru iso3 codes"""
    
    df=pd.DataFrame(df_in)
    
    index_name=df.index.name
    
    df[common_name]=any_name_to_iso3[common_name]
    
    #warns and prints if some countries are not recognized
    cond = df[common_name].isnull()
    if cond.sum()>0:
        warnings.warn("Unrecognized countries in 1st argument 'df':"+", ".join((df.index[cond].values)))
        warnings.warn("Correct 'df' or add lines to 2nd argument 'any_name_to_iso3'")
    
    df=df.dropna()
    
    #warning if missing
    df =df.reset_index().set_index(common_name)
    df[end_name]=iso3_to_unique_name
    cond = df[end_name].isnull()
    if cond.sum()>0:
        warnings.warn("this countries appear to be missing from 3rd argument 'iso3_to_unique_name':"+", ".join((df.index[cond])))

           
    #return df with original indexing
    return df.dropna().set_index(index_name)

In [2]:
datadir = "csvdata/";


In [67]:
iso_to_name = pd.read_csv("iso3_to_wb_name.csv",usecols=["iso3","country"],index_col="iso3",squeeze=True)
names_to_iso = pd.read_csv("names_to_iso.csv",usecols=["country","iso3"],index_col="country",squeeze=True)
any_name_to_wb_name = names_to_iso.replace(iso_to_name)
any_name_to_wb_name;

###Data from Maros

In [68]:
#number of persons per household
persdata=pd.read_csv("csvdata/PERSDATA.csv",usecols=["COUNTRY","HID","PID"]).replace(iso_to_name)  #her emost of weights are 1 but some weights are 0.6
persdata.columns = map(str.lower, persdata.columns)
nb_people_in_hh =persdata.groupby(["country","hid"])["pid"].count()


In [5]:
#total number of people in the economy
hhdata = pd.read_csv("csvdata/HHLDDATA.csv").replace(iso_to_name)
hhdata.columns = map(str.lower, hhdata.columns)
hhdata.set_index(["country","hid"],inplace="True")
nb_people_tot = hhdata["weight"]*nb_people_in_hh


In [8]:
#emissions per household and per capita
hh_emi = pd.read_csv("csvdata/emissionsbyhousehold.csv").replace(iso_to_name)

In [9]:
hh_emi.rename(columns={"value": "hh_emis"},inplace=True)
hh_emis = hh_emi.set_index(["country", "hid"])["hh_emis"]
hh_emi_pc = hh_emis/nb_people_in_hh

In [10]:
tot_pop = nb_people_tot.reset_index().groupby("country")[0].sum()
tot_emis = (hh_emis*hhdata.weight).groupby(level=0).sum()
tot_emis

country
Albania         1.040662e+09
Armenia                  NaN
Bangladesh               NaN
Ghana           1.260871e+10
India           1.906669e+11
Iraq            3.202297e+10
Liberia         9.380252e+08
Malawi          1.989655e+09
Moldova         3.982440e+10
Niger           6.601154e+09
Rwanda          1.795978e+09
Senegal         8.885214e+09
Sierra Leone    1.653691e+09
Vietnam         5.715552e+10
dtype: float64

###Data from World bank API

In [13]:
wb_pop = get_wb_mrv("SP.POP.TOTL","wb_pop")
wb_emis = get_wb_mrv("EN.ATM.CO2E.FF.KT","wb_emis") #CO2 emissions from fossil fuels, total;

In [16]:
wb_emis = wb_emis.ix[tot_emis.index]

In [18]:
tot_emis/wb_emis

country
Albania                    NaN
Armenia                    NaN
Bangladesh                 NaN
Ghana           1693805.518934
India                      NaN
Iraq                       NaN
Liberia         1788823.758953
Malawi          1877452.705637
Moldova                    NaN
Niger           5696680.335304
Rwanda          2473574.510851
Senegal         1941522.382588
Sierra Leone    1168304.614147
Vietnam                    NaN
dtype: float64

###Data from IEA

In [117]:
iso_to_name = pd.read_csv("iso3_to_wb_name.csv",usecols=["iso3","country"],index_col="iso3",squeeze=True)
names_to_iso = pd.read_csv("names_to_iso.csv",usecols=["country","iso3"],index_col="country",squeeze=True)
any_name_to_wb_name = names_to_iso.replace(iso_to_name)
any_name_to_wb_name;

iea_emis=pd.read_csv("carbon_emissions_from_iea.csv",skiprows=[1],na_values=[".."]).dropna(axis=1,how="all").dropna().replace(any_name_to_wb_name).set_index("country")
iea_emis = iea_emis.ix[wb_emis.index].astype("float")

In [115]:
wb_emis

country
Albania              NaN
Armenia              NaN
Bangladesh           NaN
Ghana           7444.010
India                NaN
Iraq                 NaN
Liberia          524.381
Malawi          1059.763
Moldova              NaN
Niger           1158.772
Rwanda           726.066
Senegal         4576.416
Sierra Leone    1415.462
Vietnam              NaN
Name: wb_emis, dtype: float64

In [114]:
iea_emis.iloc[:,-1]

country
Albania          0.00
Armenia          1.42
Bangladesh      16.81
Ghana            0.82
India           30.52
Iraq            10.96
Liberia           NaN
Malawi            NaN
Moldova          3.39
Niger             NaN
Rwanda            NaN
Senegal          0.05
Sierra Leone      NaN
Vietnam         17.15
Name: 2012, dtype: float64

###Comparing various sources

In [None]:
(tot_pop/wb_pop).dropna()

In [None]:
wb_emis_pc=wb_emis_pc.ix[tot_emis.index]
wb_emis = wb_emis.ix[tot_emis.index] #kilotons, for some reason
wb_emis;

In [None]:
tot_emis/wb_emis

In [None]:
(tot_emis/tot_pop)/(wb_emis_pc)

In [None]:
#expenditures per capita
hh_exp = pd.read_csv("csvdata/PerCapitaExpenditures.csv").replace(iso_to_name)
hh_exp.rename(columns={"value": "hh_exp_pc"},inplace=True)
hh_exp.set_index(["country","hid"],inplace="True")

In [None]:
#builds dataframe indexed by country and hid with all the data
df = pd.DataFrame(hh_exp["hh_exp_pc"])
df["hh_emi"]=hh_emi["hh_emis"]
df["hh_emi_pc"]=hh_emi_pc
df["nb_people"]=nb_people
df["w"]=hop["weight"]
df["hh_emis_perdol"]=df["hh_emi_pc"]/df["hh_exp_pc"]

#Vietnam 
vn=df.ix["VNM",:]

In [None]:
def wavg(val_col_name, wt_col_name):
    """computes the average of val_col_name weighted by wt_col_name"""
    def inner(group):
        return (group[val_col_name] * group[wt_col_name]).sum() / group[wt_col_name].sum()
    #inner.__name__ = res_name
    return inner

In [None]:
flat_df =  df.reset_index()
country_list = flat_df.country.unique()

In [None]:
#Categories USD per day categories
income_bins = np.array([0,1,1.25,2,2.5,4]+[10*2.**i for i in range(0,15,1)]) 
income_cut = pd.cut(flat_df["hh_exp_pc"]/365, income_bins)

income_cut.unique()


In [None]:
flat_df

In [None]:
#
stats = pd.DataFrame()
grouped =flat_df.groupby(["country",income_cut])
stats["avg_emis_pc"]=grouped.apply(wavg('hh_emi_pc', 'w'))
stats["nb_people0"] =grouped["nb_people"].sum()
#stats.sort_index(inplace=True)
#stats["nb_people1"]=grouped["nb_people"]
stats

In [None]:
#stats.ix[c]
#country_stats

In [None]:
#for c in country_list:
#def lift_folks(country_stats):
def lift_folks(country_stats):
    #out = pd.DataFrame(index=country_stats.index)
    emis =pd.DataFrame(index=["emis"])
    out= country_stats.ix[:,["nb_people0"]]
    for i in range(1,10):
        nam = 'min'+str(income_bins[i])
        out[nam] = country_stats["nb_people0"]
        out.ix[i,nam] = country_stats.ix[0:i+1,"nb_people0"].sum()
        out.ix[0:i,nam] =0
        #emis[str(i)]= np.average(country_stats["avg_emis_pc"],weights=out[nam])
    return out
 #   stats.ix[c,[out.columns.values]]=out

In [None]:
emis_sce_country = pd.DataFrame()
#computes new emissions at once
for c in country_list:
    emis_sce_country[c]= stats.ix[c]["avg_emis_pc"].dot(lift_folks(stats.ix[c,:]))/(stats.ix[c]["nb_people0"].sum())
    
emis_sce_country    

In [None]:
#emis_sce_country['tot']=emis_sce_country.sum(axis=1)  

In [None]:
(emis_sce_country/emis_sce_country.ix["nb_people0",:]-1)*100

In [None]:
#paf = stats.reset_index().groupby("country").apply(lift_folks)
#paf.reset_index().groupby("country").apply(wavg("avg_emis_pc","nb_people1"))
#paf

In [None]:
#pd.concat([flat_df,pd.get_dummies(income_cut)],axis=1).set_index(["country","hid"])
income_cut = pd.cut(df["hh_exp_pc"]/365, income_bins)

income_dummies= pd.get_dummies(income_cut)
df= pd.concat([df,income_dummies],axis=1)
df

In [None]:
stats.ix[:,"nb_people1"] = stats.ix[:,"nb_people"]
stats.ix[:,"nb_people1"]

In [None]:
stats.ix[pd.MultiIndex("ALB)]

In [None]:
wp(vn["hh_exp_pc"],vn["nb_people"],np.arange(0,1,0.1))

In [None]:
font = {'family' : 'serif',
        'weight' : 'normal',
        'size'   : 15}
plt.rc('font', **font)
plt.figure(figsize=(9,7))
plt.plot((df["hh_exp_pc"]),df["hh_emi_pc"]/df["hh_exp_pc"],color="blue",marker='.',markeredgewidth=1,linestyle='None',markersize=12,clip_on=False)
plt.xlabel("$\$/yr$")
plt.ylabel("$gCO_2/\$$")
plt.xscale("log")

In [None]:
stats.

In [None]:
def weighted_rolling_mean(x,y,w,kernel):
    

In [None]:
from statsmodels.nonparametric.kernel_regression import KernelReg
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(0,2*np.pi,100)
y = np.sin(x) + np.random.random(100) * 0.2
kr = KernelReg(y,x,'o')
plt.plot(x, y, '+')
y_pred, y_std = kr.fit(x)
plt.plot(x, y_pred)

In [None]:
def plot_distribution(y,w,thecolor="red"):
    hist, bin_edges = np.histogram(y,bins=100,weights=w,density=True)
    width = 0.5 * (bin_edges[1] - bin_edges[0])
    center = (bin_edges[:-1] + bin_edges[1:]) / 2
    xs = np.linspace(center.min(),center.max(),300)
    plt.bar(center, hist, align='center', width=width,alpha=0.0,color=thecolor)
    density = gaussian_kde(y,weights=w)
    density._compute_covariance()
    plt.plot(xs,density(xs),linewidth=2,color=thecolor)


In [None]:
plot_distribution(np.log(vn.hh_emi_pc),vn.w)

