# ETL of Emission and Population Data

In [2]:
#Boto3 is the AWS Software Development Kit for Python
%pip install boto3

Note: you may need to restart the kernel to use updated packages.


In [3]:
#Import dependencies
import pandas as pd 
import os 
import csv 
import numpy as np 
from sqlalchemy import create_engine, inspect
import boto3
from io import StringIO

# Emission Data Cleaning

In [7]:
# 1. Create a DataFrame for the Emission data
emissions_data_df=pd.read_csv('Emissions_Data.csv',low_memory=False)
emissions_data_df.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Source Code,Source,Unit,Y1961,...,Y2016N,Y2017,Y2017F,Y2017N,Y2018,Y2018F,Y2018N,Y2019,Y2019F,Y2019N
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),3050,FAO TIER 1,kilotonnes,240.6831,...,,371.2863,Fc,,378.887,Fc,,389.6563,Fc,
1,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),3051,UNFCCC,kilotonnes,,...,,,,,,,,,,
2,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),3050,FAO TIER 1,kilotonnes,6739.1279,...,,10396.0163,Fc,,10608.8357,Fc,,10910.3754,Fc,
3,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),3051,UNFCCC,kilotonnes,,...,,,,,,,,,,
4,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),3050,FAO TIER 1,kilotonnes,6739.1279,...,,10396.0163,Fc,,10608.8357,Fc,,10910.3754,Fc,


In [5]:
# 2. Check the datatypes of your columns. 
emissions_data_df.dtypes

Area Code         int64
Area             object
Item Code         int64
Item             object
Element Code      int64
                 ...   
Y2018F           object
Y2018N           object
Y2019           float64
Y2019F           object
Y2019N           object
Length: 186, dtype: object

In [8]:
#display the columns
emissions_data_df.columns

Index(['Area Code', 'Area', 'Item Code', 'Item', 'Element Code', 'Element',
       'Source Code', 'Source', 'Unit', 'Y1961',
       ...
       'Y2016N', 'Y2017', 'Y2017F', 'Y2017N', 'Y2018', 'Y2018F', 'Y2018N',
       'Y2019', 'Y2019F', 'Y2019N'],
      dtype='object', length=186)

In [9]:
# Row count
emissions_data_df.count()

Area Code       35214
Area            35214
Item Code       35214
Item            35214
Element Code    35214
                ...  
Y2018F          30501
Y2018N            214
Y2019           31349
Y2019F          30475
Y2019N            210
Length: 186, dtype: int64

In [10]:
# Finding Mean,Std,Min,Max etc.
emissions_data_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Area Code,35214.0,836.225450,1802.568667,1.000,75.000000,150.00000,223.000000,5.873000e+03
Item Code,35214.0,9323.758193,15686.892075,1707.000,5059.000000,5066.00000,6993.000000,6.992100e+04
Element Code,35214.0,365351.316976,358303.088068,7225.000,7230.000000,7273.00000,724313.000000,7.244130e+05
Source Code,35214.0,3050.142727,0.349800,3050.000,3050.000000,3050.00000,3050.000000,3.051000e+03
Y1961,9695.0,9248.883169,76293.922762,0.000,0.405600,11.29130,345.274500,3.261482e+06
...,...,...,...,...,...,...,...,...
Y2015,31649.0,18766.775147,205781.535921,-3966552.558,0.038000,10.21410,667.864100,1.111916e+07
Y2016,31578.0,18789.611363,191236.770842,-2720180.973,0.035700,10.68550,669.740275,1.029408e+07
Y2017,31414.0,18796.475622,191542.218907,-2636985.222,0.037925,10.31540,658.084100,1.034031e+07
Y2018,31390.0,19024.886269,193904.479619,-2636985.222,0.033550,9.92465,653.896400,1.048629e+07


In [11]:
emissions_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35214 entries, 0 to 35213
Columns: 186 entries, Area Code to Y2019N
dtypes: float64(88), int64(4), object(94)
memory usage: 50.0+ MB


In [12]:
emissions_data_df.columns

Index(['Area Code', 'Area', 'Item Code', 'Item', 'Element Code', 'Element',
       'Source Code', 'Source', 'Unit', 'Y1961',
       ...
       'Y2016N', 'Y2017', 'Y2017F', 'Y2017N', 'Y2018', 'Y2018F', 'Y2018N',
       'Y2019', 'Y2019F', 'Y2019N'],
      dtype='object', length=186)

In [13]:
#checking for null values
emissions_data_df.isnull()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Source Code,Source,Unit,Y1961,...,Y2016N,Y2017,Y2017F,Y2017N,Y2018,Y2018F,Y2018N,Y2019,Y2019F,Y2019N
0,False,False,False,False,False,False,False,False,False,False,...,True,False,False,True,False,False,True,False,False,True
1,False,False,False,False,False,False,False,False,False,True,...,True,True,True,True,True,True,True,True,True,True
2,False,False,False,False,False,False,False,False,False,False,...,True,False,False,True,False,False,True,False,False,True
3,False,False,False,False,False,False,False,False,False,True,...,True,True,True,True,True,True,True,True,True,True
4,False,False,False,False,False,False,False,False,False,False,...,True,False,False,True,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35209,False,False,False,False,False,False,False,False,False,True,...,True,False,False,True,False,False,True,False,False,True
35210,False,False,False,False,False,False,False,False,False,True,...,True,False,False,True,False,False,True,False,False,True
35211,False,False,False,False,False,False,False,False,False,True,...,True,False,False,True,False,False,True,False,False,True
35212,False,False,False,False,False,False,False,False,False,True,...,True,False,False,True,False,False,True,False,False,True


In [14]:
#Sum of null values
emissions_data_df.isnull().sum()

Area Code           0
Area                0
Item Code           0
Item                0
Element Code        0
                ...  
Y2018F           4713
Y2018N          35000
Y2019            3865
Y2019F           4739
Y2019N          35004
Length: 186, dtype: int64

In [15]:
#Check for unique values
emissions_data_df.nunique().tolist()

[281,
 281,
 25,
 25,
 8,
 8,
 2,
 2,
 1,
 6777,
 1,
 0,
 6809,
 1,
 0,
 6811,
 1,
 0,
 6851,
 1,
 0,
 6825,
 1,
 0,
 6855,
 1,
 0,
 6875,
 1,
 0,
 6901,
 1,
 0,
 6887,
 1,
 0,
 6931,
 1,
 0,
 6949,
 1,
 0,
 6963,
 1,
 0,
 6971,
 1,
 0,
 6987,
 1,
 0,
 6959,
 1,
 0,
 6983,
 1,
 0,
 6987,
 1,
 0,
 6994,
 1,
 0,
 7032,
 1,
 0,
 7031,
 1,
 0,
 7053,
 1,
 0,
 7052,
 1,
 0,
 7049,
 1,
 0,
 7053,
 1,
 0,
 7080,
 1,
 0,
 7078,
 1,
 0,
 7072,
 1,
 0,
 7069,
 1,
 0,
 7095,
 1,
 0,
 15728,
 2,
 2,
 15524,
 2,
 2,
 16989,
 2,
 2,
 17212,
 2,
 2,
 18013,
 2,
 2,
 17292,
 2,
 2,
 17053,
 2,
 2,
 17148,
 2,
 2,
 17180,
 2,
 2,
 17147,
 2,
 2,
 18053,
 2,
 2,
 16588,
 2,
 2,
 16739,
 2,
 2,
 16907,
 2,
 2,
 16824,
 2,
 2,
 17061,
 2,
 2,
 16949,
 2,
 2,
 16994,
 2,
 2,
 16796,
 2,
 2,
 16867,
 2,
 2,
 17223,
 2,
 2,
 16796,
 2,
 2,
 17059,
 2,
 2,
 16834,
 2,
 2,
 16850,
 2,
 2,
 16859,
 2,
 2,
 16655,
 2,
 2,
 16623,
 2,
 2,
 16491,
 1,
 1,
 16603,
 1,
 1]

In [16]:
#Listing unique countries
countries=emissions_data_df['Area'].unique()
countries

array(['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra',
       'Angola', 'Anguilla', 'Antigua and Barbuda', 'Argentina',
       'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan',
       'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
       'Belgium', 'Belgium-Luxembourg', 'Belize', 'Benin', 'Bermuda',
       'Bhutan', 'Bolivia (Plurinational State of)',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei Darussalam', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon',
       'Canada', 'Cayman Islands', 'Central African Republic', 'Chad',
       'Channel Islands', 'Chile', 'China', 'China, Hong Kong SAR',
       'China, Macao SAR', 'China, mainland', 'China, Taiwan Province of',
       'Colombia', 'Comoros', 'Congo', 'Cook Islands', 'Costa Rica',
       "C�te d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia',
       'Czechoslovakia', "Democratic People's Republic of Korea",
  

In [17]:
#listing unique items
items=emissions_data_df['Item'].unique()
items

array(['Enteric Fermentation', 'Manure Management', 'Rice Cultivation',
       'Synthetic Fertilizers', 'Manure applied to Soils',
       'Manure left on Pasture', 'Crop Residues',
       'Burning - Crop residues', 'Net Forest conversion', 'Forestland',
       'Savanna fires', 'Fires in humid tropical forests', 'Forest fires',
       'Fires in organic soils', 'On-farm energy use', 'IPCC Agriculture',
       'Agricultural Soils', 'LULUCF', 'AFOLU',
       'Emissions on agricultural land', 'Farm-gate emissions',
       'Land Use change', 'Drained organic soils',
       'Drained organic soils (CO2)', 'Drained organic soils (N2O)'],
      dtype=object)

In [18]:
#listing unique elements
elements=emissions_data_df['Element'].unique()
items

array(['Enteric Fermentation', 'Manure Management', 'Rice Cultivation',
       'Synthetic Fertilizers', 'Manure applied to Soils',
       'Manure left on Pasture', 'Crop Residues',
       'Burning - Crop residues', 'Net Forest conversion', 'Forestland',
       'Savanna fires', 'Fires in humid tropical forests', 'Forest fires',
       'Fires in organic soils', 'On-farm energy use', 'IPCC Agriculture',
       'Agricultural Soils', 'LULUCF', 'AFOLU',
       'Emissions on agricultural land', 'Farm-gate emissions',
       'Land Use change', 'Drained organic soils',
       'Drained organic soils (CO2)', 'Drained organic soils (N2O)'],
      dtype=object)

In [19]:
#Store and Display unique source and source code
source_code=emissions_data_df['Source Code'].unique()
source=emissions_data_df['Source'].unique()
print(source_code)
print(source)

[3050 3051]
['FAO TIER 1' 'UNFCCC']


In [20]:
#Checking for the row values for Source Code = 3051
emissions_data_df.loc[(emissions_data_df['Source Code'])==3051]

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Source Code,Source,Unit,Y1961,...,Y2016N,Y2017,Y2017F,Y2017N,Y2018,Y2018F,Y2018N,Y2019,Y2019F,Y2019N
1,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),3051,UNFCCC,kilotonnes,,...,,,,,,,,,,
3,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),3051,UNFCCC,kilotonnes,,...,,,,,,,,,,
5,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),3051,UNFCCC,kilotonnes,,...,,,,,,,,,,
7,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),3051,UNFCCC,kilotonnes,,...,,,,,,,,,,
10,2,Afghanistan,5059,Manure Management,724413,Emissions (CO2eq) from CH4 (AR5),3051,UNFCCC,kilotonnes,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35051,5849,Non-Annex I countries,1709,Agricultural Soils,7236,Indirect emissions (N2O),3051,UNFCCC,kilotonnes,,...,,4.79,Fc,,4.89,Fc,,4.89,Fc,
35053,5849,Non-Annex I countries,1709,Agricultural Soils,7230,Emissions (N2O),3051,UNFCCC,kilotonnes,,...,,91.28,Fc,,58.44,Fc,,42.20,Fc,
35054,5849,Non-Annex I countries,1709,Agricultural Soils,724413,Emissions (CO2eq) from CH4 (AR5),3051,UNFCCC,kilotonnes,,...,,,,,,,,,,
35056,5849,Non-Annex I countries,1709,Agricultural Soils,724313,Emissions (CO2eq) from N2O (AR5),3051,UNFCCC,kilotonnes,,...,,24189.20,Fc,,15486.60,Fc,,11183.00,Fc,


In [21]:
# Deleting the rows with Source Code=3051 as it has more null values
emissions_data_df.drop(emissions_data_df.index[emissions_data_df['Source Code'] == 3051], inplace=True)
emissions_data_df.head(50)

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Source Code,Source,Unit,Y1961,...,Y2016N,Y2017,Y2017F,Y2017N,Y2018,Y2018F,Y2018N,Y2019,Y2019F,Y2019N
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),3050,FAO TIER 1,kilotonnes,240.6831,...,,371.2863,Fc,,378.887,Fc,,389.6563,Fc,
2,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),3050,FAO TIER 1,kilotonnes,6739.1279,...,,10396.0163,Fc,,10608.8357,Fc,,10910.3754,Fc,
4,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),3050,FAO TIER 1,kilotonnes,6739.1279,...,,10396.0163,Fc,,10608.8357,Fc,,10910.3754,Fc,
6,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),3050,FAO TIER 1,kilotonnes,11.6228,...,,23.7001,Fc,,24.2499,Fc,,26.1252,Fc,
8,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),3050,FAO TIER 1,kilotonnes,0.3992,...,,0.4013,Fc,,0.4074,Fc,,0.3654,Fc,
9,2,Afghanistan,5059,Manure Management,724413,Emissions (CO2eq) from CH4 (AR5),3050,FAO TIER 1,kilotonnes,325.4372,...,,663.6019,Fc,,678.9958,Fc,,731.5053,Fc,
11,2,Afghanistan,5059,Manure Management,724313,Emissions (CO2eq) from N2O (AR5),3050,FAO TIER 1,kilotonnes,105.7889,...,,106.3426,Fc,,107.969,Fc,,96.822,Fc,
12,2,Afghanistan,5059,Manure Management,723113,Emissions (CO2eq) (AR5),3050,FAO TIER 1,kilotonnes,431.2261,...,,769.9445,Fc,,786.9648,Fc,,828.3273,Fc,
14,2,Afghanistan,5060,Rice Cultivation,7225,Emissions (CH4),3050,FAO TIER 1,kilotonnes,29.4,...,,15.3233,Fc,,16.4555,Fc,,17.8542,Fc,
15,2,Afghanistan,5060,Rice Cultivation,724413,Emissions (CO2eq) from CH4 (AR5),3050,FAO TIER 1,kilotonnes,823.2,...,,429.0518,Fc,,460.7529,Fc,,499.9176,Fc,


In [22]:
x=emissions_data_df.groupby(['Source Code']).count()['Y1979']
x

Source Code
3050    9852
Name: Y1979, dtype: int64

In [23]:
#Checking the data for Source Code=3050
emissions_data_df.loc[emissions_data_df['Source Code'] == 3050]

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Source Code,Source,Unit,Y1961,...,Y2016N,Y2017,Y2017F,Y2017N,Y2018,Y2018F,Y2018N,Y2019,Y2019F,Y2019N
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),3050,FAO TIER 1,kilotonnes,240.6831,...,,371.2863,Fc,,378.8870,Fc,,389.6563,Fc,
2,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),3050,FAO TIER 1,kilotonnes,6739.1279,...,,10396.0163,Fc,,10608.8357,Fc,,10910.3754,Fc,
4,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),3050,FAO TIER 1,kilotonnes,6739.1279,...,,10396.0163,Fc,,10608.8357,Fc,,10910.3754,Fc,
6,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),3050,FAO TIER 1,kilotonnes,11.6228,...,,23.7001,Fc,,24.2499,Fc,,26.1252,Fc,
8,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),3050,FAO TIER 1,kilotonnes,0.3992,...,,0.4013,Fc,,0.4074,Fc,,0.3654,Fc,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35209,5873,OECD,6516,Land Use change,7230,Emissions (N2O),3050,FAO TIER 1,kilotonnes,,...,,0.7522,Fc,,0.9253,Fc,,1.0574,Fc,
35210,5873,OECD,6516,Land Use change,7273,Emissions (CO2),3050,FAO TIER 1,kilotonnes,,...,,209078.1221,Fc,,209078.1221,Fc,,209078.1221,Fc,
35211,5873,OECD,6516,Land Use change,724413,Emissions (CO2eq) from CH4 (AR5),3050,FAO TIER 1,kilotonnes,,...,,230.6636,Fc,,283.7454,Fc,,324.2807,Fc,
35212,5873,OECD,6516,Land Use change,724313,Emissions (CO2eq) from N2O (AR5),3050,FAO TIER 1,kilotonnes,,...,,199.3235,Fc,,245.1931,Fc,,280.2209,Fc,


In [24]:
#drop Source and Source Code as there is only two types & its not relevant for our analysis
emissions_data_df=emissions_data_df.drop(['Source'],axis=1)
df1=emissions_data_df.drop(['Source Code'],axis=1)


In [25]:
#Check the data frame
df1.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Unit,Y1961,Y1961F,Y1961N,...,Y2016N,Y2017,Y2017F,Y2017N,Y2018,Y2018F,Y2018N,Y2019,Y2019F,Y2019N
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),kilotonnes,240.6831,Fc,,...,,371.2863,Fc,,378.887,Fc,,389.6563,Fc,
2,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),kilotonnes,6739.1279,Fc,,...,,10396.0163,Fc,,10608.8357,Fc,,10910.3754,Fc,
4,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),kilotonnes,6739.1279,Fc,,...,,10396.0163,Fc,,10608.8357,Fc,,10910.3754,Fc,
6,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),kilotonnes,11.6228,Fc,,...,,23.7001,Fc,,24.2499,Fc,,26.1252,Fc,
8,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),kilotonnes,0.3992,Fc,,...,,0.4013,Fc,,0.4074,Fc,,0.3654,Fc,


In [26]:
#Display the unit value
unit=df1['Unit'].unique()
unit

array(['kilotonnes'], dtype=object)

In [27]:
#Drop the column 'Unit'
df2=df1.drop(['Unit'],axis=1)


In [28]:
df2.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Y1961,Y1961F,Y1961N,Y1962,...,Y2016N,Y2017,Y2017F,Y2017N,Y2018,Y2018F,Y2018N,Y2019,Y2019F,Y2019N
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),240.6831,Fc,,245.3106,...,,371.2863,Fc,,378.887,Fc,,389.6563,Fc,
2,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),6739.1279,Fc,,6868.6971,...,,10396.0163,Fc,,10608.8357,Fc,,10910.3754,Fc,
4,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),6739.1279,Fc,,6868.6971,...,,10396.0163,Fc,,10608.8357,Fc,,10910.3754,Fc,
6,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),11.6228,Fc,,11.9632,...,,23.7001,Fc,,24.2499,Fc,,26.1252,Fc,
8,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),0.3992,Fc,,0.4039,...,,0.4013,Fc,,0.4074,Fc,,0.3654,Fc,


In [29]:
#Checking the data for the column Y1961F
ar=df2['Y1961F'].unique()
ar

array(['Fc', nan], dtype=object)

In [30]:
#checking for values in the column 'Y1961N'
df2['Y1961N']

0       NaN
2       NaN
4       NaN
6       NaN
8       NaN
         ..
35209   NaN
35210   NaN
35211   NaN
35212   NaN
35213   NaN
Name: Y1961N, Length: 30188, dtype: float64

In [31]:
#checking for values in the column 'Y1989'
df2['Y1989']

0         179.5620
2        5027.7360
4        5027.7360
6           8.6167
8           0.3045
           ...    
35209          NaN
35210          NaN
35211          NaN
35212          NaN
35213          NaN
Name: Y1989, Length: 30188, dtype: float64

In [33]:
#checking for values in the column 'Y1990'
df2['Y1990']

0           178.4682
2          4997.1108
4          4997.1108
6             8.5165
8             0.3046
            ...     
35209         1.4119
35210    306473.8942
35211       768.5391
35212       374.1567
35213    307616.5900
Name: Y1990, Length: 30188, dtype: float64

In [34]:
#Assigning df2 to a new data frame
df3=df2
df3.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Y1961,Y1961F,Y1961N,Y1962,...,Y2016N,Y2017,Y2017F,Y2017N,Y2018,Y2018F,Y2018N,Y2019,Y2019F,Y2019N
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),240.6831,Fc,,245.3106,...,,371.2863,Fc,,378.887,Fc,,389.6563,Fc,
2,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),6739.1279,Fc,,6868.6971,...,,10396.0163,Fc,,10608.8357,Fc,,10910.3754,Fc,
4,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),6739.1279,Fc,,6868.6971,...,,10396.0163,Fc,,10608.8357,Fc,,10910.3754,Fc,
6,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),11.6228,Fc,,11.9632,...,,23.7001,Fc,,24.2499,Fc,,26.1252,Fc,
8,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),0.3992,Fc,,0.4039,...,,0.4013,Fc,,0.4074,Fc,,0.3654,Fc,


In [35]:
#Since there is not much data from 1961 to 1989(Outliers),We can drop those columns
for i in range (1961,2020):
     
    year_flag1 = "Y" + str(i) + "F"
    year_flag2 = "Y" + str(i) + "N"
    df3=df3.drop([year_flag1],axis=1)
    df3=df3.drop([year_flag2],axis=1)
for i in range (1961,1990):
    year_flag3 = "Y" +str(i) 
    df3=df3.drop([year_flag3],axis=1)
     
    

In [36]:
#display the df
df3.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Y1990,Y1991,Y1992,Y1993,...,Y2010,Y2011,Y2012,Y2013,Y2014,Y2015,Y2016,Y2017,Y2018,Y2019
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),178.4682,187.55,189.76,190.83,...,401.068,402.513,396.921,393.093,398.287,383.3023,380.8767,371.2863,378.887,389.6563
2,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),4997.1108,5251.4,5313.28,5343.24,...,11229.904,11270.364,11113.788,11006.604,11152.036,10732.4631,10664.5483,10396.0163,10608.8357,10910.3754
4,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),4997.1108,5251.4,5313.28,5343.24,...,11229.904,11270.364,11113.788,11006.604,11152.036,10732.4631,10664.5483,10396.0163,10608.8357,10910.3754
6,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),8.5165,9.3445,9.6669,9.8348,...,26.5669,26.1599,26.0913,26.0859,26.3682,24.8209,24.6903,23.7001,24.2499,26.1252
8,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),0.3046,0.3044,0.296,0.2903,...,0.398,0.4177,0.4013,0.3868,0.3932,0.3945,0.3931,0.4013,0.4074,0.3654


In [37]:
#Finding the index for the particular column
index1=df3.columns.get_loc("Y1990")
print(index1)

index2=df3.columns.get_loc("Y2019")
print(index2)


6
35


In [38]:
#Printing the sum of null values
df3.iloc[:,6:36]. isnull().sum()


Y1990    4028
Y1991    3854
Y1992    1674
Y1993    1466
Y1994    1418
Y1995    1389
Y1996    1382
Y1997    1390
Y1998    1385
Y1999    1385
Y2000    1273
Y2001    1271
Y2002    1228
Y2003    1213
Y2004    1208
Y2005    1213
Y2006    1074
Y2007    1079
Y2008    1079
Y2009    1069
Y2010    1074
Y2011    1076
Y2012     962
Y2013     957
Y2014     957
Y2015     962
Y2016     962
Y2017     957
Y2018     957
Y2019     957
dtype: int64

In [39]:
#Trying to find null values for a particular country
df3.loc[(df3["Area Code"] == 2)]
df3

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Y1990,Y1991,Y1992,Y1993,...,Y2010,Y2011,Y2012,Y2013,Y2014,Y2015,Y2016,Y2017,Y2018,Y2019
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),178.4682,187.5500,189.7600,190.8300,...,401.0680,402.5130,396.9210,393.0930,398.2870,383.3023,380.8767,371.2863,378.8870,389.6563
2,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),4997.1108,5251.4000,5313.2800,5343.2400,...,11229.9040,11270.3640,11113.7880,11006.6040,11152.0360,10732.4631,10664.5483,10396.0163,10608.8357,10910.3754
4,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),4997.1108,5251.4000,5313.2800,5343.2400,...,11229.9040,11270.3640,11113.7880,11006.6040,11152.0360,10732.4631,10664.5483,10396.0163,10608.8357,10910.3754
6,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),8.5165,9.3445,9.6669,9.8348,...,26.5669,26.1599,26.0913,26.0859,26.3682,24.8209,24.6903,23.7001,24.2499,26.1252
8,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),0.3046,0.3044,0.2960,0.2903,...,0.3980,0.4177,0.4013,0.3868,0.3932,0.3945,0.3931,0.4013,0.4074,0.3654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35209,5873,OECD,6516,Land Use change,7230,Emissions (N2O),1.4119,1.4119,1.4119,1.4119,...,0.3298,1.8699,0.5001,1.4242,0.4180,1.2957,0.8834,0.7522,0.9253,1.0574
35210,5873,OECD,6516,Land Use change,7273,Emissions (CO2),306473.8942,306473.8942,306473.8942,307152.8342,...,260435.8387,159202.3883,159202.3883,159202.3883,159202.3883,159202.3883,209078.1221,209078.1221,209078.1221,209078.1221
35211,5873,OECD,6516,Land Use change,724413,Emissions (CO2eq) from CH4 (AR5),768.5391,768.5391,768.5391,768.5391,...,101.1322,573.4284,153.3618,436.7699,128.1795,397.3500,270.9106,230.6636,283.7454,324.2807
35212,5873,OECD,6516,Land Use change,724313,Emissions (CO2eq) from N2O (AR5),374.1567,374.1567,374.1567,374.1567,...,87.3914,495.5169,132.5246,377.4261,110.7638,343.3623,234.1021,199.3235,245.1931,280.2209


In [40]:
# Replacing null values with zeros
df3=df3.fillna(0)
df3

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Y1990,Y1991,Y1992,Y1993,...,Y2010,Y2011,Y2012,Y2013,Y2014,Y2015,Y2016,Y2017,Y2018,Y2019
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),178.4682,187.5500,189.7600,190.8300,...,401.0680,402.5130,396.9210,393.0930,398.2870,383.3023,380.8767,371.2863,378.8870,389.6563
2,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),4997.1108,5251.4000,5313.2800,5343.2400,...,11229.9040,11270.3640,11113.7880,11006.6040,11152.0360,10732.4631,10664.5483,10396.0163,10608.8357,10910.3754
4,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),4997.1108,5251.4000,5313.2800,5343.2400,...,11229.9040,11270.3640,11113.7880,11006.6040,11152.0360,10732.4631,10664.5483,10396.0163,10608.8357,10910.3754
6,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),8.5165,9.3445,9.6669,9.8348,...,26.5669,26.1599,26.0913,26.0859,26.3682,24.8209,24.6903,23.7001,24.2499,26.1252
8,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),0.3046,0.3044,0.2960,0.2903,...,0.3980,0.4177,0.4013,0.3868,0.3932,0.3945,0.3931,0.4013,0.4074,0.3654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35209,5873,OECD,6516,Land Use change,7230,Emissions (N2O),1.4119,1.4119,1.4119,1.4119,...,0.3298,1.8699,0.5001,1.4242,0.4180,1.2957,0.8834,0.7522,0.9253,1.0574
35210,5873,OECD,6516,Land Use change,7273,Emissions (CO2),306473.8942,306473.8942,306473.8942,307152.8342,...,260435.8387,159202.3883,159202.3883,159202.3883,159202.3883,159202.3883,209078.1221,209078.1221,209078.1221,209078.1221
35211,5873,OECD,6516,Land Use change,724413,Emissions (CO2eq) from CH4 (AR5),768.5391,768.5391,768.5391,768.5391,...,101.1322,573.4284,153.3618,436.7699,128.1795,397.3500,270.9106,230.6636,283.7454,324.2807
35212,5873,OECD,6516,Land Use change,724313,Emissions (CO2eq) from N2O (AR5),374.1567,374.1567,374.1567,374.1567,...,87.3914,495.5169,132.5246,377.4261,110.7638,343.3623,234.1021,199.3235,245.1931,280.2209


In [41]:
#Changing the column names to standard form
df3=df3.rename({'Area Code': 'Area_Code', 'Item Code': 'Item_Code','Element Code' : 'Element_Code'}, axis=1)
df3.head()

Unnamed: 0,Area_Code,Area,Item_Code,Item,Element_Code,Element,Y1990,Y1991,Y1992,Y1993,...,Y2010,Y2011,Y2012,Y2013,Y2014,Y2015,Y2016,Y2017,Y2018,Y2019
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),178.4682,187.55,189.76,190.83,...,401.068,402.513,396.921,393.093,398.287,383.3023,380.8767,371.2863,378.887,389.6563
2,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),4997.1108,5251.4,5313.28,5343.24,...,11229.904,11270.364,11113.788,11006.604,11152.036,10732.4631,10664.5483,10396.0163,10608.8357,10910.3754
4,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),4997.1108,5251.4,5313.28,5343.24,...,11229.904,11270.364,11113.788,11006.604,11152.036,10732.4631,10664.5483,10396.0163,10608.8357,10910.3754
6,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),8.5165,9.3445,9.6669,9.8348,...,26.5669,26.1599,26.0913,26.0859,26.3682,24.8209,24.6903,23.7001,24.2499,26.1252
8,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),0.3046,0.3044,0.296,0.2903,...,0.398,0.4177,0.4013,0.3868,0.3932,0.3945,0.3931,0.4013,0.4074,0.3654


In [15]:
#Saving the clean data to a csv file
df3.to_csv("Emission_Clean_Data.csv",header=True,index=False,mode="w")

In [42]:
#Normalizing the data(Changing the year colums to rows)
emissions_data_new_df = pd.melt(df3, id_vars=["Area_Code", "Area", "Item_Code", "Item", "Element_Code", "Element"],
                  var_name="Year", value_name="Emission")
emissions_data_new_df.head()

Unnamed: 0,Area_Code,Area,Item_Code,Item,Element_Code,Element,Year,Emission
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),Y1990,178.4682
1,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),Y1990,4997.1108
2,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),Y1990,4997.1108
3,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),Y1990,8.5165
4,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),Y1990,0.3046


In [43]:
#Removing the letter "Y" from the year data
emissions_data_new_df['Year']=emissions_data_new_df['Year'].str[1:]
emissions_data_new_df.head()

Unnamed: 0,Area_Code,Area,Item_Code,Item,Element_Code,Element,Year,Emission
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),1990,178.4682
1,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),1990,4997.1108
2,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),1990,4997.1108
3,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),1990,8.5165
4,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),1990,0.3046


In [44]:
#Checking the count of normalized data
emissions_data_new_df.count()

Area_Code       905640
Area            905640
Item_Code       905640
Item            905640
Element_Code    905640
Element         905640
Year            905640
Emission        905640
dtype: int64

In [45]:
#Checking the datatypes
emissions_data_new_df.dtypes

Area_Code         int64
Area             object
Item_Code         int64
Item             object
Element_Code      int64
Element          object
Year             object
Emission        float64
dtype: object

In [46]:
# changing the Year data type as int
emissions_data_new_df['Year'] = emissions_data_new_df['Year'].astype(int)


In [47]:
emissions_data_new_df.dtypes

Area_Code         int64
Area             object
Item_Code         int64
Item             object
Element_Code      int64
Element          object
Year              int32
Emission        float64
dtype: object

In [38]:
#Saving the data frame to a csv file
emissions_data_new_df.to_csv("Emission_Normalized_Data.csv",header=True,index=False,mode="w")

In [None]:
#Saving the clean Emission data to S3 bucket
filename = 'Emission_Normalized_Data.csv' #csv name we want to have in bucket
bucketname = 'emission-bucket' #S3 bucket name

csv_buffer = StringIO()
emissions_data_new_df.to_csv(csv_buffer)

## specifying the access key and secret key.
client = boto3.client('s3', 
        region_name = 'us-east-2', #need to change here
        aws_access_key_id = 'access key',
        aws_secret_access_key = 'secret key' )

response = client.put_object(
    ACL='private',
    Body=csv_buffer.getvalue(),
    Bucket=bucketname,
    Key=filename
)

# Population Data Cleaning

In [48]:
# 1. Create a DataFrame for the Population data
population_data_df=pd.read_csv('Population_Data.csv',low_memory=False)

In [49]:
#Display df
population_data_df.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,2,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1950,1950,1000 persons,7752.118,X,
1,2,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1951,1951,1000 persons,7840.156,X,
2,2,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1952,1952,1000 persons,7935.997,X,
3,2,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1953,1953,1000 persons,8039.694,X,
4,2,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1954,1954,1000 persons,8151.317,X,


In [50]:
#Finding standard mathematical functions.
population_data_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Area Code,160411.0,852.35737,1809.356064,1.0,79.0,150.0,224.0,5817.0
Item Code,160411.0,3010.0,0.0,3010.0,3010.0,3010.0,3010.0,3010.0
Element Code,160411.0,526.12996,20.781723,511.0,512.0,513.0,551.0,561.0
Year Code,160411.0,2018.628398,40.911927,1950.0,1985.0,2017.0,2048.0,2100.0
Year,160411.0,2018.628398,40.911927,1950.0,1985.0,2017.0,2048.0,2100.0
Value,160411.0,92479.64615,436584.229823,0.0,462.5945,4397.008,24867.4155,10874900.0
Note,0.0,,,,,,,


In [51]:
#Informations about the columns
population_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160411 entries, 0 to 160410
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Area Code     160411 non-null  int64  
 1   Area          160411 non-null  object 
 2   Item Code     160411 non-null  int64  
 3   Item          160411 non-null  object 
 4   Element Code  160411 non-null  int64  
 5   Element       160411 non-null  object 
 6   Year Code     160411 non-null  int64  
 7   Year          160411 non-null  int64  
 8   Unit          160411 non-null  object 
 9   Value         160411 non-null  float64
 10  Flag          160411 non-null  object 
 11  Note          0 non-null       float64
dtypes: float64(2), int64(5), object(5)
memory usage: 14.7+ MB


In [52]:
#Displaying column names
population_data_df.columns

Index(['Area Code', 'Area', 'Item Code', 'Item', 'Element Code', 'Element',
       'Year Code', 'Year', 'Unit', 'Value', 'Flag', 'Note'],
      dtype='object')

In [53]:
#Dropping unwanted columns

population_data_df=population_data_df.drop(['Item Code','Item','Year Code','Unit','Flag','Note'],axis=1)
population_data_df.head()

Unnamed: 0,Area Code,Area,Element Code,Element,Year,Value
0,2,Afghanistan,511,Total Population - Both sexes,1950,7752.118
1,2,Afghanistan,511,Total Population - Both sexes,1951,7840.156
2,2,Afghanistan,511,Total Population - Both sexes,1952,7935.997
3,2,Afghanistan,511,Total Population - Both sexes,1953,8039.694
4,2,Afghanistan,511,Total Population - Both sexes,1954,8151.317


In [54]:
#Select data for Total population-Both sexes
new=population_data_df.loc[population_data_df['Element Code'] == 511]

new.head()

Unnamed: 0,Area Code,Area,Element Code,Element,Year,Value
0,2,Afghanistan,511,Total Population - Both sexes,1950,7752.118
1,2,Afghanistan,511,Total Population - Both sexes,1951,7840.156
2,2,Afghanistan,511,Total Population - Both sexes,1952,7935.997
3,2,Afghanistan,511,Total Population - Both sexes,1953,8039.694
4,2,Afghanistan,511,Total Population - Both sexes,1954,8151.317


In [55]:
# select the data from year 1990-2019
clean_df=new[(new["Year"] > 1989) &  (new["Year"]  < 2020)]
clean_df.head()

Unnamed: 0,Area Code,Area,Element Code,Element,Year,Value
40,2,Afghanistan,511,Total Population - Both sexes,1990,12412.308
41,2,Afghanistan,511,Total Population - Both sexes,1991,13299.017
42,2,Afghanistan,511,Total Population - Both sexes,1992,14485.546
43,2,Afghanistan,511,Total Population - Both sexes,1993,15816.603
44,2,Afghanistan,511,Total Population - Both sexes,1994,17075.727


In [56]:
#Checking data for the Area Code = 2
clean_df.loc[clean_df["Area Code"]==2]

Unnamed: 0,Area Code,Area,Element Code,Element,Year,Value
40,2,Afghanistan,511,Total Population - Both sexes,1990,12412.308
41,2,Afghanistan,511,Total Population - Both sexes,1991,13299.017
42,2,Afghanistan,511,Total Population - Both sexes,1992,14485.546
43,2,Afghanistan,511,Total Population - Both sexes,1993,15816.603
44,2,Afghanistan,511,Total Population - Both sexes,1994,17075.727
45,2,Afghanistan,511,Total Population - Both sexes,1995,18110.657
46,2,Afghanistan,511,Total Population - Both sexes,1996,18853.437
47,2,Afghanistan,511,Total Population - Both sexes,1997,19357.126
48,2,Afghanistan,511,Total Population - Both sexes,1998,19737.765
49,2,Afghanistan,511,Total Population - Both sexes,1999,20170.844


In [57]:
#Drop Element Code and Element as its not needed for our analysis
clean_population_df=clean_df.drop(['Element Code','Element'],axis=1)

clean_population_df.head()

Unnamed: 0,Area Code,Area,Year,Value
40,2,Afghanistan,1990,12412.308
41,2,Afghanistan,1991,13299.017
42,2,Afghanistan,1992,14485.546
43,2,Afghanistan,1993,15816.603
44,2,Afghanistan,1994,17075.727


In [58]:
#Standardising the column names
clean_population_df=clean_population_df.rename({'Area Code': 'Area_Code', 'Value':'Population'}, axis=1)
clean_population_df.head()

Unnamed: 0,Area_Code,Area,Year,Population
40,2,Afghanistan,1990,12412.308
41,2,Afghanistan,1991,13299.017
42,2,Afghanistan,1992,14485.546
43,2,Afghanistan,1993,15816.603
44,2,Afghanistan,1994,17075.727


In [59]:
#Resetting index
clean_population_df.reset_index(inplace=True, drop=True)
clean_population_df

Unnamed: 0,Area_Code,Area,Year,Population
0,2,Afghanistan,1990,12412.308
1,2,Afghanistan,1991,13299.017
2,2,Afghanistan,1992,14485.546
3,2,Afghanistan,1993,15816.603
4,2,Afghanistan,1994,17075.727
...,...,...,...,...
7925,5817,Net Food Importing Developing Countries,2015,1495081.183
7926,5817,Net Food Importing Developing Countries,2016,1527235.522
7927,5817,Net Food Importing Developing Countries,2017,1559721.436
7928,5817,Net Food Importing Developing Countries,2018,1592589.456


In [60]:
#clean population df is saved as csv
clean_population_df.to_csv("Population_Clean_Data.csv",header=True,index=False,mode="w")
clean_population_df.reset_index(drop=True, inplace=True)

clean_population_df.head()

Unnamed: 0,Area_Code,Area,Year,Population
0,2,Afghanistan,1990,12412.308
1,2,Afghanistan,1991,13299.017
2,2,Afghanistan,1992,14485.546
3,2,Afghanistan,1993,15816.603
4,2,Afghanistan,1994,17075.727


In [None]:
filename = 'Population_Clean_Data.csv' #Any name we want to have in bucket
bucketname = 'emission-bucket' #S3 bucket name

csv_buffer = StringIO()
clean_population_df.to_csv(csv_buffer)

## in AWS to go click ur name and then "My Security Credentials". Then "Access Keys"
client = boto3.client('s3', 
        region_name = 'us-east-2', #need to change here
        aws_access_key_id = 'access key',
        aws_secret_access_key = 'secret key' )

response = client.put_object(
    ACL='private',
    Body=csv_buffer.getvalue(),
    Bucket=bucketname,
    Key=filename
)