# ETL of Emission and Population data


In [None]:

%pip install boto3

In [2]:
#Import dependencies
import pandas as pd 
import os 
import csv 
import numpy as np 
from sqlalchemy import create_engine, inspect
import boto3
from io import StringIO


In [None]:
# Create a DataFrame for the Emission data
emissions_data_df=pd.read_csv('Emissions_Data.csv',low_memory=False)


#  Emission Data Cleaning

In [None]:
emissions_data_df.head()

In [None]:
# Check the datatypes of emission df. 
emissions_data_df.dtypes

In [None]:
#display the columns
emissions_data_df.columns

In [None]:
# Row count
emissions_data_df.count()

In [None]:
# Finding Mean,Std,Min,Max etc.
emissions_data_df.describe().T

In [None]:
#Displaying the data frame info
emissions_data_df.info()

In [None]:
#Displaying the columns
emissions_data_df.columns

In [None]:
#checking for null values
emissions_data_df.isnull()

In [None]:
#Sum of null values
emissions_data_df.isnull().sum()

In [None]:
#Check for unique values
emissions_data_df.nunique().tolist()

In [None]:
#Listing unique countries
countries=emissions_data_df['Area'].unique()
countries

In [None]:
#listing unique items
items=emissions_data_df['Item'].unique()
items

In [None]:
#listing unique elements
elements=emissions_data_df['Element'].unique()
items

In [None]:
#Store and Display unique source and source code
source_code=emissions_data_df['Source Code'].unique()
source=emissions_data_df['Source'].unique()
print(source_code)
print(source)

In [None]:
#Checking for the row values for Source Code = 3051
emissions_data_df.loc[(emissions_data_df['Source Code'])==3051]

In [None]:
# Deleting the rows with Source Code=3051 as it has more null values
emissions_data_df.drop(emissions_data_df.index[emissions_data_df['Source Code'] == 3051], inplace=True)
emissions_data_df.head(50)

In [None]:
x=emissions_data_df.groupby(['Source Code']).count()['Y1979']
x

In [None]:
#Checking the data for Source Code=3050
emissions_data_df.loc[emissions_data_df['Source Code'] == 3050]

In [None]:
#drop Source and Source Code as there is only two types & its not relevant for our analysis
emissions_data_df=emissions_data_df.drop(['Source'],axis=1)
df1=emissions_data_df.drop(['Source Code'],axis=1)


In [None]:
#Check the data frame
df1.head()

In [None]:
#Display the unit value
unit=df1['Unit'].unique()
unit

In [None]:
#Drop the column 'Unit'
df2=df1.drop(['Unit'],axis=1)


In [None]:
df2.head()

In [None]:
#Checking the data for the column Y1961F
ar=df2['Y1961F'].unique()
ar

In [None]:
#checking for values in the column 'Y1961N'
df2['Y1961N']

In [None]:
#checking for values in the column 'Y1989'
df2['Y1989']

In [None]:
#checking for values in the column 'Y1990'
df2['Y1990']

In [None]:
#Assigning df2 to a new data frame
df3=df2
df3.head()

In [None]:
#Since there is not much data from 1961 to 1989(Outliers),We can drop those columns
for i in range (1961,2020):
     
    year_flag1 = "Y" + str(i) + "F"
    year_flag2 = "Y" + str(i) + "N"
    df3=df3.drop([year_flag1],axis=1)
    df3=df3.drop([year_flag2],axis=1)
for i in range (1961,1990):
    year_flag3 = "Y" +str(i) 
    df3=df3.drop([year_flag3],axis=1)
     
    

In [None]:
#display the df
df3.head()

In [None]:
#Finding the index for the particular column
index1=df3.columns.get_loc("Y1990")
print(index1)

index2=df3.columns.get_loc("Y2019")
print(index2)


In [None]:
#Printing the sum of null values
df3.iloc[:,6:36]. isnull().sum()


In [None]:
#Trying to find null values for a particular country
df3.loc[(df3["Area Code"] == 2)]
df3

In [None]:
# Replacing null values with zeros
df3=df3.fillna(0)
df3

In [None]:
#Changing the column names to standard form
df3=df3.rename({'Area Code': 'Area_Code', 'Item Code': 'Item_Code','Element Code' : 'Element_Code'}, axis=1)
df3.head()

In [None]:
#Saving the clean data to a csv file
df3.to_csv("Emission_Clean_Data.csv",header=True,index=False,mode="w")

In [None]:
#Normalizing the data(Changing the year colums to rows)
emissions_data_new_df = pd.melt(df3, id_vars=["Area_Code", "Area", "Item_Code", "Item", "Element_Code", "Element"],
                  var_name="Year", value_name="Emission")
emissions_data_new_df.head()

In [None]:
#Removing the letter "Y" from the year data
emissions_data_new_df['Year']=emissions_data_new_df['Year'].str[1:]
emissions_data_new_df.head()

In [None]:
#Checking the count of normalized data
emissions_data_new_df.count()

In [None]:
#Checking the datatypes
emissions_data_new_df.dtypes

In [None]:
# changing the Year data type as int
emissions_data_new_df['Year'] = emissions_data_new_df['Year'].astype(int)


In [None]:
emissions_data_new_df.dtypes

In [None]:
#Saving the data frame to a csv file
emissions_data_new_df.to_csv("Emission_Normalized_Data.csv",header=True,index=False,mode="w")

In [None]:
#Saving the clean Emission data to S3 bucket
filename = 'Emission_Normalized_Data.csv' #csv name we want to have in bucket
bucketname = 'emission-bucket' #S3 bucket name

csv_buffer = StringIO()
emissions_data_new_df.to_csv(csv_buffer)

## specifying the access key and secret key.
client = boto3.client('s3', 
        region_name = 'us-east-2', #need to change here
        aws_access_key_id = 'access key',
        aws_secret_access_key = 'secret key' )

response = client.put_object(
    ACL='private',
    Body=csv_buffer.getvalue(),
    Bucket=bucketname,
    Key=filename
)

# Population Data Cleaning

In [3]:
# 1. Create a DataFrame for the Population data
population_data_df=pd.read_csv('Population_Data.csv',low_memory=False)

In [4]:
#Display df
population_data_df.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,2,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1950,1950,1000 persons,7752.118,X,
1,2,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1951,1951,1000 persons,7840.156,X,
2,2,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1952,1952,1000 persons,7935.997,X,
3,2,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1953,1953,1000 persons,8039.694,X,
4,2,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1954,1954,1000 persons,8151.317,X,


In [5]:
#Finding standard mathematical functions.
population_data_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Area Code,160411.0,852.35737,1809.356064,1.0,79.0,150.0,224.0,5817.0
Item Code,160411.0,3010.0,0.0,3010.0,3010.0,3010.0,3010.0,3010.0
Element Code,160411.0,526.12996,20.781723,511.0,512.0,513.0,551.0,561.0
Year Code,160411.0,2018.628398,40.911927,1950.0,1985.0,2017.0,2048.0,2100.0
Year,160411.0,2018.628398,40.911927,1950.0,1985.0,2017.0,2048.0,2100.0
Value,160411.0,92479.64615,436584.229823,0.0,462.5945,4397.008,24867.4155,10874900.0
Note,0.0,,,,,,,


In [6]:
#Informations about the columns
population_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160411 entries, 0 to 160410
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Area Code     160411 non-null  int64  
 1   Area          160411 non-null  object 
 2   Item Code     160411 non-null  int64  
 3   Item          160411 non-null  object 
 4   Element Code  160411 non-null  int64  
 5   Element       160411 non-null  object 
 6   Year Code     160411 non-null  int64  
 7   Year          160411 non-null  int64  
 8   Unit          160411 non-null  object 
 9   Value         160411 non-null  float64
 10  Flag          160411 non-null  object 
 11  Note          0 non-null       float64
dtypes: float64(2), int64(5), object(5)
memory usage: 14.7+ MB


In [8]:
#Displaying column names
population_data_df.columns

Index(['Area Code', 'Area', 'Item Code', 'Item', 'Element Code', 'Element',
       'Year Code', 'Year', 'Unit', 'Value', 'Flag', 'Note'],
      dtype='object')

In [9]:
#Dropping unwanted columns

population_data_df=population_data_df.drop(['Item Code','Item','Year Code','Unit','Flag','Note'],axis=1)
population_data_df.head()

Unnamed: 0,Area Code,Area,Element Code,Element,Year,Value
0,2,Afghanistan,511,Total Population - Both sexes,1950,7752.118
1,2,Afghanistan,511,Total Population - Both sexes,1951,7840.156
2,2,Afghanistan,511,Total Population - Both sexes,1952,7935.997
3,2,Afghanistan,511,Total Population - Both sexes,1953,8039.694
4,2,Afghanistan,511,Total Population - Both sexes,1954,8151.317


In [10]:
#Select data for Total population-Both sexes
new=population_data_df.loc[population_data_df['Element Code'] == 511]

new.head()

Unnamed: 0,Area Code,Area,Element Code,Element,Year,Value
0,2,Afghanistan,511,Total Population - Both sexes,1950,7752.118
1,2,Afghanistan,511,Total Population - Both sexes,1951,7840.156
2,2,Afghanistan,511,Total Population - Both sexes,1952,7935.997
3,2,Afghanistan,511,Total Population - Both sexes,1953,8039.694
4,2,Afghanistan,511,Total Population - Both sexes,1954,8151.317


In [11]:
# select the data from year 1990-2019
clean_df=new[(new["Year"] > 1989) &  (new["Year"]  < 2020)]
clean_df.head()

Unnamed: 0,Area Code,Area,Element Code,Element,Year,Value
40,2,Afghanistan,511,Total Population - Both sexes,1990,12412.308
41,2,Afghanistan,511,Total Population - Both sexes,1991,13299.017
42,2,Afghanistan,511,Total Population - Both sexes,1992,14485.546
43,2,Afghanistan,511,Total Population - Both sexes,1993,15816.603
44,2,Afghanistan,511,Total Population - Both sexes,1994,17075.727


In [12]:
#Checking data for the Area Code = 2
clean_df.loc[clean_df["Area Code"]==2]

Unnamed: 0,Area Code,Area,Element Code,Element,Year,Value
40,2,Afghanistan,511,Total Population - Both sexes,1990,12412.308
41,2,Afghanistan,511,Total Population - Both sexes,1991,13299.017
42,2,Afghanistan,511,Total Population - Both sexes,1992,14485.546
43,2,Afghanistan,511,Total Population - Both sexes,1993,15816.603
44,2,Afghanistan,511,Total Population - Both sexes,1994,17075.727
45,2,Afghanistan,511,Total Population - Both sexes,1995,18110.657
46,2,Afghanistan,511,Total Population - Both sexes,1996,18853.437
47,2,Afghanistan,511,Total Population - Both sexes,1997,19357.126
48,2,Afghanistan,511,Total Population - Both sexes,1998,19737.765
49,2,Afghanistan,511,Total Population - Both sexes,1999,20170.844


In [13]:
#Drop Element Code and Element as its not needed for our analysis
clean_population_df=clean_df.drop(['Element Code','Element'],axis=1)

clean_population_df.head()

Unnamed: 0,Area Code,Area,Year,Value
40,2,Afghanistan,1990,12412.308
41,2,Afghanistan,1991,13299.017
42,2,Afghanistan,1992,14485.546
43,2,Afghanistan,1993,15816.603
44,2,Afghanistan,1994,17075.727


In [14]:
#Standardising the column names
clean_population_df=clean_population_df.rename({'Area Code': 'Area_Code', 'Value':'Population'}, axis=1)
clean_population_df.head()

Unnamed: 0,Area_Code,Area,Year,Population
40,2,Afghanistan,1990,12412.308
41,2,Afghanistan,1991,13299.017
42,2,Afghanistan,1992,14485.546
43,2,Afghanistan,1993,15816.603
44,2,Afghanistan,1994,17075.727


In [15]:
#Resetting index
clean_population_df.reset_index(inplace=True, drop=True)
clean_population_df

Unnamed: 0,Area_Code,Area,Year,Population
0,2,Afghanistan,1990,12412.308
1,2,Afghanistan,1991,13299.017
2,2,Afghanistan,1992,14485.546
3,2,Afghanistan,1993,15816.603
4,2,Afghanistan,1994,17075.727
...,...,...,...,...
7925,5817,Net Food Importing Developing Countries,2015,1495081.183
7926,5817,Net Food Importing Developing Countries,2016,1527235.522
7927,5817,Net Food Importing Developing Countries,2017,1559721.436
7928,5817,Net Food Importing Developing Countries,2018,1592589.456


In [16]:
#clean population df is saved as csv
clean_population_df.to_csv("Population_Clean_Data.csv",header=True,index=False,mode="w")
clean_population_df.reset_index(drop=True, inplace=True)

clean_population_df.head()

Unnamed: 0,Area_Code,Area,Year,Population
0,2,Afghanistan,1990,12412.308
1,2,Afghanistan,1991,13299.017
2,2,Afghanistan,1992,14485.546
3,2,Afghanistan,1993,15816.603
4,2,Afghanistan,1994,17075.727


In [17]:
filename = 'Population_Clean_Data.csv' #Any name we want to have in bucket
bucketname = 'emission-bucket' #S3 bucket name

csv_buffer = StringIO()
clean_population_df.to_csv(csv_buffer)

## in AWS to go click ur name and then "My Security Credentials". Then "Access Keys"
client = boto3.client('s3', 
        region_name = 'us-east-2', #need to change here
        aws_access_key_id = 'access key',
        aws_secret_access_key = 'secret key' )

response = client.put_object(
    ACL='private',
    Body=csv_buffer.getvalue(),
    Bucket=bucketname,
    Key=filename
)