# This program is marging the two files 1) Emission and 2) Population and then creating two dataframes based on area code less than 5000 and the rest. Finally the two dataframes are pushed to AWS and loaded onto Postgres tables

In [1]:
#Import Libraries
import pandas as pd 
import os 
import csv 
from sqlalchemy import create_engine
from config import db_password
import psycopg2
import boto3
from io import StringIO

In [2]:
# Create a DataFrame for the Normalized data
Normalized_df=pd.read_csv('https://emission-bucket.s3.us-east-2.amazonaws.com/Emission_Normalized_Data.csv')
Normalized_df.head()

Unnamed: 0,Area_Code,Area,Item_Code,Item,Element_Code,Element,Year,Emission
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),1990,178.4682
1,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),1990,4997.1108
2,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),1990,4997.1108
3,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),1990,8.5165
4,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),1990,0.3046


In [3]:
Normalized_df.reset_index()
Normalized_df.head()

Unnamed: 0,Area_Code,Area,Item_Code,Item,Element_Code,Element,Year,Emission
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),1990,178.4682
1,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),1990,4997.1108
2,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),1990,4997.1108
3,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),1990,8.5165
4,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),1990,0.3046


In [4]:
# Create a DataFrame for the Population data
Population_df=pd.read_csv('https://emission-bucket.s3.us-east-2.amazonaws.com/Population_Clean_Data.csv')
Population_df.head()

Unnamed: 0,Area_Code,Area,Year,Population
0,2,Afghanistan,1990,12412.308
1,2,Afghanistan,1991,13299.017
2,2,Afghanistan,1992,14485.546
3,2,Afghanistan,1993,15816.603
4,2,Afghanistan,1994,17075.727


In [5]:
Merged_df = pd.merge(Normalized_df, Population_df, on = ['Area_Code', 'Year'], sort=False)
Merged_df.head()

Unnamed: 0,Area_Code,Area_x,Item_Code,Item,Element_Code,Element,Year,Emission,Area_y,Population
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),1990,178.4682,Afghanistan,12412.308
1,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),1990,4997.1108,Afghanistan,12412.308
2,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),1990,4997.1108,Afghanistan,12412.308
3,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),1990,8.5165,Afghanistan,12412.308
4,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),1990,0.3046,Afghanistan,12412.308


In [6]:
Merged_df.isnull().values.sum()

0

In [7]:
# Drop Area_y 
Merged_df = Merged_df.drop(['Area_y'],axis=1)
Merged_df.head()

Unnamed: 0,Area_Code,Area_x,Item_Code,Item,Element_Code,Element,Year,Emission,Population
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),1990,178.4682,12412.308
1,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),1990,4997.1108,12412.308
2,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),1990,4997.1108,12412.308
3,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),1990,8.5165,12412.308
4,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),1990,0.3046,12412.308


In [8]:
Merged_df = Merged_df.rename({'Area_x': 'Area'}, axis=1)
Merged_df.head()

Unnamed: 0,Area_Code,Area,Item_Code,Item,Element_Code,Element,Year,Emission,Population
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),1990,178.4682,12412.308
1,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),1990,4997.1108,12412.308
2,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),1990,4997.1108,12412.308
3,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),1990,8.5165,12412.308
4,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),1990,0.3046,12412.308


# Rearrange columns

In [9]:
Merged_df = Merged_df[["Area_Code", "Area", "Item_Code", "Item", "Element_Code", "Element" , "Year", "Population", "Emission"]]
Merged_df.head()

Unnamed: 0,Area_Code,Area,Item_Code,Item,Element_Code,Element,Year,Population,Emission
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),1990,12412.308,178.4682
1,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),1990,12412.308,4997.1108
2,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),1990,12412.308,4997.1108
3,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),1990,12412.308,8.5165
4,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),1990,12412.308,0.3046


In [10]:
Merged_L5000_df = Merged_df[Merged_df['Area_Code'] < 5000]
Merged_L5000_df.head()

Unnamed: 0,Area_Code,Area,Item_Code,Item,Element_Code,Element,Year,Population,Emission
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),1990,12412.308,178.4682
1,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),1990,12412.308,4997.1108
2,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),1990,12412.308,4997.1108
3,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),1990,12412.308,8.5165
4,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),1990,12412.308,0.3046


In [12]:
Merged_L5000_df.set_index('Area_Code', inplace=True)
Merged_L5000_df.head()

Unnamed: 0_level_0,Area,Item_Code,Item,Element_Code,Element,Year,Population,Emission
Area_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),1990,12412.308,178.4682
2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),1990,12412.308,4997.1108
2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),1990,12412.308,4997.1108
2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),1990,12412.308,8.5165
2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),1990,12412.308,0.3046


In [13]:
Merged_M5000_df = Merged_df[Merged_df['Area_Code'] >= 5000]
Merged_M5000_df.head()

Unnamed: 0,Area_Code,Area,Item_Code,Item,Element_Code,Element,Year,Population,Emission
21795,5000,World,5058,Enteric Fermentation,7225,Emissions (CH4),1990,5327231.061,89059.69
21796,5000,World,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),1990,5327231.061,2493671.0
21797,5000,World,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),1990,5327231.061,2493671.0
21798,5000,World,5059,Manure Management,7225,Emissions (CH4),1990,5327231.061,9261.077
21799,5000,World,5059,Manure Management,7230,Emissions (N2O),1990,5327231.061,455.0201


In [14]:
Merged_M5000_df.set_index('Area_Code', inplace=True)
Merged_M5000_df.head()

Unnamed: 0_level_0,Area,Item_Code,Item,Element_Code,Element,Year,Population,Emission
Area_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5000,World,5058,Enteric Fermentation,7225,Emissions (CH4),1990,5327231.061,89059.69
5000,World,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),1990,5327231.061,2493671.0
5000,World,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),1990,5327231.061,2493671.0
5000,World,5059,Manure Management,7225,Emissions (CH4),1990,5327231.061,9261.077
5000,World,5059,Manure Management,7230,Emissions (N2O),1990,5327231.061,455.0201


# Push Dataframe to AWS

In [None]:
%pip install boto3

In [None]:
filename = 'Merged_L5000.csv' #Any name we want to have in bucket
bucketname = 'dataanalyticsproject' #S3 bucket name

csv_buffer = StringIO()
Merged_L5000_df.to_csv(csv_buffer)

## in AWS to go click ur name and then "My Security Credentials". Then "Access Keys"
client = boto3.client('s3', 
        region_name = 'us-East-2', #need to change here
        aws_access_key_id = 'xxxxxxx', #need to change here
        aws_secret_access_key = 'xxxxxxxxx' #need to change here
)

response = client.put_object(
    ACL='private',
    Body=csv_buffer.getvalue(),
    Bucket=bucketname,
    Key=filename
)

In [None]:
# Create a DataFrame for the Merged_L5000 data
Merged_L5000s_df=pd.read_csv('https://dataanalyticsproject.s3.us-east-2.amazonaws.com/Merged_L5000.csv')
Merged_L5000s_df.head()

In [None]:
filename = 'Merged_M5000.csv' #Any name we want to have in bucket
bucketname = 'dataanalyticsproject' #S3 bucket name

csv_buffer = StringIO()
Merged_M5000_df.to_csv(csv_buffer)

## in AWS to go click ur name and then "My Security Credentials". Then "Access Keys"
client = boto3.client('s3', 
        region_name = 'us-East-2', #need to change here
        aws_access_key_id = 'xxxxxxx', #need to change here
        aws_secret_access_key = 'xxxxxxxx' #need to change here
)

response = client.put_object(
    ACL='private',
    Body=csv_buffer.getvalue(),
    Bucket=bucketname,
    Key=filename
)

# Create Database Engine

In [16]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/DataAnalyticsProject"
engine = create_engine(db_string)

In [17]:
Merged_L5000_df.to_sql(name='Merged_L5000', con=engine, if_exists='replace')

In [18]:
Merged_M5000_df.to_sql(name='Merged_M5000', con=engine, if_exists='replace')