In [1]:
%%file total_energy.py

import pandas as pd

def import_data():
    csv_file = "Resources/organised_Gen.csv"
    energy = pd.read_csv(csv_file)
    return energy

Writing total_energy.py


In [2]:
from total_energy import import_data

x = import_data()
x

Unnamed: 0.1,Unnamed: 0,YEAR,MONTH,STATE,TYPE OF PRODUCER,ENERGY SOURCE,GENERATION (Megawatthours)
0,0,2001,1,AK,Total Electric Power Industry,Coal,46903.0
1,1,2001,1,AK,Total Electric Power Industry,Petroleum,71085.0
2,2,2001,1,AK,Total Electric Power Industry,Natural Gas,367521.0
3,3,2001,1,AK,Total Electric Power Industry,Hydroelectric Conventional,104549.0
4,4,2001,1,AK,Total Electric Power Industry,Wind,87.0
...,...,...,...,...,...,...,...
496769,10581,2022,5,WY,"Electric Generators, Electric Utilities",Coal,2071403.0
496770,10582,2022,5,WY,"Electric Generators, Electric Utilities",Hydroelectric Conventional,96790.0
496771,10583,2022,5,WY,"Electric Generators, Electric Utilities",Natural Gas,91570.0
496772,10584,2022,5,WY,"Electric Generators, Electric Utilities",Petroleum,1812.0


In [3]:
%%file tran_total_energy.py
from total_energy import import_data

import pandas as pd

def transform_data():
    
    tf_energy_data = import_data().drop(columns=['Unnamed: 0'])
    
    tf_energy_data = tf_energy_data.rename(columns={
        "YEAR": "year", 
        "MONTH": "month",
        "STATE": "state",
        "TYPE OF PRODUCER": "producer",
        "ENERGY SOURCE": "source",
        "GENERATION (Megawatthours)": "generated"})


    #tf_energy_data['producer'] = tf_energy_data['producer'].apply(lambda x: x.replace(',','/'))
    tf_energy_data = tf_energy_data[tf_energy_data.state != 'US-TOTAL']
    tf_energy_data = tf_energy_data[tf_energy_data.producer != 'Total Electric Power Industry']
    tf_energy_data = tf_energy_data[tf_energy_data.source != 'Total']
    tf_energy_data = tf_energy_data[tf_energy_data.year != 2022]
    
    tf_energy_data = tf_energy_data.drop(columns=['month'])
    tf_energy_data = tf_energy_data.drop(columns=['producer'])
    tf_energy_data = tf_energy_data.drop(columns=['state'])
    
    return tf_energy_data.reset_index(drop = True)

Writing tran_total_energy.py


In [4]:
from tran_total_energy import transform_data
import sys

import psycopg2

from psycopg2 import OperationalError, errorcodes, errors
import psycopg2.extras as extras
import pandas as pd
from io import StringIO
import numpy as np

import plotly.express as px
from timeit import default_timer as timer

from tran_total_energy import transform_data
start = timer()
x = transform_data()
end = timer()
k = end - start 
print("Query execution time: {}s".format(k))
x

Query execution time: 0.5842061419999993s


Unnamed: 0,year,source,generated
0,2001,Coal,18410.0
1,2001,Petroleum,64883.0
2,2001,Natural Gas,305277.0
3,2001,Hydroelectric Conventional,104549.0
4,2001,Wind,87.0
...,...,...,...
282589,2021,Coal,2408632.0
282590,2021,Hydroelectric Conventional,109773.0
282591,2021,Natural Gas,44421.0
282592,2021,Petroleum,7181.0


In [5]:
%%file query_total_energy.py 

from tran_total_energy import transform_data
import sys
import pandas as pd

import psycopg2

from psycopg2 import OperationalError, errorcodes, errors
import psycopg2.extras as extras
from io import StringIO
import numpy as np

import plotly.express as px
from timeit import default_timer as timer

edf = transform_data()

params_dic = {
    "host"      : "localhost",
    "user"      : "postgres",
    "password"  : "postgres",
    "port"      : "5432"
}

def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn
conn = connect(params_dic)

#install psycopg2-binary for MacOs if you don't have it
    #!pip install psycopg2-binary

# Define a function that handles and parses psycopg2 exceptions
def show_psycopg2_exception(err):
    # get details about the exception
    err_type, err_obj, traceback = sys.exc_info()    
    # get the line number when exception occured
    line_n = traceback.tb_lineno    
    # print the connect() error
    print ("\npsycopg2 ERROR:", err, "on line number:", line_n)
    print ("psycopg2 traceback:", traceback, "-- type:", err_type) 
    # psycopg2 extensions.Diagnostics object attribute
    print ("\nextensions.Diagnostics:", err.diag)    
    # print the pgcode and pgerror exceptions
    print ("pgerror:", err.pgerror)
    print ("pgcode:", err.pgcode, "\n")

def create_table(cursor):
    try:
        # 
        cursor.execute("DROP TABLE IF EXISTS energy_over_time;")
        sql = '''CREATE TABLE energy_over_time(
        year INT NOT NULL, 
        source VARCHAR NOT NULL, 
        generated FLOAT NOT NULL
        )'''
        # Creating a table
        cursor.execute(sql);
        print("energy table is created successfully...............")  
    except OperationalError as err:
        # pass exception to function
        show_psycopg2_exception(err)
        # set the connection to 'None' in case of error
        conn = None

# Define function using copy_from() with StringIO to insert the dataframe
def copy_from_dataFile_StringIO(conn, datafrm, table):
    
    #save dataframe to an in memory buffer
    buffer = StringIO()
    datafrm.to_csv(buffer, header=False, index = False)
    buffer.seek(0)
    
    cursor = conn.cursor()
    try:
        cursor.copy_from(buffer, table, sep=",")
        print("Data inserted using copy_from_datafile_StringIO() successfully....")
    except (Exception, psycopg2.DatabaseError) as err:
        # pass exception to function
        show_psycopg2_exception(err)
        cursor.close()

conn = connect(params_dic)

# We set autocommit=True so every command we execute will produce results immediately.
conn.autocommit = True
cursor = conn.cursor()
create_table(cursor)

copy_from_dataFile_StringIO(conn, edf, 'energy_over_time')


def query_data():
    start = timer()

    conn.autocommit = True
    cursor = conn.cursor()
  
    sql = '''SELECT year, source, SUM(generated)
            FROM energy_over_time 
            Group By year, source
            Order By year
                ;''' 
  
    cursor.execute(sql)
    results = cursor.fetchall()
    df = pd.DataFrame (results, columns = ['Year', 'Energy Source','Total Generated'])
    conn.commit()
    end = timer()
    k = end - start 
    #print("Query execution time: {}s".format(k))
    #fig.show()
    return df

Writing query_total_energy.py


In [6]:
from tran_total_energy import transform_data
from query_total_energy import query_data

start = timer()
x = transform_data()
y = query_data()
end = timer()
k = end - start 
print("Query execution time: {}s".format(k))

Connecting to the PostgreSQL database...
Connection successful
Connecting to the PostgreSQL database...
Connection successful
energy table is created successfully...............
Data inserted using copy_from_datafile_StringIO() successfully....
Query execution time: 0.648082438000003s


In [7]:
query_data()

Unnamed: 0,Year,Energy Source,Total Generated
0,2001,Geothermal,13740501.0
1,2001,Wood and Wood Derived Fuels,35199905.0
2,2001,Other Biomass,14548144.0
3,2001,Hydroelectric Conventional,216961052.0
4,2001,Solar Thermal and Photovoltaic,542755.0
...,...,...,...
268,2021,Nuclear,778151500.0
269,2021,Other,12139767.0
270,2021,Wood and Wood Derived Fuels,37170460.0
271,2021,Other Gases,11283103.0


In [8]:
%%file test_total_energy.py

from total_energy import import_data
from tran_total_energy import transform_data
from query_total_energy import query_data

def test_data_columns_count():
    df = import_data()
    assert len(df.columns) == 7

def test_data_row_count():
    df = import_data()
    assert len(df.index) == 496774
    
def test_transform_data_columns_count():
    df = transform_data()
    assert len(df.columns) == 3

def test_transform_data_row_count():
    df = transform_data()
    assert len(df.index) == 282594
    
def test_query_data_column_count():
    df = query_data()
    assert len(df.columns) == 3

def test_query_data_row_count():
    df = query_data()
    assert len(df.index) == 273

Writing test_total_energy.py


In [9]:
# Run the test_import_data.py file with pytest. 
!python -m pytest test_total_energy.py

[1mTest session starts (platform: darwin, Python 3.9.7, pytest 6.2.4, pytest-sugar 0.9.5)[0m
rootdir: /Users/LisaHong/Desktop/github-test-project
plugins: anyio-2.2.0, sugar-0.9.5
[1mcollecting ... [0m
 [36m[0mtest_total_energy.py[0m [32m✓[0m[32m✓[0m[32m✓[0m[32m✓[0m[32m✓[0m[32m✓[0m                                     [32m100% [0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m

Results (4.98s):
[32m       6 passed[0m


In [10]:
state_grouped=x.groupby(['year', 'source']).sum().reset_index()
state_grouped

Unnamed: 0,year,source,generated
0,2001,Coal,1.903956e+09
1,2001,Geothermal,1.374050e+07
2,2001,Hydroelectric Conventional,2.169611e+08
3,2001,Natural Gas,6.391291e+08
4,2001,Nuclear,7.688263e+08
...,...,...,...
268,2021,Petroleum,1.878178e+07
269,2021,Pumped Storage,-5.111684e+06
270,2021,Solar Thermal and Photovoltaic,1.146782e+08
271,2021,Wind,3.797673e+08


In [23]:
fig = px.line(state_grouped, x="year", y="generated",color='source', title="Energy Source from 2001-2021", labels={'year':'Year', 'generated':'Total Energy Generated (Megawatthours)', 'source':'Energy Source'},\
             category_orders= {'source':['Coal','Natural Gas','Nuclear','Hydroelectric Conventional','Wind','Petroleum','Solar Thermal and Photovoltaic','Geothermal','Wood and Wood Derived Fuels','Pumped Storage','Other Gases','Other Biomass','Other']},\
             color_discrete_map={'Other Gases': 'purple','Other Biomass':'gray' ,'Other':'#1f77b4'}) 
fig.show()