In [1]:
%%file total_energy.py

import pandas as pd

def import_data():
    csv_file = "../Resources/organised_Gen.csv"
    energy = pd.read_csv(csv_file)
    return energy

Overwriting total_energy.py


In [2]:
from total_energy import import_data

x = import_data()
x['TYPE OF PRODUCER'].unique()

array(['Total Electric Power Industry',
       'Electric Generators, Electric Utilities',
       'Combined Heat and Power, Electric Power',
       'Combined Heat and Power, Commercial Power',
       'Combined Heat and Power, Industrial Power',
       'Electric Generators, Independent Power Producers'], dtype=object)

In [3]:
%%file transformed_total_energy.py

from total_energy import import_data

import pandas as pd

def transform_data():
    
    tf_energy_data = import_data().drop(columns=['Unnamed: 0'])
    
    tf_energy_data = tf_energy_data.rename(columns={
        "YEAR": "year", 
        "MONTH": "month",
        "STATE": "state",
        "TYPE OF PRODUCER": "producer",
        "ENERGY SOURCE": "source",
        "GENERATION (Megawatthours)": "generated"})


    tf_energy_data['producer'] = tf_energy_data['producer'].apply(lambda x: x.replace(',','/'))
    tf_energy_data = tf_energy_data[tf_energy_data.state != 'US-TOTAL']
    tf_energy_data = tf_energy_data[tf_energy_data.producer != 'Total Electric Power Industry']
    tf_energy_data = tf_energy_data[tf_energy_data.source != 'Total']

    
    return tf_energy_data.reset_index(drop = True)

Overwriting transformed_total_energy.py


In [4]:
from transformed_total_energy import transform_data
import sys

import psycopg2

from psycopg2 import OperationalError, errorcodes, errors
import psycopg2.extras as extras
import pandas as pd
from io import StringIO
import numpy as np

import plotly.express as px
from timeit import default_timer as timer

start = timer()
x = transform_data()
end = timer()
k = end - start 
print("Transform execution time: {}s".format(k))
x

Transform execution time: 0.8191364799999992s


Unnamed: 0,year,month,state,producer,source,generated
0,2001,1,AK,Electric Generators/ Electric Utilities,Coal,18410.0
1,2001,1,AK,Electric Generators/ Electric Utilities,Petroleum,64883.0
2,2001,1,AK,Electric Generators/ Electric Utilities,Natural Gas,305277.0
3,2001,1,AK,Electric Generators/ Electric Utilities,Hydroelectric Conventional,104549.0
4,2001,1,AK,Electric Generators/ Electric Utilities,Wind,87.0
...,...,...,...,...,...,...
288886,2022,5,WY,Electric Generators/ Electric Utilities,Coal,2071403.0
288887,2022,5,WY,Electric Generators/ Electric Utilities,Hydroelectric Conventional,96790.0
288888,2022,5,WY,Electric Generators/ Electric Utilities,Natural Gas,91570.0
288889,2022,5,WY,Electric Generators/ Electric Utilities,Petroleum,1812.0


In [5]:
%%file query_total_energy.py 

from transformed_total_energy import transform_data
import sys

import psycopg2

from psycopg2 import OperationalError, errorcodes, errors
import psycopg2.extras as extras
import pandas as pd
from io import StringIO
import numpy as np

import plotly.express as px
from timeit import default_timer as timer

edf = transform_data()

params_dic = {
    "host"      : "localhost",
    "user"      : "postgres",
    "password"  : "postgres",
    "port"      : "5432"
}

def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn
conn = connect(params_dic)

#install psycopg2-binary for MacOs if you don't have it
    #!pip install psycopg2-binary

# Define a function that handles and parses psycopg2 exceptions
def show_psycopg2_exception(err):
    # get details about the exception
    err_type, err_obj, traceback = sys.exc_info()    
    # get the line number when exception occured
    line_n = traceback.tb_lineno    
    # print the connect() error
    print ("\npsycopg2 ERROR:", err, "on line number:", line_n)
    print ("psycopg2 traceback:", traceback, "-- type:", err_type) 
    # psycopg2 extensions.Diagnostics object attribute
    print ("\nextensions.Diagnostics:", err.diag)    
    # print the pgcode and pgerror exceptions
    print ("pgerror:", err.pgerror)
    print ("pgcode:", err.pgcode, "\n")

def create_table(cursor):
    try:
        # 
        cursor.execute("DROP TABLE IF EXISTS energy;")
        sql = '''CREATE TABLE energy(
        year INT NOT NULL, 
        month INT NOT NULL, 
        state VARCHAR(8) NOT NULL,
        producer VARCHAR(50) NOT NULL,
        source VARCHAR(50) NOT NULL,
        energy FLOAT NOT NULL    
        )'''
        # Creating a table
        cursor.execute(sql);
        print("energy table is created successfully...............")  
    except OperationalError as err:
        # pass exception to function
        show_psycopg2_exception(err)
        # set the connection to 'None' in case of error
        conn = None

# Define function using copy_from() with StringIO to insert the dataframe
def copy_from_dataFile_StringIO(conn, datafrm, table):
    
    #save dataframe to an in memory buffer
    buffer = StringIO()
    datafrm.to_csv(buffer, header=False, index = False)
    buffer.seek(0)
    
    cursor = conn.cursor()
    try:
        cursor.copy_from(buffer, table, sep=",")
        print("Data inserted using copy_from_datafile_StringIO() successfully....")
    except (Exception, psycopg2.DatabaseError) as err:
        # pass exception to function
        show_psycopg2_exception(err)
        cursor.close()

conn = connect(params_dic)

# We set autocommit=True so every command we execute will produce results immediately.
conn.autocommit = True
cursor = conn.cursor()
create_table(cursor)

copy_from_dataFile_StringIO(conn, edf, 'energy')


def query_data():
    start = timer()

    conn.autocommit = True
    cursor = conn.cursor()
  
    sql = '''SELECT year, producer, SUM(energy) AS total_generated 
                FROM energy
                WHERE year >= 2001 AND year < 2022 
                    AND (source='Wind' OR source='Solar Thermal and Photovoltaic' 
                        OR source='Hydroelectric Conventional' OR source='Geothermal' 
                        OR source='Wood and Wood Derived Fuels' OR source='Other Biomass' 
                        OR source='Pumped Storage')
                    AND (state != 'US-TOTAL' 
                        AND producer != 'Total Electric Power Industry' 
                        AND source != 'Total')
                GROUP BY year, producer
                ORDER BY year DESC, producer DESC;
                ;''' 
  
    cursor.execute(sql)
    results = cursor.fetchall()
    df = pd.DataFrame (results, columns = ['Year', 'Producer', 'Total Generated(MWh)'])
    #fig = px.line(df, x="Year", y="Total Generated(in millions)", title="TOTAL Energy Generated Each Year")
    conn.commit()
    end = timer()
    k = end - start 
    #print("Query execution time: {}s".format(k))
    #fig.show()
    return df

Overwriting query_total_energy.py


In [6]:
from transformed_total_energy import transform_data
from query_total_energy import query_data

start = timer()
a = import_data()
x = transform_data()
y = query_data()
end = timer()
k = end - start 
print("Import, Transform and Query execution time: {}s".format(k))
y

Connecting to the PostgreSQL database...
Connection successful
Connecting to the PostgreSQL database...
Connection successful
energy table is created successfully...............
Data inserted using copy_from_datafile_StringIO() successfully....
Import, Transform and Query execution time: 1.7564214600000003s


Unnamed: 0,Year,Producer,Total Generated(MWh)
0,2021,Electric Generators/ Independent Power Producers,463220603.0
1,2021,Electric Generators/ Electric Utilities,323768754.0
2,2021,Combined Heat and Power/ Industrial Power,27537256.0
3,2021,Combined Heat and Power/ Electric Power,3054152.0
4,2021,Combined Heat and Power/ Commercial Power,3694951.0
...,...,...,...
100,2001,Electric Generators/ Independent Power Producers,52026743.0
101,2001,Electric Generators/ Electric Utilities,191765253.0
102,2001,Combined Heat and Power/ Industrial Power,30630146.0
103,2001,Combined Heat and Power/ Electric Power,3392669.0


In [7]:
y['Total Generated(MWh)'].mean()

99824051.75533333

In [13]:
from query_total_energy import query_data

df = query_data()
fig = px.line(df, x="Year", y="Total Generated(MWh)", color = 'Producer',  
              title="Total Renewable Energy Production Over time", markers=True)


# adding a horizontal "target" line
fig.add_shape(
    type = "line", line_color = "#d62728", line_width = 3, opacity = 1, line_dash = "dot",
    x0=0, x1=1, xref = "paper", y0 = 99824051.75533333, y1 = 99824051.75533333, yref = "y")

#  # add a text for avg gen
# fig.add_annotation(
#     text = "Average Generated", x = 2002, y = 4.055194e+09, showarrow=False)

# fig.add_annotation(
#     text = "Financial Crisis (2009)", x = 2008.9, y = 3.945e+09, 
#     arrowhead = 1, ax = -50, ay = 50,
#     showarrow = True, arrowcolor = '#ff7f0e')

# fig.add_annotation(
#     text = "COVID-19 Pandemic (2020))", x = 2019.9, y = 4.000e+09, 
#     arrowhead = 1, ax = -50, ay = 50,
#     showarrow = True, arrowcolor = '#ff7f0e')

fig.show()

In [11]:
%%file test_total_energy.py

from total_energy import import_data
from transformed_total_energy import transform_data
from query_total_energy import query_data

def test_data_columns_count():
    df = import_data()
    assert len(df.columns) == 7

def test_data_row_count():
    df = import_data()
    assert len(df.index) == 496774
    
def test_transform_data_columns_count():
    df = transform_data()
    assert len(df.columns) != 7

def test_transform_data_row_count():
    df = transform_data()
    assert len(df.index) != 496774
    
def test_query_data_column_count():
    df = query_data()
    assert len(df.columns) == 3

def test_query_data_row_count():
    df = query_data()
    assert len(df.index) == 105

Overwriting test_total_energy.py


In [12]:
# Run the test_import_data.py file with pytest. 
!python -m pytest test_total_energy.py

platform darwin -- Python 3.9.7, pytest-6.2.4, py-1.10.0, pluggy-0.13.1
rootdir: /Users/slimykid/Group4_NFLX_MIDTERM/Adeh
plugins: anyio-2.2.0
collected 6 items                                                              [0m[1m

test_total_energy.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                              [100%][0m

