In [1]:
%%file total_states_energy.py

import pandas as pd

def import_data():
    csv_file = "../Resources/organised_Gen.csv"
    energy = pd.read_csv(csv_file)
    return energy

Overwriting total_states_energy.py


In [2]:
from total_states_energy import import_data

x = import_data()
x

Unnamed: 0.1,Unnamed: 0,YEAR,MONTH,STATE,TYPE OF PRODUCER,ENERGY SOURCE,GENERATION (Megawatthours)
0,0,2001,1,AK,Total Electric Power Industry,Coal,46903.0
1,1,2001,1,AK,Total Electric Power Industry,Petroleum,71085.0
2,2,2001,1,AK,Total Electric Power Industry,Natural Gas,367521.0
3,3,2001,1,AK,Total Electric Power Industry,Hydroelectric Conventional,104549.0
4,4,2001,1,AK,Total Electric Power Industry,Wind,87.0
...,...,...,...,...,...,...,...
496769,10581,2022,5,WY,"Electric Generators, Electric Utilities",Coal,2071403.0
496770,10582,2022,5,WY,"Electric Generators, Electric Utilities",Hydroelectric Conventional,96790.0
496771,10583,2022,5,WY,"Electric Generators, Electric Utilities",Natural Gas,91570.0
496772,10584,2022,5,WY,"Electric Generators, Electric Utilities",Petroleum,1812.0


In [3]:
%%file tran_total_states_energy.py

from total_states_energy import import_data

import pandas as pd

def transform_data():
    
    tf_energy_data = import_data().drop(columns=['Unnamed: 0'])
    
    tf_energy_data = tf_energy_data.rename(columns={
        "YEAR": "year", 
        "MONTH": "month",
        "STATE": "state",
        "TYPE OF PRODUCER": "producer",
        "ENERGY SOURCE": "source",
        "GENERATION (Megawatthours)": "generated"})


    tf_energy_data['producer'] = tf_energy_data['producer'].apply(lambda x: x.replace(',','/'))
    tf_energy_data = tf_energy_data[tf_energy_data.state != 'US-TOTAL']
    tf_energy_data = tf_energy_data[tf_energy_data.producer == 'Total Electric Power Industry']
    tf_energy_data = tf_energy_data[tf_energy_data.source == 'Total']
    
    tf_energy_data = tf_energy_data.drop(columns=['producer'])
    tf_energy_data = tf_energy_data.drop(columns=['source'])
    
    return tf_energy_data.reset_index(drop = True)

Overwriting tran_total_states_energy.py


In [4]:
from tran_total_states_energy import transform_data
import sys

import psycopg2

from psycopg2 import OperationalError, errorcodes, errors
import psycopg2.extras as extras
import pandas as pd
from io import StringIO
import numpy as np

import plotly.express as px
from timeit import default_timer as timer

start = timer()
x = transform_data()
end = timer()
k = end - start 
print("Query execution time: {}s".format(k))
x

Query execution time: 0.7194037840000003s


Unnamed: 0,year,month,state,generated
0,2001,1,AK,590145.0
1,2001,1,AL,11326642.0
2,2001,1,AR,3903323.0
3,2001,1,AZ,7573925.0
4,2001,1,CA,16618887.0
...,...,...,...,...
13102,2022,5,VT,157506.0
13103,2022,5,WA,9550776.0
13104,2022,5,WI,4741646.0
13105,2022,5,WV,4276516.0


In [5]:
%%file query_total_states_energy.py 

from tran_total_states_energy import transform_data
import sys

import psycopg2

from psycopg2 import OperationalError, errorcodes, errors
import psycopg2.extras as extras
import pandas as pd
from io import StringIO
import numpy as np

from timeit import default_timer as timer

edf = transform_data()

params_dic = {
    "host"      : "localhost",
    "user"      : "postgres",
    "password"  : "postgres",
    "port"      : "5432"
}

def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn
conn = connect(params_dic)

#install psycopg2-binary for MacOs if you don't have it
    #!pip install psycopg2-binary

# Define a function that handles and parses psycopg2 exceptions
def show_psycopg2_exception(err):
    # get details about the exception
    err_type, err_obj, traceback = sys.exc_info()    
    # get the line number when exception occured
    line_n = traceback.tb_lineno    
    # print the connect() error
    print ("\npsycopg2 ERROR:", err, "on line number:", line_n)
    print ("psycopg2 traceback:", traceback, "-- type:", err_type) 
    # psycopg2 extensions.Diagnostics object attribute
    print ("\nextensions.Diagnostics:", err.diag)    
    # print the pgcode and pgerror exceptions
    print ("pgerror:", err.pgerror)
    print ("pgcode:", err.pgcode, "\n")

def create_table(cursor):
    try:
        # 
        cursor.execute("DROP TABLE IF EXISTS energy;")
        sql = '''CREATE TABLE energy(
        year INT NOT NULL, 
        month INT NOT NULL, 
        state VARCHAR NOT NULL, 
        generated FLOAT NOT NULL
        )'''
        # Creating a table
        cursor.execute(sql);
        print("energy table is created successfully...............")  
    except OperationalError as err:
        # pass exception to function
        show_psycopg2_exception(err)
        # set the connection to 'None' in case of error
        conn = None

# Define function using copy_from() with StringIO to insert the dataframe
def copy_from_dataFile_StringIO(conn, datafrm, table):
    
    #save dataframe to an in memory buffer
    buffer = StringIO()
    datafrm.to_csv(buffer, header=False, index = False)
    buffer.seek(0)
    
    cursor = conn.cursor()
    try:
        cursor.copy_from(buffer, table, sep=",")
        print("Data inserted using copy_from_datafile_StringIO() successfully....")
    except (Exception, psycopg2.DatabaseError) as err:
        # pass exception to function
        show_psycopg2_exception(err)
        cursor.close()

conn = connect(params_dic)

# We set autocommit=True so every command we execute will produce results immediately.
conn.autocommit = True
cursor = conn.cursor()
create_table(cursor)

copy_from_dataFile_StringIO(conn, edf, 'energy')


def query_data():

    conn.autocommit = True
    cursor = conn.cursor()
  
    sql = '''SELECT year, state, (SUM(generated)) as sums
            FROM energy 
            WHERE YEAR != 2022
            Group By YEAR, state
            Order By YEAR ASC, sums DESC
                ;''' 
  
    cursor.execute(sql)
    results = cursor.fetchall()
    df = pd.DataFrame (results, columns = ['Year', 'State','Total Generated(MWh)'])
    conn.commit()
    return df

Overwriting query_total_states_energy.py


In [6]:
from tran_total_states_energy import transform_data
from query_total_states_energy import query_data

start = timer()
y = query_data()
end = timer()
k = end - start 
print("Query execution time: {}s".format(k))

Connecting to the PostgreSQL database...
Connection successful
Connecting to the PostgreSQL database...
Connection successful
energy table is created successfully...............
Data inserted using copy_from_datafile_StringIO() successfully....
Query execution time: 0.008607667000000596s


In [7]:
df = query_data()

fig = px.bar(df, x="Year", y="Total Generated(MWh)", 
             color = "State", title="TOTAL Energy Generated Each Year")
#fig.show()

In [8]:
print(df['Total Generated(MWh)'].min(),
      df['Total Generated(MWh)'].max()) 
#checking for max to set the color range of the map below

35499.0 483561772.0


In [12]:
from query_total_states_energy import query_data
import plotly.express as px

df = query_data()

#the code below is for animating month to month
#df['Time'] = df[['Year', 'Month']].apply(lambda row: '-'.join(row.values.astype(str)), axis=1)


fig = px.choropleth(df, 
          locations = 'State',
          color = 'Total Generated(MWh)', 
          animation_frame = "Year", #chooses years as the animation tick mark
          color_continuous_scale = 'jet_r', #color of the scale
          locationmode = 'USA-states',
          scope = "usa",
          range_color = (35499.0, 483561773.0),
          title = 'Energy Generated by State Over Time',
          height=600
         )
fig.show()
#    return fig

In [10]:
%%file test_total_states_energy.py

from total_states_energy import import_data
from tran_total_states_energy import transform_data
from query_total_states_energy import query_data

def test_data_columns_count():
    df = import_data()
    assert len(df.columns) == 7

def test_data_row_count():
    df = import_data()
    assert len(df.index) == 496774
    
def test_transform_data_columns_count():
    df = transform_data()
    assert len(df.columns) == 4

def test_transform_data_row_count():
    df = transform_data()
    assert len(df.index) == 13107
    
def test_query_data_column_count():
    df = query_data()
    assert len(df.columns) != 4

def test_query_data_row_count():
    df = query_data()
    assert len(df.index) != 13107

Overwriting test_total_states_energy.py


In [11]:
# Run the test_import_data.py file with pytest. 
!python -m pytest test_total_states_energy.py

platform darwin -- Python 3.9.7, pytest-6.2.4, py-1.10.0, pluggy-0.13.1
rootdir: /Users/slimykid/Group4_NFLX_MIDTERM/Adeh
plugins: anyio-2.2.0
collected 6 items                                                              [0m[1m

test_total_states_energy.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                       [100%][0m

