In [1]:
# Activate Spark in our Colab notebook.
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example: 'spark-3.2.2'
spark_version = 'spark-3.2.2'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.2.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.2.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3.2"

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Get:2 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:6 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]
Hit:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Fetched 261 kB in 3s (91.9 kB/s)
Reading package l

In [2]:
# Install pytest and pytest-sugar to make our output look nice.
!pip install -q pytest pytest-sugar

In [2]:
# Create and navigate to the tdd directory.
from pathlib import Path
if Path.cwd().name != 'tests':
    %mkdir tests
    %cd tests
# Show the current working directory.  
%pwd

mkdir: cannot create directory ‘tests’: File exists
/content/tests


'/content/tests'

In [3]:
# Initialize the __init__.py file. 
# This file will be stored in our pwd (/content/tests)
%%file __init__.py
pass

Overwriting __init__.py


In [4]:
import findspark
findspark.init()

# Import other dependencies. 
from pyspark import SparkFiles
from pyspark.sql import SparkSession

import time
spark = SparkSession.builder.appName("sparkEnergyData").getOrCreate()


In [5]:
energy = spark.read.csv("organised_Gen.csv", header=True)
energy.show()

+---+----+-----+-----+--------------------+--------------------+--------------------------+
|_c0|YEAR|MONTH|STATE|    TYPE OF PRODUCER|       ENERGY SOURCE|GENERATION (Megawatthours)|
+---+----+-----+-----+--------------------+--------------------+--------------------------+
|  0|2001|    1|   AK|Total Electric Po...|                Coal|                   46903.0|
|  1|2001|    1|   AK|Total Electric Po...|           Petroleum|                   71085.0|
|  2|2001|    1|   AK|Total Electric Po...|         Natural Gas|                  367521.0|
|  3|2001|    1|   AK|Total Electric Po...|Hydroelectric Con...|                  104549.0|
|  4|2001|    1|   AK|Total Electric Po...|                Wind|                      87.0|
|  5|2001|    1|   AK|Total Electric Po...|               Total|                  590145.0|
|  6|2001|    1|   AK|Electric Generato...|                Coal|                   18410.0|
|  7|2001|    1|   AK|Electric Generato...|           Petroleum|                

In [40]:
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf



In [41]:
spark = SparkSession \
    .builder \
    .appName("energy file read in") \
    .getOrCreate()
energy = spark.read.csv('organised_Gen.csv', header = True)
energy = energy.withColumnRenamed('TYPE OF PRODUCER', 'Producer_Type')
energy = energy.withColumnRenamed('ENERGY SOURCE', 'Energy_Source')
energy = energy.withColumnRenamed('GENERATION (Megawatthours)', 'Generated_Megawatthours')
season_udf = udf(lambda MONTH: 'Winter' if (MONTH in ('12', '1', '2')) else 'Spring' if (MONTH in ('3','4','5')) else 'Summer' if (MONTH in ('6','7','8')) else 'Autumn', StringType())
energy = energy.withColumn("Season", season_udf(energy.MONTH))



In [42]:
energy.show(10)

+---+----+-----+-----+--------------------+--------------------+-----------------------+------+
|_c0|YEAR|MONTH|STATE|       Producer_Type|       Energy_Source|Generated_Megawatthours|Season|
+---+----+-----+-----+--------------------+--------------------+-----------------------+------+
|  0|2001|    1|   AK|Total Electric Po...|                Coal|                46903.0|Winter|
|  1|2001|    1|   AK|Total Electric Po...|           Petroleum|                71085.0|Winter|
|  2|2001|    1|   AK|Total Electric Po...|         Natural Gas|               367521.0|Winter|
|  3|2001|    1|   AK|Total Electric Po...|Hydroelectric Con...|               104549.0|Winter|
|  4|2001|    1|   AK|Total Electric Po...|                Wind|                   87.0|Winter|
|  5|2001|    1|   AK|Total Electric Po...|               Total|               590145.0|Winter|
|  6|2001|    1|   AK|Electric Generato...|                Coal|                18410.0|Winter|
|  7|2001|    1|   AK|Electric Generato.

In [43]:
energy.write.parquet('parquet_energy', mode='overwrite')
new_energy = spark.read.parquet('parquet_energy')
new_energy.createOrReplaceTempView('p_energy_data')

start_time = time.time()




In [45]:
new_df = spark.sql("""SELECT * FROM p_energy_data """)

new_df.show()


+---+----+-----+-----+--------------------+--------------------+-----------------------+------+
|_c0|YEAR|MONTH|STATE|       Producer_Type|       Energy_Source|Generated_Megawatthours|Season|
+---+----+-----+-----+--------------------+--------------------+-----------------------+------+
|  0|2001|    1|   AK|Total Electric Po...|                Coal|                46903.0|Winter|
|  1|2001|    1|   AK|Total Electric Po...|           Petroleum|                71085.0|Winter|
|  2|2001|    1|   AK|Total Electric Po...|         Natural Gas|               367521.0|Winter|
|  3|2001|    1|   AK|Total Electric Po...|Hydroelectric Con...|               104549.0|Winter|
|  4|2001|    1|   AK|Total Electric Po...|                Wind|                   87.0|Winter|
|  5|2001|    1|   AK|Total Electric Po...|               Total|               590145.0|Winter|
|  6|2001|    1|   AK|Electric Generato...|                Coal|                18410.0|Winter|
|  7|2001|    1|   AK|Electric Generato.

In [55]:
p_sql = spark.sql("""SELECT Energy_Source, Season, sum(Generated_Megawatthours)
            FROM p_energy_data 
            WHERE Producer_Type != 'Total Electric Power Industry' AND Energy_Source != 'Total' AND State != 'US-TOTAL'
            GROUP BY Energy_Source, Season
            """)

p_sql.show(52)

print("--- %s seconds ---" % (time.time() - start_time))

+--------------------+------+----------------------------+
|       Energy_Source|Season|sum(Generated_Megawatthours)|
+--------------------+------+----------------------------+
|          Geothermal|Autumn|               7.958760456E7|
|Hydroelectric Con...|Winter|        1.4710756455000005E9|
|             Nuclear|Autumn|               3.980135663E9|
|                Wind|Autumn|         7.274587293499999E8|
|          Geothermal|Summer|         8.100923721999997E7|
|             Nuclear|Spring|               4.078164619E9|
|Wood and Wood Der...|Spring|         2.022933708599999E8|
|                Wind|Winter|         8.286922304200002E8|
|Solar Thermal and...|Autumn|              1.1993666909E8|
|Solar Thermal and...|Summer|              1.5974313533E8|
|          Geothermal|Spring|         8.393710702000001E7|
|         Other Gases|Spring|         6.800739398000002E7|
|Hydroelectric Con...|Summer|        1.5290877372800012E9|
|         Natural Gas|Autumn|         5.555902655870007E

In [53]:
# Create a test_bank_data.py file and write the test functions to it. 
# This file will be stored in our pwd (/content/tests).
%%file test_total_energy.py

# From the bank_data.py file and import the import_data function. 
from total_energy import (import_data, transform_data, query_data) #transform_data_full, distinct_zip_codes

# Write the tests. 
def test_row_count_before_transform():
  df = import_data()
  assert df.count() == 52

def test_column_count_before_transform():
  df = import_data()
  assert len(df.columns) == 3

def test_row_count_after_transform():
    df = transform_data()
    assert df.count() != 52

def test_column_count_after_transform():
    df = transform_data()
    assert len(df.columns) != 3



Overwriting test_total_energy.py


In [54]:
# Run the test_import_data.py file with pytest. 
!python -m pytest test_total_energy.py

[1mTest session starts (platform: linux, Python 3.7.14, pytest 3.6.4, pytest-sugar 0.9.5)[0m
rootdir: /content/tests, inifile:
plugins: typeguard-2.7.1, sugar-0.9.5

―――――――――――――――――――― ERROR collecting test_total_energy.py ―――――――――――――――――――――
[31mImportError while importing test module '/content/tests/test_total_energy.py'.
Hint: make sure your test modules/packages have valid Python names.
Traceback:
test_total_energy.py:3: in <module>
    from total_energy import (import_data, transform_data, query_data) #transform_data_full, distinct_zip_codes
E   ModuleNotFoundError: No module named 'total_energy'[0m

!!!!!!!!!!!!!!!!!!! Interrupted: 1 errors during collection !!!!!!!!!!!!!!!!!!!!

Results (0.08s):
