In [10]:
from pyspark.sql import SparkSession
from utils import run_spark_sql,filter_df_by_threshold,count_nulls_by_country

# Spark session & context
spark = SparkSession.builder \
    .appName("world-energy-stats") \
    .master("spark://spark-master:7077")\
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

#Local Development
# spark = SparkSession.builder.appName("world-energy-stats").master("local").getOrCreate()

In [2]:
df = (spark.read
  .format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
   .load("hdfs://namenode:9000/energy-data/owid-energy-data.csv"))

#Local Development
# df = spark.read.csv("owid-energy-data.csv", header=True, inferSchema=True)

In [3]:
#DROPPING REGIONS (FOR NOW)
df = df.filter(df['iso_code'].isNotNull())

In [4]:
df = df[df['year'] >=1990]

# Drop 2022 as well.
# df = df[df['year'] >=1990]

#40 years of data
grouped_df = df.groupBy("year").count().orderBy("year")
grouped_df.show(40)

+----+-----+
|year|count|
+----+-----+
|1990|  208|
|1991|  208|
|1992|  214|
|1993|  214|
|1994|  215|
|1995|  215|
|1996|  215|
|1997|  216|
|1998|  216|
|1999|  216|
|2000|  216|
|2001|  216|
|2002|  216|
|2003|  217|
|2004|  217|
|2005|  218|
|2006|  218|
|2007|  218|
|2008|  218|
|2009|  218|
|2010|  218|
|2011|  218|
|2012|  219|
|2013|  219|
|2014|  219|
|2015|  219|
|2016|  219|
|2017|  219|
|2018|  219|
|2019|  219|
|2020|  219|
|2021|  219|
|2022|  108|
+----+-----+



In [5]:
# Dropping irrelevant columns
cols_to_drop = [col for col in df.columns if '_per_gdp' if '_per_capita' in col or '_change_pct' in col or '_change_twh' in col]
df = df.drop(*cols_to_drop)
# per_capita_electricity

# Show the updated DataFrame
df.head(n=1)

[Row(country='Afghanistan', year=1990, iso_code='AFG', population=10694804, gdp=13065984000.0, biofuel_consumption=None, biofuel_electricity=None, biofuel_share_elec=None, biofuel_share_energy=None, carbon_intensity_elec=None, coal_consumption=None, coal_electricity=None, coal_production=0.61, coal_share_elec=None, coal_share_energy=None, electricity_demand=None, electricity_generation=None, electricity_share_energy=None, energy_per_gdp=2.43, fossil_electricity=None, fossil_fuel_consumption=None, fossil_share_elec=None, fossil_share_energy=None, gas_consumption=None, gas_electricity=None, gas_production=31.941, gas_share_elec=None, gas_share_energy=None, greenhouse_gas_emissions=None, hydro_consumption=None, hydro_electricity=None, hydro_share_elec=None, hydro_share_energy=None, low_carbon_consumption=None, low_carbon_electricity=None, low_carbon_share_elec=None, low_carbon_share_energy=None, net_elec_imports=None, net_elec_imports_share_demand=None, nuclear_consumption=None, nuclear_e

In [6]:
from pyspark.sql import Window
from pyspark.sql.functions import last, first

temp_column = [column for column in df.columns if 'year' not in column]
temp_column = [column for column in temp_column if 'country' not in column]
temp_column

import pyspark.sql.functions as F

# Define the windows for forward fill and backward fill
ffill_window = "(partition by country order by year rows between unbounded preceding and current row)"
# bfill_window = "(partition by country order by year rows between current row and unbounded following)"

for col in temp_column:
    df = (df.withColumn(col, F.expr(f"case when isnan({col}) then null else {col} end"))
    .withColumn(col, F.expr(f"coalesce({col}, last({col}, true) over {ffill_window})")))
    # .withColumn(col, F.expr(f"coalesce({col}, first({col}, true) over {bfill_window})")))

In [7]:
### LEVEL 1 CATEGORIZATION FOR BACKFILLING AND LOGICAL SEPARATION

# Primary Key Columns
primary_keys = ['country', 'year', 'iso_code']

# 1. General Information
df_general = df[primary_keys + ['population', 'gdp', 'electricity_demand', 'electricity_generation', 'primary_energy_consumption']]

# 2. Biofuel
df_biofuel = df[primary_keys + ['biofuel_consumption', 'biofuel_electricity', 'biofuel_share_elec', 'biofuel_share_energy']]

# 3. Coal
df_coal = df[primary_keys + ['coal_consumption', 'coal_electricity', 'coal_production', 'coal_share_elec', 'coal_share_energy']]

# 4. Gas
df_gas = df[primary_keys + ['gas_consumption', 'gas_electricity', 'gas_production', 'gas_share_elec', 'gas_share_energy']]

# 5. Oil
df_oil = df[primary_keys + ['oil_consumption', 'oil_electricity', 'oil_production', 'oil_share_elec', 'oil_share_energy']]

# 6. Fossil Fuels (Aggregate)
df_fossil = df[primary_keys + ['fossil_electricity', 'fossil_fuel_consumption', 'fossil_share_elec', 'fossil_share_energy', 'carbon_intensity_elec']]

# 7. Greenhouse Gas
df_greenhouse_gas = df[primary_keys + ['greenhouse_gas_emissions']]

# 8. Hydro
df_hydro = df[primary_keys + ['hydro_consumption', 'hydro_electricity', 'hydro_share_elec', 'hydro_share_energy']]

# 9. Nuclear
df_nuclear = df[primary_keys + ['nuclear_consumption', 'nuclear_electricity', 'nuclear_share_elec', 'nuclear_share_energy']]

# 10. Renewables (Aggregate)
df_renewables = df[primary_keys + ['renewables_consumption', 'renewables_electricity', 'renewables_share_elec', 'renewables_share_energy']]

# 11. Solar
df_solar = df[primary_keys + ['solar_consumption', 'solar_electricity', 'solar_share_elec', 'solar_share_energy']]

# 12. Wind
df_wind = df[primary_keys + ['wind_consumption', 'wind_electricity', 'wind_share_elec', 'wind_share_energy']]

# 13. Other Renewables
df_other_renewables = df[primary_keys + ['other_renewable_consumption', 'other_renewable_electricity', 'other_renewable_exc_biofuel_electricity', 'other_renewables_share_elec', 'other_renewables_share_elec_exc_biofuel', 'other_renewables_share_energy']]

# 14. Low Carbon
df_low_carbon = df[primary_keys + ['low_carbon_consumption', 'low_carbon_electricity', 'low_carbon_share_elec', 'low_carbon_share_energy']]

# 15. Electricity Imports
df_electricity_imports = df[primary_keys + ['net_elec_imports', 'net_elec_imports_share_demand']]


In [8]:
# Usage example:
null_counts_fossil = count_nulls_by_country(df_fossil)

# # Show the results
null_counts_fossil.show(n=5)

NameError: name 'count_nulls_by_country' is not defined

In [11]:
# Calling the filter function on each dataframe
filtered_df_general = filter_df_by_threshold(df_general, 0)
filtered_df_biofuel = filter_df_by_threshold(df_biofuel, 0)
filtered_df_coal = filter_df_by_threshold(df_coal, 0)
filtered_df_gas = filter_df_by_threshold(df_gas, 0)
filtered_df_oil = filter_df_by_threshold(df_oil, 0)
filtered_df_fossil = filter_df_by_threshold(df_fossil, 0)
filtered_df_greenhouse_gas = filter_df_by_threshold(df_greenhouse_gas, 0)
filtered_df_hydro = filter_df_by_threshold(df_hydro, 0)
filtered_df_nuclear = filter_df_by_threshold(df_nuclear, 0)
filtered_df_renewables = filter_df_by_threshold(df_renewables, 0)
filtered_df_solar = filter_df_by_threshold(df_solar, 0)
filtered_df_wind = filter_df_by_threshold(df_wind, 0)
filtered_df_other_renewables = filter_df_by_threshold(df_other_renewables, 0)
filtered_df_low_carbon = filter_df_by_threshold(df_low_carbon, 0)
filtered_df_electricity_imports = filter_df_by_threshold(df_electricity_imports, 0)

{'Original number of rows': 7043, 'Number of rows after filtering': 7043, 'Number of rows dropped': 0, 'Dropped countries': []}
{'Original number of rows': 7043, 'Number of rows after filtering': 6754, 'Number of rows dropped': 289, 'Dropped countries': ['Chile', 'Antarctica', 'Gibraltar', 'Bermuda', 'Northern Mariana Islands', 'Saint Helena', 'Tuvalu', 'Netherlands Antilles', 'Micronesia (country)']}
{'Original number of rows': 7043, 'Number of rows after filtering': 6947, 'Number of rows dropped': 96, 'Dropped countries': ['Antarctica', 'Tuvalu', 'Micronesia (country)']}
{'Original number of rows': 7043, 'Number of rows after filtering': 6947, 'Number of rows dropped': 96, 'Dropped countries': ['Tuvalu', 'Micronesia (country)', 'Antarctica']}
{'Original number of rows': 7043, 'Number of rows after filtering': 6947, 'Number of rows dropped': 96, 'Dropped countries': ['Tuvalu', 'Micronesia (country)', 'Antarctica']}
{'Original number of rows': 7043, 'Number of rows after filtering': 68

In [None]:
filtered_df_ren = filter_df_by_threshold(df_renewables, 0)

# Assuming df is your DataFrame
null_counts_ren = count_nulls_by_country(filtered_df_ren)

# Show the results
null_counts_ren.show(n=10)

In [12]:
# Define the folder path for saving the CSV files
folder_path = './clean/'

# Define file paths for each dataframe within the "clean" folder
filtered_df_general.toPandas().to_csv(folder_path + 'general.csv', index=False)
filtered_df_biofuel.toPandas().to_csv(folder_path + 'biofuel.csv', index=False)
filtered_df_coal.toPandas().to_csv(folder_path + 'coal.csv', index=False)
filtered_df_gas.toPandas().to_csv(folder_path + 'gas.csv', index=False)
filtered_df_oil.toPandas().to_csv(folder_path + 'oil.csv', index=False)
filtered_df_fossil.toPandas().to_csv(folder_path + 'fossil.csv', index=False)
filtered_df_greenhouse_gas.toPandas().to_csv(folder_path + 'greenhouse_gas.csv', index=False)
filtered_df_hydro.toPandas().to_csv(folder_path + 'hydro.csv', index=False)
filtered_df_nuclear.toPandas().to_csv(folder_path + 'nuclear.csv', index=False)
filtered_df_renewables.toPandas().to_csv(folder_path + 'renewables.csv', index=False)
filtered_df_solar.toPandas().to_csv(folder_path + 'solar.csv', index=False)
filtered_df_wind.toPandas().to_csv(folder_path + 'wind.csv', index=False)
filtered_df_other_renewables.toPandas().to_csv(folder_path + 'other_renewables.csv', index=False)
filtered_df_low_carbon.toPandas().to_csv(folder_path + 'low_carbon.csv', index=False)
filtered_df_electricity_imports.toPandas().to_csv(folder_path + 'electricity_imports.csv', index=False)

PermissionError: [Errno 13] Permission denied: './clean/general.csv'

In [13]:
# save to hive tables.
filtered_df_general.write.mode("overwrite").saveAsTable("wes.general")
filtered_df_biofuel.write.mode("overwrite").saveAsTable("wes.biofuel")
filtered_df_coal.write.mode("overwrite").saveAsTable("wes.coal")
filtered_df_gas.write.mode("overwrite").saveAsTable("wes.gas")
filtered_df_oil.write.mode("overwrite").saveAsTable("wes.oil")
filtered_df_fossil.write.mode("overwrite").saveAsTable("wes.fossil")
filtered_df_greenhouse_gas.write.mode("overwrite").saveAsTable("wes.greenhouse_gas")
filtered_df_hydro.write.mode("overwrite").saveAsTable("wes.hydro")
filtered_df_nuclear.write.mode("overwrite").saveAsTable("wes.nuclear")
filtered_df_renewables.write.mode("overwrite").saveAsTable("wes.renewables")
filtered_df_solar.write.mode("overwrite").saveAsTable("wes.solar")
filtered_df_wind.write.mode("overwrite").saveAsTable("wes.wind")
filtered_df_other_renewables.write.mode("overwrite").saveAsTable("wes.other_renewables")
filtered_df_low_carbon.write.mode("overwrite").saveAsTable("wes.low_carbon")
filtered_df_electricity_imports.write.mode("overwrite").saveAsTable("wes.electricity_imports")


In [14]:
run_spark_sql("combined_energy_data.sql")

++
||
++
++



In [18]:
run_spark_sql("energy_breakdown_top10.sql","insight-3.csv")

+----+---------------------+--------------------+------------------------+----------------------+------------------------+--------------------+----------------------+---------------------+
|year|sum(coal_consumption)|sum(gas_consumption)|sum(biofuel_consumption)|sum(hydro_consumption)|sum(nuclear_consumption)|sum(oil_consumption)|sum(solar_consumption)|sum(wind_consumption)|
+----+---------------------+--------------------+------------------------+----------------------+------------------------+--------------------+----------------------+---------------------+
|1990|            18022.183|           11651.397|       90.53500000000001|              3758.196|                3423.739|           21585.775|                 1.111|                8.641|
|2003|            24202.481|  14338.896999999999|                 158.912|     4482.544999999999|                4531.455|  25861.610999999997|    5.7669999999999995|               103.34|
|2007|   32248.589999999997|  16171.619999999997|      