In [1]:
from pyspark.sql import SparkSession

# Spark session & context
# spark = SparkSession.builder.appName("world-energy-stats").master("spark://spark-master:7077").getOrCreate()

#Local Development
spark = SparkSession.builder.appName("world-energy-stats").master("local").getOrCreate()

sc = spark.sparkContext

#Testing

# Sum of the first 100 whole numbers
rdd = sc.parallelize(range(100 + 1))
rdd.sum()
# 5050

5050

In [2]:
df = (spark.read
  .format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
   .load("hdfs://namenode:9000/energy-data/owid-energy-data.csv"))

In [3]:
df.head()

Row(country='ASEAN (Ember)', year=2000, iso_code=None, population=None, gdp=None, biofuel_cons_change_pct=None, biofuel_cons_change_twh=None, biofuel_cons_per_capita=None, biofuel_consumption=None, biofuel_elec_per_capita=None, biofuel_electricity=5.6, biofuel_share_elec=1.519, biofuel_share_energy=None, carbon_intensity_elec=500.231, coal_cons_change_pct=None, coal_cons_change_twh=None, coal_cons_per_capita=None, coal_consumption=None, coal_elec_per_capita=None, coal_electricity=71.03, coal_prod_change_pct=None, coal_prod_change_twh=None, coal_prod_per_capita=None, coal_production=None, coal_share_elec=19.268, coal_share_energy=None, electricity_demand=368.65, electricity_generation=368.65, electricity_share_energy=None, energy_cons_change_pct=None, energy_cons_change_twh=None, energy_per_capita=None, energy_per_gdp=None, fossil_cons_change_pct=None, fossil_cons_change_twh=None, fossil_elec_per_capita=None, fossil_electricity=295.75, fossil_energy_per_capita=None, fossil_fuel_consumpt

In [4]:
df = df[df['year'] >=1990]

#40 years of data
grouped_df = df.groupBy("year").count().orderBy("year")
grouped_df.show(40)


+----+-----+
|year|count|
+----+-----+
|1990|  281|
|1991|  279|
|1992|  285|
|1993|  284|
|1994|  285|
|1995|  285|
|1996|  285|
|1997|  286|
|1998|  286|
|1999|  286|
|2000|  298|
|2001|  298|
|2002|  298|
|2003|  299|
|2004|  299|
|2005|  300|
|2006|  299|
|2007|  299|
|2008|  299|
|2009|  299|
|2010|  299|
|2011|  299|
|2012|  300|
|2013|  300|
|2014|  300|
|2015|  300|
|2016|  300|
|2017|  286|
|2018|  286|
|2019|  286|
|2020|  285|
|2021|  284|
|2022|  146|
+----+-----+



In [6]:
# Dropping irrelevant columns
cols_to_drop = [col for col in df.columns if '_per_capita' in col or '_change_pct' in col or '_change_twh' in col]
df = df.drop(*cols_to_drop)

# Show the updated DataFrame
df.show(n=1)

+-------------+----+--------+----------+----+-------------------+-------------------+------------------+--------------------+---------------------+----------------+----------------+---------------+---------------+-----------------+------------------+----------------------+------------------------+--------------+------------------+-----------------------+-----------------+-------------------+---------------+---------------+--------------+--------------+----------------+------------------------+-----------------+-----------------+----------------+------------------+----------------------+----------------------+---------------------+-----------------------+----------------+-----------------------------+-------------------+-------------------+------------------+--------------------+---------------+---------------+--------------+--------------+----------------+---------------------------+---------------------------+---------------------------------------+---------------------------+--------

In [8]:
df.describe().show()

+-------+-------------+------------------+--------+-------------------+--------------------+-------------------+-------------------+------------------+--------------------+---------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+----------------------+------------------------+------------------+------------------+-----------------------+------------------+-------------------+------------------+-----------------+------------------+-----------------+-----------------+------------------------+------------------+------------------+------------------+------------------+----------------------+----------------------+---------------------+-----------------------+-------------------+-----------------------------+-------------------+-------------------+------------------+--------------------+------------------+------------------+------------------+------------------+------------------+---------------------------+-------

In [11]:
from pyspark.sql.functions import col, count, when

# Assuming df is your DataFrame
null_counts = df.groupBy("country").agg(*[(count(when(col(c).isNull(), c)).alias(c)) for c in df.columns if c != 'country'])

# Show the results
null_counts.show()


+-------------------+----+--------+----------+---+-------------------+-------------------+------------------+--------------------+---------------------+----------------+----------------+---------------+---------------+-----------------+------------------+----------------------+------------------------+--------------+------------------+-----------------------+-----------------+-------------------+---------------+---------------+--------------+--------------+----------------+------------------------+-----------------+-----------------+----------------+------------------+----------------------+----------------------+---------------------+-----------------------+----------------+-----------------------------+-------------------+-------------------+------------------+--------------------+---------------+---------------+--------------+--------------+----------------+---------------------------+---------------------------+---------------------------------------+---------------------------+---

['country',
 'year',
 'iso_code',
 'population',
 'gdp',
 'biofuel_consumption',
 'biofuel_electricity',
 'biofuel_share_elec',
 'biofuel_share_energy',
 'carbon_intensity_elec',
 'coal_consumption',
 'coal_electricity',
 'coal_production',
 'coal_share_elec',
 'coal_share_energy',
 'electricity_demand',
 'electricity_generation',
 'electricity_share_energy',
 'energy_per_gdp',
 'fossil_electricity',
 'fossil_fuel_consumption',
 'fossil_share_elec',
 'fossil_share_energy',
 'gas_consumption',
 'gas_electricity',
 'gas_production',
 'gas_share_elec',
 'gas_share_energy',
 'greenhouse_gas_emissions',
 'hydro_consumption',
 'hydro_electricity',
 'hydro_share_elec',
 'hydro_share_energy',
 'low_carbon_consumption',
 'low_carbon_electricity',
 'low_carbon_share_elec',
 'low_carbon_share_energy',
 'net_elec_imports',
 'net_elec_imports_share_demand',
 'nuclear_consumption',
 'nuclear_electricity',
 'nuclear_share_elec',
 'nuclear_share_energy',
 'oil_consumption',
 'oil_electricity',
 'oil_p

In [12]:
### LEVEL 1 CATEGORIZATION FOR BACKFILLING AND LOGICAL SEPARATION

# Primary Key Columns
primary_keys = ['country', 'year', 'iso_code']

# 1. General Information
df_general = df[primary_keys + ['population', 'gdp', 'electricity_demand', 'electricity_generation', 'energy_per_gdp', 'primary_energy_consumption', 'per_capita_electricity']]

# 2. Biofuel
df_biofuel = df[primary_keys + ['biofuel_consumption', 'biofuel_electricity', 'biofuel_share_elec', 'biofuel_share_energy']]

# 3. Coal
df_coal = df[primary_keys + ['coal_consumption', 'coal_electricity', 'coal_production', 'coal_share_elec', 'coal_share_energy']]

# 4. Gas
df_gas = df[primary_keys + ['gas_consumption', 'gas_electricity', 'gas_production', 'gas_share_elec', 'gas_share_energy']]

# 5. Oil
df_oil = df[primary_keys + ['oil_consumption', 'oil_electricity', 'oil_production', 'oil_share_elec', 'oil_share_energy']]

# 6. Fossil Fuels (Aggregate)
df_fossil = df[primary_keys + ['fossil_electricity', 'fossil_fuel_consumption', 'fossil_share_elec', 'fossil_share_energy', 'carbon_intensity_elec']]

# 7. Greenhouse Gas
df_greenhouse_gas = df[primary_keys + ['greenhouse_gas_emissions']]

# 8. Hydro
df_hydro = df[primary_keys + ['hydro_consumption', 'hydro_electricity', 'hydro_share_elec', 'hydro_share_energy']]

# 9. Nuclear
df_nuclear = df[primary_keys + ['nuclear_consumption', 'nuclear_electricity', 'nuclear_share_elec', 'nuclear_share_energy']]

# 10. Renewables (Aggregate)
df_renewables = df[primary_keys + ['renewables_consumption', 'renewables_electricity', 'renewables_share_elec', 'renewables_share_energy']]

# 11. Solar
df_solar = df[primary_keys + ['solar_consumption', 'solar_electricity', 'solar_share_elec', 'solar_share_energy']]

# 12. Wind
df_wind = df[primary_keys + ['wind_consumption', 'wind_electricity', 'wind_share_elec', 'wind_share_energy']]

# 13. Other Renewables
df_other_renewables = df[primary_keys + ['other_renewable_consumption', 'other_renewable_electricity', 'other_renewable_exc_biofuel_electricity', 'other_renewables_share_elec', 'other_renewables_share_elec_exc_biofuel', 'other_renewables_share_energy']]

# 14. Low Carbon
df_low_carbon = df[primary_keys + ['low_carbon_consumption', 'low_carbon_electricity', 'low_carbon_share_elec', 'low_carbon_share_energy']]

# 15. Electricity Imports
df_electricity_imports = df[primary_keys + ['net_elec_imports', 'net_elec_imports_share_demand']]


In [18]:
# df_fossil.show()

# Assuming df is your DataFrame
null_counts = df_fossil.groupBy("country").agg(*[(count(when(col(c).isNull(), c)).alias(c)) for c in df_fossil.columns if c != 'country'])

# Show the results
null_counts.show(n=300)

# # Define a threshold for maximum allowable null values
# threshold = 30  # for example

# # Filtering out countries where 'gdp' column has more than 30 null values
# countries_to_exclude = null_counts.filter(null_counts['gdp'] > threshold).select('country').rdd.flatMap(lambda x: x).collect()

# df_filtered = df.filter(~df['country'].isin(countries_to_exclude))

+--------------------+----+--------+------------------+-----------------------+-----------------+-------------------+---------------------+
|             country|year|iso_code|fossil_electricity|fossil_fuel_consumption|fossil_share_elec|fossil_share_energy|carbon_intensity_elec|
+--------------------+----+--------+------------------+-----------------------+-----------------+-------------------+---------------------+
|                Chad|   0|       0|                11|                     33|               11|                 33|                   11|
|   Wake Island (EIA)|   0|      32|                32|                     32|               32|                 32|                   32|
|            Paraguay|   0|       0|                10|                     32|               10|                 32|                   10|
|              Russia|   0|       0|                 0|                      0|                0|                  0|                   10|
|               Maca

In [21]:
from pyspark.sql import functions as F

# Primary Key Columns
primary_keys = ['country', 'year', 'iso_code']

# List of columns to check for null values
columns_to_check = [col for col in df_fossil.columns if col not in primary_keys]

# Set the threshold equal to the number of non-primary key columns
threshold = len(columns_to_check)

# Calculate the number of nulls for each row
null_count = sum(F.when(F.col(c).isNull(), 1).otherwise(0) for c in columns_to_check)

# Filter rows based on the threshold
df_fossil_filtered = df_fossil.filter(null_count < threshold)

original_row_count = df_fossil.count()
filtered_row_count = df_fossil_filtered.count()
rows_dropped = original_row_count - filtered_row_count

print(f"Original number of rows: {original_row_count}")
print(f"Number of rows after filtering: {filtered_row_count}")
print(f"Number of rows dropped: {rows_dropped}")


Original number of rows: 9501
Number of rows after filtering: 6629
Number of rows dropped: 2872


In [23]:
# Assuming df is your DataFrame
null_counts = df_fossil_filtered.groupBy("country").agg(*[(count(when(col(c).isNull(), c)).alias(c)) for c in df_fossil_filtered.columns if c != 'country'])

# Show the results
null_counts.show(n=300)

+--------------------+----+--------+------------------+-----------------------+-----------------+-------------------+---------------------+
|             country|year|iso_code|fossil_electricity|fossil_fuel_consumption|fossil_share_elec|fossil_share_energy|carbon_intensity_elec|
+--------------------+----+--------+------------------+-----------------------+-----------------+-------------------+---------------------+
|                Chad|   0|       0|                 0|                     22|                0|                 22|                    0|
|            Paraguay|   0|       0|                 0|                     22|                0|                 22|                    0|
|              Russia|   0|       0|                 0|                      0|                0|                  0|                   10|
|               Macao|   0|       0|                 0|                     22|                0|                 22|                    0|
|               Worl