In [1]:
from pyspark.sql import SparkSession

# Spark session & context
# spark = SparkSession.builder.appName("world-energy-stats").master("spark://spark-master:7077").getOrCreate()

#Local Development
spark = SparkSession.builder.appName("world-energy-stats").master("local").getOrCreate()

sc = spark.sparkContext

#Testing

# Sum of the first 100 whole numbers
rdd = sc.parallelize(range(100 + 1))
rdd.sum()
# 5050

5050

In [2]:
df = (spark.read
  .format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
   .load("hdfs://namenode:9000/energy-data/owid-energy-data.csv"))

In [3]:
df.head()

Row(country='ASEAN (Ember)', year=2000, iso_code=None, population=None, gdp=None, biofuel_cons_change_pct=None, biofuel_cons_change_twh=None, biofuel_cons_per_capita=None, biofuel_consumption=None, biofuel_elec_per_capita=None, biofuel_electricity=5.6, biofuel_share_elec=1.519, biofuel_share_energy=None, carbon_intensity_elec=500.231, coal_cons_change_pct=None, coal_cons_change_twh=None, coal_cons_per_capita=None, coal_consumption=None, coal_elec_per_capita=None, coal_electricity=71.03, coal_prod_change_pct=None, coal_prod_change_twh=None, coal_prod_per_capita=None, coal_production=None, coal_share_elec=19.268, coal_share_energy=None, electricity_demand=368.65, electricity_generation=368.65, electricity_share_energy=None, energy_cons_change_pct=None, energy_cons_change_twh=None, energy_per_capita=None, energy_per_gdp=None, fossil_cons_change_pct=None, fossil_cons_change_twh=None, fossil_elec_per_capita=None, fossil_electricity=295.75, fossil_energy_per_capita=None, fossil_fuel_consumpt

In [4]:
#DROPPING REGIONS (FOR NOW)
df = df.filter(df['iso_code'].isNotNull())

In [5]:
df = df[df['year'] >=1990]

# Drop 2022 as well.
# df = df[df['year'] >=1990]

#40 years of data
grouped_df = df.groupBy("year").count().orderBy("year")
grouped_df.show(40)

+----+-----+
|year|count|
+----+-----+
|1990|  208|
|1991|  208|
|1992|  214|
|1993|  214|
|1994|  215|
|1995|  215|
|1996|  215|
|1997|  216|
|1998|  216|
|1999|  216|
|2000|  216|
|2001|  216|
|2002|  216|
|2003|  217|
|2004|  217|
|2005|  218|
|2006|  218|
|2007|  218|
|2008|  218|
|2009|  218|
|2010|  218|
|2011|  218|
|2012|  219|
|2013|  219|
|2014|  219|
|2015|  219|
|2016|  219|
|2017|  219|
|2018|  219|
|2019|  219|
|2020|  219|
|2021|  219|
|2022|  108|
+----+-----+



In [6]:
# Dropping irrelevant columns
cols_to_drop = [col for col in df.columns if '_per_capita' in col or '_change_pct' in col or '_change_twh' in col]
df = df.drop(*cols_to_drop)

# Show the updated DataFrame
df.head(n=1)

[Row(country='Afghanistan', year=1990, iso_code='AFG', population=10694804, gdp=13065984000.0, biofuel_consumption=None, biofuel_electricity=None, biofuel_share_elec=None, biofuel_share_energy=None, carbon_intensity_elec=None, coal_consumption=None, coal_electricity=None, coal_production=0.61, coal_share_elec=None, coal_share_energy=None, electricity_demand=None, electricity_generation=None, electricity_share_energy=None, energy_per_gdp=2.43, fossil_electricity=None, fossil_fuel_consumption=None, fossil_share_elec=None, fossil_share_energy=None, gas_consumption=None, gas_electricity=None, gas_production=31.941, gas_share_elec=None, gas_share_energy=None, greenhouse_gas_emissions=None, hydro_consumption=None, hydro_electricity=None, hydro_share_elec=None, hydro_share_energy=None, low_carbon_consumption=None, low_carbon_electricity=None, low_carbon_share_elec=None, low_carbon_share_energy=None, net_elec_imports=None, net_elec_imports_share_demand=None, nuclear_consumption=None, nuclear_e

In [7]:
# df.describe().show()

In [8]:
### LEVEL 1 CATEGORIZATION FOR BACKFILLING AND LOGICAL SEPARATION

# Primary Key Columns
primary_keys = ['country', 'year', 'iso_code']

# 1. General Information
df_general = df[primary_keys + ['population', 'gdp', 'electricity_demand', 'electricity_generation', 'energy_per_gdp', 'primary_energy_consumption', 'per_capita_electricity']]

# 2. Biofuel
df_biofuel = df[primary_keys + ['biofuel_consumption', 'biofuel_electricity', 'biofuel_share_elec', 'biofuel_share_energy']]

# 3. Coal
df_coal = df[primary_keys + ['coal_consumption', 'coal_electricity', 'coal_production', 'coal_share_elec', 'coal_share_energy']]

# 4. Gas
df_gas = df[primary_keys + ['gas_consumption', 'gas_electricity', 'gas_production', 'gas_share_elec', 'gas_share_energy']]

# 5. Oil
df_oil = df[primary_keys + ['oil_consumption', 'oil_electricity', 'oil_production', 'oil_share_elec', 'oil_share_energy']]

# 6. Fossil Fuels (Aggregate)
df_fossil = df[primary_keys + ['fossil_electricity', 'fossil_fuel_consumption', 'fossil_share_elec', 'fossil_share_energy', 'carbon_intensity_elec']]

# 7. Greenhouse Gas
df_greenhouse_gas = df[primary_keys + ['greenhouse_gas_emissions']]

# 8. Hydro
df_hydro = df[primary_keys + ['hydro_consumption', 'hydro_electricity', 'hydro_share_elec', 'hydro_share_energy']]

# 9. Nuclear
df_nuclear = df[primary_keys + ['nuclear_consumption', 'nuclear_electricity', 'nuclear_share_elec', 'nuclear_share_energy']]

# 10. Renewables (Aggregate)
df_renewables = df[primary_keys + ['renewables_consumption', 'renewables_electricity', 'renewables_share_elec', 'renewables_share_energy']]

# 11. Solar
df_solar = df[primary_keys + ['solar_consumption', 'solar_electricity', 'solar_share_elec', 'solar_share_energy']]

# 12. Wind
df_wind = df[primary_keys + ['wind_consumption', 'wind_electricity', 'wind_share_elec', 'wind_share_energy']]

# 13. Other Renewables
df_other_renewables = df[primary_keys + ['other_renewable_consumption', 'other_renewable_electricity', 'other_renewable_exc_biofuel_electricity', 'other_renewables_share_elec', 'other_renewables_share_elec_exc_biofuel', 'other_renewables_share_energy']]

# 14. Low Carbon
df_low_carbon = df[primary_keys + ['low_carbon_consumption', 'low_carbon_electricity', 'low_carbon_share_elec', 'low_carbon_share_energy']]

# 15. Electricity Imports
df_electricity_imports = df[primary_keys + ['net_elec_imports', 'net_elec_imports_share_demand']]


In [9]:
from pyspark.sql import functions as F

def filter_df_by_threshold(df, threshold):
    """
    Filter a dataframe based on the threshold of non-null counts in non-primary columns.

    Parameters:
    - df: The input dataframe.
    - threshold: The minimum number of non-null values required across non-primary columns.

    Returns:
    - filtered_df: The filtered dataframe.
    - stats: A dictionary containing statistics about the filtering process.
    """

    # Primary Key Columns
    primary_keys = ['country', 'year', 'iso_code']

    # List of columns to check for null values
    columns_to_check = [col for col in df.columns if col not in primary_keys]

    # Count non-null values across all non-primary columns for each country
    agg_exprs = [F.count(F.when(F.col(c).isNotNull(), 1)).alias(c + '_non_null_count') for c in columns_to_check]
    country_counts = df.groupBy('country').agg(*agg_exprs)

    # Sum the non-null counts across all columns for each country
    total_non_null_counts = sum(F.col(c + '_non_null_count') for c in columns_to_check)
    country_counts = country_counts.withColumn('total_non_null_counts', total_non_null_counts)
    
   # Filter countries based on the threshold
    countries_to_keep_df = country_counts.filter(F.col('total_non_null_counts') > threshold).select('country')

    # Find out the countries that were dropped
    all_countries = df.select('country').distinct()
    dropped_countries_df = all_countries.subtract(countries_to_keep_df)
    dropped_countries = [row['country'] for row in dropped_countries_df.collect()]

    # Join with the original DataFrame to get the filtered data
    filtered_df = df.join(countries_to_keep_df, on='country', how='inner')

    original_row_count = df.count()
    filtered_row_count = filtered_df.count()
    rows_dropped = original_row_count - filtered_row_count

    stats = {
        'Original number of rows': original_row_count,
        'Number of rows after filtering': filtered_row_count,
        'Number of rows dropped': rows_dropped,
        'Dropped countries': dropped_countries
    }
    
    print(stats)
    
    return filtered_df

# # Usage example:
# filtered_df_fossil = filter_df_by_threshold(df_fossil, 5)

def count_nulls_by_country(df):
    """
    Count the number of null values for each country and each column (except 'country').

    Parameters:
    - df: The input dataframe.

    Returns:
    - null_counts_df: A dataframe with the count of null values for each column and country.
    """

    # Generate the aggregation expressions
    agg_exprs = [F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns if c != 'country']

    # Group by 'country' and aggregate
    null_counts_df = df.groupBy("country").agg(*agg_exprs)

    return null_counts_df

# # Usage example:
# null_counts_fossil = count_nulls_by_country(df_fossil)

# # Show the results
# null_counts_fossil.show(n=300)

from pyspark.sql import functions as F

def filter_rows_by_null_threshold(df):
    """
    Filter rows from a dataframe based on the threshold of null values across non-primary columns.

    Parameters:
    - df: The input dataframe.

    Returns:
    - filtered_df: The filtered dataframe.
    - stats: A dictionary containing statistics about the filtering process.
    """
    
    # Primary Key Columns
    primary_keys = ['country', 'year', 'iso_code']

    # List of columns to check for null values
    columns_to_check = [col for col in df.columns if col not in primary_keys]

    # Set the threshold equal to the number of non-primary key columns
    threshold = len(columns_to_check)

    # Calculate the number of nulls for each row
    null_count = sum(F.when(F.col(c).isNull(), 1).otherwise(0) for c in columns_to_check)

    # Filter rows based on the threshold
    filtered_df = df.filter(null_count < threshold)

    original_row_count = df.count()
    filtered_row_count = filtered_df.count()
    rows_dropped = original_row_count - filtered_row_count

    stats = {
        'Original number of rows': original_row_count,
        'Number of rows after filtering': filtered_row_count,
        'Number of rows dropped': rows_dropped
    }
    
    print(stats)

    return filtered_df

# # Usage example:
# filtered_df_fossil, fossil_stats = filter_rows_by_null_threshold(df_fossil)


In [10]:
# Usage example:
null_counts_fossil = count_nulls_by_country(df_fossil)

# # Show the results
null_counts_fossil.show(n=300)

+--------------------+----+--------+------------------+-----------------------+-----------------+-------------------+---------------------+
|             country|year|iso_code|fossil_electricity|fossil_fuel_consumption|fossil_share_elec|fossil_share_energy|carbon_intensity_elec|
+--------------------+----+--------+------------------+-----------------------+-----------------+-------------------+---------------------+
|                Chad|   0|       0|                11|                     33|               11|                 33|                   11|
|            Paraguay|   0|       0|                10|                     32|               10|                 32|                   10|
|              Russia|   0|       0|                 0|                      0|                0|                  0|                   10|
|               Macao|   0|       0|                10|                     32|               10|                 32|                   10|
|               Yeme

In [11]:
filtered_df_fossil = filter_df_by_threshold(df_fossil, 0)
filtered_df_fossil[filtered_df_fossil['country']=='Zimbabwe'].show(40)

{'Original number of rows': 7043, 'Number of rows after filtering': 6883, 'Number of rows dropped': 160, 'Dropped countries': ['Northern Mariana Islands', 'Tuvalu', 'Netherlands Antilles', 'Antarctica', 'Micronesia (country)']}
+--------+----+--------+------------------+-----------------------+-----------------+-------------------+---------------------+
| country|year|iso_code|fossil_electricity|fossil_fuel_consumption|fossil_share_elec|fossil_share_energy|carbon_intensity_elec|
+--------+----+--------+------------------+-----------------------+-----------------+-------------------+---------------------+
|Zimbabwe|1990|     ZWE|              null|                   null|             null|               null|                 null|
|Zimbabwe|1991|     ZWE|              null|                   null|             null|               null|                 null|
|Zimbabwe|1992|     ZWE|              null|                   null|             null|               null|                 null|
|Zim

In [12]:
filtered_df_ren = filter_df_by_threshold(df_renewables, 5)
filtered_df_ren[filtered_df_ren['country']=='Chad'].show(40)

# Assuming df is your DataFrame
null_counts_ren = count_nulls_by_country(filtered_df_ren)

# Show the results
null_counts_ren.show(n=300)

{'Original number of rows': 7043, 'Number of rows after filtering': 6883, 'Number of rows dropped': 160, 'Dropped countries': ['Northern Mariana Islands', 'Tuvalu', 'Netherlands Antilles', 'Antarctica', 'Micronesia (country)']}
+-------+----+--------+----------------------+----------------------+---------------------+-----------------------+
|country|year|iso_code|renewables_consumption|renewables_electricity|renewables_share_elec|renewables_share_energy|
+-------+----+--------+----------------------+----------------------+---------------------+-----------------------+
|   Chad|1990|     TCD|                  null|                  null|                 null|                   null|
|   Chad|1991|     TCD|                  null|                  null|                 null|                   null|
|   Chad|1992|     TCD|                  null|                  null|                 null|                   null|
|   Chad|1993|     TCD|                  null|                  null|       