In [3]:
from pyspark.sql import SparkSession

# Spark session & context
# Spark session & context
spark = SparkSession.builder \
    .appName("world-energy-stats") \
    .master("spark://spark-master:7077")\
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

#Local Development
# spark = SparkSession.builder.appName("world-energy-stats").master("local").getOrCreate()

In [4]:
df = (spark.read
  .format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
   .load("hdfs://namenode:9000/energy-data/owid-energy-data.csv"))

#Local Development
# df = spark.read.csv("owid-energy-data.csv", header=True, inferSchema=True)

In [6]:
# Only World
df = df[df['country'] == 'World']

In [7]:
df = df[df['year'] >=1990]

# Drop 2022 as well.
# df = df[df['year'] >=1990]

#40 years of data
grouped_df = df.groupBy("year").count().orderBy("year")
grouped_df.show(40)


+----+-----+
|year|count|
+----+-----+
|1990|    1|
|1991|    1|
|1992|    1|
|1993|    1|
|1994|    1|
|1995|    1|
|1996|    1|
|1997|    1|
|1998|    1|
|1999|    1|
|2000|    1|
|2001|    1|
|2002|    1|
|2003|    1|
|2004|    1|
|2005|    1|
|2006|    1|
|2007|    1|
|2008|    1|
|2009|    1|
|2010|    1|
|2011|    1|
|2012|    1|
|2013|    1|
|2014|    1|
|2015|    1|
|2016|    1|
|2017|    1|
|2018|    1|
|2019|    1|
|2020|    1|
|2021|    1|
|2022|    1|
+----+-----+



In [8]:
# Dropping irrelevant columns
cols_to_drop = [col for col in df.columns if '_per_capita' in col or '_change_pct' in col or '_change_twh' in col]
df = df.drop(*cols_to_drop)

# Show the updated DataFrame
df.head(n=1)

[Row(country='World', year=1990, iso_code=None, population=5316175872, gdp=43018245636096.0, biofuel_consumption=106.643, biofuel_electricity=None, biofuel_share_elec=None, biofuel_share_energy=0.112, carbon_intensity_elec=None, coal_consumption=25906.625, coal_electricity=4460.242, coal_production=26344.955, coal_share_elec=37.29, coal_share_energy=27.098, electricity_demand=None, electricity_generation=11960.959, electricity_share_energy=12.511, energy_per_gdp=2.222, fossil_electricity=7614.629, fossil_fuel_consumption=83064.32, fossil_share_elec=63.662, fossil_share_energy=86.883, gas_consumption=19481.125, gas_electricity=1789.703, gas_production=19697.166, gas_share_elec=14.963, gas_share_energy=20.377, greenhouse_gas_emissions=None, hydro_consumption=6383.708, hydro_electricity=2158.854, hydro_share_elec=18.049, hydro_share_energy=6.677, low_carbon_consumption=12540.152, low_carbon_electricity=4280.688, low_carbon_share_elec=35.789, low_carbon_share_energy=13.117, net_elec_import

In [11]:
folder_path = './clean/'
df.toPandas().to_csv(folder_path + 'world.csv', index=False)

# save to hive tables.
df.write.mode("overwrite").saveAsTable("wes.world")