In [4]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, DateType
from pyspark.sql.functions import *

#set up spark (session)
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("BatchPipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

#set up hadoop fs configuration
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#retrieve data from bucket
#google storage file path
#make sure this is a separate bucket that only has the correct files in there
gsc_file_path = 'gs://jadsdenb/' # bucket name with whodata.csv file

dataSchema = StructType(
    [StructField("ParentLocation", StringType(), True),
    StructField("Location", StringType(), True),
    StructField("Period", LongType(), True),
    StructField("isLatestYear", StringType(), True),
    StructField("Dim1", StringType(), True),
    StructField("FactValueNumeric", LongType(), True)
    ])

consumed = spark.read.format("csv").schema(dataSchema).option("header", "true") \
    .load(gsc_file_path+'whodata.csv')

consumed.printSchema()

root
 |-- ParentLocation: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Period: long (nullable = true)
 |-- isLatestYear: string (nullable = true)
 |-- Dim1: string (nullable = true)
 |-- FactValueNumeric: long (nullable = true)



In [8]:
consumed = consumed.withColumn("Period", to_date(col("Period"), 'yyyy'))
consumed.printSchema()
consumed.show(10)

root
 |-- ParentLocation: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Period: date (nullable = true)
 |-- isLatestYear: string (nullable = true)
 |-- Dim1: string (nullable = true)
 |-- FactValueNumeric: long (nullable = true)

+--------------------+-----------+----------+------------+--------------------+----------------+
|      ParentLocation|   Location|    Period|isLatestYear|                Dim1|FactValueNumeric|
+--------------------+-----------+----------+------------+--------------------+----------------+
|Eastern Mediterra...|Afghanistan|2019-01-01|        True|                Beer|            null|
|Eastern Mediterra...|Afghanistan|2019-01-01|        True|Other alcoholic b...|            null|
|Eastern Mediterra...|Afghanistan|2019-01-01|        True|                Wine|            null|
|              Africa|    Algeria|2019-01-01|        True|Other alcoholic b...|            null|
|              Europe|    Andorra|2019-01-01|        True|Other alc

In [31]:
avg_total_consumption = consumed.select('Location', 'Period', 'Dim1', 'FactValueNumeric') \
    .where(col('Dim1') == 'All types')\
    .sort('Period') \

avg_total_consumption.show()

+--------------------+----------+---------+----------------+
|            Location|    Period|     Dim1|FactValueNumeric|
+--------------------+----------+---------+----------------+
|United States of ...|1960-01-01|All types|            null|
|             Austria|1960-01-01|All types|            null|
|         Afghanistan|1961-01-01|All types|            null|
|              Guinea|1961-01-01|All types|            null|
|Micronesia (Feder...|1961-01-01|All types|            null|
|                Oman|1961-01-01|All types|            null|
|          Bangladesh|1961-01-01|All types|            null|
|              Kuwait|1961-01-01|All types|            null|
|               Nepal|1961-01-01|All types|            null|
|          Mauritania|1961-01-01|All types|            null|
|               Qatar|1961-01-01|All types|            null|
|        Saudi Arabia|1961-01-01|All types|            null|
|            Pakistan|1961-01-01|All types|            null|
|           Indonesia|19

In [None]:
#write data to bucket
matches.write.mode("overwrite").format("csv").save("gs://de_jads_batch_data/whodata.csv") # bucket name !

In [None]:
#use the cloud storage bucket for temporary BigQuery export data used by the connector
bucket = "niels_bq_bucket" # bucket name !
spark.conf.set('temporaryGcsBucket', bucket)

#save the goal data to bigQuery -> do not forget to change project ID
consumed.write.format('bigquery') \
    .option('table', 'de2022-366418.assignment2dataset.consumption') \
    .mode("append") \
    .save()

In [None]:
# stop the spark context
spark.stop()