In [9]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, DateType
from pyspark.sql.functions import *

#set up spark (session)
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("BatchPipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

#set up hadoop fs configuration
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#retrieve data from bucket
#google storage file path
#make sure this is a separate bucket that only has the correct files in there
gsc_file_path = 'gs://jadsdenb/' # bucket name with whodata.csv file

dataSchema = StructType(
    [StructField("ParentLocation", StringType(), True),
    StructField("Location", StringType(), True),
    StructField("Period", LongType(), True),
    StructField("isLatestYear", StringType(), True),
    StructField("Dim1", StringType(), True),
    StructField("FactValueNumeric", DoubleType(), True)
    ])

consumed_import = spark.read.format("csv").schema(dataSchema).option("header", "true") \
    .load(gsc_file_path+'whodata.csv')

consumed_import.printSchema()

root
 |-- ParentLocation: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Period: long (nullable = true)
 |-- isLatestYear: string (nullable = true)
 |-- Dim1: string (nullable = true)
 |-- FactValueNumeric: double (nullable = true)



In [10]:
consumed = consumed_import.select(col('Location').alias('Country') , col('Period').alias('Year'), col('FactValueNumeric').alias('Litres_alcohol')) \
    .where(col('Dim1') == 'All types')\
    .sort('Year') \

consumed.show()

+--------------------+----+--------------+
|             Country|Year|Litres_alcohol|
+--------------------+----+--------------+
|United States of ...|1960|          7.83|
|             Austria|1960|          8.91|
|         Afghanistan|1961|           0.0|
|              Guinea|1961|          0.24|
|Micronesia (Feder...|1961|           0.0|
|                Oman|1961|           0.0|
|          Bangladesh|1961|           0.0|
|              Kuwait|1961|          0.02|
|               Nepal|1961|           0.0|
|          Mauritania|1961|          0.11|
|               Qatar|1961|           0.0|
|        Saudi Arabia|1961|           0.0|
|            Pakistan|1961|          0.01|
|           Indonesia|1961|          0.03|
|              Malawi|1961|          0.05|
|             Somalia|1961|          0.06|
|             Comoros|1961|          0.12|
|               Niger|1961|          0.13|
|Iran (Islamic Rep...|1961|          0.14|
|              Jordan|1961|          0.14|
+----------

In [11]:
yearly_ww_consumed = consumed.select('Year', 'Litres_alcohol') \
    .groupBy('Year') \
    .agg(sum('Litres_alcohol').alias('Total yearly consumption'))

yearly_ww_consumed.show()

+----+------------------------+
|Year|Total yearly consumption|
+----+------------------------+
|1960|      16.740000000000002|
|1961|                   531.0|
|1962|       557.3200000000002|
|1963|       611.7099999999997|
|1964|                  624.44|
|1965|       633.9300000000001|
|1966|       660.0799999999997|
|1967|       664.3100000000004|
|1968|       675.6399999999999|
|1969|       687.0300000000001|
|1970|       719.9400000000002|
|1971|                  732.55|
|1972|       746.9100000000001|
|1973|       783.8900000000002|
|1974|                  786.51|
|1975|       796.0600000000003|
|1976|       795.9800000000002|
|1977|       802.9100000000003|
|1978|       792.9999999999995|
|1979|       796.5499999999996|
+----+------------------------+
only showing top 20 rows



In [12]:
#write data to bucket
consumed.write.mode("overwrite").format("csv").save("gs://jadsdenb/consumed") # bucket name !

In [13]:
#use the cloud storage bucket for temporary BigQuery export data used by the connector
bucket = "niels_bq_bucket" # bucket name !
spark.conf.set('temporaryGcsBucket', bucket)

#save the combined matches data to BigQuery -> do not forget to change project ID
yearly_ww_consumed.write.format('bigquery') \
    .option('table', 'de2022-366418.assignment2dataset.yearly_consumption') \
    .mode("overwrite") \
    .save()

#save the goal data to bigQuery -> do not forget to change project ID
consumed.write.format('bigquery') \
    .option('table', 'de2022-366418.assignment2dataset.consumption') \
    .mode("append") \
    .save()

In [14]:
# stop the spark context
spark.stop()