In [83]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, DateType
from pyspark.sql.functions import *

#set up spark (session)
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("BatchPipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

#set up hadoop fs configuration
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#retrieve data from bucket
#google storage file path
gsc_file_path = 'gs://data_a1/' # bucket name !

dataSchema = StructType(
    [StructField("home_team", StringType(), True),
    StructField("away_team", StringType(), True),
    StructField("home_score", DoubleType(), True),
    StructField("home_xg", DoubleType(), True),
    StructField("home_penalty", LongType(), True),
    StructField("away_score", DoubleType(), True),
    StructField("away_xg", DoubleType(), True),
    StructField("away_penalty", LongType(), True),
    StructField("home_manager", StringType(), True),
    StructField("home_captain", StringType(), True),
    StructField("away_manager", StringType(), True),
    StructField("away_captain", StringType(), True),
    StructField("home_goals", StringType(), True),
    StructField("away_goals", StringType(), True),
    StructField("Attendance", LongType(), True),
    StructField("Venue", StringType(), True),
    StructField("Officials", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("Score", StringType(), True),
    StructField("Referee", StringType(), True),
    StructField("Notes", StringType(), True),
    StructField("Round", StringType(), True),
    StructField("Host", StringType(), True),
    StructField("Year", LongType(), True)
    ])

#add all decades to one dataframe
matches = spark.read.format("csv").schema(dataSchema).option("header", "true") \
    .load(gsc_file_path+'*.csv')
    
matches.printSchema()



root
 |-- home_team: string (nullable = true)
 |-- away_team: string (nullable = true)
 |-- home_score: double (nullable = true)
 |-- home_xg: double (nullable = true)
 |-- home_penalty: long (nullable = true)
 |-- away_score: double (nullable = true)
 |-- away_xg: double (nullable = true)
 |-- away_penalty: long (nullable = true)
 |-- home_manager: string (nullable = true)
 |-- home_captain: string (nullable = true)
 |-- away_manager: string (nullable = true)
 |-- away_captain: string (nullable = true)
 |-- home_goals: string (nullable = true)
 |-- away_goals: string (nullable = true)
 |-- Attendance: long (nullable = true)
 |-- Venue: string (nullable = true)
 |-- Officials: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- Referee: string (nullable = true)
 |-- Notes: string (nullable = true)
 |-- Round: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- Year: long (nullable = true)



In [84]:
#convert StringType to DateType
matches = matches.withColumn("Date", to_date(col("Date"), 'yyyy-MM-dd'))
matches = matches.withColumn("Year", to_date(col("Year"), 'yyyy'))

matches.printSchema()
matches.show(3)

root
 |-- home_team: string (nullable = true)
 |-- away_team: string (nullable = true)
 |-- home_score: double (nullable = true)
 |-- home_xg: double (nullable = true)
 |-- home_penalty: long (nullable = true)
 |-- away_score: double (nullable = true)
 |-- away_xg: double (nullable = true)
 |-- away_penalty: long (nullable = true)
 |-- home_manager: string (nullable = true)
 |-- home_captain: string (nullable = true)
 |-- away_manager: string (nullable = true)
 |-- away_captain: string (nullable = true)
 |-- home_goals: string (nullable = true)
 |-- away_goals: string (nullable = true)
 |-- Attendance: long (nullable = true)
 |-- Venue: string (nullable = true)
 |-- Officials: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Score: string (nullable = true)
 |-- Referee: string (nullable = true)
 |-- Notes: string (nullable = true)
 |-- Round: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- Year: date (nullable = true)

+---------+---------+----------+--

In [85]:
# write data to bucket
matches.write.format("csv").mode("overwrite").save("gs://data_a1/matches.csv") # bucket name !


In [None]:
#use the cloud storage bucket for temporary BigQuery export data used by the connector
# bucket = "data_a1" # bucket name !
# spark.conf.set('temporaryGcsBucket', bucket)

# #save the data to BigQuery -> do not forget to change project ID
# matches.write.format('bigquery') \
#     .option('table', 'de2022-rrd.a2dataset.matches') \
#     .mode("append") \
#     .save()

Calculate the total number and average number of goals per year

In [86]:
from pyspark.sql import Row, Window

scoring_team = matches.groupby('home_team', ').count()

scoring_team.show(5)

+---------+----------+-----+
|home_team|home_score|count|
+---------+----------+-----+
|      492|    1562.0|    1|
|      395|    6993.0|    1|
|      904|    2383.0|    1|
|     1236|    4787.0|    1|
|      562|    3441.0|    1|
+---------+----------+-----+
only showing top 5 rows



In [113]:
from pyspark.sql.functions import *



matches_goals = matches.withColumn("total_goals", col("home_score") + col("away_score")) \
    .groupBy('Year', 'Host')\
    .agg(avg('total_goals').alias('average_goals_per_match'))\
    .sort('Year') \
    .na.drop(how = 'any') \
    

matches_goals.show(30)

+----------+--------------------+-----------------------+
|      Year|                Host|average_goals_per_match|
+----------+--------------------+-----------------------+
|1930-01-01|             Uruguay|      3.888888888888889|
|1934-01-01|               Italy|      4.117647058823529|
|1938-01-01|              France|      4.666666666666667|
|1950-01-01|              Brazil|                    4.0|
|1954-01-01|         Switzerland|      5.384615384615385|
|1958-01-01|              Sweden|                    3.6|
|1962-01-01|               Chile|                2.78125|
|1966-01-01|             England|                2.78125|
|1970-01-01|              Mexico|                2.96875|
|1974-01-01|             Germany|     2.5526315789473686|
|1978-01-01|           Argentina|     2.6842105263157894|
|1982-01-01|               Spain|     2.8076923076923075|
|1986-01-01|              Mexico|     2.5384615384615383|
|1990-01-01|               Italy|     2.2115384615384617|
|1994-01-01|  

In [None]:
# stop the spark context
# spark.stop()