In [1]:
%sql
-- setting database in session
use soccer_db

In [2]:
%sql
-- exploring the data
select
  *
from game_events as ge
limit 5

id_odsp,id_event,sort_order,time,event_type,event_type_str,event_type2,event_type2_str,side,side_str,event_team,opponent,player,player2,player_in,player_out,shot_place,shot_place_str,shot_outcome,shot_outcome_str,is_goal,location,location_str,bodypart,bodypart_str,assist_method,assist_method_str,situation,situation_str,time_bin,country_code
UFot0hit/,UFot0hit1,1,2,1,Attempt,12,Key Pass,2,Away,Hamburg SV,Borussia Dortmund,mladen petric,gokhan tore,,,6,High and wide,2.0,Off target,0,9.0,Left side of the box,2.0,Left foot,1,Pass,1,Open play,0.0,DEU
UFot0hit/,UFot0hit2,2,4,2,Corner,99,,1,Home,Borussia Dortmund,Hamburg SV,dennis diekmeier,dennis diekmeier,,,99,,,,0,,,,,0,,99,,0.0,DEU
UFot0hit/,UFot0hit3,3,4,2,Corner,99,,1,Home,Borussia Dortmund,Hamburg SV,heiko westermann,heiko westermann,,,99,,,,0,,,,,0,,99,,0.0,DEU
UFot0hit/,UFot0hit4,4,7,3,Foul,99,,1,Home,Borussia Dortmund,Hamburg SV,sven bender,,,,99,,,,0,,,,,0,,99,,0.0,DEU
UFot0hit/,UFot0hit5,5,7,8,Free kick won,99,,2,Away,Hamburg SV,Borussia Dortmund,gokhan tore,,,,99,,,,0,2.0,Defensive half,,,0,,99,,0.0,DEU


In [3]:
# creating a dataset for model training and prediction
gameEventsDF = spark.sql("""
  select
    ge.event_type_str,
    ge.event_team,
    ge.shot_place_str,
    ge.location_str,
    ge.assist_method_str,
    ge.situation_str,
    ge.country_code,
    ge.is_goal
  from game_events as ge
""")

In [4]:
# imports for Spark ML pipelinefrom pyspark.ml import Pipeline
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [5]:
# list of categorical features
categFeatures = ["event_type_str", "event_team", "shot_place_str", "location_str", "assist_method_str", "situation_str", "country_code"]

In [6]:
# encoding categorical string columns to label indices
stringIndexers = [StringIndexer().setHandleInvalid("keep").setInputCol(baseFeature).setOutputCol(baseFeature + "_idx") for baseFeature in categFeatures]

In [7]:
# convert categorical label indices to binary vectors
encoders = [OneHotEncoder().setInputCol(baseFeature + "_idx").setOutputCol(baseFeature + "_vec") for baseFeature in categFeatures]

In [8]:
# combining all columns into a single feature vector
featureAssembler = VectorAssembler()
featureAssembler.setInputCols([baseFeature + "_vec" for baseFeature in categFeatures])
featureAssembler.setOutputCol("features")

In [9]:
# Creating Spark ML pipeline using a GBT classifier
gbtClassifier = GBTClassifier(labelCol="is_goal", featuresCol="features", maxDepth=5, maxIter=20)
pipelineStages = stringIndexers + encoders + [featureAssembler, gbtClassifier]
pipeline = Pipeline(stages=pipelineStages)

In [10]:
# split data into training and test sets, and create a model from training data
(trainingData, testData) = gameEventsDF.randomSplit([0.75, 0.25])
model = pipeline.fit(trainingData)

In [11]:
# validate model on test data and display predictions
predictions = model.transform(testData)
display(predictions.select("prediction", "is_goal", "features").limit(5))

prediction,is_goal,features
0.0,0,"List(0, 201, List(2, 95, 163, 182, 186, 195, 198), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0.0,0,"List(0, 201, List(2, 95, 154, 169, 188, 192, 198), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0.0,0,"List(0, 201, List(2, 95, 154, 169, 188, 192, 198), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0.0,0,"List(0, 201, List(2, 95, 154, 169, 189, 192, 198), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0.0,0,"List(0, 201, List(2, 95, 154, 169, 186, 192, 198), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"


In [12]:
# evaluate model using areaUnderROC metric
evaluator = BinaryClassificationEvaluator(
    labelCol="is_goal", rawPredictionCol="prediction")
evaluator.evaluate(predictions)