# DA 231o Data Engineering at Scale Project



# Expected Goals Model & Player Analysis

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 37 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 64.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=6c7f1c7a56ba26467b6e3c6cd35cc7778277caeeadc953567504d0638b95ef2e
  Stored in directory: /root/.cache/pip/wheels/42/59/f5/79a5bf931714dcd201b26025347785f087370a10a3329a899c
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pyspark
pyspark.__version__

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.sql import SparkSession

# Load input Dataset with Goals

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data = pd.read_csv('drive/MyDrive/refined_goal_dataset.csv')

In [5]:
print(len(data))
print(data.is_goal.sum())
print(len(data.columns)-1)

229135
24441
29


In [6]:
spark = SparkSession \
    .builder \
    .appName('xG Project Solution') \
    .getOrCreate()
  
spark

In [7]:
data.columns

Index(['Unnamed: 0', 'fast_break', 'loc_centre_box', 'loc_diff_angle_lr',
       'diff_angle_left', 'diff_angle_right', 'left_side_box',
       'left_side_6ybox', 'right_side_box', 'right_side_6ybox', 'close_range',
       'penalty', 'outside_box', 'long_range', 'more_35y', 'more_40y',
       'not_recorded', 'right_foot', 'left_foot', 'header', 'no_assist',
       'assist_pass', 'assist_cross', 'assist_header', 'assist_through_ball',
       'open_play', 'set_piece', 'corner', 'free_kick', 'is_goal'],
      dtype='object')

In [8]:
dataset = spark.createDataFrame(data=data)
dataset.show()

+----------+----------+--------------+-----------------+---------------+----------------+-------------+---------------+--------------+----------------+-----------+-------+-----------+----------+--------+--------+------------+----------+---------+------+---------+-----------+------------+-------------+-------------------+---------+---------+------+---------+-------+
|Unnamed: 0|fast_break|loc_centre_box|loc_diff_angle_lr|diff_angle_left|diff_angle_right|left_side_box|left_side_6ybox|right_side_box|right_side_6ybox|close_range|penalty|outside_box|long_range|more_35y|more_40y|not_recorded|right_foot|left_foot|header|no_assist|assist_pass|assist_cross|assist_header|assist_through_ball|open_play|set_piece|corner|free_kick|is_goal|
+----------+----------+--------------+-----------------+---------------+----------------+-------------+---------------+--------------+----------------+-----------+-------+-----------+----------+--------+--------+------------+----------+---------+------+---------+-

# Extended Goals with PySpark Machine Learning

In [9]:
required_features = list(data.columns[:-1]) # remove 'is_goal' because it is the label

assembler = VectorAssembler(inputCols=required_features, outputCol='features')
transformed_data = assembler.transform(dataset)

transformed_data.show()

+----------+----------+--------------+-----------------+---------------+----------------+-------------+---------------+--------------+----------------+-----------+-------+-----------+----------+--------+--------+------------+----------+---------+------+---------+-----------+------------+-------------+-------------------+---------+---------+------+---------+-------+--------------------+
|Unnamed: 0|fast_break|loc_centre_box|loc_diff_angle_lr|diff_angle_left|diff_angle_right|left_side_box|left_side_6ybox|right_side_box|right_side_6ybox|close_range|penalty|outside_box|long_range|more_35y|more_40y|not_recorded|right_foot|left_foot|header|no_assist|assist_pass|assist_cross|assist_header|assist_through_ball|open_play|set_piece|corner|free_kick|is_goal|            features|
+----------+----------+--------------+-----------------+---------------+----------------+-------------+---------------+--------------+----------------+-----------+-------+-----------+----------+--------+--------+----------

In [10]:
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2])

print("Total Data", transformed_data.count())
print("Total Training Data", training_data.count())
print("Total Test Data", test_data.count())

print("Training Dataset")
training_data.show(5)

Total Data 229135
Total Training Data 183309
Total Test Data 45826
Training Dataset
+----------+----------+--------------+-----------------+---------------+----------------+-------------+---------------+--------------+----------------+-----------+-------+-----------+----------+--------+--------+------------+----------+---------+------+---------+-----------+------------+-------------+-------------------+---------+---------+------+---------+-------+--------------------+
|Unnamed: 0|fast_break|loc_centre_box|loc_diff_angle_lr|diff_angle_left|diff_angle_right|left_side_box|left_side_6ybox|right_side_box|right_side_6ybox|close_range|penalty|outside_box|long_range|more_35y|more_40y|not_recorded|right_foot|left_foot|header|no_assist|assist_pass|assist_cross|assist_header|assist_through_ball|open_play|set_piece|corner|free_kick|is_goal|            features|
+----------+----------+--------------+-----------------+---------------+----------------+-------------+---------------+--------------+----

# Gradient-Boosted Trees (GBTs) Classifier

In [11]:
gbt = GBTClassifier(labelCol='is_goal', 
                            featuresCol='features',
                            maxDepth=5)

In [12]:
# Another classifier
# from pyspark.ml.classification import RandomForestClassifier
# gbt = RandomForestClassifier(labelCol='is_goal', 
#                             featuresCol='features',
#                             maxDepth=5)

In [13]:
# Fit the model on training data
model = gbt.fit(training_data)

# Predictions with Test Data

In [14]:
# Predictions with test data
predictions = model.transform(test_data)
predictions.show(5)

+----------+----------+--------------+-----------------+---------------+----------------+-------------+---------------+--------------+----------------+-----------+-------+-----------+----------+--------+--------+------------+----------+---------+------+---------+-----------+------------+-------------+-------------------+---------+---------+------+---------+-------+--------------------+--------------------+--------------------+----------+
|Unnamed: 0|fast_break|loc_centre_box|loc_diff_angle_lr|diff_angle_left|diff_angle_right|left_side_box|left_side_6ybox|right_side_box|right_side_6ybox|close_range|penalty|outside_box|long_range|more_35y|more_40y|not_recorded|right_foot|left_foot|header|no_assist|assist_pass|assist_cross|assist_header|assist_through_ball|open_play|set_piece|corner|free_kick|is_goal|            features|       rawPrediction|         probability|prediction|
+----------+----------+--------------+-----------------+---------------+----------------+-------------+-------------

# Evaluate our model

In [15]:
# Evaluate our model
evaluator = MulticlassClassificationEvaluator(
    labelCol='is_goal', 
    predictionCol='prediction', 
    metricName='accuracy')

# Calculate Test Accuracy

In [16]:
accuracy = evaluator.evaluate(predictions)
print('Test Accuracy = ', accuracy)

Test Accuracy =  0.9116876882119321
