In [73]:
from pyspark.ml.classification import LogisticRegression
import findspark
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import count

In [74]:
spark = (SparkSession
         .builder
         .appName("Logistic Regression")
         .getOrCreate())

In [75]:
df = (spark.read
            .option("HEADER", True)
            .option("inferSchema", True)
            .csv("./data/pl_matches_modified.csv")
           )

df.show(5)

# df['home_captain'] = df['home_captain'].astype(str)

+----------+-----------------+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+--------------------+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+----------+----------+
|Match_Date|        Home_Team|Home_Sh|Home_SoT|Home_Touches|Home_Tkl|Home_Int|Home_Blocks|  Home_xG_Expected|Home_npxG_Expected|Home_SCA_SCA|Home_GCA_SCA|Home_Cmp_Passes|Home_Att_Passes|Home_Cmp_percent_Passes|Home_PrgP_Passes|Home_Carries_Carries|Home_PrgC_Carries|Home_Att_Take_Ons|Home_Succ_Take_Ons|           Away_Team|Away_Sh|Away_SoT|Away_Touches|Away_Tkl|Away_Int|Away_Blo

# Onehot Encoder

https://stackoverflow.com/questions/56585434/pyspark-pipeline-error-when-using-indexer-and-encoder

In [76]:
from pyspark.ml.feature import OneHotEncoder,StringIndexer,VectorAssembler

In [77]:
# unusedCols = ['home_fbrefMatchId ','away_fbrefMatchId']
# outputCols = ['home_score','away_score']
# inputCols = [column for column in df.columns if column not in outputCols and column not in unusedCols]
# encodeCols = ['home_captain','away_captain','home_manager','away_manager','home_team','away_team','away_is_home_team']

unusedCols = ['Match_Date']
outputCols = ['Home_Score','Away_Score']
inputCols = [column for column in df.columns if column not in outputCols and column not in unusedCols]
encodeCols = ['Away_Team','Home_Team']




In [78]:
indexer = StringIndexer(inputCols=encodeCols,outputCols = [encodeCol+"_Index" for encodeCol in encodeCols])
indexerModel = indexer.fit(df)
indexer_df = indexerModel.transform(df)
indexer_df.show(5)

+----------+-----------------+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+--------------------+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+----------+----------+---------------+---------------+
|Match_Date|        Home_Team|Home_Sh|Home_SoT|Home_Touches|Home_Tkl|Home_Int|Home_Blocks|  Home_xG_Expected|Home_npxG_Expected|Home_SCA_SCA|Home_GCA_SCA|Home_Cmp_Passes|Home_Att_Passes|Home_Cmp_percent_Passes|Home_PrgP_Passes|Home_Carries_Carries|Home_PrgC_Carries|Home_Att_Take_Ons|Home_Succ_Take_Ons|           Away_Team|Away_Sh|Away_SoT|Away_To

In [79]:
encodeer = OneHotEncoder(inputCols=[encodeCol+"_Index" for encodeCol in encodeCols],outputCols=[encodeCol+"_Onehot" for encodeCol in encodeCols])
encodeer_df = encodeer.fit(indexer_df).transform(indexer_df)
encodeer_df.show(5)

+----------+-----------------+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+--------------------+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+----------+----------+---------------+---------------+----------------+----------------+
|Match_Date|        Home_Team|Home_Sh|Home_SoT|Home_Touches|Home_Tkl|Home_Int|Home_Blocks|  Home_xG_Expected|Home_npxG_Expected|Home_SCA_SCA|Home_GCA_SCA|Home_Cmp_Passes|Home_Att_Passes|Home_Cmp_percent_Passes|Home_PrgP_Passes|Home_Carries_Carries|Home_PrgC_Carries|Home_Att_Take_Ons|Home_Succ_Take_Ons|           

In [80]:

indexedCols = [encodeCol+"_Index" for encodeCol in encodeCols]
assembler_inputCols = encodeer_df.columns
assembler_inputCols = [assembler_inputCol for assembler_inputCol in assembler_inputCols \
                       if assembler_inputCol not in encodeCols and assembler_inputCol not in indexedCols and assembler_inputCol not in unusedCols]
assembler_df = encodeer_df.select(assembler_inputCols)
assembler = VectorAssembler(inputCols=assembler_inputCols,outputCol="features")

In [81]:
assembler_inputCols

['Home_Sh',
 'Home_SoT',
 'Home_Touches',
 'Home_Tkl',
 'Home_Int',
 'Home_Blocks',
 'Home_xG_Expected',
 'Home_npxG_Expected',
 'Home_SCA_SCA',
 'Home_GCA_SCA',
 'Home_Cmp_Passes',
 'Home_Att_Passes',
 'Home_Cmp_percent_Passes',
 'Home_PrgP_Passes',
 'Home_Carries_Carries',
 'Home_PrgC_Carries',
 'Home_Att_Take_Ons',
 'Home_Succ_Take_Ons',
 'Away_Sh',
 'Away_SoT',
 'Away_Touches',
 'Away_Tkl',
 'Away_Int',
 'Away_Blocks',
 'Away_xG_Expected',
 'Away_npxG_Expected',
 'Away_SCA_SCA',
 'Away_GCA_SCA',
 'Away_Cmp_Passes',
 'Away_Att_Passes',
 'Away_Cmp_percent_Passes',
 'Away_PrgP_Passes',
 'Away_Carries_Carries',
 'Away_PrgC_Carries',
 'Away_Att_Take_Ons',
 'Away_Succ_Take_Ons',
 'Home_Score',
 'Away_Score',
 'Away_Team_Onehot',
 'Home_Team_Onehot']

In [82]:
output_vector = assembler.transform(assembler_df)
output_vector.show(5)

+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+----------+----------+----------------+----------------+--------------------+
|Home_Sh|Home_SoT|Home_Touches|Home_Tkl|Home_Int|Home_Blocks|  Home_xG_Expected|Home_npxG_Expected|Home_SCA_SCA|Home_GCA_SCA|Home_Cmp_Passes|Home_Att_Passes|Home_Cmp_percent_Passes|Home_PrgP_Passes|Home_Carries_Carries|Home_PrgC_Carries|Home_Att_Take_Ons|Home_Succ_Take_Ons|Away_Sh|Away_SoT|Away_Touches|Away_Tkl|Away_Int|Away_Blocks|  Away_xG_Expected|Away_npxG_Expected|Awa

In [83]:
output_vector.select("features","Home_Score","Away_Score").show()

+--------------------+----------+----------+
|            features|Home_Score|Away_Score|
+--------------------+----------+----------+
|(90,[0,1,2,3,4,5,...|         1|         1|
|(90,[0,1,2,3,4,5,...|         1|         0|
|(90,[0,1,2,3,4,5,...|         1|         1|
|(90,[0,1,2,3,4,5,...|         2|         1|
|(90,[0,1,2,3,4,5,...|         5|         0|
|(90,[0,1,2,3,4,5,...|         4|         4|
|(90,[0,1,2,3,4,5,...|         1|         0|
|(90,[0,1,2,3,4,5,...|         2|         1|
|(90,[0,1,2,3,4,5,...|         1|         4|
|(90,[0,1,2,3,4,5,...|         2|         1|
|(90,[0,1,2,3,4,5,...|         4|         1|
|(90,[0,1,2,3,4,5,...|         1|         1|
|(90,[0,1,2,3,4,5,...|         0|         0|
|(90,[0,1,2,3,4,5,...|         1|         0|
|(90,[0,1,2,3,4,5,...|         3|         1|
|(90,[0,1,2,3,4,5,...|         3|         1|
|(90,[0,1,2,3,4,5,...|         1|         1|
|(90,[0,1,2,3,4,5,...|         1|         0|
|(90,[0,1,2,3,4,5,...|         0|         1|
|(90,[0,1,

# Linear Regression

## Home Score

In [84]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression

In [85]:
model_df = output_vector.select("features","Home_Score")
model_df.show(5)

+--------------------+----------+
|            features|Home_Score|
+--------------------+----------+
|(90,[0,1,2,3,4,5,...|         1|
|(90,[0,1,2,3,4,5,...|         1|
|(90,[0,1,2,3,4,5,...|         1|
|(90,[0,1,2,3,4,5,...|         2|
|(90,[0,1,2,3,4,5,...|         5|
+--------------------+----------+
only showing top 5 rows



In [86]:
training_df, test_df = model_df.randomSplit([0.7, 0.3])

In [87]:
print(training_df.count())
print(test_df.count())

1338
562


In [88]:
linearRegression = LinearRegression(featuresCol = "features",labelCol="Home_Score",predictionCol="Home_Score_Prediction").fit(training_df)

In [89]:
predictions = linearRegression.transform(test_df)

In [90]:
predictions.show()

+--------------------+----------+---------------------+
|            features|Home_Score|Home_Score_Prediction|
+--------------------+----------+---------------------+
|(90,[0,1,2,3,4,5,...|         2|                  2.0|
|(90,[0,1,2,3,4,5,...|         5|    5.000000000000001|
|(90,[0,1,2,3,4,5,...|         4|                  4.0|
|(90,[0,1,2,3,4,5,...|         3|                  3.0|
|(90,[0,1,2,3,4,5,...|         3|   3.0000000000000004|
|(90,[0,1,2,3,4,5,...|         2|   1.9999999999999998|
|(90,[0,1,2,3,4,5,...|         1|   0.9999999999999999|
|(90,[0,1,2,3,4,5,...|         1|   0.9999999999999999|
|(90,[0,1,2,3,4,5,...|         1|                  1.0|
|(90,[0,1,2,3,4,5,...|         2|   1.9999999999999998|
|(90,[0,1,2,3,4,5,...|         1|   0.9999999999999997|
|(90,[0,1,2,3,4,5,...|         1|   0.9999999999999997|
|(90,[0,1,2,3,4,5,...|         1|   0.9999999999999999|
|(90,[0,1,2,3,4,5,...|         4|    4.000000000000001|
|(90,[0,1,2,3,4,5,...|         1|   0.9999999999

In [91]:
evaluator_rmse = RegressionEvaluator(labelCol="Home_Score", predictionCol="Home_Score_Prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print(rmse)

4.0635181002775696e-16


In [92]:
evaluator_r2 = RegressionEvaluator(labelCol="Home_Score", predictionCol="Home_Score_Prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print(r2)

1.0


## Away Score

In [93]:
model_df = output_vector.select("features","Away_Score")
model_df.show(5)
training_df, test_df = model_df.randomSplit([0.7, 0.3])

+--------------------+----------+
|            features|Away_Score|
+--------------------+----------+
|(90,[0,1,2,3,4,5,...|         1|
|(90,[0,1,2,3,4,5,...|         0|
|(90,[0,1,2,3,4,5,...|         1|
|(90,[0,1,2,3,4,5,...|         1|
|(90,[0,1,2,3,4,5,...|         0|
+--------------------+----------+
only showing top 5 rows



In [94]:
linearRegression = LinearRegression(featuresCol = "features",labelCol="Away_Score",predictionCol="Away_Score_Prediction").fit(training_df)
predictions = linearRegression.transform(test_df)

In [95]:
predictions.show(5)

+--------------------+----------+---------------------+
|            features|Away_Score|Away_Score_Prediction|
+--------------------+----------+---------------------+
|(90,[0,1,2,3,4,5,...|         4|    4.000000000000003|
|(90,[0,1,2,3,4,5,...|         3|    3.000000000000002|
|(90,[0,1,2,3,4,5,...|         1|   1.0000000000000016|
|(90,[0,1,2,3,4,5,...|         1|   1.0000000000000013|
|(90,[0,1,2,3,4,5,...|         3|    3.000000000000002|
+--------------------+----------+---------------------+
only showing top 5 rows



In [96]:
evaluator_rmse = RegressionEvaluator(labelCol="Away_Score", predictionCol="Away_Score_Prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print(rmse)

1.4517209664285783e-15


In [97]:
evaluator_r2 = RegressionEvaluator(labelCol="Away_Score", predictionCol="Away_Score_Prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print(r2)

1.0


# Decision tree regression

## Home Score:

In [98]:
from pyspark.ml.regression import DecisionTreeRegressor

In [99]:
model_df = output_vector.select("features","Home_Score")
model_df.show(5)
training_df, test_df = model_df.randomSplit([0.7, 0.3])

+--------------------+----------+
|            features|Home_Score|
+--------------------+----------+
|(90,[0,1,2,3,4,5,...|         1|
|(90,[0,1,2,3,4,5,...|         1|
|(90,[0,1,2,3,4,5,...|         1|
|(90,[0,1,2,3,4,5,...|         2|
|(90,[0,1,2,3,4,5,...|         5|
+--------------------+----------+
only showing top 5 rows



In [100]:
dt = DecisionTreeRegressor(featuresCol = "features",labelCol="Home_Score",predictionCol="Home_Score_Prediction")
dt_model = dt.fit(training_df)

In [101]:
predictions = dt_model.transform(test_df)

In [102]:
predictions.show(5)

+--------------------+----------+---------------------+
|            features|Home_Score|Home_Score_Prediction|
+--------------------+----------+---------------------+
|(90,[0,1,2,3,4,5,...|         1|                  1.0|
|(90,[0,1,2,3,4,5,...|         3|                  3.0|
|(90,[0,1,2,3,4,5,...|         1|                  1.0|
|(90,[0,1,2,3,4,5,...|         1|                  1.0|
|(90,[0,1,2,3,4,5,...|         1|                  1.0|
+--------------------+----------+---------------------+
only showing top 5 rows



In [103]:
evaluator_rmse = RegressionEvaluator(labelCol="Home_Score", predictionCol="Home_Score_Prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print(rmse)

0.09308872791678188


In [104]:
evaluator_r2 = RegressionEvaluator(labelCol="Home_Score", predictionCol="Home_Score_Prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print(r2)

0.9953000169427465


## Away Score

In [105]:
model_df = output_vector.select("features","Away_Score")
model_df.show(5)
training_df, test_df = model_df.randomSplit([0.7, 0.3])

+--------------------+----------+
|            features|Away_Score|
+--------------------+----------+
|(90,[0,1,2,3,4,5,...|         1|
|(90,[0,1,2,3,4,5,...|         0|
|(90,[0,1,2,3,4,5,...|         1|
|(90,[0,1,2,3,4,5,...|         1|
|(90,[0,1,2,3,4,5,...|         0|
+--------------------+----------+
only showing top 5 rows



In [106]:
dt = DecisionTreeRegressor(featuresCol = "features",labelCol="Away_Score",predictionCol="Away_Score_Prediction")
dt_model = dt.fit(training_df)

In [107]:
predictions = dt_model.transform(test_df)
predictions.show(5)

+--------------------+----------+---------------------+
|            features|Away_Score|Away_Score_Prediction|
+--------------------+----------+---------------------+
|(90,[0,1,2,3,4,5,...|         4|                  4.0|
|(90,[0,1,2,3,4,5,...|         2|                  2.0|
|(90,[0,1,2,3,4,5,...|         4|                  4.0|
|(90,[0,1,2,3,4,5,...|         1|                  1.0|
|(90,[0,1,2,3,4,5,...|         1|                  1.0|
+--------------------+----------+---------------------+
only showing top 5 rows



In [109]:
evaluator_rmse = RegressionEvaluator(labelCol="Away_Score", predictionCol="Away_Score_Prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print(rmse)

0.11592077187019914


In [110]:
evaluator_r2 = RegressionEvaluator(labelCol="Away_Score", predictionCol="Away_Score_Prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print(r2)

0.9913609495274915
