In [13]:
from pyspark.ml.classification import LogisticRegression
import findspark
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import count

In [14]:
spark = (SparkSession
         .builder
         .appName("Logistic Regression")
         .getOrCreate())

In [15]:
df = (spark.read
            .option("HEADER", True)
            .option("inferSchema", True)
            .csv("./data/pl_matches_modified.csv")
           )

df.show(5)

# df['home_captain'] = df['home_captain'].astype(str)

+----------+-----------------+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+--------------------+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+----------+----------+
|Match_Date|        Home_Team|Home_Sh|Home_SoT|Home_Touches|Home_Tkl|Home_Int|Home_Blocks|  Home_xG_Expected|Home_npxG_Expected|Home_SCA_SCA|Home_GCA_SCA|Home_Cmp_Passes|Home_Att_Passes|Home_Cmp_percent_Passes|Home_PrgP_Passes|Home_Carries_Carries|Home_PrgC_Carries|Home_Att_Take_Ons|Home_Succ_Take_Ons|           Away_Team|Away_Sh|Away_SoT|Away_Touches|Away_Tkl|Away_Int|Away_Blo

# Onehot Encoder

https://stackoverflow.com/questions/56585434/pyspark-pipeline-error-when-using-indexer-and-encoder

In [16]:
from pyspark.ml.feature import OneHotEncoder,StringIndexer,VectorAssembler

In [17]:
# unusedCols = ['home_fbrefMatchId ','away_fbrefMatchId']
# outputCols = ['home_score','away_score']
# inputCols = [column for column in df.columns if column not in outputCols and column not in unusedCols]
# encodeCols = ['home_captain','away_captain','home_manager','away_manager','home_team','away_team','away_is_home_team']

unusedCols = ['Match_Date']
outputCols = ['Home_Score','Away_Score']
inputCols = [column for column in df.columns if column not in outputCols and column not in unusedCols]
encodeCols = ['Away_Team','Home_Team']




In [18]:
indexer = StringIndexer(inputCols=encodeCols,outputCols = [encodeCol+"_Index" for encodeCol in encodeCols])
indexerModel = indexer.fit(df)
indexer_df = indexerModel.transform(df)
indexer_df.show(5)

+----------+-----------------+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+--------------------+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+----------+----------+---------------+---------------+
|Match_Date|        Home_Team|Home_Sh|Home_SoT|Home_Touches|Home_Tkl|Home_Int|Home_Blocks|  Home_xG_Expected|Home_npxG_Expected|Home_SCA_SCA|Home_GCA_SCA|Home_Cmp_Passes|Home_Att_Passes|Home_Cmp_percent_Passes|Home_PrgP_Passes|Home_Carries_Carries|Home_PrgC_Carries|Home_Att_Take_Ons|Home_Succ_Take_Ons|           Away_Team|Away_Sh|Away_SoT|Away_To

In [19]:
encodeer = OneHotEncoder(inputCols=[encodeCol+"_Index" for encodeCol in encodeCols],outputCols=[encodeCol+"_Onehot" for encodeCol in encodeCols])
encodeer_df = encodeer.fit(indexer_df).transform(indexer_df)
encodeer_df.show(5)

+----------+-----------------+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+--------------------+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+----------+----------+---------------+---------------+----------------+----------------+
|Match_Date|        Home_Team|Home_Sh|Home_SoT|Home_Touches|Home_Tkl|Home_Int|Home_Blocks|  Home_xG_Expected|Home_npxG_Expected|Home_SCA_SCA|Home_GCA_SCA|Home_Cmp_Passes|Home_Att_Passes|Home_Cmp_percent_Passes|Home_PrgP_Passes|Home_Carries_Carries|Home_PrgC_Carries|Home_Att_Take_Ons|Home_Succ_Take_Ons|           

In [52]:

indexedCols = [encodeCol+"_Index" for encodeCol in encodeCols]
encoderCols = encodeer_df.columns
assembler_cols = [encoderCol for encoderCol in encoderCols \
                       if encoderCol not in encodeCols 
                       and encoderCol not in indexedCols 
                       and encoderCol not in unusedCols]
assembler_inputCols = [assembler_col for assembler_col in assembler_cols if assembler_col not in outputCols]
assembler_df = encodeer_df.select(assembler_cols)
assembler = VectorAssembler(inputCols=assembler_inputCols,outputCol="features")

In [53]:
output_vector = assembler.transform(assembler_df)
output_vector.show(5)

+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+-------+--------+------------+--------+--------+-----------+------------------+------------------+------------+------------+---------------+---------------+-----------------------+----------------+--------------------+-----------------+-----------------+------------------+----------+----------+----------------+----------------+--------------------+
|Home_Sh|Home_SoT|Home_Touches|Home_Tkl|Home_Int|Home_Blocks|  Home_xG_Expected|Home_npxG_Expected|Home_SCA_SCA|Home_GCA_SCA|Home_Cmp_Passes|Home_Att_Passes|Home_Cmp_percent_Passes|Home_PrgP_Passes|Home_Carries_Carries|Home_PrgC_Carries|Home_Att_Take_Ons|Home_Succ_Take_Ons|Away_Sh|Away_SoT|Away_Touches|Away_Tkl|Away_Int|Away_Blocks|  Away_xG_Expected|Away_npxG_Expected|Awa

In [54]:
output_vector.select("features","Home_Score","Away_Score").show()

+--------------------+----------+----------+
|            features|Home_Score|Away_Score|
+--------------------+----------+----------+
|(88,[0,1,2,3,4,5,...|         1|         1|
|(88,[0,1,2,3,4,5,...|         1|         0|
|(88,[0,1,2,3,4,5,...|         1|         1|
|(88,[0,1,2,3,4,5,...|         2|         1|
|(88,[0,1,2,3,4,5,...|         5|         0|
|(88,[0,1,2,3,4,5,...|         4|         4|
|(88,[0,1,2,3,4,5,...|         1|         0|
|(88,[0,1,2,3,4,5,...|         2|         1|
|(88,[0,1,2,3,4,5,...|         1|         4|
|(88,[0,1,2,3,4,5,...|         2|         1|
|(88,[0,1,2,3,4,5,...|         4|         1|
|(88,[0,1,2,3,4,5,...|         1|         1|
|(88,[0,1,2,3,4,5,...|         0|         0|
|(88,[0,1,2,3,4,5,...|         1|         0|
|(88,[0,1,2,3,4,5,...|         3|         1|
|(88,[0,1,2,3,4,5,...|         3|         1|
|(88,[0,1,2,3,4,5,...|         1|         1|
|(88,[0,1,2,3,4,5,...|         1|         0|
|(88,[0,1,2,3,4,5,...|         0|         1|
|(88,[0,1,

# Linear Regression

## Home Score

In [55]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression

In [56]:
model_df = output_vector.select("features","Home_Score")
model_df.show(5)

+--------------------+----------+
|            features|Home_Score|
+--------------------+----------+
|(88,[0,1,2,3,4,5,...|         1|
|(88,[0,1,2,3,4,5,...|         1|
|(88,[0,1,2,3,4,5,...|         1|
|(88,[0,1,2,3,4,5,...|         2|
|(88,[0,1,2,3,4,5,...|         5|
+--------------------+----------+
only showing top 5 rows



In [57]:
training_df, test_df = model_df.randomSplit([0.7, 0.3])

In [58]:
print(training_df.count())
print(test_df.count())

1333
567


In [59]:
linearRegression = LinearRegression(featuresCol = "features",labelCol="Home_Score",predictionCol="Home_Score_Prediction").fit(training_df)

In [60]:
predictions = linearRegression.transform(test_df)

In [61]:
predictions.show()

+--------------------+----------+---------------------+
|            features|Home_Score|Home_Score_Prediction|
+--------------------+----------+---------------------+
|(88,[0,1,2,3,4,5,...|         1|   0.9314245868493488|
|(88,[0,1,2,3,4,5,...|         3|    1.280050186804215|
|(88,[0,1,2,3,4,5,...|         0|   1.1391710478257968|
|(88,[0,1,2,3,4,5,...|         3|   1.6712357364189778|
|(88,[0,1,2,3,4,5,...|         0|   0.8319296944103378|
|(88,[0,1,2,3,4,5,...|         5|   2.5637726038595217|
|(88,[0,1,2,3,4,5,...|         0|   1.9747339814817053|
|(88,[0,1,2,3,4,5,...|         0|   1.1575952305478285|
|(88,[0,1,2,3,4,5,...|         0|   1.2830526235867483|
|(88,[0,1,2,3,4,5,...|         3|   0.9886974688753157|
|(88,[0,1,2,3,4,5,...|         0|   1.9187601050957064|
|(88,[0,1,2,3,4,5,...|         3|   1.3387466309028195|
|(88,[0,1,2,3,4,5,...|         1|   1.5287109888508195|
|(88,[0,1,2,3,4,5,...|         0|   0.8726569554034186|
|(88,[0,1,2,3,4,5,...|         1|    1.291246535

In [62]:
evaluator_rmse = RegressionEvaluator(labelCol="Home_Score", predictionCol="Home_Score_Prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print(rmse)

1.2804290627198955


In [63]:
evaluator_r2 = RegressionEvaluator(labelCol="Home_Score", predictionCol="Home_Score_Prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print(r2)

0.10761381601200759


## Away Score

In [35]:
model_df = output_vector.select("features","Away_Score")
model_df.show(5)
training_df, test_df = model_df.randomSplit([0.7, 0.3])

+--------------------+----------+
|            features|Away_Score|
+--------------------+----------+
|(88,[0,1,2,3,4,5,...|         1|
|(88,[0,1,2,3,4,5,...|         0|
|(88,[0,1,2,3,4,5,...|         1|
|(88,[0,1,2,3,4,5,...|         1|
|(88,[0,1,2,3,4,5,...|         0|
+--------------------+----------+
only showing top 5 rows



In [36]:
linearRegression = LinearRegression(featuresCol = "features",labelCol="Away_Score",predictionCol="Away_Score_Prediction").fit(training_df)
predictions = linearRegression.transform(test_df)

In [37]:
predictions.show(5)

+--------------------+----------+---------------------+
|            features|Away_Score|Away_Score_Prediction|
+--------------------+----------+---------------------+
|(88,[0,1,2,3,4,5,...|         1|   1.4283298630807402|
|(88,[0,1,2,3,4,5,...|         0|   1.5881559034310821|
|(88,[0,1,2,3,4,5,...|         2|   1.3943305780648938|
|(88,[0,1,2,3,4,5,...|         2|   1.3946211681927596|
|(88,[0,1,2,3,4,5,...|         0|   1.4005098491808234|
+--------------------+----------+---------------------+
only showing top 5 rows



In [38]:
evaluator_rmse = RegressionEvaluator(labelCol="Away_Score", predictionCol="Away_Score_Prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print(rmse)

1.2526407540438125


In [39]:
evaluator_r2 = RegressionEvaluator(labelCol="Away_Score", predictionCol="Away_Score_Prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print(r2)

0.08198555186562761


# Decision tree regression

## Home Score:

In [40]:
from pyspark.ml.regression import DecisionTreeRegressor

In [41]:
model_df = output_vector.select("features","Home_Score")
model_df.show(5)
training_df, test_df = model_df.randomSplit([0.7, 0.3])

+--------------------+----------+
|            features|Home_Score|
+--------------------+----------+
|(88,[0,1,2,3,4,5,...|         1|
|(88,[0,1,2,3,4,5,...|         1|
|(88,[0,1,2,3,4,5,...|         1|
|(88,[0,1,2,3,4,5,...|         2|
|(88,[0,1,2,3,4,5,...|         5|
+--------------------+----------+
only showing top 5 rows



In [42]:
dt = DecisionTreeRegressor(featuresCol = "features",labelCol="Home_Score",predictionCol="Home_Score_Prediction")
dt_model = dt.fit(training_df)

In [43]:
predictions = dt_model.transform(test_df)

In [44]:
predictions.show(5)

+--------------------+----------+---------------------+
|            features|Home_Score|Home_Score_Prediction|
+--------------------+----------+---------------------+
|(88,[0,1,2,3,4,5,...|         0|   1.0503597122302157|
|(88,[0,1,2,3,4,5,...|         0|   1.6993243243243243|
|(88,[0,1,2,3,4,5,...|         0|   1.6993243243243243|
|(88,[0,1,2,3,4,5,...|         3|   1.3397435897435896|
|(88,[0,1,2,3,4,5,...|         1|   1.3397435897435896|
+--------------------+----------+---------------------+
only showing top 5 rows



In [45]:
evaluator_rmse = RegressionEvaluator(labelCol="Home_Score", predictionCol="Home_Score_Prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print(rmse)

1.4623142317990825


In [46]:
evaluator_r2 = RegressionEvaluator(labelCol="Home_Score", predictionCol="Home_Score_Prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print(r2)

-0.17303838602609933


## Away Score

In [47]:
model_df = output_vector.select("features","Away_Score")
model_df.show(5)
training_df, test_df = model_df.randomSplit([0.7, 0.3])

+--------------------+----------+
|            features|Away_Score|
+--------------------+----------+
|(88,[0,1,2,3,4,5,...|         1|
|(88,[0,1,2,3,4,5,...|         0|
|(88,[0,1,2,3,4,5,...|         1|
|(88,[0,1,2,3,4,5,...|         1|
|(88,[0,1,2,3,4,5,...|         0|
+--------------------+----------+
only showing top 5 rows



In [48]:
dt = DecisionTreeRegressor(featuresCol = "features",labelCol="Away_Score",predictionCol="Away_Score_Prediction")
dt_model = dt.fit(training_df)

In [49]:
predictions = dt_model.transform(test_df)
predictions.show(5)

+--------------------+----------+---------------------+
|            features|Away_Score|Away_Score_Prediction|
+--------------------+----------+---------------------+
|(88,[0,1,2,3,4,5,...|         4|    1.243353783231084|
|(88,[0,1,2,3,4,5,...|         1|    1.243353783231084|
|(88,[0,1,2,3,4,5,...|         0|   1.0339285714285715|
|(88,[0,1,2,3,4,5,...|         1|   1.0339285714285715|
|(88,[0,1,2,3,4,5,...|         2|   1.0339285714285715|
+--------------------+----------+---------------------+
only showing top 5 rows



In [50]:
evaluator_rmse = RegressionEvaluator(labelCol="Away_Score", predictionCol="Away_Score_Prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print(rmse)

1.333916409241543


In [51]:
evaluator_r2 = RegressionEvaluator(labelCol="Away_Score", predictionCol="Away_Score_Prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print(r2)

-0.2042152238253554
