In [None]:
from pyspark.ml.classification import LogisticRegression
import findspark
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import count

In [None]:
spark = (SparkSession
         .builder
         .appName("Logistic Regression")
         .getOrCreate())

In [None]:
df = (spark.read
            .option("HEADER", True)
            .option("inferSchema", True)
            .csv("./data/pl_matches_modified.csv")
           )

df.show(5)

# df['home_captain'] = df['home_captain'].astype(str)

# Onehot Encoder

https://stackoverflow.com/questions/56585434/pyspark-pipeline-error-when-using-indexer-and-encoder

In [None]:
from pyspark.ml.feature import OneHotEncoder,StringIndexer,VectorAssembler

In [None]:
# unusedCols = ['home_fbrefMatchId ','away_fbrefMatchId']
# outputCols = ['home_score','away_score']
# inputCols = [column for column in df.columns if column not in outputCols and column not in unusedCols]
# encodeCols = ['home_captain','away_captain','home_manager','away_manager','home_team','away_team','away_is_home_team']

unusedCols = ['Match_Date']
outputCols = ['Home_Score','Away_Score']
inputCols = [column for column in df.columns if column not in outputCols and column not in unusedCols]
encodeCols = ['Away_Team','Home_Team']




In [None]:
indexer = StringIndexer(inputCols=encodeCols,outputCols = [encodeCol+"_Index" for encodeCol in encodeCols])
indexerModel = indexer.fit(df)
indexer_df = indexerModel.transform(df)
indexer_df.show(5)

In [None]:
encodeer = OneHotEncoder(inputCols=[encodeCol+"_Index" for encodeCol in encodeCols],outputCols=[encodeCol+"_Onehot" for encodeCol in encodeCols])
encodeer_df = encodeer.fit(indexer_df).transform(indexer_df)
encodeer_df.show(5)

In [None]:

indexedCols = [encodeCol+"_Index" for encodeCol in encodeCols]
assembler_inputCols = encodeer_df.columns
assembler_inputCols = [assembler_inputCol for assembler_inputCol in assembler_inputCols \
                       if assembler_inputCol not in encodeCols and assembler_inputCol not in indexedCols and assembler_inputCol not in unusedCols]
assembler_df = encodeer_df.select(assembler_inputCols)
assembler = VectorAssembler(inputCols=assembler_inputCols,outputCol="features")

In [None]:
assembler_inputCols

In [None]:
output_vector = assembler.transform(assembler_df)
output_vector.show(5)

In [None]:
output_vector.select("features","Home_Score","Away_Score").show()

# Linear Regression

## Home Score

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression

In [None]:
model_df = output_vector.select("features","Home_Score")
model_df.show(5)

In [None]:
training_df, test_df = model_df.randomSplit([0.7, 0.3])

In [None]:
print(training_df.count())
print(test_df.count())

In [None]:
linearRegression = LinearRegression(featuresCol = "features",labelCol="Home_Score",predictionCol="Home_Score_Prediction").fit(training_df)

In [None]:
predictions = linearRegression.transform(test_df)

In [None]:
predictions.show()

In [None]:
evaluator_rmse = RegressionEvaluator(labelCol="Home_Score", predictionCol="Home_Score_Prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print(rmse)

In [None]:
evaluator_r2 = RegressionEvaluator(labelCol="Home_Score", predictionCol="Home_Score_Prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print(r2)

## Away Score

In [None]:
model_df = output_vector.select("features","Away_Score")
model_df.show(5)
training_df, test_df = model_df.randomSplit([0.7, 0.3])

In [None]:
linearRegression = LinearRegression(featuresCol = "features",labelCol="Away_Score",predictionCol="Away_Score_Prediction").fit(training_df)
predictions = linearRegression.transform(test_df)

In [None]:
predictions.show(5)

In [None]:
evaluator_rmse = RegressionEvaluator(labelCol="Away_Score", predictionCol="Away_Score_Prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print(rmse)

In [None]:
evaluator_r2 = RegressionEvaluator(labelCol="Away_Score", predictionCol="Away_Score_Prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print(r2)

# Decision tree regression

## Home Score:

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor

In [None]:
model_df = output_vector.select("features","Home_Score")
model_df.show(5)
training_df, test_df = model_df.randomSplit([0.7, 0.3])

In [None]:
dt = DecisionTreeRegressor(featuresCol = "features",labelCol="Home_Score",predictionCol="Home_Score_Prediction")
dt_model = dt.fit(training_df)

In [None]:
predictions = dt_model.transform(test_df)

In [None]:
predictions.show(5)

In [None]:
evaluator_rmse = RegressionEvaluator(labelCol="Home_Score", predictionCol="Home_Score_Prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print(rmse)

In [None]:
evaluator_r2 = RegressionEvaluator(labelCol="Home_Score", predictionCol="Home_Score_Prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print(r2)

## Away Score

In [None]:
model_df = output_vector.select("features","Away_Score")
model_df.show(5)
training_df, test_df = model_df.randomSplit([0.7, 0.3])

In [None]:
dt = DecisionTreeRegressor(featuresCol = "features",labelCol="Away_Score",predictionCol="Away_Score_Prediction")
dt_model = dt.fit(training_df)

In [None]:
predictions = dt_model.transform(test_df)
predictions.show(5)

In [None]:
evaluator_rmse = RegressionEvaluator(labelCol="Away_Score", predictionCol="Away_Score_Prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print(rmse)

In [None]:
evaluator_r2 = RegressionEvaluator(labelCol="Away_Score", predictionCol="Away_Score_Prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print(r2)