In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import isnan, when, count, col
#from pyspark.ml.regression import LabeledPoint
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import NaiveBayesModel
from pyspark.ml.linalg import Vector, SparseVector, DenseVector, Matrices
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from random import shuffle
import matplotlib.pyplot as plt

In [2]:
spark = SparkSession.builder.appName("BetsExploring").getOrCreate()
sc = spark.sparkContext
sc

In [3]:
betsHouses = ['B365', 'BS', 'BW', 'GB', 'IW', 'LB', 'SB', 'SJ', 'VC', 'WH']
quotaTypes = ['H','D','A']

In [4]:
# Selecting fields included in dataframe to use as features

def calcBetsHousesCols(df, betsHouses):
    betsHousesCols = []
    betsHousesOk = []
    for betsHouse in betsHouses:
        col = betsHouse + 'H'
        if col in df.columns:
            betsHousesOk.append(betsHouse)
            betsHousesCols.append(betsHouse + 'H')
            betsHousesCols.append(betsHouse + 'D')
            betsHousesCols.append(betsHouse + 'A')

    print ("Potential bets houses:", betsHouses)
    print ("Bets houses selected :" , betsHousesOk)
    print ("Bets houses columns  :", betsHousesCols)
    
    return betsHousesCols

In [5]:
# Cleaning nulls in dataframe, mixing deletion of columns and rows
# Columns with less than 'percDropping' not null rows will be deleted
# Rows with nulls aftes deleting columns will be deleted

def cleanNulls(df, percDropping):
    print ("Cleaning nulls:")
    totalrows = df.count()
    notnullrows = df.dropna().count()
    dropCols = []
    print ("Total rows:", totalrows)
    print ("Not null rows: ", notnullrows)
    print ("Percentage for dropping column: ", percDropping, "%")

    print ("\nNot null rows before cleaning:")
    dfok = df
    for col in df.columns:

        percOk = round((df.select(col).dropna().count() / totalrows) * 100, 2)
        print (col, "\t", df.select(col).dropna().count(), "\t(", percOk, "%)")

        # Dropping columnss with a lot of nulls
        if percOk < percDropping:
            dropCols.append(col)
            dfok = dfok.drop(col)
    #        print ("\tColumn dropped")

    # Dropping row with nulls
    dfok = dfok.dropna()
    percOk = round((dfok.count() / totalrows) * 100 , 2)
    print ("\nDropped columns:", dropCols)
    print ("Not null rows after cleaning: " , dfok.count(), "\t(", percOk, "%)")
    print ("Final columns:", dfok.columns)
    
    return dfok

In [6]:
# Converting text result to numeric result

def resultToNumeric(df):
    dfok = df.withColumn("Label", when(df['FTR']=='H',0)
                             .when(df['FTR']=='D',1)
                             .when(df['FTR']=='A',2).cast("double"))
    dfok.show(3)
                      
    return dfok

In [7]:
# Create Dataframe with label and features 

def createLabelAndFeatures(df, betsHousesCols):
    vectorAssembler = VectorAssembler(
        inputCols = betsHousesCols, 
        outputCol = 'features')

    vec_df = vectorAssembler.transform(df)

    vec_df = vec_df.withColumn("label", df["Label"])

    vec_df = vec_df.select(['features', 'label'])
    vec_df.show(5, False)
    vec_df.printSchema()
    
    return vec_df

In [8]:
# Defining, Training and Avaluating Naive Bayes Model

def calcNaiveBayesModel(df, betsHousesCols):
    
    # Converting result to numeric
    print ("\n  - Converting result to numeric")
    df = resultToNumeric(df)

    # Creating vector with label and features for training
    print ("\n  - Creating vector with label and features for training")
    vectrain = createLabelAndFeatures(df, betsHousesCols)

    # Training model
    print ("\n  - Training model")
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
    model = nb.fit(vectrain)
    print("Pi   :", model.pi)
    print("Theta:", model.theta)

    # Calculating auto-prediction to avaluate model
    print ("\n  - Calculating auto-prediction to avaluate model")
    predict_train = model.transform(vectrain)
    predict_train.show(5, False)

    # Evaluating model
    print ("\n  - Evaluating model")
    nb_evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label",metricName="accuracy")
    results = nb_evaluator.evaluate(predict_train)
    print ("Accuracy: ", round(results * 100, 2), "%")
    
    return model

In [12]:
# Calculating predictions with test dataset
def calcPrediction (model, df, betsHousesCols):
    
    # Converting result to numeric
    print ("\n  - Converting result to numeric")
    df = resultToNumeric(df)

    # Creating vector with label and features for training
    print ("  - Creating vector with label and features for training")
    vectest = createLabelAndFeatures(df, betsHousesCols)

    # Calculating prediction
    print ("  - Calculating prediction")
    predict = model.transform(vectest)
    print(predict.count())
    predict.show(5, False)

    # Evaluating test
    print ("  - Evaluating model")
    nb_evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label",metricName="accuracy")
    results = nb_evaluator.evaluate(predict)
    print ("Accuracy: ", round(results * 100, 2), "%")
    
    return predict

In [13]:
# Calculating Naive Bayes Model and Prediction

def calcModelAndPrediction(df):
    
    # Cleaning nulls in columns and rows
    print ("\n* Cleaning nulls in columns and rows")
    dfok = cleanNulls(df, 80)

    # Calculating Bets Houses Cols
    print ("\n  - Calculating Bets Houses Cols")
    betsHousesCols =  calcBetsHousesCols(dfok, betsHouses)

    # Spliting training and testing datasets
    print ("\n* Spliting training and testing datasets")
    dftrain, dftest = dfok.randomSplit([0.8, 0.2], seed=1)
    print("  Train dataset:", dftrain.count(), "rows")
    print("  Test dataset: ", dftest.count(), "rows")

    # Defining, Training and Avaluating Naive Bayes Model
    print ("\n* Defining, Training and Avaluating Naive Bayes Model")
    model = calcNaiveBayesModel(dftrain, betsHousesCols)

    # Calculating predictions with test dataset
    print ("\n* Calculating predictions with test dataset")
    predict = calcPrediction(model, dftest, betsHousesCols)


In [14]:
url = "../Data/Processed/main_competitions_recent.csv"
# Open Dataset file
print ("* Opening dataset file")
df = spark.read.csv(path = url, header = True, inferSchema = True)
print ("Rows   :", df.count())
print ("Columns:", df.columns)
print ("First row:")
print(df.take(1))
df.printSchema()

calcModelAndPrediction(df)

* Opening dataset file
Rows   : 11374
Columns: ['Country', 'Competition', 'Season', 'Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTR', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'VCH', 'VCD', 'VCA', 'WHH', 'WHD', 'WHA']
First row:
[Row(Country='Belgium', Competition='JupilerLeague', Season='2017-2018', Div='B1', Date=datetime.datetime(2017, 7, 28, 0, 0), HomeTeam='Antwerp', AwayTeam='Anderlecht', FTR='D', B365H=5.75, B365D=3.8, B365A=1.6, BWH=5.0, BWD=4.0, BWA=1.67, IWH=4.7, IWD=3.6, IWA=1.7, LBH=5.2, LBD=3.7, LBA=1.61, VCH=5.5, VCD=3.9, VCA=1.62, WHH=4.75, WHD=3.75, WHA=1.67)]
root
 |-- Country: string (nullable = true)
 |-- Competition: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Div: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- HomeTeam: string (nullable = true)
 |-- AwayTeam: string (nullable = true)
 |-- FTR: string (nullable = true)
 |-- B365H: double (nullable = true)
 |-- B365D: double (nu

+--------------------------------------------------------------------+-----+
|features                                                            |label|
+--------------------------------------------------------------------+-----+
|[2.6,3.5,2.54,2.55,3.5,2.7,2.85,3.4,2.3,2.55,3.5,2.63,2.5,3.4,2.6]  |0.0  |
|[1.53,3.79,6.5,1.57,4.1,6.0,1.55,4.0,5.5,1.53,4.1,6.0,1.53,3.8,6.0] |1.0  |
|[3.3,3.25,2.2,3.2,3.25,2.35,2.9,3.3,2.3,3.13,3.25,2.3,3.2,3.2,2.2]  |2.0  |
|[1.75,3.75,4.33,1.83,3.7,4.25,1.85,3.5,3.9,1.87,3.6,4.0,1.8,3.5,4.2]|0.0  |
|[1.4,4.5,8.0,1.42,4.6,7.75,1.4,4.4,7.3,1.4,4.8,7.5,1.4,4.5,7.0]     |0.0  |
+--------------------------------------------------------------------+-----+
only showing top 5 rows

root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)

  - Calculating prediction
2331
+--------------------------------------------------------------------+-----+-------------------------------------------------------------+-----------------------------

In [15]:
url = "../Data/Processed/main_competitions.csv"
# Open Dataset file
print ("* Opening dataset file")
df = spark.read.csv(path = url, header = True, inferSchema = True)
print ("Rows   :", df.count())
print ("Columns:", df.columns)
print ("First row:")
print(df.take(1))
df.printSchema()

calcModelAndPrediction(df)

* Opening dataset file
Rows   : 106583
Columns: ['Country', 'Competition', 'Season', 'Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTR', 'B365H', 'B365D', 'B365A', 'BSH', 'BSD', 'BSA', 'BWH', 'BWD', 'BWA', 'GBH', 'GBD', 'GBA', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'SBH', 'SBD', 'SBA', 'SJH', 'SJD', 'SJA', 'VCH', 'VCD', 'VCA', 'WHH', 'WHD', 'WHA']
First row:
[Row(Country='Belgium', Competition='JupilerLeague', Season='2003-2004', Div='B1', Date=datetime.datetime(2003, 8, 8, 0, 0), HomeTeam='Club Brugge', AwayTeam='Genk', FTR='H', B365H=1.4, B365D=3.75, B365A=7.0, BSH=None, BSD=None, BSA=None, BWH=None, BWD=None, BWA=None, GBH=1.4, GBD=3.8, GBA=6.85, IWH=1.45, IWD=3.8, IWA=5.4, LBH=None, LBD=None, LBA=None, SBH=1.44, SBD=3.75, SBA=6.5, SJH=None, SJD=None, SJA=None, VCH=None, VCD=None, VCA=None, WHH=None, WHD=None, WHA=None)]
root
 |-- Country: string (nullable = true)
 |-- Competition: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Div: string (nullable = true)
 |-- Da

Accuracy:  47.4 %

* Calculating predictions with test dataset

  - Converting result to numeric
+-------+-------------+---------+---+-------------------+----------+-----------+---+-----+-----+-----+---+---+----+----+---+---+----+----+---+----+----+---+----+----+----+-----+
|Country|  Competition|   Season|Div|               Date|  HomeTeam|   AwayTeam|FTR|B365H|B365D|B365A|BWH|BWD| BWA| IWH|IWD|IWA| LBH| LBD|LBA| VCH| VCD|VCA| WHH| WHD| WHA|Label|
+-------+-------------+---------+---+-------------------+----------+-----------+---+-----+-----+-----+---+---+----+----+---+---+----+----+---+----+----+---+----+----+----+-----+
|Belgium|JupilerLeague|2005-2006| B1|2005-01-10 00:00:00|   Waregem|  Roeselare|  H| 1.66|  3.4|  4.5|1.8|3.2|4.05|1.75|3.2|4.0|1.72| 3.4|4.0| 1.7| 3.6|4.0| 1.6| 3.5| 4.6|  0.0|
|Belgium|JupilerLeague|2005-2006| B1|2005-03-12 00:00:00|    Lierse|FC Brussels|  D|  2.5|  3.2|  2.5|2.5|3.2| 2.5| 2.6|3.0|2.4|2.38| 3.2|2.6|2.35|3.25|2.5|2.45|3.25|2.45|  1.0|
|Belgium|Jupi