In [1]:
import numpy as np
import math
import pandas
from pymongo import MongoClient
from datetime import date
from enum import Enum
from random import shuffle

In [2]:
# used as classifiers
class Result(Enum):
    HOME_WIN = 1
    DRAW = 2
    AWAY_WIN = 3
    
# used to calculate the form
class ResultScore(Enum):
    WIN = 3
    DRAW = 1
    LOSE = 0
    
number_of_last_games_form = 7

In [3]:
# connect to remote database
mongoClient = MongoClient("mongodb://app:123456@ds131511.mlab.com:31511/bettime")
db = mongoClient.bettime
# get premier league for now
matches = [m for m
           in db.Matches.find_one({ "leagueId": 39 })['season']['eventType'][0]['matches']
           if m['dataCertified']]
matches_count = len(matches)

In [4]:
############################################## points form feature ######################################################

#function: return a Result enum which represents the result of a givven game
def get_result(home_team, away_team):
    result = Result.DRAW
    if (home_team['isWinner']):
        result = Result.HOME_WIN
    elif (away_team['isWinner']):
        result = Result.AWAY_WIN
    return result;

#function:returnds the amount of points the team earned For the given result
def get_result_score(is_home, result):
    if (result == Result.DRAW):
        return ResultScore.DRAW
    else:
        if ((is_home and result == Result.HOME_WIN) or (not is_home and result == Result.AWAY_WIN)):
            return ResultScore.WIN
        else:
            return ResultScore.LOSE

# function: get the last 'n' matches of a given team before the was played before the time of the given match
def get_last_n_games_points(team, match, matches, n):
    # get home matches before game
    home_matches = [(get_result_score(True, get_result(m['teams'][0], m['teams'][1])), m)
                    for m
                    in matches
                    if team['teamId'] == m['teams'][0]['teamId']
                    and match['week'] > m['week']]
    # get away matches before game
    away_matches = [(get_result_score(False, get_result(m['teams'][0], m['teams'][1])), m)
                    for m
                    in matches
                    if team['teamId'] == m['teams'][1]['teamId']
                    and match['week'] > m['week']]
    # merge home and away and sort by week
    sorted_team_matches = sorted(home_matches + away_matches, key=lambda m: m[1]['week'], reverse=True)
    # get first matches relevant to to current form
    return sorted_team_matches[:n]

# function: calculating a score which represents the form for the given match of the given team 
def get_last_games_points_form(team, match, matches):
    # Get the last 'n' matches of the team
    last_games_for_form = get_last_n_games_points(team, match, matches, number_of_last_games_form)
    form_team_matches = [m[0].value for m in last_games_for_form]
     # Returning the team's earned point percenetage for representing its form
    return np.sum(form_team_matches) / (3 * len(form_team_matches)) if form_team_matches else 0

In [5]:
############################################## shots form feature ######################################################

#function:returnds the amount of points the team earned For the given result
def get_shots(team):
            return team['shots']



# function: get the last 'n' matches of a given team before the was played before the time of the given match
def get_last_n_games_shots_for(team, match, matches, n):
    # get home matches before game
    home_matches = [(get_shots(m['teams'][0]), m)
                    for m
                    in matches
                    if team['teamId'] == m['teams'][0]['teamId']
                    and match['week'] > m['week']]
    # get away matches before game
    away_matches = [(get_shots(m['teams'][1]), m)
                    for m
                    in matches
                    if team['teamId'] == m['teams'][1]['teamId']
                    and match['week'] > m['week']]
    # merge home and away and sort by week
    sorted_team_matches = sorted(home_matches + away_matches, key=lambda m: m[1]['week'], reverse=True)
    # get first matches relevant to to current form
    return sorted_team_matches[:n]

# function: calculating a score which represents the form for the given match of the given team 
def get_last_games_shots_for_form(team, match, matches):
    # Get the last 'n' matches of the team
    last_games_for_form = get_last_n_games_shots_for(team, match, matches, number_of_last_games_form)
    form_team_matches = [m[0] for m in last_games_for_form]
     # Returning the team's earned point percenetage for representing its form
    return np.sum(form_team_matches)

In [6]:
############################################## shots form feature ######################################################

#function:returnds the amount of points the team earned For the given result
def get_shots(team):
            return team['shots']



# function: get the last 'n' matches of a given team before the was played before the time of the given match
def get_last_n_games_shots_against(team, match, matches, n):
    # get home matches before game
    home_matches = [(get_shots(m['teams'][1]), m)
                    for m
                    in matches
                    if team['teamId'] == m['teams'][0]['teamId']
                    and match['week'] > m['week']]
    # get away matches before game
    away_matches = [(get_shots(m['teams'][0]), m)
                    for m
                    in matches
                    if team['teamId'] == m['teams'][1]['teamId']
                    and match['week'] > m['week']]
    # merge home and away and sort by week
    sorted_team_matches = sorted(home_matches + away_matches, key=lambda m: m[1]['week'], reverse=True)
    # get first matches relevant to to current form
    return sorted_team_matches[:n]

# function: calculating a score which represents the form for the given match of the given team 
def get_last_games_shots_against_form(team, match, matches):
    # Get the last 'n' matches of the team
    last_games_for_form = get_last_n_games_shots_against(team, match, matches, number_of_last_games_form)
    form_team_matches = [m[0] for m in last_games_for_form]
     # Returning the team's earned point percenetage for representing its form
    return np.sum(form_team_matches)

In [7]:
############################################## GF form feature #########################################################

#function:returnds the amount of points the team earned For the given result
def get_score(team):
            return team['score']



# function: get the last 'n' matches of a given team before the was played before the time of the given match
def get_last_n_games_GF(team, match, matches, n):
    # get home matches before game
    home_matches = [(get_score(m['teams'][0]), m)
                    for m
                    in matches
                    if team['teamId'] == m['teams'][0]['teamId']
                    and match['week'] > m['week']]
    # get away matches before game
    away_matches = [(get_score(m['teams'][1]), m)
                    for m
                    in matches
                    if team['teamId'] == m['teams'][1]['teamId']
                    and match['week'] > m['week']]
    # merge home and away and sort by week
    sorted_team_matches = sorted(home_matches + away_matches, key=lambda m: m[1]['week'], reverse=True)
    # get first matches relevant to to current form
    return sorted_team_matches[:n]

# function: calculating a score which represents the form for the given match of the given team 
def get_last_games_GF_form(team, match, matches):
    # Get the last 'n' matches of the team
    last_games_for_form = get_last_n_games_GF(team, match, matches, number_of_last_games_form)
    form_team_matches = [m[0] for m in last_games_for_form]
     # Returning the team's earned point percenetage for representing its form
    return np.sum(form_team_matches)

In [8]:
############################################## GA form feature #########################################################

#function:returnds the amount of points the team earned For the given result
def get_score(team):
            return team['score']



# function: get the last 'n' matches of a given team before the was played before the time of the given match
def get_last_n_games_GA(team, match, matches, n):
    # get home matches before game
    home_matches = [(get_score(m['teams'][1]), m)
                    for m
                    in matches
                    if team['teamId'] == m['teams'][0]['teamId']
                    and match['week'] > m['week']]
    # get away matches before game
    away_matches = [(get_score(m['teams'][0]), m)
                    for m
                    in matches
                    if team['teamId'] == m['teams'][1]['teamId']
                    and match['week'] > m['week']]
    # merge home and away and sort by week
    sorted_team_matches = sorted(home_matches + away_matches, key=lambda m: m[1]['week'], reverse=True)
    # get first matches relevant to to current form
    return sorted_team_matches[:n]

# function: calculating a score which represents the form for the given match of the given team 
def get_last_games_GA_form(team, match, matches):
    # Get the last 'n' matches of the team
    last_games_for_form = get_last_n_games_GA(team, match, matches, number_of_last_games_form)
    form_team_matches = [m[0] for m in last_games_for_form]
     # Returning the team's earned point percenetage for representing its form
    return np.sum(form_team_matches)

In [9]:
############################################## standing feature ########################################################

# function: 
def get_standing_score(team, match):
    number_of_games_played = team['record']['wins'] + team['record']['ties'] + team['record']['losses'];
    is_home = team['teamId'] == match['teams'][0]['teamId']
    match_result = get_result_score(is_home, get_result(match['teams'][0], match['teams'][1]))
    points = team['record']['points'] - match_result.value
    possible_points = (number_of_games_played - 1) * 3
    # check if points are above zero to avoid division by zero
    return (points / possible_points) if possible_points > 0 else 0

In [10]:
############################################## last meeting feature ####################################################

# function: 
def get_home_team_last_meeting(match, matches):
    matches = [get_result_score(False, get_result(m['teams'][0], m['teams'][1]))
               for m
               in matches
               if match['teams'][0]['teamId'] == m['teams'][1]['teamId']
               and match['teams'][1]['teamId'] == m['teams'][0]['teamId']
               and match['week'] > m['week']]
    
    return matches

In [11]:
xyz = []


# or each match in the matches vector:
for match in (m for m in matches if m['week'] > number_of_last_games_form):
    home_team = match['teams'][0]
    away_team = match['teams'][1]
    
    #home team feature
    home_team_feature = 3
    
    #away team feature
    away_team_feature = -3
    
    # points form feature
    home_team_points_form = get_last_games_points_form(home_team, match, matches)
    away_team_points_form = get_last_games_points_form(away_team, match, matches)
    form_points_feature = (home_team_points_form - away_team_points_form)
    
    # shots-for form feature
    home_team_shots_for_form = get_last_games_shots_for_form(home_team, match, matches)
    away_team_shots_for_form = get_last_games_shots_for_form(away_team, match, matches)
    form_shots_for_feature = (home_team_shots_for_form - away_team_shots_for_form)
    
    # shots-fagainst form feature
    home_team_shots_against_form = get_last_games_shots_against_form(home_team, match, matches)
    away_team_shots_against_form = get_last_games_shots_against_form(away_team, match, matches)
    form_shots_against_feature = (away_team_shots_against_form - home_team_shots_against_form)
    
    # GF form feature
    home_team_GF_form = get_last_games_GF_form(home_team, match, matches)
    away_team_GF_form = get_last_games_GF_form(away_team, match, matches)
    form_GF_feature = (home_team_GF_form - away_team_GF_form)
    
    # GA form feature
    home_team_GA_form = get_last_games_GA_form(home_team, match, matches)
    away_team_GA_form = get_last_games_GA_form(away_team, match, matches)
    form_GA_feature = (away_team_GA_form - home_team_GA_form)
    
    # standing feature
    home_team_standing_score = get_standing_score(home_team, match)
    away_team_standing_score = get_standing_score(away_team, match)
    standing_feature = (home_team_standing_score - away_team_standing_score)
    
    # last meeting feature
    home_team_last_meeting = get_home_team_last_meeting(match, matches)
    home_team_last_meeting_score = away_team_last_meeting_score = ResultScore.DRAW.value
    if (home_team_last_meeting and home_team_last_meeting[0] != ResultScore.DRAW.value):
        home_team_last_meeting_score = home_team_last_meeting[0].value
        away_team_last_meeting_score = abs(ResultScore.WIN.value - home_team_last_meeting_score)
    home_team_last_meeting_score = home_team_last_meeting_score / 3
    away_team_last_meeting_score = away_team_last_meeting_score / 3
    last_meeting_feature = (home_team_last_meeting_score - away_team_last_meeting_score)
        
    # Add the analyzed features from the current game to the features vector
    features = [form_points_feature, form_shots_for_feature, form_shots_against_feature, form_GF_feature, form_GA_feature,
         standing_feature, last_meeting_feature]
    
    # Add the result (= the classification) of the current game to the classifications vector
    classification = get_result(home_team, away_team).value
    
    # Add the label of the current game
    label = 'Week {}: {}-{} {}-{}'.format(match['week'], home_team['displayName'], away_team['displayName'],
        home_team['score'], away_team['score'])
    
    xyz.append((features, classification, label))
    
shuffle(xyz)
x, y, z = zip(*xyz)

In [12]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

# The list of mechine learning classifier algrithems that we use to classify the result  of a given match
algorithms = [
    ('naive bayes', GaussianNB()),
    ('neural network', MLPClassifier()),
    ('svm', svm.SVC(decision_function_shape='ovo', kernel='rbf', probability=False)),
    ('extra trees', ExtraTreesClassifier()),
    ('linear model', SGDClassifier(max_iter=100, tol=None)),
    ('random forest', RandomForestClassifier()),
    ('descision tree', DecisionTreeClassifier())
]

# The amount of data used for training the machine
train_count = math.ceil(matches_count * 0.75)

# The train vectors and classifications arrays
train_vectors = x[:train_count]
train_classifications = y[:train_count]

# The test vectors and classifications arrays
test_vectors = x[train_count + 1:]
test_classifications = y[train_count + 1:]

# Setsthe data frame first row 
test_labels = z[train_count + 1:]
pred_mat = [test_classifications]
columns = test_labels
rows = ['Reality']

# Loop over each mechine learing algorithem in the list
for algorithm in algorithms:
     # Getting the current classifier algorithem
    clf = algorithm[1]
    # Training the current classifier with the training data
    clf.fit(train_vectors, train_classifications)
    # Classifying the result for each train vector
    dec = clf.predict(test_vectors) 
    # Filling a new row representing the current algprithm results
    pred_mat.append(dec)
    rows.append(algorithm[0] + " %.2f" % (accuracy_score(dec, test_classifications) * 100) + "%")
    df = pandas.DataFrame(np.array(pred_mat), columns=columns, index=rows)

In [13]:
df

Unnamed: 0,Week 35: Stoke City-West Ham United 0-0,Week 20: Stoke City-Watford 2-0,Week 22: Manchester City-Tottenham Hotspur 2-2,Week 8: Bournemouth-Hull City 6-1,Week 11: Manchester City-Middlesbrough 1-1,Week 21: Burnley-Southampton 1-0,Week 34: Middlesbrough-Sunderland 1-0,Week 16: West Bromwich Albion-Swansea City 3-1,Week 17: Manchester City-Arsenal 2-1,Week 12: West Bromwich Albion-Burnley 4-0,...,Week 13: Chelsea-Tottenham Hotspur 2-1,Week 32: Tottenham Hotspur-Watford 4-0,Week 14: Stoke City-Burnley 2-0,Week 34: Liverpool-Crystal Palace 1-2,Week 33: Watford-Swansea City 1-0,Week 12: Middlesbrough-Chelsea 0-1,Week 29: West Bromwich Albion-Arsenal 3-1,Week 36: Leicester City-Watford 3-0,Week 24: Hull City-Liverpool 2-0,Week 31: Chelsea-Manchester City 2-1
Reality,2,1,2,1,2,1,1,1,1,1,...,1,1,1,3,1,3,1,1,1,1
naive bayes 70.83%,3,1,3,1,1,1,1,1,1,1,...,1,1,1,1,1,3,3,1,3,1
neural network 58.33%,3,3,1,1,2,3,1,1,1,1,...,1,1,1,1,1,3,3,2,3,2
svm 70.83%,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
extra trees 62.50%,1,2,3,1,1,3,1,1,1,1,...,2,1,1,1,1,3,1,1,3,1
linear model 66.67%,3,1,3,1,1,3,1,1,1,1,...,1,1,1,1,1,3,3,1,3,1
random forest 58.33%,3,1,3,1,1,1,1,1,1,1,...,2,1,1,1,1,3,1,3,3,2
descision tree 50.00%,1,2,3,1,1,2,1,1,1,1,...,3,1,1,1,2,3,1,3,3,1
