In [1]:
import numpy as np
import math
import pandas
from pymongo import MongoClient
from datetime import date
from enum import Enum

In [2]:
# used as classifiers
class Result(Enum):
    HOME_WIN = 1
    DRAW = 2
    AWAY_WIN = 3
    
# used to calculate the form
class ResultScore(Enum):
    WIN = 3
    DRAW = 1
    LOSE = 0
    
number_of_last_games_form = 5

In [3]:
# connect to remote database
mongoClient = MongoClient("mongodb://app:123456@ds131511.mlab.com:31511/bettime")
db = mongoClient.bettime
# get premier league for now
matches = [m for m
           in db.Matches.find_one({ "leagueId": 39 })['season']['eventType'][0]['matches']
           if m['dataCertified']]
matches_count = len(matches)

In [4]:
def get_result(home_team, away_team):
    result = Result.DRAW
    if (home_team['isWinner']):
        result = Result.HOME_WIN
    elif (away_team['isWinner']):
        result = Result.AWAY_WIN
    return result;

def get_result_score(is_home, result):
    if (result == Result.DRAW):
        return ResultScore.DRAW
    else:
        if ((is_home and result == Result.HOME_WIN) or (not is_home and result == Result.AWAY_WIN)):
            return ResultScore.WIN
        else:
            return ResultScore.LOSE

def get_last_n_games(team, match, matches, n):
    # get home matches before game
    home_matches = [(get_result_score(True, get_result(m['teams'][0], m['teams'][1])), m)
                    for m
                    in matches
                    if team['teamId'] == m['teams'][0]['teamId']
                    and match['week'] > m['week']]
    # get away matches before game
    away_matches = [(get_result_score(False, get_result(m['teams'][0], m['teams'][1])), m)
                    for m
                    in matches
                    if team['teamId'] == m['teams'][1]['teamId']
                    and match['week'] > m['week']]
    team_matches = home_matches + away_matches
    # merge home and away and sort by week
    sorted_team_matches = sorted(home_matches + away_matches, key=lambda m: m[1]['week'], reverse=True)
    # get first matches relevant to to current form
    return sorted_team_matches[:n]
        
def get_last_games_form(team, match, matches):
    last_games_for_form = get_last_n_games(team, match, matches, number_of_last_games_form)
    form_team_matches = [m[0].value for m in last_games_for_form]
    return np.sum(form_team_matches) / (3 * len(form_team_matches)) if form_team_matches else 0

def get_standing_score(team, match):
    number_of_games_played = team['record']['wins'] + team['record']['ties'] + team['record']['losses'];
    is_home = team['teamId'] == match['teams'][0]['teamId']
    match_result = get_result_score(is_home, get_result(match['teams'][0], match['teams'][1]))
    points = team['record']['points'] - match_result.value
    possible_points = (number_of_games_played - 1) * 3
    # check if points are above zero to avoid division by zero
    return (points / possible_points) if possible_points > 0 else 0

def get_home_team_last_meeting(match, matches):
    matches = [get_result_score(False, get_result(m['teams'][0], m['teams'][1]))
               for m
               in matches
               if match['teams'][0]['teamId'] == m['teams'][1]['teamId']
               and match['teams'][1]['teamId'] == m['teams'][0]['teamId']
               and match['week'] > m['week']]
    
    return matches

In [5]:
x = [];
y = [];
z = [];

for match in (m for m in matches if m['week'] > 5):
    home_team = match['teams'][0]
    away_team = match['teams'][1]
    
    # form
    home_team_form = get_last_games_form(home_team, match, matches)
    away_team_form = get_last_games_form(away_team, match, matches)
    
    # standing
    home_team_standing_score = get_standing_score(home_team, match)
    away_team_standing_score = get_standing_score(away_team, match)
    
    # last meeting
    home_team_last_meeting = get_home_team_last_meeting(match, matches)
    home_team_last_meeting_score = away_team_last_meeting_score = ResultScore.DRAW.value
    if (home_team_last_meeting and home_team_last_meeting[0] != ResultScore.DRAW.value):
        home_team_last_meeting_score = home_team_last_meeting[0].value
        away_team_last_meeting_score = abs(ResultScore.WIN.value - home_team_last_meeting_score)
    home_team_last_meeting_score = home_team_last_meeting_score / 3
    away_team_last_meeting_score = away_team_last_meeting_score / 3
    
    z.append('Week ' + str(match['week']) + ': ' + home_team['displayName'] + '-' + away_team['displayName'] +
            ' ' + str(home_team['score']) + '-' + str(away_team['score']))
    x.append([home_team_form,home_team_standing_score,home_team_last_meeting_score,1,
              away_team_form,away_team_standing_score,away_team_last_meeting_score,0])
    y.append(get_result(home_team, away_team).value)

In [6]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

algorithms = [
    ('SVM', svm.SVC()),
    ('Random Forest', RandomForestClassifier()),
    ('Extra Trees', ExtraTreesClassifier()),
    ('Descision Tree', DecisionTreeClassifier()),
    ('Neural Network', MLPClassifier(max_iter=500)),
    ('Naive Bayes', GaussianNB()),
    ('Linear Model', SGDClassifier(max_iter=500, tol=None))
]

train_count = math.ceil(matches_count * 0.75)
train_vectors = x[:train_count]
train_classifications = y[:train_count]
test_vectors = x[train_count + 1:]
test_classifications = y[train_count + 1:]
test_labels = z[train_count + 1:]

pred_mat = [test_classifications]
columns = test_labels
rows = ['Reality']

for algorithm in algorithms:
    clf = algorithm[1]
    clf.fit(train_vectors, train_classifications)
    dec = clf.predict(test_vectors)
    pred_mat.append(dec)
    rows.append(algorithm[0] + " %.2f" % (accuracy_score(dec, test_classifications) * 100) + "%")
                
df = pandas.DataFrame(np.array(pred_mat), columns=columns, index=rows)

In [7]:
df

Unnamed: 0,Week 35: Sunderland-Bournemouth 0-1,Week 35: Southampton-Hull City 0-0,Week 35: Crystal Palace-Burnley 0-2,Week 35: Manchester United-Swansea City 1-1,Week 35: Everton-Chelsea 0-3,Week 35: Middlesbrough-Manchester City 2-2,Week 35: Tottenham Hotspur-Arsenal 2-0,Week 35: Watford-Liverpool 0-1,Week 36: West Ham United-Tottenham Hotspur 1-0,Week 36: Manchester City-Crystal Palace 5-0,...,Week 38: Arsenal-Everton 3-1,Week 38: Chelsea-Sunderland 5-1,Week 38: Manchester United-Crystal Palace 2-0,Week 38: Burnley-West Ham United 1-2,Week 38: Hull City-Tottenham Hotspur 1-7,Week 38: Swansea City-West Bromwich Albion 2-1,Week 38: Southampton-Stoke City 0-1,Week 38: Watford-Manchester City 0-5,Week 38: Leicester City-Bournemouth 1-1,Week 38: Liverpool-Middlesbrough 3-0
Reality,3,2,3,2,3,2,1,3,1,1,...,1,1,1,3,3,1,3,3,2,1
SVM 43.18%,1,1,1,1,1,1,1,1,3,1,...,1,1,1,1,1,1,1,1,1,1
Random Forest 56.82%,2,1,1,2,2,1,2,3,3,1,...,1,3,1,1,3,1,1,3,1,1
Extra Trees 52.27%,1,1,1,1,1,3,1,3,3,1,...,2,1,1,1,3,1,1,3,1,2
Descision Tree 52.27%,2,2,1,1,1,3,1,3,3,1,...,2,3,1,1,3,1,1,3,2,1
Neural Network 56.82%,1,1,1,1,1,3,1,3,3,1,...,1,1,1,1,3,1,1,3,1,1
Naive Bayes 59.09%,1,1,1,1,3,3,1,3,3,1,...,1,1,1,1,3,1,1,3,3,1
Linear Model 59.09%,3,1,1,1,3,3,3,3,3,1,...,1,1,1,1,3,3,1,3,3,1
