In [27]:
import sqlite3
import pandas as pd
import datetime

connection = sqlite3.connect('../database.sqlite')
query_train = \
                'select match_api_id, home_team_api_id, away_team_api_id, home_team_goal, away_team_goal, \
                homeWinOddsAvg, drawOddsAvg, awayWinOddsAvg, date  \
                from Match limit 10000;'
train = pd.read_sql(query_train, connection, index_col = 'match_api_id', parse_dates=['date']);

query_test = 'select match_api_id, home_team_api_id, away_team_api_id, home_team_goal, away_team_goal, \
              homeWinOddsAvg, drawOddsAvg, awayWinOddsAvg, date \
              from Match where id between 100 and 150;'
test = pd.read_sql(query_test, connection, index_col = 'match_api_id', parse_dates=['date']);
train.head(10)


Unnamed: 0_level_0,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,homeWinOddsAvg,drawOddsAvg,awayWinOddsAvg,date
match_api_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
492473,9987,9993,1,1,1.77,3.32,4.16,2008-08-17
492474,10000,9994,0,0,1.9,3.25,3.62,2008-08-16
492475,9984,8635,0,3,2.46,3.23,2.54,2008-08-16
492476,9991,9998,5,0,1.44,3.81,6.53,2008-08-17
492477,7947,9985,1,3,4.49,3.42,1.67,2008-08-16
492478,8203,8342,1,1,4.61,3.4,1.69,2008-09-24
492479,9999,8571,2,2,1.96,3.27,3.4,2008-08-16
492480,4049,9996,1,2,2.71,3.27,2.32,2008-08-16
492481,10001,9986,1,0,2.22,3.23,2.84,2008-08-16
492564,8342,8571,4,1,1.28,4.67,9.11,2008-11-01


In [28]:
import pandas as pd
def teamAttributes(team_api_id, matchDate):
    """Helper method that takes team_api_id and match date as argument and
    returns the attributes of that team closest to that date"""
    query = 'select date, buildUpPlaySpeed, buildUpPlayPassing, \
            chanceCreationPassing, chanceCreationCrossing, chanceCreationShooting, \
            defencePressure, defenceAggression, defenceTeamWidth \
            from Team_Attributes where team_api_id = %s order by date;' % team_api_id
    attributes = pd.read_sql(query, connection, parse_dates=['date'])
    if (not attributes.empty):
        closestAttributes = None
        minDifference = None
        for index, row in attributes.iterrows():
            date = row['date']
            difference = abs(matchDate - date)
            if(minDifference is None or difference < minDifference):
                minDifference = difference
                closestAttributes = row
        return closestAttributes.drop('date')
    else:
        return None
    
# print('Example output from teamAttributes:\n')
# teamAttributes(9993, pd.to_datetime('2008-08-16'))

In [29]:
def differenceVector (row):
    """It takes in as an argument, a row from Match table, and returns the difference
    of important attributes with reference to Home Team"""
    date = row['date']
    homeGoal = row['home_team_goal']
    homeAttributes = teamAttributes(row['home_team_api_id'], date)
    
    awayGoal = row['away_team_goal']
    awayAttributes = teamAttributes(row['away_team_api_id'], date)
    
    if(homeAttributes is not None and awayAttributes is not None):
        differenceVector = homeAttributes - awayAttributes
        
        if(homeGoal > awayGoal):
            result = 2
        elif(awayGoal > homeGoal):
            result = 0
        else:
            result = 1
        
        results = pd.Series({
                'home_team_api_id':row['home_team_api_id'],
                'away_team_api_id':row['away_team_api_id'],
                'homeWinOddsAvg': row['homeWinOddsAvg'],
                'drawOddsAvg':row['drawOddsAvg'],
                'awayWinOddsAvg':row['awayWinOddsAvg'],
                'outcome': result
            })
        return results.append([differenceVector])
    else:
        return
    
testSet = test.apply(differenceVector, axis = 1).dropna(axis=0, how='all')

results = train.apply(differenceVector, axis = 1).dropna(axis=0, how='all')
connection.close()
results.head(10)

Unnamed: 0_level_0,awayWinOddsAvg,away_team_api_id,drawOddsAvg,homeWinOddsAvg,home_team_api_id,outcome,buildUpPlaySpeed,buildUpPlayPassing,chanceCreationPassing,chanceCreationCrossing,chanceCreationShooting,defencePressure,defenceAggression,defenceTeamWidth
match_api_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
492473,4.16,9993.0,3.32,1.77,9987.0,1.0,10.0,10.0,5.0,-5.0,10.0,0.0,-5.0,0.0
492474,3.62,9994.0,3.25,1.9,10000.0,1.0,5.0,0.0,10.0,0.0,0.0,5.0,-5.0,0.0
492475,2.54,8635.0,3.23,2.46,9984.0,0.0,-5.0,0.0,0.0,-5.0,-5.0,-5.0,10.0,0.0
492476,6.53,9998.0,3.81,1.44,9991.0,2.0,20.0,15.0,5.0,2.0,14.0,-1.0,5.0,-7.0
492478,1.69,8342.0,3.4,4.61,8203.0,1.0,30.0,20.0,5.0,-10.0,5.0,0.0,0.0,-10.0
492479,3.4,8571.0,3.27,1.96,9999.0,1.0,15.0,0.0,0.0,-15.0,-10.0,5.0,10.0,5.0
492481,2.84,9986.0,3.23,2.22,10001.0,2.0,5.0,10.0,-5.0,2.0,-5.0,0.0,0.0,0.0
492564,9.11,8571.0,4.67,1.28,8342.0,2.0,-15.0,-20.0,-5.0,0.0,-5.0,0.0,10.0,5.0
492565,9.28,9986.0,4.75,1.27,9985.0,0.0,13.0,-10.0,10.0,12.0,5.0,0.0,0.0,-5.0
492566,2.43,9991.0,3.21,2.57,10000.0,0.0,-5.0,-5.0,0.0,-10.0,-10.0,25.0,10.0,30.0


In [26]:
from sklearn.tree import DecisionTreeClassifier

features = [
    'homeWinOddsAvg',
    'drawOddsAvg',
    'awayWinOddsAvg',
    'buildUpPlaySpeed',
    'buildUpPlayPassing',
    'chanceCreationPassing',
    'chanceCreationCrossing',
    'chanceCreationShooting',
    'defencePressure',
    'defenceAggression',
    'defenceTeamWidth'
]

x = results[features]
y = results['outcome']

dt = DecisionTreeClassifier()
dt = dt.fit(x,y)

test = testSet[features]
dt.predict(test)

array([ 0.,  2.,  1.,  2.,  2.,  2.,  1.,  0.,  2.,  2.,  0.,  2.,  0.,
        2.,  2.,  2.,  1.,  0.,  2.,  2.,  2.,  1.,  2.,  2.,  0.,  2.,
        0.,  1.,  0.,  2.,  0.,  2.,  0.,  0.,  0.,  1.,  2.,  0.,  2.,
        0.,  2.,  2.,  2.,  1.,  0.,  2.,  2.,  2.,  1.,  1.])