In [1]:
from sklearn import svm 
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
import pandas as pd
%matplotlib inline

In [2]:
# Create a SQL connection to our SQLite database
con = sqlite3.connect("soccerdata.sqlite")

match =  pd.read_sql_query("SELECT * FROM Match", con)
team =  pd.read_sql_query("SELECT * FROM Team", con)

#Be sure to close the connection.
con.close()

In [3]:
#match data
match = match.loc[match['league_id'] == 1729,] #filter by league
match = match.iloc[:,3:11] #only need first 11 columns
match.index = range(1, len(match["stage"]) + 1)
match["Outcome"] = 0

#rename columns
match.columns = ["Season", "Stage", "Date", "Match_ID", "Home_Team_ID", "Away_Team_ID", "Home_Goals", "Away_Goals", "Outcome"]

In [4]:
#team data
team = team.drop("team_fifa_api_id", 1)

#extract those teams that took part in 
print(team)

        id  team_api_id           team_long_name team_short_name
0        1         9987                 KRC Genk             GEN
1        2         9993             Beerschot AC             BAC
2        3        10000         SV Zulte-Waregem             ZUL
3        4         9994         Sporting Lokeren             LOK
4        5         9984        KSV Cercle Brugge             CEB
5        6         8635           RSC Anderlecht             AND
6        7         9991                 KAA Gent             GEN
7        8         9998                RAEC Mons             MON
8        9         7947            FCV Dender EH             DEN
9       10         9985        Standard de Liège             STL
10      11         8203              KV Mechelen             MEC
11      12         8342           Club Brugge KV             CLB
12      13         9999            KSV Roeselare             ROS
13      14         8571              KV Kortrijk             KOR
14      15         4049  

In [5]:
#create a class variable for "Win", "Lose", "Tie" - WRT home team
for i in range(1, len(match["Outcome"]) + 1):
    home_goals = match.loc[i, "Home_Goals"]
    away_goals = match.loc[i, "Away_Goals"]
    if home_goals > away_goals:
        match.loc[i,'Outcome'] = "Win"
    elif home_goals == away_goals:
        match.loc[i,'Outcome'] = "Tie"
    elif home_goals < away_goals:
        match.loc[i,'Outcome'] = "Lose"

In [6]:
match.head()

Unnamed: 0,Season,Stage,Date,Match_ID,Home_Team_ID,Away_Team_ID,Home_Goals,Away_Goals,Outcome
1,2008/2009,1,2008-08-17 00:00:00,489042,10260,10261,1,1,Tie
2,2008/2009,1,2008-08-16 00:00:00,489043,9825,8659,1,0,Win
3,2008/2009,1,2008-08-16 00:00:00,489044,8472,8650,0,1,Lose
4,2008/2009,1,2008-08-16 00:00:00,489045,8654,8528,2,1,Win
5,2008/2009,1,2008-08-17 00:00:00,489046,10252,8456,4,2,Win


In [9]:
def getLastXMatchesResults_Home(date, team, x = 10):    
    #Filter team matches from matches
    teamMatches = match[(match['Home_Team_ID'] == team)]                           
    #Filter x last matches from team matches
    xMatches = teamMatches[teamMatches.Date < date].sort_values(by = 'Date', ascending = False).iloc[0:x,:]
    #Calculate win percentage
    won = len(xMatches['Outcome'] == "Win") / 10
    return won

def getLastXMatchesResults_Away(date, team, x = 10):    
    #Filter team matches from matches
    teamMatches = match[(match['Away_Team_ID'] == team)]                           
    #Filter x last matches from team matches
    xMatches = teamMatches[teamMatches.Date < date].sort_values(by = 'Date', ascending = False).iloc[0:x,:]
    #Calculate win percentage
    won = len(xMatches['Outcome'] == "Win") / 10
    return won
    
def getLastXMatchesResultsAgainstEachOther(date, home_team, away_team, x = 10):    
    #Find matches of both teams
    homeMatches = match[(match['Home_Team_ID'] == home_team) & (match['Away_Team_ID'] == away_team)]    
    awayMatches = match[(match['Home_Team_ID'] == away_team) & (match['Away_Team_ID'] == home_team)] 
    #Get last x matches - Home
    try:    
        xMatches = homeMatches[homeMatches.Date < date].sort_values(by = 'Date', ascending = False).iloc[0:x,:]
        homeWin = len(xMatches['Outcome'] == "Win") / 10
    except:
        xMatches = homeMatches[homeMatches.Date < date].sort_values(by = 'Date', ascending = False).iloc[0:homeMatches.shape[0],:]
        homeWin = 0
        #Check for error in data
        if(xMatches.shape[0] > x):
            print("Error in obtaining matches")
    
    #Get last x matches - Away
    try:    
        xMatches = awayMatches[awayMatches.Date < date].sort_values(by = 'Date', ascending = False).iloc[0:x,:]
        awayWin = len(xMatches['Outcome'] == "Win") / 10
    except:
        xMatches = awayMatches[awayMatches.Date < date].sort_values(by = 'Date', ascending = False).iloc[0:awayMatches.shape[0],:]
        awayWin = 0
        #Check for error in data
        if(xMatches.shape[0] > x):
            print("Error in obtaining matches")
    
            
    #Return data
    return np.array([homeWin, awayWin])


In [10]:
#create training set and validation set
#season 2008-09 to 2014-15 will be the training data
#we will predict the outcome of the matches in 2015-16
train = match.loc[match["Season"] != "2015/2016",]
validation = match.loc[match["Season"] == "2015/2016",]

#create columns for last X matches
match["HomeTeamWins_Home"] = 0
match["HomeTeamWins_Away"] = 0
match["AwayTeamWins_Home"] = 0
match["AwayTeamWins_Away"] = 0
match["HomeVsAway_Home"] = 0
match["HomeVsAway_Away"] = 0

for i in range(1, len(match["Match_ID"]) + 1):
    match.loc[i, "HomeTeamWins_Home"] = getLastXMatchesResults_Home(match.loc[i, "Date"],  match.loc[i, "Home_Team_ID"])
    match.loc[i, "HomeTeamWins_Away"] = getLastXMatchesResults_Away(match.loc[i, "Date"],  match.loc[i, "Home_Team_ID"])
    match.loc[i, "AwayTeamWins_Home"] = getLastXMatchesResults_Home(match.loc[i, "Date"],  match.loc[i, "Away_Team_ID"])
    match.loc[i, "AwayTeamWins_Away"] = getLastXMatchesResults_Away(match.loc[i, "Date"],  match.loc[i, "Away_Team_ID"])
    temp =  getLastXMatchesResultsAgainstEachOther(match.loc[i, "Date"],match.loc[i, "Home_Team_ID"],match.loc[i, "Away_Team_ID"])
    match.loc[i, "HomeVsAway_Home"] = temp[0]
    match.loc[i, "HomeVsAway_Away"] = temp[1]

In [None]:
# fit SVM with polynomial kernel 
clf = svm.SVC(kernel='poly', degree=2, C=10)
clf.fit(train[["Home_Team_ID", "Away_Team_ID", "HomeTeamWins_Home", "HomeTeamWins_Away", "AwayTeamWins_Home", "AwayTeamWins_Away", "HomeVsAway_Home", "HomeVsAway_Away"]], train['Outcome'])

In [None]:
#predict
validation['prediction'] = clf.predict(validation[["Home_Team_ID", "Away_Team_ID", "HomeTeamWins_Home", "HomeTeamWins_Away", "AwayTeamWins_Home", "AwayTeamWins_Away", "HomeVsAway_Home", "HomeVsAway_Away"]])