In [7]:
import numpy as np
import pandas as pd
import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split, TimeSeriesSplit, KFold
from sklearn import linear_model
from sklearn.metrics import f1_score, accuracy_score
from joblib import dump, load
from functools import reduce
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

In [8]:
WINDOW_SIZE = 5
VAL_SIZE = 0.2

In [9]:
#add standing
data_2016_2017 = pd.read_csv('https://query.data.world/s/r5gdp7ga24tokmbwgk2fi7mekndsze')
data_2017_2018 = pd.read_csv('https://query.data.world/s/r3n4zit4rb4tqs4lnabptudresgnyr').drop(columns = ['Div'])
data_2018_2019 = pd.read_csv('https://query.data.world/s/2wp65h24cyeljfcp4dnvsml46dwsla').drop(columns = ['Div'])

data_more = pd.concat([pd.read_csv(f'../data/{path}', sep = ',') for path in ['2016-2017.txt', '2017-2018.txt', '2018-2019.txt']])
data_more['Date'] = pd.to_datetime(data_more['Date'])
data_more = data_more.sort_values('Date')

data = pd.concat([data_2016_2017, data_2017_2018, data_2018_2019])
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date')
data = data.merge(data_more[[
    'Date', 'HomeTeam', 'AwayTeam', 'B365H', 'B365D', 'B365A']])
data['Bet_pred'] = data[['B365A', 'B365D', 'B365H']].apply(lambda x: np.argmin(x.values), axis = 1)
data.index = range(len(data))

In [10]:
data

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Bet_pred
0,2016-01-10,Hull,Chelsea,0,2,A,0,0,D,A Taylor,...,5,7,2,2,0,0,7.50,4.50,1.50,0
1,2016-01-10,Swansea,Liverpool,1,2,A,1,0,H,M Oliver,...,3,10,2,2,0,0,8.00,4.75,1.45,0
2,2016-01-10,Watford,Bournemouth,2,2,D,0,1,A,M Dean,...,4,5,3,4,0,0,2.38,3.30,3.30,2
3,2016-01-10,West Ham,Middlesbrough,1,1,D,0,0,D,N Swarbrick,...,4,5,2,3,0,0,2.25,3.40,3.50,2
4,2016-01-10,Sunderland,West Brom,1,1,D,0,1,A,S Attwell,...,6,5,1,3,0,0,2.50,3.25,3.20,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,2019-12-05,Liverpool,Wolves,2,0,H,1,0,H,M Atkinson,...,4,1,0,2,0,0,1.30,6.00,11.00,2
1136,2019-12-05,Man United,Cardiff,0,2,A,0,1,A,J Moss,...,11,2,3,3,0,0,1.28,6.50,11.00,2
1137,2019-12-05,Southampton,Huddersfield,1,1,D,1,0,H,L Probert,...,4,3,0,1,0,0,1.44,4.75,8.50,2
1138,2019-12-05,Tottenham,Everton,2,2,D,1,0,H,A Marriner,...,7,4,0,2,0,0,2.20,3.50,3.50,2


fthg - Full Time Home goals

ftag - Full Time Away goals

ftr - Full Time result (Away, Home, Draw)

hthg - Halftime Home goals

htag - Halftime Away goals

htr - Halftime result (Away, Home, Draw)

referee - The jabroni in charge

hs - Home Shots

as - Away Shots

hst - Home Shots on target

ast - Away Shots on target

hf - Home Fouls

af - Away Fouls

hc - Home Corners

ac - Away Corners

hy - Home Yellow cards

ay - Away Yellow cards

hr - Home Red cards

ar - Away Red cards

In [1005]:
def prepare_dataset(df, window_size = WINDOW_SIZE):
    def get_stats(row, home = True):
        team_loc = 'HomeTeam' if home else 'AwayTeam'
        
        res_arr = np.empty((len(columns), window_size), dtype = 'object')
        res_arr[:] = np.nan
        
        res = (
            df[
                (df['Date'] < row['Date'])
                 & (df[team_loc] == row[team_loc])
            ]
            .iloc[-window_size:]
            [columns]
            .values
            .transpose()
        )
        
        if res.shape[1]:
            res_arr[:, :res.shape[1]] = res
        return res_arr.ravel()
    
    columns = [
            'FTR', 'FTHG', 'FTAG', 'HS', 'AS', 'HF',
            'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR'
        ]
    new_columns = list(reduce(lambda x, y: x+y, 
                         [[f'{col}_{i}' for i in range(window_size, 0, -1)] for col in columns]))
    new_columns_home = list(map(lambda x: x+'_home', new_columns))
    new_columns_away = list(map(lambda x: x+'_away', new_columns))
            
    df[new_columns_home] = np.vstack(df.apply(get_stats, axis = 1, home = True).values)
    df[new_columns_away] = np.vstack(df.apply(get_stats, axis = 1, home = False).values)
    
    df = df.drop(columns = [
        'Date', 'FTHG', 'FTAG', 'HS', 'AS', 'HF',
        'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR',
        'HTHG', 'HTAG', 'HTR', 'HST', 'AST'
    ])
    df = df.dropna()
    return df

In [1006]:
enc_dict = defaultdict(LabelEncoder)

In [1007]:
df = prepare_dataset(data)

df['HomeTeam'] = enc_dict['team'].fit_transform(df['HomeTeam'])
df['AwayTeam'] = enc_dict['team'].transform(df['AwayTeam'])
df['Referee'] = enc_dict['Referee'].fit_transform(df['Referee'])
df['FTR'] = enc_dict['FTR'].fit_transform(df['FTR'])
df[[f'FTR_{i}_home' for i in range(WINDOW_SIZE, 0, -1)]] = (
    df[[f'FTR_{i}_home' for i in range(WINDOW_SIZE, 0, -1)]]
    .apply(lambda x: enc_dict['FTR'].transform(x))
)
df[[f'FTR_{i}_away' for i in range(WINDOW_SIZE, 0, -1)]] = (
    df[[f'FTR_{i}_away' for i in range(WINDOW_SIZE, 0, -1)]]
    .apply(lambda x: enc_dict['FTR'].transform(x))
)

In [988]:
# train = df.iloc[:-int(df.shape[0]*VAL_SIZE)]
# val = df.iloc[-int(df.shape[0]*VAL_SIZE):]

# X_train, X_val = train.drop(columns = ['FTR']), val.drop(columns = ['FTR'])
# y_train, y_val = train['FTR'], val['FTR']

In [1050]:
skf = KFold(5)
df.index = range(len(df))
X = df.drop(columns = ['FTR'])
y = df['FTR']
scores = []
bet_scores = []

for train_indx, val_indx in skf.split(X, y):
    X_train, y_train = X.iloc[train_indx], y.iloc[train_indx]
    X_val, y_val = X.iloc[val_indx], y.iloc[val_indx]
    
    forest = RandomForestClassifier(n_estimators=300, max_depth=4, random_state=42)
    forest.fit(X_train, y_train)
    scores.append(accuracy_score(forest.predict(X_val), y_val))
    bet_scores.append(accuracy_score(X_val['Bet_pred'], y_val))

In [1051]:
np.array(scores).mean(), np.array(bet_scores).mean()

(0.5856410256410257, 0.5846153846153845)

In [1000]:
print('forest', accuracy_score(forest.predict(X_val), y_val))
print('lgbm', accuracy_score(gbm.predict(X_val.values), y_val))
print('random', accuracy_score(np.random.randint(0, 3, size= (val.shape[0])), y_val))
print('most frequent', accuracy_score(np.ones(val.shape[0])*2, y_val))
print('bet prediction', accuracy_score(X_val['Bet_pred'], y_val))
#5794

forest 0.5948717948717949
lgbm 0.5128205128205128
random 0.2717948717948718
most frequent 0.5128205128205128
bet prediction 0.5846153846153846


In [1053]:
from joblib import dump, load
dump(forest, 'forest.joblib')

['forest.joblib']

In [159]:
#will be using soccer data and predict outcome
#collect data and store it in sql database

In [161]:
# 2016-2017 https://query.data.world/s/r5gdp7ga24tokmbwgk2fi7mekndsze
# 2017-2018 https://query.data.world/s/r3n4zit4rb4tqs4lnabptudresgnyr
# 2018-2019 https://query.data.world/s/2wp65h24cyeljfcp4dnvsml46dwsla
#info about columns could be found in https://data.world/datasets/soccer

In [None]:
#https://www.premierleague.com/results?team=FIRST&co=1&se=363&cl=-1
#https://www.flashscore.ru/match/Ysr0GpAj/#match-summary/match-summary
#https://www.football-data.co.uk/englandm.php

In [1055]:
list(df.columns)

['HomeTeam',
 'AwayTeam',
 'FTR',
 'Referee',
 'B365H',
 'B365D',
 'B365A',
 'Bet_pred',
 'FTR_5_home',
 'FTR_4_home',
 'FTR_3_home',
 'FTR_2_home',
 'FTR_1_home',
 'FTHG_5_home',
 'FTHG_4_home',
 'FTHG_3_home',
 'FTHG_2_home',
 'FTHG_1_home',
 'FTAG_5_home',
 'FTAG_4_home',
 'FTAG_3_home',
 'FTAG_2_home',
 'FTAG_1_home',
 'HS_5_home',
 'HS_4_home',
 'HS_3_home',
 'HS_2_home',
 'HS_1_home',
 'AS_5_home',
 'AS_4_home',
 'AS_3_home',
 'AS_2_home',
 'AS_1_home',
 'HF_5_home',
 'HF_4_home',
 'HF_3_home',
 'HF_2_home',
 'HF_1_home',
 'AF_5_home',
 'AF_4_home',
 'AF_3_home',
 'AF_2_home',
 'AF_1_home',
 'HC_5_home',
 'HC_4_home',
 'HC_3_home',
 'HC_2_home',
 'HC_1_home',
 'AC_5_home',
 'AC_4_home',
 'AC_3_home',
 'AC_2_home',
 'AC_1_home',
 'HY_5_home',
 'HY_4_home',
 'HY_3_home',
 'HY_2_home',
 'HY_1_home',
 'AY_5_home',
 'AY_4_home',
 'AY_3_home',
 'AY_2_home',
 'AY_1_home',
 'HR_5_home',
 'HR_4_home',
 'HR_3_home',
 'HR_2_home',
 'HR_1_home',
 'AR_5_home',
 'AR_4_home',
 'AR_3_home',
 'AR