In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import plotly as ply
import os
import re
import requests
import json
from IPython.core.display import HTML

# Load Shot Data

In [2]:
shot_data = pd.read_csv('data/fantasy-league/shot_data.csv', index_col=0)
shot_data['Timestamp'] = pd.to_datetime(shot_data['Timestamp'], format='%Y-%m-%d') 

In [3]:
shot_data = shot_data.loc[shot_data['Timestamp'] < pd.to_datetime('2021-01-04', format='%Y-%m-%d')]

In [4]:
shot_data.head()

Unnamed: 0,Timestamp,Score,Player Advantage,Minute,Player,Squad,Against,Outcome,Distance,Body Part,Notes,SCA 1 Player,SCA 1 Event,SCA 2 Player,SCA 2 Event,Threat
0,2016-08-13 12:30:00,0.0,0.0,147,Riyad Mahrez,Leicester City,Hull City,Goal,,,Penalty Kick,Goal,,,,720.0
1,2016-08-13 17:30:00,1.0,0.0,4,Sergio Agüero,Manchester City,Sunderland,Goal,,,Penalty Kick,—,,,,720.0
2,2016-08-15 20:00:00,1.0,0.0,147,Eden Hazard,Chelsea,West Ham United,Goal,,,Penalty Kick,Yellow Card,,,,627.0
3,2016-08-19 20:00:00,2.0,0.0,152,Zlatan Ibrahimović,Manchester United,Southampton,Goal,,,Penalty Kick,Goal,,,,627.0
4,2016-08-20 12:30:00,1.0,0.0,27,Sergio Agüero,Manchester City,Stoke City,Goal,,,Penalty Kick,Yellow Card,,,,


In [5]:
teams = {'Arsenal':'Arsenal',
 'Aston Villa':'Aston Villa',
 'Bournemouth':'Bournemouth',
 'Brighton':'Brighton',
 'Brighton & Hove Albion':'Brighton',
 'Burnley':'Burnley',
 'Cardiff City':'Cardiff',
 'Chelsea':'Chelsea',
 'Crystal Palace':'Crystal Palace',
 'Everton':'Everton',
 'Fulham':'Fulham',
 'Huddersfield':'Huddersfield',
 'Huddersfield Town':'Huddersfield',
 'Hull City':'Hull',
 'Leeds United':'Leeds',
 'Leicester City':'Leicester',
 'Liverpool':'Liverpool',
 'Manchester City':'Man City',
 'Manchester United':'Man United',
 'Manchester Utd':'Man United',
 'Middlesbrough':'Middlesbrough',
 'Newcastle United':'Newcastle',
 'Newcastle Utd':'Newcastle',
 'Norwich City':'Norwich',
 'Sheffield United':'Sheffield United',
 'Sheffield Utd':'Sheffield United',
 'Southampton':'Southampton',
 'Stoke City':'Stoke',
 'Sunderland':'Sunderland',
 'Swansea City':'Swansea',
 'Tottenham':'Tottenham',
 'Tottenham Hotspur':'Tottenham',
 'Watford':'Watford',
 'West Brom':'West Brom',
 'West Bromwich Albion':'West Brom',
 'West Ham':'West Ham',
 'West Ham United':'West Ham',
 'Wolverhampton Wanderers':'Wolves',
 'Wolves':'Wolves'}

In [6]:
shot_data['Squad'] = [teams[s] for s in shot_data['Squad']]

# Load Non-shot Data

In [7]:
non_shot_data = pd.read_csv('data/non-shot-xG/non_shot_data.csv', index_col=0)
non_shot_data['Date'] = pd.to_datetime(non_shot_data['Date'], format='%d/%m/%Y')

In [8]:
non_shot_data.head()

Unnamed: 0_level_0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HF,AF,HC,AC,HY,AY,HR,AR
GameID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,2008-08-16,Arsenal,West Brom,1,0,H,1,0,H,H Webb,11,8,7,5,0,0,0,0
2,2008-08-16,Bolton,Stoke,3,1,H,3,0,H,C Foy,13,12,4,3,1,2,0,0
3,2008-08-16,Everton,Blackburn,2,3,A,1,1,D,A Marriner,11,9,3,5,2,2,0,0
4,2008-08-16,Hull,Fulham,2,1,H,1,1,D,P Walton,10,9,5,6,3,0,0,0
5,2008-08-16,Middlesbrough,Tottenham,2,1,H,0,0,D,M Atkinson,11,12,7,9,1,2,0,0


# Match Data

In [9]:
non_shot_data['Date'] = [d.date() for d in non_shot_data['Date']]

In [10]:
arr = np.asarray([d for d in non_shot_data['Date']]) == pd.to_datetime('2018-01-12').date()

In [11]:
non_shot_data_groups = non_shot_data.groupby('Date').groups

In [12]:
shot_data['MatchID'] = np.NaN

## Add MatchID

In [13]:
errors = []
try:
    for i, s in shot_data.iterrows():
        date = s['Timestamp'].date()
        if date in non_shot_data_groups:
            indices = non_shot_data_groups[date]
            group = non_shot_data.iloc[indices-1]
            home = group['HomeTeam'] == s['Squad']
            if sum(home) > 0:
                shot_data.loc[i, 'MatchID'] = group.loc[home].index[0]
                continue
            away = group['AwayTeam'] == s['Squad']
            if sum(away) > 0:
                shot_data.loc[i, 'MatchID'] = group.loc[away].index[0]
                continue

        else:
            errors.append(i)
except:
    print(i)

20974


In [14]:
path = os.path.join(os.getcwd(), "output/shot_data.csv")
shot_data.to_csv(path, index=False)

In [15]:
shot_data.iloc[errors]

Unnamed: 0,Timestamp,Score,Player Advantage,Minute,Player,Squad,Against,Outcome,Distance,Body Part,Notes,SCA 1 Player,SCA 1 Event,SCA 2 Player,SCA 2 Event,Threat,MatchID


## Integrating Non-shot with ELO classifier

In [36]:
FTHG_path = os.path.join(os.getcwd(), "output/non_shot_FTHG_predictions.csv")
FTHG_predictions = pd.read_csv(FTHG_path)

FTAG_path = os.path.join(os.getcwd(), "output/non_shot_FTAG_predictions.csv")
FTAG_predictions = pd.read_csv(FTAG_path)

In [37]:
FTHG_predictions.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTR,HTHG,HTR,Referee,HF,AF,HC,HY,AY,HR,AR,Predicted FTHG,Deviation
0,1,37,1,1.0,1,1.0,11,11,8,7,0,0,0,0,1.76,0.76
1,7,32,3,1.0,3,1.0,5,13,12,4,1,2,0,0,3.29,0.29
2,14,5,2,0.0,1,0.5,2,11,9,3,2,2,0,0,1.63,0.37
3,18,15,2,1.0,1,0.5,31,10,9,5,3,0,0,0,2.37,0.37
4,24,35,2,1.0,0,0.5,18,11,12,7,1,2,0,0,1.7,0.3


In [38]:
FTAG_predictions.head()

Unnamed: 0,HomeTeam,AwayTeam,FTAG,FTR,HTAG,HTR,Referee,HF,AF,AC,HY,AY,HR,AR,Predicted FTAG,Deviation
0,1,37,0,1.0,0,1.0,11,11,8,5,0,0,0,0,0.11,0.11
1,7,32,1,1.0,0,1.0,5,13,12,3,1,2,0,0,0.87,0.13
2,14,5,3,0.0,1,0.5,2,11,9,5,2,2,0,0,2.8,0.2
3,18,15,1,1.0,1,0.5,31,10,9,6,3,0,0,0,1.16,0.16
4,24,35,1,1.0,0,0.5,18,11,12,9,1,2,0,0,0.71,0.29


In [47]:
non_shot_predictions = FTHG_predictions.copy()
non_shot_predictions['Predicted FTAG'] = FTAG_predictions['Predicted FTAG']

In [48]:
non_shot_predictions

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTR,HTHG,HTR,Referee,HF,AF,HC,HY,AY,HR,AR,Predicted FTHG,Deviation,Predicted FTAG
0,1,37,1,1.0,1,1.0,11,11,8,7,0,0,0,0,1.76,0.76,0.11
1,7,32,3,1.0,3,1.0,5,13,12,4,1,2,0,0,3.29,0.29,0.87
2,14,5,2,0.0,1,0.5,2,11,9,3,2,2,0,0,1.63,0.37,2.80
3,18,15,2,1.0,1,0.5,31,10,9,5,3,0,0,0,2.37,0.37,1.16
4,24,35,2,1.0,0,0.5,18,11,12,7,1,2,0,0,1.70,0.30,0.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4719,9,40,3,0.5,1,0.0,1,13,8,5,2,1,0,0,2.71,0.29,3.21
4720,37,1,0,0.0,0,0.0,18,7,4,3,1,2,0,0,0.15,0.15,3.76
4721,25,20,1,0.0,0,0.5,33,10,11,3,0,2,0,0,0.70,0.30,1.56
4722,12,22,1,0.0,0,0.0,3,11,10,5,3,1,0,0,0.85,0.15,3.18


In [49]:
columns = ['HomeTeam', 'AwayTeam', 'FTR', 'Predicted FTHG', 'Predicted FTAG']
non_shot_predictions = non_shot_predictions[columns]

In [50]:
non_shot_predictions

Unnamed: 0,HomeTeam,AwayTeam,FTR,Predicted FTHG,Predicted FTAG
0,1,37,1.0,1.76,0.11
1,7,32,1.0,3.29,0.87
2,14,5,0.0,1.63,2.80
3,18,15,1.0,2.37,1.16
4,24,35,1.0,1.70,0.71
...,...,...,...,...,...
4719,9,40,0.5,2.71,3.21
4720,37,1,0.0,0.15,3.76
4721,25,20,0.0,0.70,1.56
4722,12,22,0.0,0.85,3.18


In [51]:
path = os.path.join(os.getcwd(), "output/non_shot_predictions.csv")
non_shot_predictions.to_csv(path, index=False)