In [1]:
import os
from datetime import date
import pandas as pd
import numpy as np
from unidecode import unidecode
from collections import Counter
from random import choices

import sys
sys.path.append('/Users/andrew.peters/documents/fpl/')
from odds_model.utils import betting_odds_scraper as sc
from odds_model.utils import name_conversions as nc

In [2]:
sc.orchestrator()

Retrieveed 19 total matches.
14 matches have already been scraped.5 matches remaining.
Scraping attempt 1
Match 0 of 5: /football/crystal-palace-leicester
['Leicester', 'Crystal Palace']: Sat 15 Oct
While scraping /football/crystal-palace-leicester, encountered error: 'Correct Score'
Match 1 of 5: /football/fulham-bournemouth
['Fulham', 'Bournemouth']: Sat 15 Oct
While scraping /football/fulham-bournemouth, encountered error: 'Correct Score'
Match 2 of 5: /football/wolves-nottingham-forest
['Wolves', 'Nottm Forest']: Sat 15 Oct
While scraping /football/wolves-nottingham-forest, encountered error: 'Correct Score'
Match 3 of 5: /football/everton-tottenham
['Tottenham', 'Everton']: Sat 15 Oct
While scraping /football/everton-tottenham, encountered error: 'Correct Score'
Match 4 of 5: /football/man-city-arsenal
['Arsenal', 'Man City']: Wed 19 Oct
While scraping /football/man-city-arsenal, encountered error: 'Correct Score'
14 matches have already been scraped.5 matches remaining.
Scraping 

In [4]:
sc.final_odds_df_builder()

In [7]:
# will filter for data up to or equal to date
last_date_of_gw = '2022-10-16'

In [8]:
fpl_data = pd.read_pickle('../data/current_fpl_costs.pkl')
ga_odds = pd.read_pickle('../data/goals_assists_odds.pkl')

In [9]:
# remove accents on player names for more consisten merges
fpl_data['name'] = fpl_data.name.apply(lambda x: unidecode(x))
ga_odds['player'] = ga_odds.player.apply(lambda x: unidecode(x))

# replace names with names from the conversion dictioary so they can merge
ga_odds['player'] = ga_odds.player.replace(nc.name_conversion_dict)
ga_odds = ga_odds.rename(columns={'player': 'name'})

In [10]:
# look for mismatches in player names: players that exist in the odds data, but don't have a matching name in the fpl data
name_mismatches = [p for p in ga_odds.name.unique() if not p in(fpl_data.name.unique())]
name_mismatches

['Luis Sinisterr', 'Thiago']

In [11]:
last_name_mismatches = [x.split(' ')[-1] for x in name_mismatches]

In [12]:
for idx, player in enumerate(name_mismatches):
    print(f'Possible Matches for {player}')
    snippet = fpl_data[fpl_data.name.str.contains(last_name_mismatches[idx])].head()
    for name in snippet.name.unique():
        print(f"'{player}': '{name}',")    
    display(snippet)

Possible Matches for Luis Sinisterr
'Luis Sinisterr': 'Luis Sinisterra Lucumi',


Unnamed: 0,name,web_name,team,pos,now_cost,minutes
351,Luis Sinisterra Lucumi,Sinisterra,Leeds,3,65,225


Possible Matches for Thiago
'Thiago': 'Thiago Emiliano da Silva',
'Thiago': 'Thiago Alcantara do Nascimento',


Unnamed: 0,name,web_name,team,pos,now_cost,minutes
159,Thiago Emiliano da Silva,Thiago Silva,Chelsea,2,54,630
359,Thiago Alcantara do Nascimento,Thiago,Liverpool,3,52,228


In [13]:
df = fpl_data.merge(ga_odds[['name', 'match_date', 'proba_assist', 'proba_goal']], on='name')

In [14]:
df[['assist_share', 'goal_share']] = df[['proba_assist', 'proba_goal']] / df.groupby('team')[['proba_assist', 'proba_goal']].transform(sum)

In [15]:
scorelines = pd.read_pickle('../data/scoreline_probabilities.pkl')
scorelines.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1008 entries, 0 to 55
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   team        1008 non-null   object        
 1   gf          1008 non-null   int64         
 2   ga          1008 non-null   int64         
 3   match_date  1008 non-null   datetime64[ns]
 4   odds        1008 non-null   float64       
 5   at_home     1008 non-null   int64         
 6   proba       1008 non-null   float64       
dtypes: datetime64[ns](1), float64(2), int64(3), object(1)
memory usage: 63.0+ KB


In [16]:
#make sure team names align in both dataframes, since that is what will be used for matching
print([t for t in df.team.unique() if not t in(scorelines.team.unique())])
print([t for t in scorelines.team.unique() if not t in(df.team.unique())])

[]
['Brighton', 'Crystal Palace', 'Bournemouth', 'Brentford', 'Fulham', 'Leicester', 'Nottm Forest', 'Wolves', 'Everton', 'Tottenham']


In [17]:
# Set last day of gameweek. In the future, this could be automated with FPL API
scorelines = scorelines[scorelines.match_date <= last_date_of_gw]
print(len(scorelines))

360


In [18]:
def simulate_results(team, num_simulations = 10000):
    possible_scores = list(zip(scorelines[scorelines['team'] == team].gf, scorelines[scorelines['team'] == team].ga))
    scoreline_weights = scorelines[scorelines['team'] == team].proba
    
    names = list(df[df.team == team].name)
    assist_weights = list(df[df.team == team].assist_share)
    goal_weights = list(df[df.team == team].goal_share)
    
    sim_goal_list = []
    sim_assist_list = []
    sim_goals_against_list = []
    sim_name_list = names * num_simulations
    round_list = []
    
    for sim in range(num_simulations):
        score = choices(possible_scores, scoreline_weights)[0]
        
        game_goal_list = []
        game_assist_list = []
        
        for goals_for in range(score[0]):
            game_goal_list.append(choices(names, goal_weights)[0])
            game_assist_list.append(choices(names, assist_weights)[0])
        
        sim_goal_list.extend([Counter(game_goal_list)[p] for p in names])
        sim_assist_list.extend([Counter(game_assist_list)[p] for p in names])
        sim_goals_against_list.extend([score[1]] * len(names))
        round_list.extend([sim] * len(names))
    
    # Make DataFrame of Results
    results_df = pd.DataFrame(zip(sim_name_list, round_list, sim_goal_list, sim_assist_list, sim_goals_against_list),
                              columns=['name', 'round', 'goals', 'assists', 'goals_against'])
    return results_df
    
    

In [19]:
all_simulations = pd.DataFrame()
for team in scorelines.team.unique():
    team_result = simulate_results(team)
    all_simulations = pd.concat([all_simulations, team_result])

In [20]:
date_path = date.today().strftime('%Y_%m_%d')
path = f'../data/historical/{date_path}'

# save to historical folder as well
all_simulations.to_pickle(f'{path}/all_simulations.pkl')

In [21]:
player_averages = (all_simulations.groupby('name')[['assists', 'goals', 'goals_against']]
                   .mean()
                   .reset_index()
                   .sort_values('goals', ascending=False)
                   .rename(columns={'goals': 'mean_goals', 'assists': 'mean_assists', 
                                    'goals_against': 'mean_goals_against'})
)

In [22]:
df = df.merge(player_averages, on='name')

In [23]:
df.head()

Unnamed: 0,name,web_name,team,pos,now_cost,minutes,match_date,proba_assist,proba_goal,assist_share,goal_share,mean_assists,mean_goals,mean_goals_against
0,Cedric Alves Soares,Cédric,Arsenal,2,42,0,2022-10-16,0.181818,0.068441,0.056396,0.01953,0.1195,0.0406,1.1529
1,Granit Xhaka,Xhaka,Arsenal,3,51,807,2022-10-16,0.195122,0.184049,0.060523,0.052521,0.1255,0.1078,1.1529
2,Rob Holding,Holding,Arsenal,2,42,3,2022-10-16,0.05,0.076923,0.015509,0.021951,0.031,0.0444,1.1529
3,Thomas Partey,Partey,Arsenal,3,48,509,2022-10-16,0.12766,0.114613,0.039597,0.032706,0.0802,0.0671,1.1529
4,Martin Odegaard,Ødegaard,Arsenal,3,64,656,2022-10-16,0.334728,0.27907,0.103826,0.079636,0.2114,0.1679,1.1529


In [24]:
# Goalkeepers aren't in the dataframe above, since they don't have any goals/assists betting from the bookkeepers. 
# Add them in.

goalkeepers = fpl_data[fpl_data.pos == 1]
goalkeepers = goalkeepers.merge(df[['team', 'match_date', 'mean_goals_against']].drop_duplicates(), on='team')
df = pd.concat([df, goalkeepers])

In [25]:
df

Unnamed: 0,name,web_name,team,pos,now_cost,minutes,match_date,proba_assist,proba_goal,assist_share,goal_share,mean_assists,mean_goals,mean_goals_against
0,Cedric Alves Soares,Cédric,Arsenal,2,42,0,2022-10-16,0.181818,0.068441,0.056396,0.019530,0.1195,0.0406,1.1529
1,Granit Xhaka,Xhaka,Arsenal,3,51,807,2022-10-16,0.195122,0.184049,0.060523,0.052521,0.1255,0.1078,1.1529
2,Rob Holding,Holding,Arsenal,2,42,3,2022-10-16,0.050000,0.076923,0.015509,0.021951,0.0310,0.0444,1.1529
3,Thomas Partey,Partey,Arsenal,3,48,509,2022-10-16,0.127660,0.114613,0.039597,0.032706,0.0802,0.0671,1.1529
4,Martin Odegaard,Ødegaard,Arsenal,3,64,656,2022-10-16,0.334728,0.279070,0.103826,0.079636,0.2114,0.1679,1.1529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,Gavin Bazunu,Bazunu,Southampton,1,45,810,2022-10-16,,,,,,,1.5189
28,Willy Caballero,Caballero,Southampton,1,40,0,2022-10-16,,,,,,,1.5189
29,Lukasz Fabianski,Fabianski,West Ham,1,50,748,2022-10-16,,,,,,,1.3889
30,Alphonse Areola,Areola,West Ham,1,44,61,2022-10-16,,,,,,,1.3889


In [26]:
df.to_pickle('../data/simulated_player_outcomes.pkl')

date_path = date.today().strftime('%Y_%m_%d')
path = f'../data/historical/{date_path}'

# save to historical folder as well
df.to_pickle(f'{path}/simulated_player_outcomes.pkl')
