In [1]:

import os
import ast
import json
import math
import time
import pickle
import random
import urllib

import numpy as np
import pandas as pd
import requests as req

from copy import copy
from tqdm import tqdm


PLAYER_PATH = 'D://_G_Street/player_model'
DROPBOX_PATH = 'C:\\Users\Blake\G Street Dropbox\Blake Atkinson\shared_soccer_data\data'

def save_dict(di_, filename_):
    with open(filename_, 'wb') as f:
        pickle.dump(di_, f)

def load_dict(filename_):
    with open(filename_, 'rb') as f:
        ret_di = pickle.load(f)
    return ret_di


teams = load_dict(os.path.join(DROPBOX_PATH, 'IDs/teams'))
competitions = load_dict(os.path.join(DROPBOX_PATH, 'IDs/competitions'))


In [2]:
os.listdir(os.path.join(DROPBOX_PATH, 'IDs'))

['comp2pinny',
 'competitions',
 'footy',
 'gg_mkt_fty_col_ids',
 'gg_mkt_sb_col_ids',
 'gg_non_mkt_fty_col_ids',
 'gg_non_mkt_sb_col_ids',
 'id2comp.pkl',
 'managers',
 'odds_api',
 'player_bios.csv',
 'player_map',
 'referees',
 'SBR',
 'seasons',
 'stadiums',
 'teams']

In [3]:

### need separate model
women  = [131,135,1276,37,49,52,182,82,72,53,120]
youth = [1284,1331,1333,113,1336]

separate = women+youth

playoffs = {
    130:60,
    274:46,
    292:7,
    280:8,
    295:13,
    119:9,
    121:10,
    226:75,
    1256:249,
    125:80,
    1269:104,
    218:106,
    1259:109,
    1426:108,
    219:107,
    1249:97,
    231:88

}
intl_club = [16,35,90,101,273,353,66,165,1425,102]
intl = [254,255,256,257,259,43,55,1226,1249,1278,92,1346]

def load_schedules():
    
    normal = pd.read_csv(os.path.join(DROPBOX_PATH, 'schedules/processed_schedule.csv'))
    stf = pd.read_csv(os.path.join(DROPBOX_PATH, 'schedules/stf_schedule.csv'))
    
    normal = normal.loc[~normal['competition_id'].isin(separate)].reset_index(drop=True)
    stf = stf.loc[~stf['competition_id'].isin(separate)].reset_index(drop=True)
    normal['is_playoff'] = np.where(normal['competition_id'].isin(list(playoffs.keys())),1,0)
    stf['is_playoff'] = np.where(stf['competition_id'].isin(list(playoffs.keys())),1,0)
    normal['is_intl'] = np.where(normal['competition_id'].isin(intl),1,0)
    stf['is_intl'] = np.where(stf['competition_id'].isin(intl),1,0)
    normal['is_intl_c'] = np.where(normal['competition_id'].isin(intl_club),1,0)
    stf['is_intl_c'] = np.where(stf['competition_id'].isin(intl_club),1,0)
    normal['competition_id'] = normal['competition_id'].apply(lambda x: playoffs[x] if x in playoffs else x)
    stf['competition_id'] = stf['competition_id'].apply(lambda x: playoffs[x] if x in playoffs else x)
    
    normal['home_team_name'] = normal['home_team_id'].apply(lambda x: teams.get(x)['name'])
    normal['away_team_name'] = normal['away_team_id'].apply(lambda x: teams.get(x)['name'])

    stf['team_name'] = stf['team_id'].apply(lambda x: teams.get(x)['name'])
    stf['opp_team_name'] = stf['opp_team_id'].apply(lambda x: teams.get(x)['name'])
    
    normal['datetime_UTC'] = pd.to_datetime(normal['datetime_UTC'].copy())
    stf['datetime_UTC'] = pd.to_datetime(stf['datetime_UTC'].copy())

    normal['match_date_UTC'] = normal['datetime_UTC'].copy().dt.date
    stf['match_date_UTC'] = stf['datetime_UTC'].copy().dt.date
    
    normal['last_updated'] = pd.to_datetime(normal['last_updated'].copy())
    stf['last_updated'] = pd.to_datetime(stf['last_updated'].copy())
    
    normal = normal.loc[~normal['match_status'].isin(['deleted','collecting','cancelled','postponed'])].reset_index(drop=True)
    stf = stf.loc[~stf['match_status'].isin(['deleted','collecting','cancelled','postponed'])].reset_index(drop=True)
    
    return normal, stf 

schedule, stf_schedule = load_schedules()


def EST_to_UTC(time):
    return time + pd.Timedelta(hours=5)

def statsbomb_to_UTC(time):
    return time - pd.Timedelta(hours=1)

def add_STF_info(cklst):
    
    cklst['STF'] = False
    cklst['stf_path'] = cklst.apply(lambda x: os.path.join(DROPBOX_PATH, f'Statsbomb/STF/{x.competition_id}/{x.season_id}/{x.match_id}-{x.team_id}.csv'), axis=1)
    # see if its a new game that doesn't exist yet
    cklst['STF'] = cklst.apply(lambda x: os.path.exists(x.stf_path), axis=1)
    cklst['as_path'] = cklst.apply(lambda x: os.path.join(DROPBOX_PATH, f'Statsbomb/atomic_sparse/{x.competition_id}/{x.season_id}/{x.match_id}.csv'), axis=1)
    return cklst


def add_game_clock(game):
    
    game['time'] = game['minute'].copy() + (game['second'].copy()/60)
    to_add = game.groupby(['period'])['time'].max().to_dict()
    to_add[0] = 45
    game['previous_period'] = game['period'].copy() - 1
    game['to_add'] = game['previous_period'].map(to_add)
    game['time'] = game['time'].copy() + game['to_add'].copy() - 45
    
    return game.drop(columns=['previous_period','to_add'])


def add_subs(game, lineup_df):
    
    subs = game.loc[game['type_id']==19].copy().reset_index(drop=True)
    subs = subs[['team_id','player_id','time','substitution_replacement_id','substitution_replacement_name','outcome_id','outcome_name']].copy()
    
    to_append = []
    for index,row in subs.iterrows():
        player_index = lineup_df[lineup_df['player_id']==row['player_id']].index
        if len(player_index)>0:
            player_index = player_index[0]
        else:
            continue
        lineup_df.at[player_index, 'end_time'] = row['time']
        lineup_df.at[player_index, 'sub_type'] = str(row['outcome_name']) + '_off'
        to_append.append(pd.DataFrame([[np.nan, row['substitution_replacement_id'], row['substitution_replacement_name'], 
        np.nan, np.nan, row['team_id'],  row['time'], game['time'].max(), str(row['outcome_name'])+'_on']], columns=list(lineup_df)))
        
    to_append = pd.concat(to_append, axis=0).reset_index(drop=True)
    lineup_df = pd.concat([lineup_df,to_append], axis=0).reset_index(drop=True)

    return lineup_df

def add_red_cards(game, lineup_df):
    # and 10 man injury, second yellows
    
    if 'player_off_permanent' in list(game):
        injury_loss = (game.player_off_permanent==True)
    else:
        injury_loss = (game.type_id==99999) # just creating falses
    if 'bad_behaviour_card_id' in list(game):
        bad_behave_red = (game.bad_behaviour_card_id==5)
        bad_behave_sy = (game.bad_behaviour_card_id==6)
    else:
        bad_behave_red = (game.type_id==99999) # just creating falses
        bad_behave_sy = (game.type_id==99999)
        
    if 'foul_committed_card_id' in list(game):
        foul_red = (game.foul_committed_card_id==5)
        foul_sy = (game.foul_committed_card_id==6)
    else:
        foul_red = (game.type_id==99999) # just creating falses
        foul_sy = (game.type_id==99999)
    
    types=['Injury_off','Red','Second_yellow','Red','Second_yellow']
    if len(game.loc[injury_loss|bad_behave_red|bad_behave_sy|foul_red|foul_sy]) > 0:
        for mask_index, mask in enumerate([injury_loss, bad_behave_red, bad_behave_sy, foul_red, foul_sy]):
            if len(game.loc[mask]) > 0:
#                 print(mask[mask==True])
                
                info = game.loc[mask].reset_index(drop=True)
                for index, row in info.iterrows():
                    try:
                        # can't figure out this error
                        # usually has a player id that's not in the lineup
                        player_index = lineup_df[lineup_df['player_id']==row['player_id']].index[0]
                    except:
                        continue
                    time = row['time']
                    type_ = types[mask_index]
            
                    lineup_df.at[player_index, 'end_time'] = time
                    lineup_df.at[player_index, 'sub_type'] = type_
    
    
    return lineup_df

def get_starting_lineups(game):
    
    starting_lineups = game.loc[game['type_id']==35].copy().reset_index(drop=True)
    lineup_dict = starting_lineups[['team_id','tactics_lineup']].set_index('team_id').to_dict()
    
    lineup_df = []
    for team in lineup_dict['tactics_lineup'].keys():
        team_lineup = pd.json_normalize(ast.literal_eval(lineup_dict['tactics_lineup'][team]))
        team_lineup['team_id'] = team
        team_lineup.columns = [col.replace('.','_') for col in list(team_lineup)]
        team_lineup['start_time'] = 0
        team_lineup['end_time'] = game['time'].max()
        team_lineup['sub_type'] = 'None'
        lineup_df.append(team_lineup)
        
    lineup_df = pd.concat(lineup_df, axis=0)

    return lineup_df.reset_index(drop=True)


def extract_playing_time(game, comp_id, season_id):
    
    match_id = game['match_id'].mode()[0]
    
    game = add_game_clock(game)
    starting_lineups = get_starting_lineups(game)
    
    lineup_df = add_subs(game, starting_lineups)
    lineup_df['match_id'] = match_id
    lineup_df['competition_id'] = comp_id
    lineup_df['season_id'] = season_id
    
    # move to front of columns
    cols = list(lineup_df)
    cols.insert(0, cols.pop(cols.index('team_id')))
    cols.insert(0, cols.pop(cols.index('season_id')))
    cols.insert(0, cols.pop(cols.index('competition_id')))
    cols.insert(0, cols.pop(cols.index('match_id')))
    
    lineup_df = lineup_df[cols].copy()
    lineup_df = add_red_cards(game, lineup_df)
    
    lineup_df['playing_time'] = lineup_df['end_time'].copy()-lineup_df['start_time'].copy()
    
    positions = game.copy().groupby(['player_id']).agg({
        'position_id':pd.Series.mode,
        'position_name':pd.Series.mode
    }).reset_index().rename(columns={
        'position_id':'backup_position_id',
        'position_name':'backup_position_name'
    })

    lineup_df = lineup_df.merge(positions, how='left', on=['player_id'])
    lineup_df['position_id'] = lineup_df['position_id'].fillna(lineup_df['backup_position_id'].copy())
    
    lineup_df = lineup_df.dropna(subset=['backup_position_id']) # means they didn't record an action, probably barely on the field
    lineup_df['pos_type'] = lineup_df['position_id'].apply(lambda x: type(x))
    lineup_df['position_id'] = lineup_df['position_id'].apply(lambda x: list(x)[0] if isinstance(x, np.ndarray) else x) # rarely but sometimes is a list, # no real way of determining which list element is better
    lineup_df['position_id'] = lineup_df['position_id'].astype(int)
    lineup_df['position_name'] = lineup_df['position_name'].fillna(lineup_df['backup_position_name'].copy())
    lineup_df = lineup_df.drop(columns=['backup_position_id','backup_position_name','pos_type'])
    
    team_1_id, team_2_id = list(lineup_df.team_id.unique())[0], list(lineup_df.team_id.unique())[1]
    formation = game.loc[(game['team_id']==team_1_id)&(game['type_id']==35)]['tactics_formation'].values[0]
    opp_formation = game.loc[(game['team_id']==team_2_id)&(game['type_id']==35)]['tactics_formation'].values[0]
    formation_dict = {
        team_1_id:formation,
        team_2_id:opp_formation
    }
    lineup_df['team_formation'] = lineup_df['team_id'].map(formation_dict)


    return lineup_df

## use game vecs to get list of games
def create_lineup_checklist(stf_schedule_):
    
    """ on external HD """
    gvecs = stf_schedule_[['datetime_UTC','match_id','team_id','opp_team_id','is_home','competition_id','season_id','match_date_UTC']].copy().merge(pd.read_csv(os.path.join(DROPBOX_PATH,'Statsbomb/game_vecs/game_vecs.csv'),usecols=['match_id','team_id','obv_diff']), how='left', on=['match_id','team_id'])
    gvecs['datetime_UTC'] = pd.to_datetime(gvecs['datetime_UTC'])
    gvecs = gvecs.dropna(subset=['obv_diff']) ## testing if we have good data
    gvecs = gvecs.drop(columns=['obv_diff'])
    gvecs['lineup_path'] = gvecs.apply(lambda x: os.path.join(DROPBOX_PATH, f'Statsbomb/raw/lineups/{x.competition_id}/{x.season_id}/{x.match_id}.json'), axis=1)
    gvecs['as_path'] = gvecs.apply(lambda x: os.path.join(DROPBOX_PATH, f'Statsbomb/atomic_sparse/{x.competition_id}/{x.season_id}/{x.match_id}.csv'), axis=1)
    gvecs['STF_path'] = gvecs.apply(lambda x: os.path.join(DROPBOX_PATH, f'Statsbomb/STF/{x.competition_id}/{x.season_id}/{x.match_id}-{x.team_id}.csv'), axis=1)
    
    gvecs['minutes_league_folder'] = gvecs.apply(lambda x: os.path.join(PLAYER_PATH,f'playing_time/{x.competition_id}'), axis=1)
    gvecs['minutes_season_folder'] = gvecs.apply(lambda x: os.path.join(PLAYER_PATH,f'playing_time/{x.competition_id}/{x.season_id}'), axis=1)
    gvecs['minutes_path'] = gvecs['minutes_season_folder'].copy() + '/' + gvecs['match_id'].copy().astype(str) + '.csv'
    gvecs['path_exists'] = gvecs['minutes_path'].apply(lambda x: os.path.exists(x))
    return gvecs

def create_league_season_folders(gl):
    
    folders_needed = list(gl['minutes_league_folder'].unique())
    for folder in folders_needed:
        if not os.path.exists(folder):
            os.mkdir(folder)
            
    folders_needed = list(gl['minutes_season_folder'].unique())
    for folder in folders_needed:
        if not os.path.exists(folder):
            os.mkdir(folder)
    
    return


def update_player_minutes():
    
    schedule_, stf_schedule_ = load_schedules()
    game_list = create_lineup_checklist(stf_schedule_)
    create_league_season_folders(game_list.copy())
    
    ## oops, don't need stf 
    game_list = game_list.drop_duplicates(subset=['match_id']).reset_index(drop=True)
    
    print("Updating player minutes....")

    for index, row in tqdm(game_list.iterrows(), total=len(game_list)):
        if row['path_exists'] == False:
            try:
                sparse = pd.read_csv(row['as_path'])
            except:
                print(f"Error on {row['match_id']}")
                continue
            comp_id = row['competition_id']
            season_id = row['season_id']
            minutes_path = row['minutes_path']
            lineup = extract_playing_time(sparse, comp_id, season_id)
            lineup.to_csv(minutes_path, index=False)
    
    
    return

update_player_minutes()





  normal = pd.read_csv(os.path.join(DROPBOX_PATH, 'schedules/processed_schedule.csv'))
  stf = pd.read_csv(os.path.join(DROPBOX_PATH, 'schedules/stf_schedule.csv'))
  normal = pd.read_csv(os.path.join(DROPBOX_PATH, 'schedules/processed_schedule.csv'))
  stf = pd.read_csv(os.path.join(DROPBOX_PATH, 'schedules/stf_schedule.csv'))


Updating player minutes....


 81%|██████████████████████████████████████████████████████████▊              | 50864/63074 [00:00<00:00, 65197.00it/s]

Error on 3822006
Error on 3822007
Error on 3822008
Error on 3822009


100%|█████████████████████████████████████████████████████████████████████████| 63074/63074 [00:00<00:00, 65996.62it/s]

Error on 3868691
Error on 3868379





In [4]:

schedule, stf_schedule = load_schedules()



  normal = pd.read_csv(os.path.join(DROPBOX_PATH, 'schedules/processed_schedule.csv'))
  stf = pd.read_csv(os.path.join(DROPBOX_PATH, 'schedules/stf_schedule.csv'))


In [5]:

def load_lineups():
    
    return




In [6]:

def assemble_tskill_input(update=True):
    

    
    
    return




In [None]:

player_map = {}
stats = ['score_diff','xG_diff','xxG_diff','obv_diff']
gvecs = pd.read_csv(os.path.join(DROPBOX_PATH, 'Statsbomb/game_vecs/game_vecs.csv'), usecols=['match_id','team_id','score_diff','xG_diff','xxG_diff','obv_diff'])
games = create_lineup_checklist(stf_schedule)
games = games.drop_duplicates(subset=['match_id'])
games = games.merge(gvecs[['match_id','team_id']+stats], how='left',  on=['match_id','team_id'])
games = games.sort_values(by='datetime_UTC')
tskill_input = []
results = {stat:[] for stat in stats}
datetimes = []
for index, row in tqdm(games.iterrows(), total=len(games)):
    if row['is_home']==1:
        home_team_id = row['team_id']
        away_team_id = row['opp_team_id']
    else:
        home_team_id = row['opp_team_id']
        away_team_id = row['team_id']
    try:
        minutes = pd.read_csv(row['minutes_path'])
    except:
        print(f"minutes not found for {row['minutes_path']}")
        continue
        
    pmap_update = minutes.set_index('player_id')['player_name'].to_dict()
    player_map.update(pmap_update)
        
    ## these track team specific home field
    home_boost_id = str(home_team_id)+'_hfa' 
    ## could also do away id if there are a significant amount of neutral site games

    ## trying 60 to see if garbage players stop showing up
    minutes_threshold = 60 ## I think low is better, there is some implicit skill in being subbed on

    contributors = minutes.copy().loc[minutes['playing_time']>minutes_threshold]
    home_contributors = contributors.loc[contributors['team_id']==home_team_id].copy().reset_index(drop=True)
    away_contributors = contributors.loc[contributors['team_id']==away_team_id].copy().reset_index(drop=True)
     
    ## I think a couple of games where they are cut short
    if len(home_contributors) < 1:
        continue
    if len(away_contributors) < 1:
        continue
        
    tskill_format = [list(home_contributors.player_id.values)+[home_boost_id], list(away_contributors.player_id.values)]
    tskill_input.append(tskill_format)
    datetimes.append(row['datetime_UTC'])
    for stat in stats:
        if row[stat] == 0:
            results[stat].append([0,0])
        elif row[stat] > 0:
            results[stat].append([1,0])
        else:
            results[stat].append([0,1])
            
save_dict(player_map, os.path.join(DROPBOX_PATH, 'IDs/player_map'))


 64%|████████████████████████████████████████████████▏                          | 40526/63074 [02:32<00:51, 437.20it/s]

minutes not found for D://_G_Street/player_model\playing_time/104/91/3822006.csv
minutes not found for D://_G_Street/player_model\playing_time/104/91/3822007.csv
minutes not found for D://_G_Street/player_model\playing_time/104/91/3822008.csv
minutes not found for D://_G_Street/player_model\playing_time/104/91/3822009.csv


 65%|████████████████████████████████████████████████▍                          | 40759/63074 [02:33<01:03, 351.28it/s]

In [None]:

p_draw = len(games.loc[games['score_diff']==0])/len(games)
p_draw = len(games.loc[games['obv_diff']==0])/len(games)

In [None]:

import gc
gc.collect()


In [None]:

import trueskillthroughtime as ttt

## defaults: sigma = 6.0; beta = 1.0; gamma = 0.03;
h = ttt.History(composition = tskill_input, results = results['obv_diff'], times = [d.timestamp()/(60*60*24) for d in datetimes], sigma=3, p_draw=p_draw)


In [None]:

current_rating = []
for pid, ratings in h.learning_curves().items():
    current_rating.append([pid, ratings[-1][-1].mu, ratings[-1][-1].sigma])


In [None]:

current_rating = pd.DataFrame(current_rating, columns=['player_id', 'mu','sigma'])
current_ratings = current_rating.loc[~current_rating['player_id'].astype(str).str.contains('_hfa')]


In [None]:

current_ratings['player_name'] = current_ratings['player_id'].map(player_map)


In [None]:

current_ratings.sort_values(by='mu', ascending=False).head(60)


In [None]:

games

NameError: name 'game_list' is not defined