# Data Update (Run For Each GW)

This notebook updates the training data weekly for the gameweek points prediction model. The notebook takes in new PL stats per gameweek and adds them as new rows to the original training dataset.

In [117]:
#Import relevant libraries and packages
import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path
import json
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

#paths
path = Path('Data')
path_21_22 = Path('Data/2021-22')
path_22_23 = Path('Data/2022-23')

#read in data sets
training_data = pd.read_csv(path/'training_data_updated.csv', index_col=0, 
                       dtype={'season':str,
                              'comp':str,
                              'squad':str})
season_gws = pd.read_csv(path/'remaining_season.csv', index_col=0)
player_stats_2122 = pd.read_csv(path_21_22/'gws/merged_gw.csv')
player_stats_2223 = pd.read_csv(path_22_23/'gws/merged_gw.csv')

The original training data didn't have data for last season's gameweek 38 (last gameweek). The following code cell creates a dataframe with the 21-22 season gw 38 data, which we will add later to the original training data set.

*We run this cell only ONCE

In [118]:
# #Last season's gameweek 38 data (RUN JUST ONCE) 
# player_stats_2122 = player_stats_2122[player_stats_2122['GW'] == 38]
# player_stats_2122['season'] = '2122'
# relevant_columns = ['name', 'position', 'GW', 'team', 'opponent_team', 'was_home', 'season', 'minutes','total_points', 'assists', 'bonus', 'bps', 'clean_sheets',
#        'creativity', 'goals_conceded', 'goals_scored', 'ict_index',
#        'influence', 'penalties_saved', 'red_cards', 'saves', 'threat',
#        'yellow_cards', 'team_a_score', 'team_h_score']
# player_stats_2122 = player_stats_2122[relevant_columns]
# player_stats_2122 = player_stats_2122.rename(columns={'name': 'player', 'GW':'gw'})

# #Change position from string to int
# def position_assignment(data):
#     if data['position'] == 'GK':
#         return 1
#     if data['position'] == 'DEF':
#         return 2
#     if data['position'] == 'MID':
#         return 3
#     if data['position'] == 'FWD':
#         return 4
    
# player_stats_2122['position'] = player_stats_2122.apply(position_assignment, axis = 1)

# #Change opponent_team from int to string
# def team_assignment(data):
#     if data['opponent_team'] == 1:
#         return 'Arsenal'
#     if data['opponent_team'] == 2:
#         return 'Aston Villa'
#     if data['opponent_team'] == 3:
#         return 'Brentford'
#     if data['opponent_team'] == 4:
#         return 'Brighton'
#     if data['opponent_team'] == 5:
#         return 'Burnley'
#     if data['opponent_team'] == 6:
#         return 'Chelsea'
#     if data['opponent_team'] == 7:
#         return 'Crystal Palace'
#     if data['opponent_team'] == 8:
#         return 'Everton'
#     if data['opponent_team'] == 9:
#         return 'Leicester'
#     if data['opponent_team'] == 10:
#         return 'Leeds'
#     if data['opponent_team'] == 11:
#         return 'Liverpool'
#     if data['opponent_team'] == 12:
#         return 'Manchester City'
#     if data['opponent_team'] == 13:
#         return 'Manchester Utd'
#     if data['opponent_team'] == 14:
#         return 'Newcastle'
#     if data['opponent_team'] == 15:
#         return 'Norwich'
#     if data['opponent_team'] == 16:
#         return 'Southampton'
#     if data['opponent_team'] == 17:
#         return 'Spurs'
#     if data['opponent_team'] == 18:
#         return 'Watford'
#     if data['opponent_team'] == 19:
#         return 'West Ham'
#     if data['opponent_team'] == 20:
#         return 'Wolves'
    
# player_stats_2122['opponent_team'] = player_stats_2122.apply(team_assignment, axis = 1)
# player_stats_2122

Unnamed: 0,player,position,gw,team,opponent_team,was_home,season,minutes,total_points,assists,...,goals_scored,ict_index,influence,penalties_saved,red_cards,saves,threat,yellow_cards,team_a_score,team_h_score
24710,Eric Bailly,2,38,Man Utd,Crystal Palace,False,2122,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,0,1
24711,Keinan Davis,4,38,Aston Villa,Manchester City,False,2122,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,2,3
24712,Ayotomiwa Dele-Bashiru,3,38,Watford,Chelsea,False,2122,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,1,2
24713,James Ward-Prowse,3,38,Southampton,Leicester,False,2122,90,7,0,...,1,0.0,0.0,0,0,0,0.0,0,1,4
24714,Bruno Miguel Borges Fernandes,3,38,Man Utd,Crystal Palace,False,2122,90,1,0,...,0,0.0,0.0,0,0,0,0.0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25442,Wilfred Ndidi,3,38,Leicester,Southampton,True,2122,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,1,4
25443,Matt Ritchie,2,38,Newcastle,Burnley,False,2122,1,1,0,...,0,0.0,0.0,0,0,0,0.0,0,2,1
25444,Nathan Redmond,3,38,Southampton,Leicester,False,2122,82,2,0,...,0,0.0,0.0,0,0,0,0.0,0,1,4
25445,Mathew Ryan,1,38,Brighton,West Ham,True,2122,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,1,3


We now update the training data with data from the most recently played gameweek.

# CHANGE GAMEWEEK HERE:

In [119]:
#REMINDER TO CHANGE GW WEEKLY TO MOST RECENT GW
gameweek = 1

#Player stats for most recent gameweek
player_stats = player_stats_2223[player_stats_2223['GW'] == gameweek]
relevant_columns = ['name', 'minutes','total_points', 'assists', 'bonus', 'bps', 'clean_sheets',
       'creativity', 'goals_conceded', 'goals_scored', 'ict_index',
       'influence', 'penalties_saved', 'red_cards', 'saves', 'threat',
       'yellow_cards', 'team_a_score', 'team_h_score']
player_stats = player_stats[relevant_columns]
player_stats = player_stats.rename(columns={'name': 'player'})
player_stats

Unnamed: 0,player,minutes,total_points,assists,bonus,bps,clean_sheets,creativity,goals_conceded,goals_scored,ict_index,influence,penalties_saved,red_cards,saves,threat,yellow_cards,team_a_score,team_h_score
0,Nathan Redmond,1,1,0,0,3,0,0.0,0,0,0.0,0.0,0,0,0,0.0,0,1,4
1,Junior Stanislas,1,1,0,0,3,0,0.0,0,0,0.0,0.0,0,0,0,0.0,0,0,2
2,Armando Broja,15,1,0,0,3,0,0.3,0,0,2.5,5.2,0,0,0,19.0,0,1,0
3,Fabian Schär,90,15,0,3,43,1,14.6,0,1,10.6,66.0,0,0,0,25.0,0,0,2
4,Jonny Evans,90,1,0,0,15,0,1.3,2,0,1.5,14.0,0,0,0,0.0,0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
568,Oliver Skipp,0,0,0,0,0,0,0.0,0,0,0.0,0.0,0,0,0,0.0,0,1,4
569,Ryan Sessegnon,65,7,0,0,19,0,3.0,1,1,9.4,38.4,0,0,0,53.0,1,1,4
570,Ashley Young,0,0,0,0,0,0,0.0,0,0,0.0,0.0,0,0,0,0.0,0,0,2
571,Jeremy Sarmiento Morante,0,0,0,0,0,0,0.0,0,0,0.0,0.0,0,0,0,0.0,0,2,1


In [120]:
#Most recent gameweek info (by player)
season_gw = season_gws[season_gws['gw'] == gameweek]
season_gw = season_gw[['player', 'position', 'gw', 'team', 'opponent_team', 'was_home', 'season']]
season_gw

Unnamed: 0,player,position,gw,team,opponent_team,was_home,season
0,Vicente Guaita,1,1,Crystal Palace,Arsenal,True,2223
1,James Tomkins,2,1,Crystal Palace,Arsenal,True,2223
2,James McArthur,3,1,Crystal Palace,Arsenal,True,2223
3,Christian Benteke,4,1,Crystal Palace,Arsenal,True,2223
4,Joel Ward,2,1,Crystal Palace,Arsenal,True,2223
...,...,...,...,...,...,...,...
10452,Josh Wilson-Esbrand,2,1,Manchester City,West Ham United,False,2223
10453,Liam Delap,4,1,Manchester City,West Ham United,False,2223
10454,Stefan Ortega Moreno,1,1,Manchester City,West Ham United,False,2223
10455,Kalvin Phillips,3,1,Manchester City,West Ham United,False,2223


In [121]:
#Function to merge dfs by partial match (substring)
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

season_gw = fuzzy_merge(season_gw, player_stats, 'player', 'player', threshold=91)
season_gw

Unnamed: 0,player,position,gw,team,opponent_team,was_home,season,matches
0,Vicente Guaita,1,1,Crystal Palace,Arsenal,True,2223,Vicente Guaita
1,James Tomkins,2,1,Crystal Palace,Arsenal,True,2223,James Tomkins
2,James McArthur,3,1,Crystal Palace,Arsenal,True,2223,James McArthur
3,Christian Benteke,4,1,Crystal Palace,Arsenal,True,2223,Christian Benteke
4,Joel Ward,2,1,Crystal Palace,Arsenal,True,2223,Joel Ward
...,...,...,...,...,...,...,...,...
10452,Josh Wilson-Esbrand,2,1,Manchester City,West Ham United,False,2223,Josh Wilson-Esbrand
10453,Liam Delap,4,1,Manchester City,West Ham United,False,2223,Liam Delap
10454,Stefan Ortega Moreno,1,1,Manchester City,West Ham United,False,2223,Stefan Ortega Moreno
10455,Kalvin Phillips,3,1,Manchester City,West Ham United,False,2223,Kalvin Phillips


In [122]:
#Merging last gameweek's player stats with last gameweek's player info 
season_gw_stats = season_gw.merge(player_stats, left_on = 'matches', right_on = 'player')
season_gw_stats = season_gw_stats.drop(['player_x', 'matches'], axis=1)
season_gw_stats = season_gw_stats.rename(columns={'player_y': 'player'})
season_gw_stats = season_gw_stats[['player', 'position', 'gw', 'team', 'opponent_team', 'was_home',
       'season', 'minutes', 'total_points', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'penalties_saved', 'red_cards', 'saves',
       'threat', 'yellow_cards', 'team_a_score', 'team_h_score']]

# # #Add last season's gw 38 data (RUN JUST ONCE)
# gw1_gw38_stats = pd.concat([player_stats_2122, season_gw_stats])
# gw1_gw38_stats

season_gw_stats

Unnamed: 0,player,position,gw,team,opponent_team,was_home,season,minutes,total_points,assists,...,goals_scored,ict_index,influence,penalties_saved,red_cards,saves,threat,yellow_cards,team_a_score,team_h_score
24710,Eric Bailly,2,38,Man Utd,Crystal Palace,False,2122,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,0,1
24711,Keinan Davis,4,38,Aston Villa,Manchester City,False,2122,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,2,3
24712,Ayotomiwa Dele-Bashiru,3,38,Watford,Chelsea,False,2122,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,1,2
24713,James Ward-Prowse,3,38,Southampton,Leicester,False,2122,90,7,0,...,1,0.0,0.0,0,0,0,0.0,0,1,4
24714,Bruno Miguel Borges Fernandes,3,38,Man Utd,Crystal Palace,False,2122,90,1,0,...,0,0.0,0.0,0,0,0,0.0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,Josh Wilson-Esbrand,2,1,Manchester City,West Ham United,False,2223,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,2,0
530,Liam Delap,4,1,Manchester City,West Ham United,False,2223,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,2,0
531,Stefan Ortega Moreno,1,1,Manchester City,West Ham United,False,2223,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,2,0
532,Kalvin Phillips,3,1,Manchester City,West Ham United,False,2223,1,1,0,...,0,0.1,0.6,0,0,0,0.0,0,2,0


In [123]:
#We update the training data by concatenating the original training data with the new dataframe 
#and save it as a new csv file called 'training_data_updated'
keep = ['player', 'position', 'gw', 'team', 'opponent_team', 'was_home',
       'season', 'minutes', 'total_points', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'penalties_saved', 'red_cards', 'saves',
       'threat', 'yellow_cards', 'team_a_score', 'team_h_score']
training_data = training_data[keep]
training_data = pd.concat([training_data, season_gw_stats])
training_data = training_data.drop_duplicates()
training_data = training_data.reset_index()
training_data = training_data.drop('index', axis=1)
training_data.to_csv(path/'training_data_updated.csv', index=False)

In [124]:
training_data

Unnamed: 0,player,position,gw,team,opponent_team,was_home,season,minutes,total_points,assists,...,goals_scored,ict_index,influence,penalties_saved,red_cards,saves,threat,yellow_cards,team_a_score,team_h_score
0,Aaron Cresswell,2,1,West Ham United,Chelsea,False,1617,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,1,2
1,Aaron Lennon,3,1,Everton,Tottenham Hotspur,True,1617,15,1,0,...,0,0.9,8.2,0,0,0,0.0,0,1,1
2,Aaron Ramsey,3,1,Arsenal,Liverpool,True,1617,60,2,0,...,0,3.0,2.2,0,0,0,23.0,0,4,3
3,Abdoulaye Doucouré,3,1,Watford,Southampton,False,1617,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,1,1
4,Abdul Rahman Baba,2,1,Chelsea,West Ham United,True,1617,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140777,Josh Wilson-Esbrand,2,1,Manchester City,West Ham United,False,2223,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,2,0
140778,Liam Delap,4,1,Manchester City,West Ham United,False,2223,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,2,0
140779,Stefan Ortega Moreno,1,1,Manchester City,West Ham United,False,2223,0,0,0,...,0,0.0,0.0,0,0,0,0.0,0,2,0
140780,Kalvin Phillips,3,1,Manchester City,West Ham United,False,2223,1,1,0,...,0,0.1,0.6,0,0,0,0.0,0,2,0
