# Importing Packages and Data

In [103]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import copy
import tensorflow as tf
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [104]:
# Importing the main data
prem_league_data = pd.read_csv('data/cleaned_merged_seasons.csv')

  prem_league_data = pd.read_csv('data/cleaned_merged_seasons.csv')


# Merging in 2018-19 and 2019-20 seasons

In [105]:
merged_2018_19 = pd.read_csv('data/2018-19/gws/merged_gw.csv', encoding='iso-8859-1')
merged_2019_20 = pd.read_csv('data/2019-20/gws/merged_gw.csv', encoding='iso-8859-1')

In [106]:
df_19 = merged_2018_19[['name', 'assists', 'bonus', 'bps', 'clean_sheets',
                        'creativity', 'element', 'fixture', 'goals_conceded', 'goals_scored', 'ict_index',
                        'influence', 'kickoff_time', 'minutes', 'opponent_team', 'own_goals', 'penalties_missed',
                        'penalties_saved', 'red_cards', 'round', 'saves', 'selected', 'team_a_score', 'team_h_score',
                        'threat', 'total_points', 'transfers_balance', 'transfers_in', 'transfers_out', 'value', 'was_home',
                        'yellow_cards', 'GW']].copy()
df_19['season_x'] = '2018-19'
df_19['team_x'] = np.nan
df_19['position'] = np.nan

df_19['name'] = df_19['name'].apply(lambda x: x[:x.rfind('_')].replace('_', ' '))

In [107]:
df_20 = merged_2019_20[['name', 'assists', 'bonus', 'bps', 'clean_sheets',
                        'creativity', 'element', 'fixture', 'goals_conceded', 'goals_scored', 'ict_index',
                        'influence', 'kickoff_time', 'minutes', 'opponent_team', 'own_goals', 'penalties_missed',
                        'penalties_saved', 'red_cards', 'round', 'saves', 'selected', 'team_a_score', 'team_h_score',
                        'threat', 'total_points', 'transfers_balance', 'transfers_in', 'transfers_out', 'value', 'was_home',
                        'yellow_cards', 'GW']].copy()
df_20['season_x'] = '2019-20'
df_20['team_x'] = np.nan
df_20['position'] = np.nan
df_20['name'] = df_20['name'].apply(lambda x: x[:x.rfind('_')].replace('_', ' '))

In [108]:
overall_df = pd.concat([prem_league_data, df_19, df_20])

# Fixing special characters in names in the 2019-20 season

In [109]:
# Mapping special characters
spec_char_dict = {
  'Ã¡':'á',
  'Ã\x81':'Á',
  'Ã©':'é',
  'Ã\xad':'í',
  'Ã³':'ó',
  'Ãº':'ú',
  'Ã¤':'ä',
  'Ã«':'ë',
  'Ã¯':'ï',
  'Ã¶':'ö',
  'Ã\x96':'Ö',
  'Ã¼':'ü',
  'Ã£':'ã',
  'Ã\x9f':'ß',
  'Ã§':'ç',
  'Ä\x87':'ć',
  'Ã\x87':'Ç',
  'Ã±':'ñ',
  'Ã¸':'ø',
  'Ã\x98':'Ø',
  'Å¡':'š'
}

In [110]:
# Pulling a list of all the names with special characters to fix
# spec_char_names = df['name'][(df['season_x'] == '2019-20') & (df['team_x'].isnull())].unique()
# The line above only works if you run it after getting the missing team info. I want to run the fix before joining on team info that way the join will work and I don't have to repeat it
# Because of this, I'm hard coding the fix

spec_char_names = ['Abdoulaye DoucourÃ©', 'Adama TraorÃ©',
       'AdriÃ¡n San Miguel del Castillo', 'Alexis SÃ¡nchez',
       'AndrÃ© Filipe Tavares Gomes', 'Antonio RÃ¼diger', 'Ayoze PÃ©rez',
       'Bernard AnÃ\xadcio Caldeira Duarte', 'Carlos SÃ¡nchez',
       'Cheikhou KouyatÃ©', 'CÃ©dric Soares', 'CÃ©sar Azpilicueta',
       'Daniel Ceballos FernÃ¡ndez', 'Davinson SÃ¡nchez', 'Davy PrÃ¶pper',
       'Djibril SidibÃ©', 'Emiliano BuendÃ\xada', 'Emiliano MartÃ\xadnez',
       'Fabian SchÃ¤r', 'FabiÃ¡n Balbuena', 'Federico FernÃ¡ndez',
       'Francisco FemenÃ\xada Far', 'FrÃ©dÃ©ric Guilbert', 'GaÃ«tan Bong',
       'Georges-KÃ©vin Nkoudou', 'HÃ©ctor BellerÃ\xadn', 'HÃ©lder Costa',
       'Ilkay GÃ¼ndogan', 'IsmaÃ¯la Sarr', 'Javier HernÃ¡ndez BalcÃ¡zar',
       'JesÃºs Vallejo LÃ¡zaro', 'Joelinton CÃ¡ssio ApolinÃ¡rio de Lira',
       'Jonas LÃ¶ssl', 'Jose Luis Mato SanmartÃ\xadn',
       'JosÃ© Diogo Dalot Teixeira', 'JosÃ© Heriberto Izquierdo Mena',
       'JosÃ© Ignacio Peleteiro Romallo',
       'JosÃ© Ã\x81ngel EsmorÃ\xads Tasende', 'JosÃ© Holebas',
       'JoÃ£o Filipe Iria Santos Moutinho', 'JoÃ£o Pedro Cavaco Cancelo',
       'JÃ¼rgen Locadia', 'Leroy SanÃ©', 'MartÃ\xadn Montoya',
       'Mesut Ã\x96zil', 'Miguel AlmirÃ³n', 'Muhamed BeÅ¡iÄ\x87',
       "N'Golo KantÃ©", 'Nathan AkÃ©', 'Nicolas PÃ©pÃ©',
       'NicolÃ¡s Otamendi', 'Onel HernÃ¡ndez', 'Pascal GroÃ\x9f',
       'Pedro RodrÃ\xadguez Ledesma', 'Pierre-Emile HÃ¸jbjerg',
       'RaÃºl JimÃ©nez', 'Romain SaÃ¯ss',
       'Rui Pedro dos Santos PatrÃ\xadcio', 'RÃºben Diogo da Silva Neves',
       'RÃºben GonÃ§alo Silva Nascimento Vinagre', 'Sadio ManÃ©',
       'Sebastian PrÃ¶dl', 'Sergio AgÃ¼ero', 'SÃ©bastien Haller',
       'Victor LindelÃ¶f', 'VÃ\xadctor Camarasa', 'Ã\x87aglar SÃ¶yÃ¼ncÃ¼',
       'Ã\x98rjan Nyland', 'JoÃ£o Pedro Junqueira de Jesus',
       'GonÃ§alo Bento Soares Cardoso', 'Bruno AndrÃ© Cavaco Jordao',
       'JosÃ© Reina', 'Pablo MarÃ\xad', 'Borja GonzÃ¡lez TomÃ¡s',
       'JoÃ£o Manuel Neves VirgÃ\xadnia', 'Adalberto PeÃ±aranda']

In [111]:
# Fixing special characters
fixed_names = []

for name in spec_char_names:
  for key, value in spec_char_dict.items():
      name = name.replace(key, value)
  fixed_names.append(name)

In [112]:
# Making a dictionary of corrected names
corrected_name_dict = {spec_char_names[i]: fixed_names[i] for i in range(len(spec_char_names))}

In [113]:
# Replacing all special character names in overall_df with the corrected names
overall_df['name'] = overall_df['name'].replace(corrected_name_dict)

# Get missing team data for 2016-17, 2017-18, 2018-19, and 2019-20 seasons

In [114]:
# Get list of teams by season
teams = pd.read_csv('data/master_team_list.csv')
teams_2016 = teams[teams.season=='2016-17']
teams_2017 = teams[teams.season=='2017-18']
teams_2018 = teams[teams.season=='2018-19']
teams_2019 = teams[teams.season=='2019-20']

# Get 2016 players and team data
players_2016 = pd.read_csv('data/2016-17/players_raw.csv')
players_2016['name'] = players_2016.first_name + ' ' + players_2016.second_name
players_2016 = players_2016[['name', 'team']]
players_2016_merged = players_2016.merge(teams_2016, how='left', on='team')

df = overall_df.merge(players_2016_merged, how='left', left_on=['season_x', 'name'], right_on=['season', 'name'])
df['team_x'] = np.where(~df['team_name'].isnull(),df['team_name'],df['team_x'])
df.drop(columns=['team', 'season', 'team_name'], inplace=True)

# Get 2017 players and team data
players_2017 = pd.read_csv('data/2017-18/players_raw.csv')
players_2017['name'] = players_2017.first_name + ' ' + players_2017.second_name
players_2017 = players_2017[['name', 'team']]
players_2017_merged = players_2017.merge(teams_2017, how='left', on='team')

df = df.merge(players_2017_merged, how='left', left_on=['season_x', 'name'], right_on=['season', 'name'])
df['team_x'] = np.where(~df['team_name'].isnull(),df['team_name'], df['team_x'])
df.drop(columns=['team', 'season', 'team_name'], inplace=True)

# Get 2018 players and team data
players_2018 = pd.read_csv('data/2018-19/players_raw.csv')
players_2018['name'] = players_2018.first_name + ' ' + players_2018.second_name
players_2018 = players_2018[['name', 'team']]
players_2018_merged = players_2018.merge(teams_2018, how='left', on='team')

df = df.merge(players_2018_merged, how='left', left_on=['season_x', 'name'], right_on=['season', 'name'])
df['team_x'] = np.where(~df['team_name'].isnull(),df['team_name'], df['team_x'])
df.drop(columns=['team', 'season', 'team_name'], inplace=True)

# Get 2019 players and team data
players_2019 = pd.read_csv('data/2019-20/players_raw.csv')
players_2019['name'] = players_2019.first_name + ' ' + players_2019.second_name
players_2019 = players_2019[['name', 'team']]
players_2019_merged = players_2019.merge(teams_2019, how='left', on='team')

df = df.merge(players_2019_merged, how='left', left_on=['season_x', 'name'], right_on=['season', 'name'])
df['team_x'] = np.where(~df['team_name'].isnull(),df['team_name'], df['team_x'])
df.drop(columns=['team', 'season', 'team_name'], inplace=True)

# Fixing missing team info for David de Gea and Caglar Söyüncü manually
mask = ((df.season_x == '2017-18') | (df.season_x == '2018-19')) & ((df.name=="David de Gea") | (df.name=="David De Gea"))
df.loc[mask, 'team_x'] = 'Man Utd'

mask = (df.season_x == '2018-19') & (df.name=="Caglar Söyüncü")
df.loc[mask, 'team_x'] = 'Leicester'

# Fill Missing Position Data

In [115]:
fill_pos = pd.read_csv('data/mpos.csv')
fill_pos.columns = ['dirty_name', 'clean_name', 'scrape_position']

In [116]:
# clean names again
df['name'] = df['name'].replace(corrected_name_dict)
import pandas as pd
import unicodedata

def normalize_string(text):
    # Strip leading/trailing whitespace
    text = text.strip()
    # Convert to lowercase
    text = text.lower()
    # Replace multiple spaces with a single space
    text = ' '.join(text.split())
    # Normalize unicode characters to the closest ASCII representation
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    return text

# Normalizing names, easier than removing special chars, suggest we use the normalized name col
df['name_normalized'] = df['name'].apply(normalize_string)
fill_pos['name_normalized'] = fill_pos['clean_name'].apply(normalize_string)

In [117]:
# Create a dictionary to use for filling in missing names
fill_pos[['name_normalized', 'scrape_position']].to_dict()
pos_dict = pd.Series(fill_pos['scrape_position'].values, index=fill_pos['name_normalized']).to_dict()
# Katherine note - why are these here?
pos_dict.update({
    'greg cunninghamm': 'DEF',
    'muhamed besic': 'MID',
    'zeze steven sessegnon': 'DEF',
  }
)

In [118]:
# Filling nulls with a "missing position" tag
df['position'] = df['position'].fillna('no_pos')

# Filling missing positions using the dictionary created above
def fill_missing_position(row):
  if row['position'] == 'no_pos':
    row['position'] = pos_dict.get(row['name_normalized'], "no_position")
  return row

df = df.apply(fill_missing_position, axis=1)

In [119]:
# Validating that all players have position data - remove at the end
df[df.position == 'no_position']

Unnamed: 0,season_x,name,position,team_x,assists,bonus,bps,clean_sheets,creativity,element,...,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW,name_normalized


# Cleaning GK Data

In [120]:
# Cleaning up GK positions
df.loc[ df['position'] == 'GKP', 'position'] = 'GK'
df[df.position == 'GKP']

Unnamed: 0,season_x,name,position,team_x,assists,bonus,bps,clean_sheets,creativity,element,...,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW,name_normalized


In [121]:
df.columns

Index(['season_x', 'name', 'position', 'team_x', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'fixture', 'goals_conceded',
       'goals_scored', 'ict_index', 'influence', 'kickoff_time', 'minutes',
       'opponent_team', 'opp_team_name', 'own_goals', 'penalties_missed',
       'penalties_saved', 'red_cards', 'round', 'saves', 'selected',
       'team_a_score', 'team_h_score', 'threat', 'total_points',
       'transfers_balance', 'transfers_in', 'transfers_out', 'value',
       'was_home', 'yellow_cards', 'GW', 'name_normalized'],
      dtype='object')

# Renaming messy variables

In [122]:
df.rename(columns={"season_x": "season", "name": "player_name", "team_x": "player_team_name", "opponent_team": "opp_team_id", 'position_x':'position'}, inplace=True)

# Adding missing opponent team names

In [123]:
# Merging in team info - joins opponent team name (team_name) based on opp_team_id, which is populated in the seasons missing opponent team name
df = df.merge(teams, how='left', left_on=['season','opp_team_id'], right_on=['season','team'])

# Dropping unnecessary columns
df.drop(columns=['team','opp_team_name'], inplace=True)

# Renaming opponent team name column
df.rename(columns={"team_name": "opp_team_name"}, inplace=True)

# Adding opponent difficulty column

In [124]:
# Importing difficulty data
difficulty_data = pd.read_csv('data/team_difficulty_ind.csv')

In [125]:
# Joining difficulty data to ours
df = df.merge(difficulty_data, how='left', left_on='opp_team_name', right_on='team_name').drop(columns = ['team_name'])

# Renaming the opponent difficulty column
df.rename(columns={"FDI": "opp_diff_ind"}, inplace=True)

# Dropping columns we don't plan to use

In [126]:
df.drop(columns=['element', 'fixture', 'transfers_balance','transfers_in' ,'transfers_out'], inplace=True)

# Transforming the "away and home team score" columns into a more usable form (player team and opponent team score)

In [127]:
# Need two new columns: player_team_score and opp_team_score. Populated from team_a_score and team_h_score based on was_home flag
df['player_team_score'] = np.where(df['was_home'] == True, df['team_h_score'], df['team_a_score'])
df['opp_team_score'] = np.where(df['was_home'] == True, df['team_a_score'], df['team_h_score'])

# Drop team_a_score and team_h_score once we have these two columns
df.drop(columns=['team_a_score','team_h_score'], inplace=True)

# Clean Data from Covid Affected Season

In [128]:
# Adjust GW 2019-20 for COVID affected Gameweeks (GW 30+) to convert to 38 GW season

# Update the GW values for the 2019-20 season
df.loc[(df['season'] == '2019-20') & (df['GW'].isin(range(39, 48))), 'GW'] = df['GW'] - 9

# Dropping Old Seasons and Creating Sequence Column

In [129]:
# Katherine note - I think we can delete this section

In [130]:
df = df.query("season != '2017-18'")
df = df.query("season != '2016-17'")

In [131]:
# Validating that old seasons are gone - delete at the end
df.season.value_counts()

season
2022-23    26505
2021-22    25447
2020-21    24365
2019-20    22560
2018-19    21866
Name: count, dtype: int64

In [132]:
# Need to add comments to this section

season_padding = {
    '2018-19': 0,
    '2019-20': 38,
    '2020-21': 76,
    '2021-22': 114,
    '2022-23': 152,
    '2023-24': 190,
}

def sequencing(row):
  season = row['season']
  gw = row['GW']
  padding = season_padding.get(season, 0)
  row['sequence'] = padding + gw
  return row

df['sequence'] = 0
df = df.apply(sequencing, axis=1)
df.head(10)

Unnamed: 0,season,player_name,position,player_team_name,assists,bonus,bps,clean_sheets,creativity,goals_conceded,...,value,was_home,yellow_cards,GW,name_normalized,opp_team_name,opp_diff_ind,player_team_score,opp_team_score,sequence
19852,2020-21,Aaron Connolly,FWD,Brighton,0,0,-3,0,0.3,2,...,55,True,0,1,aaron connolly,Chelsea,3,1.0,3.0,77
19853,2020-21,Aaron Cresswell,DEF,West Ham,0,0,11,0,11.2,2,...,50,True,0,1,aaron cresswell,Newcastle,2,0.0,2.0,77
19854,2020-21,Aaron Mooy,MID,Brighton,0,0,0,0,0.0,0,...,50,True,0,1,aaron mooy,Chelsea,3,1.0,3.0,77
19855,2020-21,Aaron Ramsdale,GK,Sheffield Utd,0,0,12,0,0.0,2,...,50,True,0,1,aaron ramsdale,Wolves,2,0.0,2.0,77
19856,2020-21,Abdoulaye Doucouré,MID,Everton,0,0,20,1,44.6,0,...,55,False,0,1,abdoulaye doucoure,Spurs,3,1.0,0.0,77
19857,2020-21,Aboubakar Kamara,MID,Fulham,0,0,-2,0,1.8,3,...,50,True,0,1,aboubakar kamara,Arsenal,4,0.0,3.0,77
19858,2020-21,Adama Traoré,MID,Wolves,0,0,6,1,1.8,0,...,65,False,0,1,adama traore,Sheffield Utd,1,2.0,0.0,77
19859,2020-21,Adam Forshaw,MID,Leeds,0,0,0,0,0.0,0,...,50,False,0,1,adam forshaw,Liverpool,4,3.0,4.0,77
19860,2020-21,Adam Lallana,MID,Brighton,0,0,6,0,27.2,1,...,65,True,0,1,adam lallana,Chelsea,3,1.0,3.0,77
19861,2020-21,Adam Webster,DEF,Brighton,0,0,14,0,11.8,3,...,45,True,0,1,adam webster,Chelsea,3,1.0,3.0,77


In [133]:
# Random testing of season/gw sequencing - remove at the end
df.loc[(df.season == '2022-23') & (df.GW == 2)].head(3)

Unnamed: 0,season,player_name,position,player_team_name,assists,bonus,bps,clean_sheets,creativity,goals_conceded,...,value,was_home,yellow_cards,GW,name_normalized,opp_team_name,opp_diff_ind,player_team_score,opp_team_score,sequence
70237,2022-23,Nathan Redmond,MID,Southampton,0,0,0,0,0.0,0,...,55,True,0,2,nathan redmond,Leeds,1,2.0,2.0,154
70238,2022-23,Junior Stanislas,MID,Bournemouth,0,0,2,0,0.3,1,...,50,False,0,2,junior stanislas,Man City,4,0.0,4.0,154
70239,2022-23,Armando Broja,FWD,Chelsea,0,0,3,0,0.0,1,...,55,True,0,2,armando broja,Spurs,3,2.0,2.0,154


In [134]:
df.loc[(df.season == '2018-19') & (df.GW == 30)].tail(3) # Remove at the end

Unnamed: 0,season,player_name,position,player_team_name,assists,bonus,bps,clean_sheets,creativity,goals_conceded,...,value,was_home,yellow_cards,GW,name_normalized,opp_team_name,opp_diff_ind,player_team_score,opp_team_score,sequence
113049,2018-19,Zeze Steven Sessegnon,DEF,Fulham,0,0,0,0,0.0,0,...,40,False,0,30,zeze steven sessegnon,Leicester,2,1.0,3.0,30
113050,2018-19,Álvaro Morata,FWD,Chelsea,0,0,0,0,0.0,0,...,84,True,0,30,alvaro morata,Wolves,2,1.0,1.0,30
113051,2018-19,Çaglar Söyüncü,DEF,Leicester,0,0,0,0,0.0,0,...,49,True,0,30,caglar soyuncu,Fulham,1,3.0,1.0,30


# Making lagged average columns

In [135]:
# Function to make a DF with lagged features; can input features to lag and number of weeks to lag them
def create_lagged_features(df, lag_columns, lag_weeks=3):
    # Function to calculate lagged features for a given player and season
    def calculate_lags(player_df):
        player_df = player_df.sort_values('GW')
        for col in lag_columns:
            player_df[f'{col}_lag_{lag_weeks}'] = player_df[col].rolling(window=lag_weeks, min_periods=1).mean().shift()
        return player_df

    lagged_df = df.groupby(['season', 'player_name']).apply(calculate_lags).reset_index(drop=True)

    return lagged_df

In [136]:
# Columns that we want to lag
lag_columns = ['assists', 'bonus', 'bps', 'clean_sheets', 'creativity',
               'goals_conceded','goals_scored', 'ict_index', 'influence',
               'minutes', 'own_goals', 'penalties_missed', 'penalties_saved',
               'red_cards', 'saves', 'selected','player_team_score', 'opp_team_score',
               'threat', 'total_points', 'value', 'yellow_cards']

# Excluding was_home and opp_diff_ind as these are categorical

In [137]:
# Creating one week lag - will take ~1 minute
lagged_df = create_lagged_features(df, lag_columns, lag_weeks = 1)
lagged_df = lagged_df.fillna(0)

# Replacing current DF with the lagged DF since it includes all of our columns and more
df = lagged_df

In [138]:
# Creating three week lag - will take ~1 minute
lagged_df_three_wk = create_lagged_features(df, lag_columns, lag_weeks = 3)
lagged_df_three_wk = lagged_df_three_wk.fillna(0)

# Replacing current DF with the lagged DF since it includes all of our columns and more
df = lagged_df_three_wk

In [139]:
# Creating five week lag - will take ~1 minute
lagged_df_five_wk = create_lagged_features(df, lag_columns, lag_weeks = 5)
lagged_df_five_wk = lagged_df_five_wk.fillna(0)

# Replacing current DF with the lagged DF since it includes all of our columns and more
df = lagged_df_five_wk

# Sequence data by season + game week

In [140]:
df.head()

Unnamed: 0,season,player_name,position,player_team_name,assists,bonus,bps,clean_sheets,creativity,goals_conceded,...,penalties_saved_lag_5,red_cards_lag_5,saves_lag_5,selected_lag_5,player_team_score_lag_5,opp_team_score_lag_5,threat_lag_5,total_points_lag_5,value_lag_5,yellow_cards_lag_5
0,2018-19,Aaron Cresswell,DEF,West Ham,0,0,0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018-19,Aaron Cresswell,DEF,West Ham,0,0,0,0,0.0,0,...,0.0,0.0,0.0,103396.0,0.0,4.0,0.0,0.0,55.0,0.0
2,2018-19,Aaron Cresswell,DEF,West Ham,0,0,0,0,0.0,0,...,0.0,0.0,0.0,97364.5,0.5,3.0,0.0,0.0,55.0,0.0
3,2018-19,Aaron Cresswell,DEF,West Ham,0,0,10,0,27.6,1,...,0.0,0.0,0.0,88011.666667,0.666667,3.0,0.0,0.0,54.666667,0.0
4,2018-19,Aaron Cresswell,DEF,West Ham,0,0,0,0,0.0,0,...,0.0,0.0,0.0,80430.0,0.5,2.5,0.0,0.25,54.25,0.25


In [141]:
df = df.sort_values(['season','player_name','GW'])

# Create two test teams for analysis of models

In [179]:
df['player_team_name'].unique()

array(['West Ham', 'Burnley', 'Huddersfield', 'Arsenal', 'Crystal Palace',
       'Watford', 'Fulham', 'Liverpool', 'Bournemouth', 'Wolves',
       'Chelsea', 'Everton', 'Leicester', 'Southampton', 'Cardiff',
       'Man Utd', 'Spurs', 'Brighton', 'Newcastle', 'Man City', 'Norwich',
       'Aston Villa', 'Sheffield Utd', 'Leeds', 'West Brom', 'Brentford',
       "Nott'm Forest"], dtype=object)

In [180]:
df_man_city = df[(df['season'] == '2022-23') & (df['player_team_name'] == 'Man City')]
df_brighton = df[(df['season'] == '2022-23') & (df['player_team_name'] == 'Brighton')]

# Splitting into minutes DF and overall DF

In [181]:
# Katherine note - commenting this out for now in case we want to do this in the modeling notebooks, but it's easy to add back in

# minutes = df[df['minutes'] > 0]
# After this, minutes is a DF with only players who played and df is a DF with all players (regardless of minutes)

# Dropping columns we won't use

In [182]:
# We don't have to do this; could do it in modeling notebooks. But I (Katherine) think this'll be cleaner

# Dropping columns that aren't useful
df.drop(columns=['player_name', 'kickoff_time', 'opp_team_id', 'round', 'name_normalized', 'sequence'], inplace=True) # Sequence is coming from Hisham's code - if we drop that section, we need to remove the column from this line

In [183]:
# Dropping all unlagged continuous columns except the target
lag_columns.remove('total_points')

df.drop(columns=lag_columns, inplace=True)

# Update two test use cases 

In [184]:
df_brighton.drop(columns=['kickoff_time', 'opp_team_id', 'round', 'name_normalized', 'sequence'], inplace=True)
df_man_city.drop(columns=['kickoff_time', 'opp_team_id', 'round', 'name_normalized', 'sequence'], inplace=True)
df_brighton.drop(columns=lag_columns, inplace=True)
df_man_city.drop(columns=lag_columns, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brighton.drop(columns=['kickoff_time', 'opp_team_id', 'round', 'name_normalized', 'sequence'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_man_city.drop(columns=['kickoff_time', 'opp_team_id', 'round', 'name_normalized', 'sequence'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brighton.drop(columns=lag_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentat

# Split data into train/validation/test

In [185]:
# Train
X_train_init = df[df['season'].isin(['2018-19','2019-20','2020-21'])]
Y_train = np.array(X_train_init.pop('total_points'))

# Validation
X_val_init = df[df['season'].isin(['2021-22'])]
Y_val = np.array(X_val_init.pop('total_points'))

# Test
X_test_init = df[df['season'].isin(['2022-23'])]
Y_test = np.array(X_test_init.pop('total_points'))

In [186]:
# QCing train/test/val shapes - delete at the end
print(X_train_init.shape)
print(Y_train.shape)
print(X_val_init.shape)
print(Y_val.shape)
print(X_test_init.shape)
print(Y_test.shape)

(68791, 73)
(68791,)
(25447, 73)
(25447,)
(26505, 73)
(26505,)


# Standardize lagged features

In [187]:
# Pulling out only the features to be standardized
categorical_vars = ['season','player_team_name','opp_team_name','opp_diff_ind','position','GW','was_home']

X_train_contin = X_train_init.loc[:, ~X_train_init.columns.isin(categorical_vars)]
X_val_contin = X_val_init.loc[:, ~X_val_init.columns.isin(categorical_vars)]
X_test_contin = X_test_init.loc[:, ~X_test_init.columns.isin(categorical_vars)]

In [188]:
X_test_contin.columns

Index(['assists_lag_1', 'bonus_lag_1', 'bps_lag_1', 'clean_sheets_lag_1',
       'creativity_lag_1', 'goals_conceded_lag_1', 'goals_scored_lag_1',
       'ict_index_lag_1', 'influence_lag_1', 'minutes_lag_1',
       'own_goals_lag_1', 'penalties_missed_lag_1', 'penalties_saved_lag_1',
       'red_cards_lag_1', 'saves_lag_1', 'selected_lag_1',
       'player_team_score_lag_1', 'opp_team_score_lag_1', 'threat_lag_1',
       'total_points_lag_1', 'value_lag_1', 'yellow_cards_lag_1',
       'assists_lag_3', 'bonus_lag_3', 'bps_lag_3', 'clean_sheets_lag_3',
       'creativity_lag_3', 'goals_conceded_lag_3', 'goals_scored_lag_3',
       'ict_index_lag_3', 'influence_lag_3', 'minutes_lag_3',
       'own_goals_lag_3', 'penalties_missed_lag_3', 'penalties_saved_lag_3',
       'red_cards_lag_3', 'saves_lag_3', 'selected_lag_3',
       'player_team_score_lag_3', 'opp_team_score_lag_3', 'threat_lag_3',
       'total_points_lag_3', 'value_lag_3', 'yellow_cards_lag_3',
       'assists_lag_5', 'bonus

In [192]:
categorical_vars_test = ['player_name','season','player_team_name','opp_team_name','opp_diff_ind','position','GW','was_home','total_points']
df_man_city_contin = df_man_city.loc[:, ~df_man_city.columns.isin(categorical_vars_test)]
df_brighton_contin = df_brighton.loc[:, ~df_brighton.columns.isin(categorical_vars_test)]

In [193]:
# Standardizing the continuous variables
scaler = StandardScaler()

# Fit and transform the train features
train_features = scaler.fit_transform(X_train_contin)

# Only transform the validation and test features
val_features = scaler.transform(X_val_contin)
test_features = scaler.transform(X_test_contin)

In [194]:
# Standardize test team features
df_man_city_feat = scaler.transform(df_man_city_contin)
df_brighton_feat = scaler.transform(df_brighton_contin)

In [195]:
print(X_val_contin.shape)
print(val_features.shape)

(25447, 66)
(25447, 66)


In [196]:
# Turn the standardized arrays back into DFs and add the column names back in
X_train_temp = pd.DataFrame(train_features, columns=X_train_contin.columns)
X_val_temp = pd.DataFrame(val_features, columns=X_val_contin.columns)
X_test_temp = pd.DataFrame(test_features, columns=X_test_contin.columns)

In [197]:
# Turn the standardized arrays back into DFs and add the column names back in (for two test teams)
df_man_city_temp = pd.DataFrame(df_man_city_feat, columns=df_man_city_contin.columns)
df_brighton_temp = pd.DataFrame(df_brighton_feat, columns=df_brighton_contin.columns)

In [198]:
# Join the categorical variables back
X_train = pd.concat([X_train_init[categorical_vars],X_train_temp], axis=1)
X_val = pd.concat([X_val_init[categorical_vars].reset_index(drop=True),X_val_temp], axis=1)
X_test = pd.concat([X_test_init[categorical_vars].reset_index(drop=True),X_test_temp], axis=1)

In [199]:
# Join the categorical variables back (for two test teams)
df_man_city = pd.concat([df_man_city[categorical_vars_test].reset_index(drop=True),df_man_city_temp], axis=1)
df_brighton = pd.concat([df_brighton[categorical_vars_test].reset_index(drop=True),df_brighton_temp], axis=1)

In [200]:
# QCing train/test/val shapes post-standardization - delete at the end
print(X_train_init.shape)
print(X_train.shape)
print(Y_train.shape)
print(X_val_init.shape)
print(X_val.shape)
print(Y_val.shape)
print(X_test_init.shape)
print(X_test.shape)
print(Y_test.shape)

(68791, 73)
(68791, 73)
(68791,)
(25447, 73)
(25447, 73)
(25447,)
(26505, 73)
(26505, 73)
(26505,)


(1154, 74)
(1304, 74)


# Exporting CSVs

In [54]:
# Turning labels into DFs so we can export them
Y_train = pd.DataFrame(Y_train, columns=['total_points'])
Y_val = pd.DataFrame(Y_val, columns=['total_points'])
Y_test = pd.DataFrame(Y_test, columns=['total_points'])

In [55]:
X_train.to_csv('data/X_train.csv', encoding='utf-8')
Y_train.to_csv('data/Y_train.csv', encoding='utf-8')
X_val.to_csv('data/X_val.csv', encoding='utf-8')
Y_val.to_csv('data/Y_val.csv', encoding='utf-8')
X_test.to_csv('data/X_test.csv', encoding='utf-8')
Y_test.to_csv('data/Y_test.csv', encoding='utf-8')

In [202]:
# Two test teams
df_man_city.to_csv('data/df_man_city.csv', encoding='utf-8')
df_brighton.to_csv('data/df_brighton.csv', encoding='utf-8')