Adding in player level data to the dataset

In [1]:
# Data handling and manipulation
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


# Statistical tools
from scipy.stats import linregress, uniform, randint
import scipy.stats as st

# Scikit-learn libraries
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, f1_score, make_scorer

# XGBoost
import xgboost as xgb
import json

In [2]:
pd.set_option('display.max_columns', None)
#pd.reset_option('display.max_columns')

## Player data

I want to get player data for each match
- Were they a starter in the previous game
- What age are they (players)
- What height are they (players)
- What weight are they (players)
- How did they play in the previous game(s) (player_rating)

And then I will add this player onto the matches table, and aggregate for all 11 starters

In [3]:
player_games=pd.read_csv('data/player_games.csv')

In [4]:
#ill use this mapping later
player_mapping=player_games.set_index('player_id')['player_name'].to_dict()

In [5]:
#team mapping
teams=pd.read_csv('data/teams.csv')
team_mapping=teams.set_index('wyId')['name'].to_dict()

#### I need to get the date of the game so i can sort by date,

In [6]:
matchesSpain = pd.read_csv('data/matches_Spain.csv')
matchesItaly = pd.read_csv('data/matches_Italy.csv')
matchesGermany = pd.read_csv('data/matches_Germany.csv')
matchesFrance= pd.read_csv('data/matches_France.csv')
matchesEngland = pd.read_csv('data/matches_England.csv')

matches=pd.concat((matchesEngland,matchesSpain,matchesItaly,matchesGermany,matchesFrance), axis=0)

In [7]:
matches[['wyId','dateutc']]

Unnamed: 0,wyId,dateutc
0,2500089,2018-05-13 14:00:00
1,2500090,2018-05-13 14:00:00
2,2500091,2018-05-13 14:00:00
3,2500092,2018-05-13 14:00:00
4,2500093,2018-05-13 14:00:00
...,...,...
375,2500692,2017-08-05 18:00:00
376,2500694,2017-08-05 18:00:00
377,2500695,2017-08-05 18:00:00
378,2500693,2017-08-05 15:15:00


In [8]:
#player_games=player_games.merge(matches[['wyId','dateutc']], how='left', left_on='game_id', right_on='wyId')

In [9]:
# removing null rows because they allude to games that are outside of this scope (eg world cup)
#player_games=player_games[~player_games['wyId'].isna()]

In [10]:
#sorting by date
#player_games=player_games.sort_values(by=['dateutc','wyId','player_id'])

finding out if the player started the previous game

In [11]:
#player_games_starter=player_games[['dateutc','game_id','player_id','is_starter']]

In [12]:

#player_games_starter = player_games_starter.copy()

#player_games_starter['previous_is_starter'] = player_games_starter.sort_values(by=['player_id', 'dateutc']) \
#    .groupby('player_id')['is_starter'] \
#    .shift(1, fill_value=False)

will join this onto matches later

In [13]:
#player_games_starter

### Player Rating

In [14]:
playerank= pd.read_csv('data/playerank.csv')

joining the date onto the table

In [15]:
playerank=playerank.merge(matches[['wyId','dateutc']], how='left', left_on='matchId', right_on='wyId')

In [16]:
playerank=playerank[~playerank['wyId'].isna()]

In [17]:
playerank=playerank[[ 'playerankScore', 'matchId', 'playerId',  'dateutc']]

In [18]:
playerank=playerank.sort_values(by=['dateutc','matchId'])

Get the running average of each players' player rating upto, but not including, the current match

In [19]:
# Apply the lambda function to calculate the running average excluding the current row
def calculate_running_avg(x):
    # Calculate the running average excluding the current row
    return x.shift(1).cumsum() / x.shift(1).expanding().count()

# Group by playerId and apply the running average calculation
playerank['running_avg'] = playerank.groupby('playerId')['playerankScore'].apply(
    calculate_running_avg
).reset_index(level=0, drop=True)  # Reset index to align with original DataFrame

# Display the DataFrame with the running average column
playerank[['playerId', 'dateutc', 'playerankScore', 'running_avg']]


Unnamed: 0,playerId,dateutc,playerankScore,running_avg
2488,134512,2017-08-04 18:45:00,0.0078,
2870,135913,2017-08-04 18:45:00,-0.0009,
3292,144884,2017-08-04 18:45:00,0.0155,
10338,207143,2017-08-04 18:45:00,0.0168,
11988,21097,2017-08-04 18:45:00,0.0176,
...,...,...,...,...
34949,403449,2018-05-20 18:45:00,0.0033,-0.004490
35863,417231,2018-05-20 18:45:00,-0.0046,-0.001840
43328,8306,2018-05-20 18:45:00,-0.0039,-0.006321
43657,8327,2018-05-20 18:45:00,0.0400,0.024231


In [20]:
playerank

Unnamed: 0,playerankScore,matchId,playerId,dateutc,running_avg
2488,0.0078,2500691,134512,2017-08-04 18:45:00,
2870,-0.0009,2500691,135913,2017-08-04 18:45:00,
3292,0.0155,2500691,144884,2017-08-04 18:45:00,
10338,0.0168,2500691,207143,2017-08-04 18:45:00,
11988,0.0176,2500691,21097,2017-08-04 18:45:00,
...,...,...,...,...,...
34949,0.0033,2576336,403449,2018-05-20 18:45:00,-0.004490
35863,-0.0046,2576336,417231,2018-05-20 18:45:00,-0.001840
43328,-0.0039,2576336,8306,2018-05-20 18:45:00,-0.006321
43657,0.0400,2576336,8327,2018-05-20 18:45:00,0.024231


### Player bio stats

In [21]:
players = pd.read_csv('data/players.csv')

In [22]:
# Convert the birthDate column to datetime
players['birthDate'] = pd.to_datetime(players['birthDate'])

# Define the reference date (2018-01-01)
reference_date = pd.to_datetime('2018-01-01')

# Calculate the age
players['age'] = (reference_date - players['birthDate']).dt.days // 365

In [23]:
player_bio_stats=players[['wyId','age','height','weight']]

I will join this onto the matches later

In [24]:
player_bio_stats

Unnamed: 0,wyId,age,height,weight
0,32777,28,187,78
1,393228,18,182,73
2,393230,19,176,72
3,32793,27,187,82
4,393247,18,192,84
...,...,...,...,...
3598,120839,28,175,72
3599,114736,26,183,76
3600,114908,23,179,78
3601,285583,21,182,70


### Matches

In [25]:
matches=matches.sort_values(by=['dateutc','wyId']).reset_index(drop=True)

In [26]:
matches.drop(['status','duration','team1.scoreET','team1.scoreP','team1.hasFormation','team2.scoreET','team2.scoreP','team2.hasFormation'],
             axis=1, inplace=True)

matches.drop(['roundId','seasonId'],
             axis=1, inplace=True)


matches.drop(['teamsData', 'venue', 
                'date', 'referees', 'team1.coachId', 
                'team1.formation', 'team1.formation.bench', 'team2.coachId',
                'team2.formation', 'team2.formation.bench'],
            axis=1, inplace=True)


In [27]:
matches=matches.sort_values(by=['dateutc','wyId'])

In [28]:
#mapping names

teams=pd.read_csv('data/teams.csv')
team_mapping=teams.set_index('wyId')['name'].to_dict()

In [29]:
matches['team1.teamId']=matches['team1.teamId'].map(team_mapping)
matches['team2.teamId']=matches['team2.teamId'].map(team_mapping)
matches['winner']=matches['winner'].map(team_mapping)

matches.rename(columns={'team1.teamId':'team1.teamName','team2.teamId':'team2.teamName'}, inplace=True)

In [30]:
matches

Unnamed: 0,gameweek,dateutc,winner,wyId,label,competitionId,team1.side,team1.teamName,team1.score,team1.scoreHT,team1.formation.lineup,team1.formation.substitutions,team2.side,team2.teamName,team2.score,team2.scoreHT,team2.formation.lineup,team2.formation.substitutions
0,1,2017-08-04 18:45:00,Monaco,2500691,"Monaco - Toulouse, 3 - 2",412,away,Toulouse,2,1,"[{'playerId': 288423, 'ownGoals': '0', 'redCar...","[{'playerIn': 43196, 'playerOut': 288423, 'min...",home,Monaco,3,1,"[{'playerId': 135913, 'ownGoals': '0', 'redCar...","[{'playerIn': 86239, 'playerOut': 135913, 'min..."
1,1,2017-08-05 15:15:00,PSG,2500693,"PSG - Amiens SC, 2 - 0",412,home,PSG,2,1,"[{'playerId': 20394, 'ownGoals': '0', 'redCard...","[{'playerIn': 231138, 'playerOut': 20394, 'min...",away,Amiens SC,0,0,"[{'playerId': 294695, 'ownGoals': '0', 'redCar...","[{'playerIn': 25938, 'playerOut': 294695, 'min..."
2,1,2017-08-05 18:00:00,Olympique Lyonnais,2500688,"Olympique Lyonnais - Strasbourg, 4 - 0",412,home,Olympique Lyonnais,4,1,"[{'playerId': 5167, 'ownGoals': '0', 'redCards...","[{'playerIn': 404226, 'playerOut': 5167, 'minu...",away,Strasbourg,0,0,"[{'playerId': 26327, 'ownGoals': '0', 'redCard...","[{'playerIn': 412701, 'playerOut': 26327, 'min..."
3,1,2017-08-05 18:00:00,Guingamp,2500690,"Metz - Guingamp, 1 - 3",412,home,Metz,1,1,"[{'playerId': 330395, 'ownGoals': '0', 'redCar...","[{'playerIn': 498357, 'playerOut': 330395, 'mi...",away,Guingamp,3,1,"[{'playerId': 25474, 'ownGoals': '0', 'redCard...","[{'playerIn': 340920, 'playerOut': 25474, 'min..."
4,1,2017-08-05 18:00:00,Montpellier,2500692,"Montpellier - Caen, 1 - 0",412,away,Caen,0,0,"[{'playerId': 25715, 'ownGoals': '0', 'redCard...","[{'playerIn': 131321, 'playerOut': 25715, 'min...",home,Montpellier,1,0,"[{'playerId': 344744, 'ownGoals': '0', 'redCar...","[{'playerIn': 143189, 'playerOut': 344744, 'mi..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,38,2018-05-20 16:00:00,SPAL,2576337,"SPAL - Sampdoria, 3 - 1",524,away,Sampdoria,1,0,"[{'playerId': 20689, 'ownGoals': '0', 'redCard...","[{'playerIn': 20479, 'playerOut': 20689, 'minu...",home,SPAL,3,1,"[{'playerId': 226200, 'ownGoals': '0', 'redCar...","[{'playerIn': 134420, 'playerOut': 226200, 'mi..."
1822,38,2018-05-20 16:30:00,,2565925,"Atl\u00e9tico Madrid - Eibar, 2 - 2",795,home,Atl\u00e9tico Madrid,2,1,"[{'playerId': 3443, 'ownGoals': '0', 'redCards...","[{'playerIn': 4338, 'playerOut': 3443, 'minute...",away,Eibar,2,1,"[{'playerId': 4180, 'ownGoals': '0', 'redCards...","[{'playerIn': 4805, 'playerOut': 4180, 'minute..."
1823,38,2018-05-20 18:45:00,Barcelona,2565922,"Barcelona - Real Sociedad, 1 - 0",795,home,Barcelona,1,0,"[{'playerId': 286390, 'ownGoals': '0', 'redCar...","[{'playerIn': 8323, 'playerOut': 286390, 'minu...",away,Real Sociedad,0,0,"[{'playerId': 261127, 'ownGoals': '2', 'redCar...","[{'playerIn': 3282, 'playerOut': 261127, 'minu..."
1824,38,2018-05-20 18:45:00,Internazionale,2576335,"Lazio - Internazionale, 2 - 3",524,home,Lazio,2,2,"[{'playerId': 21384, 'ownGoals': '2', 'redCard...","[{'playerIn': 37745, 'playerOut': 21384, 'minu...",away,Internazionale,3,1,"[{'playerId': 20556, 'ownGoals': '0', 'redCard...","[{'playerIn': 20626, 'playerOut': 20556, 'minu..."


#### Restructuring matches table so that it is more easier to identify the home and away teams

In [31]:
home_teams1 = matches[matches['team1.side'] == 'home'].copy()
home_teams1 = home_teams1[['team1.teamName', 'team1.formation.lineup']]
home_teams1.columns=[ 'home.teamName', 'home.formation.lineup']

home_teams2 = matches[matches['team2.side'] == 'home'].copy()
home_teams2 = home_teams2[['team2.teamName', 'team2.formation.lineup']]
home_teams2.columns = ['home.teamName', 'home.formation.lineup']

home_teams=pd.concat([home_teams1, home_teams2], axis=0).sort_index()



In [32]:
away_teams1 = matches[matches['team1.side'] == 'away'].copy()
away_teams1 = away_teams1[['team1.teamName', 'team1.formation.lineup']]
away_teams1.columns=['away.teamName', 'away.formation.lineup']

away_teams2 = matches[matches['team2.side'] == 'away'].copy()
away_teams2 = away_teams2[['team2.teamName', 'team2.formation.lineup']]
away_teams2.columns = ['away.teamName', 'away.formation.lineup']

away_teams=pd.concat([away_teams1, away_teams2], axis=0).sort_index()

In [33]:
match_cols = matches[['competitionId','gameweek','wyId', 'dateutc', 'winner', 'label']]

In [34]:
len(away_teams)

1826

In [35]:
matches = pd.concat([match_cols , home_teams, away_teams], axis = 1)

In [36]:
matches.drop(['competitionId'], axis=1, inplace=True)


### Restructuring so i can get some running totals

In [37]:
homeMatches=matches[['dateutc','wyId', 'home.teamName',
                     'home.formation.lineup']
                   ]

In [38]:
homeMatches=homeMatches.rename(columns={
                            'home.teamName':'teamName',
                            'home.formation.lineup':'formation.lineup'}
                              )

In [39]:
awayMatches=matches[['dateutc','wyId', 'away.teamName',
                     'away.formation.lineup']
                   ]

In [40]:
awayMatches=awayMatches.rename(columns={
                            'away.teamName':'teamName',
                            'away.formation.lineup':'formation.lineup'}
                              )

In [41]:
disjointedMatches=pd.concat([homeMatches,awayMatches],axis=0).sort_values(by=['dateutc','wyId'])

Parsing the json player starting lineup

In [42]:
def extract_player_ids(lineup_str):
    # Convert JSON-like string to Python object
    lineup_data = json.loads(lineup_str.replace("'", '"').replace("null", "null"))
    # Extract player IDs
    return [player['playerId'] for player in lineup_data]

In [43]:
# Apply the function to the column and expand into separate columns
player_columns = disjointedMatches['formation.lineup'].apply(extract_player_ids).apply(pd.Series)
player_columns.columns = [f'player{i+1}' for i in range(player_columns.shape[1])]

# Concatenate the new player columns with the original DataFrame
disjointedMatches = pd.concat([disjointedMatches, player_columns], axis=1)



In [44]:
disjointedMatches.drop(columns=['formation.lineup'],inplace=True)

### joining player started stats

In [45]:
disjointedMatches.columns

Index(['dateutc', 'wyId', 'teamName', 'player1', 'player2', 'player3',
       'player4', 'player5', 'player6', 'player7', 'player8', 'player9',
       'player10', 'player11'],
      dtype='object')

In [46]:
# Step 1: Create the previous player columns
for i in range(1, 12):
    # Create the previous player column for each player column
    disjointedMatches[f'player{i}_prev'] = disjointedMatches.groupby('teamName')[f'player{i}'].shift(1)

In [47]:
# Step 2: Create a function to count the intersection of players
def count_matching_players(row):
    # Create sets of current players and previous players
    current_players = set([row[f'player{i}'] for i in range(1, 12)])
    previous_players = set([row[f'player{i}_prev'] for i in range(1, 12)])
    
    # Find the intersection between the two sets
    matching_players = current_players.intersection(previous_players)
    
    # Return the count of matching players
    return len(matching_players)


In [48]:
# Step 3: Apply the function to the dataframe
disjointedMatches['num_started_prev'] = disjointedMatches.apply(count_matching_players, axis=1)


### joining player bio stats started stats

In [49]:
player_bio_stats

Unnamed: 0,wyId,age,height,weight
0,32777,28,187,78
1,393228,18,182,73
2,393230,19,176,72
3,32793,27,187,82
4,393247,18,192,84
...,...,...,...,...
3598,120839,28,175,72
3599,114736,26,183,76
3600,114908,23,179,78
3601,285583,21,182,70


In [50]:

# Melt the disjointed table to create a long-form DataFrame
melted_disjointed = disjointedMatches.melt(id_vars=['dateutc', 'wyId', 'teamName', 'num_started_prev'],
                                    value_vars=['player1', 'player2', 'player3', 'player4', 'player5',
                                                 'player6', 'player7', 'player8', 'player9', 'player10', 'player11'],
                                    var_name='player_num', value_name='player_id')

# Merge melted data with player_bio_stats to get player attributes
melted_disjointed = pd.merge(melted_disjointed, player_bio_stats[['wyId', 'age', 'height', 'weight']],
                             left_on='player_id', right_on='wyId', how='left')

# Group by wyId and calculate average age, height, and weight
averages = melted_disjointed.groupby(['wyId_x','teamName'])[['age', 'height', 'weight']].mean().reset_index()

# Merge the averages back with the original disjointed table
disjointedMatches = pd.merge(disjointedMatches, averages, left_on=['wyId','teamName'], right_on=['wyId_x','teamName'], how='left')

# The disjointed_with_averages now contains the average age, height, and weight for each row of players

In [51]:
disjointedMatches.drop(columns=['player1_prev', 'player2_prev', 'player3_prev',
       'player4_prev', 'player5_prev', 'player6_prev', 'player7_prev',
       'player8_prev', 'player9_prev', 'player10_prev', 'player11_prev','wyId_x'],
                       inplace=True)

### joining player ratings

In [52]:
playerank

Unnamed: 0,playerankScore,matchId,playerId,dateutc,running_avg
2488,0.0078,2500691,134512,2017-08-04 18:45:00,
2870,-0.0009,2500691,135913,2017-08-04 18:45:00,
3292,0.0155,2500691,144884,2017-08-04 18:45:00,
10338,0.0168,2500691,207143,2017-08-04 18:45:00,
11988,0.0176,2500691,21097,2017-08-04 18:45:00,
...,...,...,...,...,...
34949,0.0033,2576336,403449,2018-05-20 18:45:00,-0.004490
35863,-0.0046,2576336,417231,2018-05-20 18:45:00,-0.001840
43328,-0.0039,2576336,8306,2018-05-20 18:45:00,-0.006321
43657,0.0400,2576336,8327,2018-05-20 18:45:00,0.024231


In [53]:
# Melt the disjointed table to create a long-form DataFrame
melted_disjointed = disjointedMatches.melt(id_vars=['dateutc', 'wyId', 'teamName', 'num_started_prev','age', 'height', 'weight'],
                                    value_vars=['player1', 'player2', 'player3', 'player4', 'player5',
                                                 'player6', 'player7', 'player8', 'player9', 'player10', 'player11'],
                                    var_name='player_num', value_name='player_id')

In [54]:
# Merge melted data with player_bio_stats to get player attributes
melted_disjointed = pd.merge(melted_disjointed, playerank[['playerId', 'matchId', 'running_avg']],
                             left_on=['wyId','player_id'], right_on=['matchId','playerId'], how='left')

In [55]:
# Group by wyId and calculate average running_avg (player rating)
averages = melted_disjointed.groupby(['wyId','teamName'])[['running_avg']].mean().reset_index()

In [56]:
# Merge the averages back with the original disjointed table
disjointedMatches = pd.merge(disjointedMatches, averages, on=['wyId','teamName'],  how='left')


In [57]:
disjointedMatches.drop(columns=['player1', 'player2', 'player3',
       'player4', 'player5', 'player6', 'player7', 'player8', 'player9',
       'player10', 'player11'], inplace=True)
                

In [58]:
disjointedMatches.rename(columns={'running_avg':'teams_player_rating'},inplace=True)

In [59]:
player_level_stats=disjointedMatches

In [60]:
player_level_stats.to_csv('data/player_level_stats.csv', index=False)

In [61]:
player_level_stats

Unnamed: 0,dateutc,wyId,teamName,num_started_prev,age,height,weight,teams_player_rating
0,2017-08-04 18:45:00,2500691,Monaco,0,25.636364,180.727273,72.545455,
1,2017-08-04 18:45:00,2500691,Toulouse,0,22.909091,183.636364,77.000000,
2,2017-08-05 15:15:00,2500693,PSG,0,28.000000,182.363636,75.454545,
3,2017-08-05 15:15:00,2500693,Amiens SC,0,27.090909,181.727273,77.363636,
4,2017-08-05 18:00:00,2500688,Olympique Lyonnais,0,25.636364,179.454545,76.454545,
...,...,...,...,...,...,...,...,...
3647,2018-05-20 18:45:00,2565922,Real Sociedad,5,26.454545,182.545455,77.636364,0.010298
3648,2018-05-20 18:45:00,2576335,Lazio,8,25.363636,184.363636,76.636364,0.010953
3649,2018-05-20 18:45:00,2576335,Internazionale,10,27.000000,183.454545,76.454545,0.011950
3650,2018-05-20 18:45:00,2576336,Sassuolo,9,26.636364,182.818182,77.000000,0.003664
