Adding in player level data to the dataset

In [1]:
# Data handling and manipulation
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


# Statistical tools
from scipy.stats import linregress, uniform, randint
import scipy.stats as st

# Scikit-learn libraries
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, f1_score, make_scorer

# XGBoost
import xgboost as xgb
import json

In [2]:
pd.set_option('display.max_columns', None)
#pd.reset_option('display.max_columns')

In [3]:

matches = pd.read_csv('data/matches_England.csv')

In [4]:
matches.drop(['status','duration','team1.scoreET','team1.scoreP','team1.hasFormation','team2.scoreET','team2.scoreP','team2.hasFormation'],
             axis=1, inplace=True)

matches.drop(['roundId','seasonId'],
             axis=1, inplace=True)


matches.drop(['teamsData', 'venue', 
                'date', 'referees', 'team1.coachId', 
                'team1.formation', 'team1.formation.bench', 'team2.coachId',
                'team2.formation', 'team2.formation.bench'],
            axis=1, inplace=True)


In [5]:
matches=matches.sort_values(by=['dateutc','wyId'])

In [6]:
#mapping names

teams=pd.read_csv('data/teams.csv')
team_mapping=teams.set_index('wyId')['name'].to_dict()

In [7]:
matches['team1.teamId']=matches['team1.teamId'].map(team_mapping)
matches['team2.teamId']=matches['team2.teamId'].map(team_mapping)
matches['winner']=matches['winner'].map(team_mapping)

matches.rename(columns={'team1.teamId':'team1.teamName','team2.teamId':'team2.teamName'}, inplace=True)

In [8]:
matches.columns

Index(['gameweek', 'dateutc', 'winner', 'wyId', 'label', 'competitionId',
       'team1.side', 'team1.teamName', 'team1.score', 'team1.scoreHT',
       'team1.formation.lineup', 'team1.formation.substitutions', 'team2.side',
       'team2.teamName', 'team2.score', 'team2.scoreHT',
       'team2.formation.lineup', 'team2.formation.substitutions'],
      dtype='object')

#### Restructuring matches table so that it is more easier to identify the home and away teams

In [9]:
home_teams1 = matches[matches['team1.side'] == 'home'].copy()
home_teams1 = home_teams1[['team1.teamName', 'team1.score', 'team1.scoreHT', 'team1.formation.lineup']]
home_teams1.columns=[ 'home.teamName', 'home.score', 'home.scoreHT', 'home.formation.lineup']

home_teams2 = matches[matches['team2.side'] == 'home'].copy()
home_teams2 = home_teams2[['team2.teamName', 'team2.score', 'team2.scoreHT', 'team1.formation.lineup']]
home_teams2.columns = ['home.teamName', 'home.score', 'home.scoreHT', 'home.formation.lineup']

home_teams=pd.concat([home_teams1, home_teams2], axis=0).sort_index()



In [10]:
away_teams1 = matches[matches['team1.side'] == 'away'].copy()
away_teams1 = away_teams1[['team1.teamName', 'team1.score', 'team1.scoreHT', 'team1.formation.lineup']]
away_teams1.columns=['away.teamName', 'away.score', 'away.scoreHT', 'away.formation.lineup']

away_teams2 = matches[matches['team2.side'] == 'away'].copy()
away_teams2 = away_teams2[['team2.teamName', 'team2.score', 'team2.scoreHT', 'team1.formation.lineup']]
away_teams2.columns = ['away.teamName', 'away.score', 'away.scoreHT', 'away.formation.lineup']

away_teams=pd.concat([away_teams1, away_teams2], axis=0).sort_index()

In [11]:
match_cols = matches[['competitionId','gameweek','wyId', 'dateutc', 'winner', 'label']]

In [12]:
matches = pd.concat([match_cols , home_teams, away_teams], axis = 1)

In [13]:
matches.drop(['competitionId', 'home.scoreHT', 'away.scoreHT',],
             axis=1, inplace=True)


In [14]:
matches.drop(['home.teamName', 'away.teamName', 'home.score', 'away.score'],
             axis=1, inplace=True)


In [15]:
matches.head()

Unnamed: 0,gameweek,wyId,dateutc,winner,label,home.formation.lineup,away.formation.lineup
379,1,2499719,2017-08-11 18:45:00,Arsenal,"Arsenal - Leicester City, 4 - 3","[{'playerId': 370224, 'ownGoals': '0', 'redCar...","[{'playerId': 370224, 'ownGoals': '0', 'redCar..."
378,1,2499727,2017-08-12 11:30:00,,"Watford - Liverpool, 3 - 3","[{'playerId': 15808, 'ownGoals': '0', 'redCard...","[{'playerId': 15808, 'ownGoals': '0', 'redCard..."
373,1,2499721,2017-08-12 14:00:00,Burnley,"Chelsea - Burnley, 2 - 3","[{'playerId': 93, 'ownGoals': '0', 'redCards':...","[{'playerId': 93, 'ownGoals': '0', 'redCards':..."
374,1,2499722,2017-08-12 14:00:00,Huddersfield Town,"Crystal Palace - Huddersfield Town, 0 - 3","[{'playerId': 127537, 'ownGoals': '0', 'redCar...","[{'playerId': 127537, 'ownGoals': '0', 'redCar..."
375,1,2499723,2017-08-12 14:00:00,Everton,"Everton - Stoke City, 1 - 0","[{'playerId': 20450, 'ownGoals': '0', 'redCard...","[{'playerId': 20450, 'ownGoals': '0', 'redCard..."


In [16]:
def extract_player_ids(lineup_str):
    # Convert JSON-like string to Python object
    lineup_data = json.loads(lineup_str.replace("'", '"').replace("null", "null"))
    # Extract player IDs
    return [player['playerId'] for player in lineup_data]

In [17]:
# Apply the function to the column and expand into separate columns
home_player_columns = matches['home.formation.lineup'].apply(extract_player_ids).apply(pd.Series)
home_player_columns.columns = [f'home.player{i+1}' for i in range(home_player_columns.shape[1])]

# Concatenate the new player columns with the original DataFrame
matches = pd.concat([matches, home_player_columns], axis=1)



In [18]:
# Apply the function to the column and expand into separate columns
away_player_columns = matches['home.formation.lineup'].apply(extract_player_ids).apply(pd.Series)
away_player_columns.columns = [f'home.player{i+1}' for i in range(away_player_columns.shape[1])]

# Concatenate the new player columns with the original DataFrame
matches = pd.concat([matches, away_player_columns], axis=1)


In [19]:
matches

Unnamed: 0,gameweek,wyId,dateutc,winner,label,home.formation.lineup,away.formation.lineup,home.player1,home.player2,home.player3,home.player4,home.player5,home.player6,home.player7,home.player8,home.player9,home.player10,home.player11,home.player1.1,home.player2.1,home.player3.1,home.player4.1,home.player5.1,home.player6.1,home.player7.1,home.player8.1,home.player9.1,home.player10.1,home.player11.1
379,1,2499719,2017-08-11 18:45:00,Arsenal,"Arsenal - Leicester City, 4 - 3","[{'playerId': 370224, 'ownGoals': '0', 'redCar...","[{'playerId': 370224, 'ownGoals': '0', 'redCar...",370224,120339,7945,14869,25413,7868,3560,167145,3319,7882,49876,370224,120339,7945,14869,25413,7868,3560,167145,3319,7882,49876
378,1,2499727,2017-08-12 11:30:00,,"Watford - Liverpool, 3 - 3","[{'playerId': 15808, 'ownGoals': '0', 'redCard...","[{'playerId': 15808, 'ownGoals': '0', 'redCard...",15808,120353,25393,346101,14870,116,18550,25747,7964,8514,4908,15808,120353,25393,346101,14870,116,18550,25747,7964,8514,4908
373,1,2499721,2017-08-12 14:00:00,Burnley,"Chelsea - Burnley, 2 - 3","[{'playerId': 93, 'ownGoals': '0', 'redCards':...","[{'playerId': 93, 'ownGoals': '0', 'redCards':...",93,70092,8433,8125,8944,77502,9433,9623,8980,8643,9127,93,70092,8433,8125,8944,77502,9433,9623,8980,8643,9127
374,1,2499722,2017-08-12 14:00:00,Huddersfield Town,"Crystal Palace - Huddersfield Town, 0 - 3","[{'playerId': 127537, 'ownGoals': '0', 'redCar...","[{'playerId': 127537, 'ownGoals': '0', 'redCar...",127537,397168,8425,8221,454,8142,8186,240559,38031,235555,8422,127537,397168,8425,8221,454,8142,8186,240559,38031,235555,8422
375,1,2499723,2017-08-12 14:00:00,Everton,"Everton - Stoke City, 1 - 0","[{'playerId': 20450, 'ownGoals': '0', 'redCard...","[{'playerId': 20450, 'ownGoals': '0', 'redCard...",20450,9532,15054,58978,49872,7932,8731,8094,107,25854,77548,20450,9532,15054,58978,49872,7932,8731,8094,107,25854,77548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,38,2500094,2018-05-13 14:00:00,Newcastle United,"Newcastle United - Chelsea, 3 - 0","[{'playerId': 12536, 'ownGoals': '0', 'redCard...","[{'playerId': 12536, 'ownGoals': '0', 'redCard...",12536,9227,8620,7967,252365,104851,3523,8833,293686,230883,7978,12536,9227,8620,7967,252365,104851,3523,8833,293686,230883,7978
6,38,2500095,2018-05-13 14:00:00,Manchester City,"Southampton - Manchester City, 0 - 1","[{'playerId': 8464, 'ownGoals': '0', 'redCards...","[{'playerId': 8464, 'ownGoals': '0', 'redCards...",8464,265673,14808,9380,245364,3662,70083,134502,11066,38021,105339,8464,265673,14808,9380,245364,3662,70083,134502,11066,38021,105339
7,38,2500096,2018-05-13 14:00:00,Stoke City,"Swansea City - Stoke City, 1 - 2","[{'playerId': 8498, 'ownGoals': '0', 'redCards...","[{'playerId': 8498, 'ownGoals': '0', 'redCards...",8498,77557,77552,246866,8976,25571,284,25572,8192,77541,7847,8498,77557,77552,246866,8976,25571,284,25572,8192,77541,7847
8,38,2500097,2018-05-13 14:00:00,Tottenham Hotspur,"Tottenham Hotspur - Leicester City, 5 - 4","[{'playerId': 149019, 'ownGoals': '0', 'redCar...","[{'playerId': 149019, 'ownGoals': '0', 'redCar...",149019,283142,285508,8653,93084,8488,14853,3928,70403,12829,26150,149019,283142,285508,8653,93084,8488,14853,3928,70403,12829,26150
