## __Import__

In [4]:
import pandas as pd
import numpy as np
import json 
import sys
import os
from tqdm import tqdm
from datetime import datetime
from statsbombpy import sb
from collections import Counter


# Run the notebook from inside the notebooks folder
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__name__), '..')))

from scripts.dataloader import Dataloader

leauges = ["bundesliga", "premier_league"]
df = pd.DataFrame()
for l in leauges:
    dataloader = Dataloader(league=l)
    dataloader.load_data()
    df_temp = dataloader.get_dimension(dimension="standard_stats",row_filter=False)
    df = pd.concat([df, df_temp], axis=0)

with open("../../config/position_mapping.json","r") as f:
    position_mapping = json.load(f)

[2025-07-08 20:11:11] Loading data form local file system
[2025-07-08 20:11:25] Loading data form local file system


  self.df = pd.read_csv(file_path,dtype=self.dtypes)


In [1]:
import json 
import sys
import os
from tqdm import tqdm
from datetime import datetime
from statsbombpy import sb
from collections import Counter
# Run the notebook from inside the notebooks folder
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__name__), '..')))

from scripts.dataloader import Dataloader
dataloader = Dataloader(file_path="../../data/new_approach/all_leagues.parquet") # /data/new_approach/all_leagues.parquet
dataloader.load_data()
df = dataloader.get_dimension("standard_stats", row_filter=False)

[2025-07-08 20:47:23] Loading data from local file system


In [2]:
all_players = set()
for player in df.loc[df["player"].notna(), "player"].values:
    all_players.add(player)

for player in df.loc[df["substitution_replacement"].notna(),"substitution_replacement"].values:
    all_players.add(player)
print(f"In total {len(all_players)} players applapeared on the pitch during the season.")

In total 2644 players applapeared on the pitch during the season.


## Main Method

In [9]:
def log_step(message):
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f"[{timestamp}] {message}")

def get_most_frequent_pos(input_list):
    """
    Return most occupied position and maps it to global position (i.e. GK,DF,MF,FW)
    """
    input_list = extract_positions(input_list, unique=False)
    if not input_list:
        return None
    
    counter = Counter(input_list)
    max_count = max(counter.values())
    most_frequent_pos = list({item for item, count in counter.items() if count == max_count})
    
    # Map positions to roles, then remove duplicates using a set
    mapped_positions = {position_mapping[pos] for pos in most_frequent_pos}
    
    # Join the unique mapped positions into a single string
    return ", ".join(mapped_positions)

def extract_positions(positions,unique=False):
    # Flatten the array of lists into a single list
    flattened_positions = np.concatenate(positions.values).tolist()
    # Check if the entire flattened list is empty
    if all(len(pos_list) == 0 for pos_list in flattened_positions):
        return []
    # Extract positions from the dictionaries

    if unique:
        positions = [pos["position"] for pos in flattened_positions if "position" in pos]
        return list(set(positions))
        
    else:
        return [pos["position"] for pos in flattened_positions if "position" in pos]
    
def extract_global_position(positions):
    positions_list = get_most_frequent_pos(positions)
    if len(positions_list)==1:
        return position_mapping[positions_list[0]]
    elif len(positions_list)==0:
        return positions
    else:
        try:
            output = position_mapping[positions_list[0]]
            for pos in positions_list[1:]:
                output += f",{position_mapping[pos]}"
            return output
        except:
            print(positions_list)

def get_positions_played(df):
    positions_played_df = df.groupby("player_id").agg(
        position=("positions", lambda x: get_most_frequent_pos(x)),
        positions_played=("positions", lambda x: extract_positions(x)),
        unique_positions_played=("positions", lambda x: extract_positions(x,unique=True))
        #global_position=("positions", lambda x: extract_global_position(x)) 
    )
    return positions_played_df

def get_minutes_played(df):
    df_with_flags = df.copy()

    all_players = set()
    match_duration = df_with_flags["minute"].max()

    for player in df_with_flags.loc[df_with_flags["player"].notna(),"player"].values:
        all_players.add(player)

    for player in df_with_flags.loc[df_with_flags["substitution_replacement"].notna(), "substitution_replacement"].values:
        all_players.add(player)

    # flags 
    df_with_flags["is_substituted"] = df_with_flags["substitution_outcome"].notna()
    df_with_flags["match_duration"] = match_duration
    df_with_flags["minutes_played_subbed_in"] = match_duration - df_with_flags["minute"]

    # filter
    df_player_subbed_out = df_with_flags.loc[df_with_flags["is_substituted"]==True,["player", "player_id", "minute"]]
    df_player_subbed_in = df_with_flags.loc[df_with_flags["is_substituted"]==True, ["substitution_replacement", "player_id", "minutes_played_subbed_in"]]

    # rename
    df_player_subbed_in = df_player_subbed_in.rename({"substitution_replacement":"player",
                                                      "minutes_played_subbed_in" :"minutes_played"}, axis=1)
    df_player_subbed_out = df_player_subbed_out.rename({"minute":"minutes_played"}, axis=1)

    # add subbed_in subbed_of mask
    df_player_subbed_in["subbed_in"] = 1
    df_player_subbed_out["subbed_out"] = 1

    df_subbed_player = pd.concat([df_player_subbed_in, df_player_subbed_out],axis=0)

    # add remaining players that were not subbed off
    players_not_subbed_off = [player for player in all_players if player not in df_subbed_player["player"].values]
    # get player_id for players not subbed off
    player_id_map = df_with_flags.set_index("player")["player_id"].to_dict()
    df_player_not_subbed_off = pd.DataFrame({
        "player": players_not_subbed_off,
        "player_id": [player_id_map.get(player, np.nan) for player in players_not_subbed_off],
        "minutes_played" : 90
    })

    df_result = pd.concat([df_subbed_player, df_player_not_subbed_off],axis=0).reset_index(drop=True)
    df_result["match_played"] = 1

    return df_result


def analyze_standard_stats(df):
    match_ids = df["match_id"].unique()
    columns = ["player", "player_id", "match_id", "minute", "substitution_replacement", "substitution_outcome"]
    standard_stats = pd.DataFrame()
    
    log_step("Calculating match_played and minutes_played")
    concated_matches_df = pd.DataFrame()

    for match_id in tqdm(match_ids, desc="Concatenating matches", unit="match"):
        current = get_minutes_played(df.loc[df["match_id"]==match_id,columns])
        concated_matches_df = pd.concat([concated_matches_df, current])
    
    
    
    df_match_minutes_played = concated_matches_df.groupby("player_id").agg(
            match_played=("match_played","sum"),
            minutes_played=("minutes_played","sum"),
            subbed_in=("subbed_in","sum"),
            subbed_out=("subbed_out","sum")
        )

    
    log_step("Retrieve Lineups from API to map team, country, and positions_played")
    
    df_team_country_concated = pd.DataFrame()
    # load and stack matches 
    for match_id in tqdm(match_ids, desc="Retrieving Lineups per game from API", unit="match"):
        lineups = sb.lineups(match_id=match_id) # ["Hertha Berlin"]
        
        for team in lineups.keys():
            player_information = lineups[team].loc[:, ["player_name","player_id","country","positions", "jersey_number"]].copy()
            player_information["team"] = team
            df_team_country_concated = pd.concat([df_team_country_concated, player_information], axis=0)
    
            
    df_team_country = df_team_country_concated.drop_duplicates(subset=['player_id'])
    df_team_country = df_team_country.rename({"player_name" : "player"},axis=1)

    log_step("Process positions")
    positions_played_df = get_positions_played(df_team_country_concated)
    positions_played_df = positions_played_df.reset_index(drop=False).rename({"player_name" : "player"},axis=1)

    log_step("Merge to final dataframe")
    # merge player position to standard stats
    standard_stats = pd.merge(left=df_match_minutes_played, right=positions_played_df, on="player_id", how="right")
    standard_stats = standard_stats.fillna(0) # player with nan did not play
    
    # merge country and team to standard stats
    standard_stats = pd.merge(left=standard_stats, right=df_team_country,on="player_id", how="left")
    
    # reorder columns and keep only relevant
    standard_stats = standard_stats[["player", "player_id","country","team","position","match_played","minutes_played","subbed_in","subbed_out","unique_positions_played","positions_played"]]
    
    return standard_stats, df_team_country_concated
    
import numpy as np
with open("../../config/position_mapping.json","r") as f:
    position_mapping = json.load(f)
result_df, df_team_country_concated = analyze_standard_stats(df.head(10))
result_df

[2025-07-08 21:14:03] Calculating match_played and minutes_played


Concatenating matches:   0%|          | 0/1 [00:00<?, ?match/s]

Concatenating matches: 100%|██████████| 1/1 [00:00<00:00, 129.24match/s]


[2025-07-08 21:14:03] Retrieve Lineups from API to map team, country, and positions_played


Retrieving Lineups per game from API: 100%|██████████| 1/1 [00:00<00:00, 124.98match/s]

[2025-07-08 21:14:03] Process positions
[2025-07-08 21:14:03] Merge to final dataframe





Unnamed: 0,player,player_id,country,team,position,match_played,minutes_played,subbed_in,subbed_out,unique_positions_played,positions_played
0,Leroy Sané,3053.0,Germany,Schalke 04,Midfielder,0.0,0.0,0.0,0.0,[Right Midfield],[Right Midfield]
1,Jean-Eric Maxim Choupo-Moting,3499.0,Cameroon,Schalke 04,Midfielder,0.0,0.0,0.0,0.0,[Left Midfield],[Left Midfield]
2,Joël Andre Job Matip,3502.0,Cameroon,Schalke 04,Defender,0.0,0.0,0.0,0.0,"[Left Center Back, Right Center Back]","[Right Center Back, Left Center Back]"
3,Sead Kolašinac,3510.0,Bosnia and Herzegovina,Schalke 04,Defender,0.0,0.0,0.0,0.0,[Left Back],[Left Back]
4,Pierre-Emile Højbjerg,3570.0,Denmark,Schalke 04,0,0.0,0.0,0.0,0.0,[],[]
5,Younès Belhanda,5242.0,Morocco,Schalke 04,Midfielder,0.0,0.0,0.0,0.0,[Left Center Midfield],[Left Center Midfield]
6,Andrej Kramarić,5460.0,Croatia,Hoffenheim,Forward,1.0,90.0,0.0,0.0,"[Left Center Forward, Center Forward]","[Center Forward, Left Center Forward]"
7,Fabian Lukas Schär,5537.0,Switzerland,Hoffenheim,Defender,1.0,90.0,0.0,0.0,[Right Center Back],[Right Center Back]
8,Sebastian Rudy,6039.0,Germany,Hoffenheim,Midfielder,1.0,90.0,0.0,0.0,"[Left Defensive Midfield, Right Center Midfiel...","[Left Defensive Midfield, Right Center Midfiel..."
9,Johannes Geis,6717.0,Germany,Schalke 04,Midfielder,0.0,0.0,0.0,0.0,[Center Defensive Midfield],[Center Defensive Midfield]


## full_match_equivalents 
"minutes_played / 90" gives idea of how many "full games" a player has contributed to

In [10]:
result_df["full_match_equivalents"] = result_df["minutes_played"] / 90
result_df["full_match_equivalents"]

0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     1.0
7     1.0
8     1.0
9     0.0
10    0.0
11    0.0
12    0.0
13    1.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    0.0
20    0.0
21    0.0
22    0.0
23    0.0
24    0.0
25    0.0
26    0.0
27    0.0
28    0.0
29    0.0
30    0.0
31    0.0
32    0.0
33    0.0
34    0.0
35    0.0
Name: full_match_equivalents, dtype: float64

## inspect position
clean:
- 0's to "nan" as these players had no playing time
- Defender, Midfielder -> Defender and so

In [11]:
result_df.groupby("position").size()

position
0                        8
Defender                 7
Defender, Midfielder     2
Forward                  2
Forward, Midfielder      3
Goalkeeper               3
Midfielder              11
dtype: int64

In [12]:
result_df["position"] = result_df["position"].replace(0, "nan")
result_df['position'] = result_df['position'].str.replace("Forward, Defender", "Forward", case=False, regex=False)
result_df['position'] = result_df['position'].str.replace("Forward, Midfielder, Defender", "Forward", case=False, regex=False)
result_df['position'] = result_df['position'].str.replace("Midfielder, Defender", "Midfielder", case=False, regex=False)
result_df['position'] = result_df['position'].str.replace("Midfielder, Forward", "Midfielder", case=False, regex=False)
result_df['position'] = result_df['position'].str.replace("Forward, Midfielder", "Forward", case=False, regex=False)
result_df['position'] = result_df['position'].str.replace("Defender, Forward", "Defender", case=False, regex=False)
result_df['position'] = result_df['position'].str.replace("Defender, Midfielder", "Defender", case=False, regex=False)


In [None]:
result_df.groupby("position").size()

position
Defender       9
Forward        5
Goalkeeper     3
Midfielder    11
nan            8
dtype: int64

: 

## __Store data__

In [42]:
result_df.to_csv("../../data/standard_stats.csv",index=False)

Notice:
- Players who did not play contain no positions from statsbomb api

__faster implementation for player information but no country__

In [None]:
import pandas as pd
def get_player_information(df_input):
    import ast
    df_tactics = df_input.loc[df_input["tactics"].notna(), "tactics"]
    all_lineups = []
    for tactics_str in df_tactics.values:
        lineup_dict = ast.literal_eval(tactics_str)["lineup"]
        df_lineup = pd.DataFrame([{
            'player_id': item['player']['id'],
            'player_name': item['player']['name'],
            'position_id': item['position']['id'],
            'position_name': item['position']['name'],
            'jersey_number': item['jersey_number']
        } for item in lineup_dict])
        all_lineups.append(df_lineup)
    df_result = pd.concat(all_lineups, ignore_index=True)
    return df_result

get_player_information(df)


# Validation

## __Minutes played, matches played__

In [None]:
df = dataloader.get_data()
columns = ["player", "match_id","position","minute" ,"substitution_replacement","substitution_outcome"] # "timestamp"
match_a = df.loc[(df["match_id"]==3890324), columns]
match_b = df.loc[(df["match_id"]==3890505), columns]


def get_minutes_played(df):
    df_with_flags = df.copy()

    all_players = set()
    match_duration = df_with_flags["minute"].max()

    for player in df_with_flags.loc[df_with_flags["player"].notna(),"player"].values:
        all_players.add(player)

    for player in df_with_flags.loc[df_with_flags["substitution_replacement"].notna(), "substitution_replacement"].values:
        all_players.add(player)

    # flags 
    df_with_flags["is_substituted"] = df_with_flags["substitution_outcome"].notna()
    df_with_flags["match_duration"] = match_duration
    df_with_flags["minutes_played_subbed_in"] = match_duration - df_with_flags["minute"]

    # filter
    df_player_subbed_out = df_with_flags.loc[df_with_flags["is_substituted"]==True,["player", "minute"]]
    df_player_subbed_in = df_with_flags.loc[df_with_flags["is_substituted"]==True, ["substitution_replacement", "minutes_played_subbed_in"]]
    
    # rename
    df_player_subbed_in = df_player_subbed_in.rename({"substitution_replacement":"player",
                                                      "minutes_played_subbed_in" :"minutes_played"}, axis=1)
    df_player_subbed_out = df_player_subbed_out.rename({"minute":"minutes_played"}, axis=1)

    # add subbed_in subbed_of mask
    df_player_subbed_in["subbed_in"] = 1
    df_player_subbed_out["subbed_out"] = 1

    df_subbed_player = pd.concat([df_player_subbed_in, df_player_subbed_out],axis=0)

    # add remaining players that were not subbed off
    players_not_subbed_off = [player for player in all_players if player not in df_subbed_player["player"].values]
    df_player_not_subbed_off = pd.DataFrame({"player": players_not_subbed_off, "minutes_played" : 90})

    df_result = pd.concat([df_subbed_player, df_player_not_subbed_off],axis=0).reset_index(drop=True)
    df_result["match_played"] = 1

    return df_result 

get_minutes_played(match_a)

Unnamed: 0,player,minutes_played,subbed_in,subbed_out,match_played
0,Jens Hegeler,62,1.0,,1
1,Gotoku Sakai,45,1.0,,1
2,Ivo Iličević,29,1.0,,1
3,Alexander Baumjohann,21,1.0,,1
4,Ivica Olić,16,1.0,,1
5,Valentin Stocker,9,1.0,,1
6,Niklas Stark,28,,1.0,1
7,Matthias Ostrzolek,45,,1.0,1
8,Marcelo Alfonso Díaz Rojas,61,,1.0,1
9,Salomon Armand Magloire Kalou,69,,1.0,1


In [None]:
matche_ids = df["match_id"].unique()
first_match = True
columns = ["player", "match_id", "minute", "substitution_replacement", "substitution_outcome"]
result = pd.DataFrame()

for match_id in matche_ids:
    current = get_minutes_played(df.loc[df["match_id"]==match_id,columns])
    result = pd.concat([result, current])

df_match_minutes_played = result.groupby("player").agg(
    match_played=("match_played","sum"),
    minutes_played=("minutes_played","sum"),
    subbed_in=("subbed_in","sum"),
    subbed_out=("subbed_out","sum")
)
df_match_minutes_played

# df_match_minutes_played = pd.merge(left=pd.DataFrame(result.groupby("player")["match_played"].sum()), right=pd.DataFrame(result.groupby("player")["minutes_played"].sum()), on="player")
# df_match_minutes_played

Unnamed: 0_level_0,match_played,minutes_played,subbed_in,subbed_out
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aaron Hunt,23,1742,3.0,8.0
Adam Hloušek,7,425,3.0,0.0
Adelino André Vieira Freitas,26,1908,4.0,8.0
Admir Mehmedi,28,1373,14.0,7.0
Adnan Januzaj,6,176,6.0,0.0
...,...,...,...,...
Özkan Yıldırım,1,18,1.0,0.0
Ørjan Håskjold Nyland,6,540,0.0,0.0
İlkay Gündoğan,25,1997,3.0,7.0
Łukasz Piszczek,20,1533,4.0,1.0


### __Validate minutes per game__

In [None]:
columns = ["player", "team","match_id","minute", "timestamp","substitution_replacement","substitution_outcome"]
match_a = df.loc[(df["match_id"]==3890324), columns]

def get_minutes_played_validate(df):
    df_with_flags = df.copy()
    df_with_flags = df_with_flags[df_with_flags["player"].notna()]
    df_with_flags = df_with_flags.sort_values("substitution_replacement", ascending=True)
    match_duration = df_with_flags["minute"].max()
    df_with_flags = df_with_flags.drop_duplicates(subset=['player'], keep='first')

    # flags 
    df_with_flags["is_substituted"] = df_with_flags["substitution_outcome"].notna()
    df_with_flags["match_duration"] = match_duration

    # assign all players match_duration
    df_with_flags["minutes_played"] = df_with_flags["match_duration"]
    # overwrite mp for player that has been subbed of
    df_with_flags.loc[df_with_flags["is_substituted"]==True, "minutes_played"] = df_with_flags[df_with_flags["is_substituted"]==True]["minute"]
    
    # calculate for subbed in players
    subbed_in_players = df_with_flags[df["substitution_replacement"].notna()]["substitution_replacement"].unique()
    df_with_flags["minutes_played_subbed_in"] = df_with_flags[df_with_flags["is_substituted"]==True]["match_duration"] - df_with_flags[df_with_flags["is_substituted"]==True]["minute"]
    
    return df_with_flags


get_minutes_played_validate(match_a)


KeyError: "['team', 'timestamp'] not in index"

## __Team and Country__
Output includes also players that sat on the bench the whole season

In [None]:
from statsbombpy import sb

df_team_country = pd.DataFrame()

# load and stack matches 
for match_id in matche_ids:
    lineups = sb.lineups(match_id=match_id) # ["Hertha Berlin"]
    
    for team in lineups.keys():
        player_information = lineups[team].loc[:, ["player_name","country","positions"]]
        player_information["team"] = team
        df_team_country = pd.concat([df_team_country, player_information], axis=0)
        
# df_team_country = df_team_country.drop_duplicates(subset=['player_name']) 
df_team_country



Unnamed: 0,player_name,country,positions,team
0,Leroy Sané,Germany,"[{'position_id': 12, 'position': 'Right Midfie...",Schalke 04
1,Jean-Eric Maxim Choupo-Moting,Cameroon,"[{'position_id': 16, 'position': 'Left Midfiel...",Schalke 04
2,Joël Andre Job Matip,Cameroon,"[{'position_id': 3, 'position': 'Right Center ...",Schalke 04
3,Sead Kolašinac,Bosnia and Herzegovina,"[{'position_id': 6, 'position': 'Left Back', '...",Schalke 04
4,Pierre-Emile Højbjerg,Denmark,[],Schalke 04
...,...,...,...,...
13,Ivica Olić,Croatia,"[{'position_id': 13, 'position': 'Right Center...",Hamburger SV
14,Ivo Iličević,Croatia,"[{'position_id': 16, 'position': 'Left Midfiel...",Hamburger SV
15,Emir Spahić,Bosnia and Herzegovina,"[{'position_id': 5, 'position': 'Left Center B...",Hamburger SV
16,Marcelo Alfonso Díaz Rojas,Chile,"[{'position_id': 16, 'position': 'Left Midfiel...",Hamburger SV


## __Positions played__

In [None]:
from collections import Counter

def get_most_frequent_pos(input_list):
    input_list = extract_positions(input_list, unique=False)
    try:
        counter = Counter(input_list)
        max_count = max(counter.values())
        most_frequent_pos = [item for item, count in counter.items() if count == max_count]
    except:
        most_frequent_pos = None
    return most_frequent_pos

def extract_positions(positions,unique=False):
    # Flatten the array of lists into a single list
    flattened_positions = np.concatenate(positions.values).tolist()
    # Check if the entire flattened list is empty
    if all(len(pos_list) == 0 for pos_list in flattened_positions):
        return []
    # Extract positions from the dictionaries

    if unique:
        positions = [pos["position"] for pos in flattened_positions if "position" in pos]
        return list(set(positions))
        
    else:
        return [pos["position"] for pos in flattened_positions if "position" in pos]

def get_positions_played(df):
    positions_played_df = df.groupby("player_name").agg(
        positions_played=("positions", lambda x: extract_positions(x, unique=True)),
        position=("positions", lambda x: get_most_frequent_pos(x)),
        raw=("positions", lambda x: x)
    )
    return positions_played_df

# 3890548 - kimmich played
# test = sb.lineups(match_id=3890548)["Bayern Munich"]
# test = get_positions_played(test)
# test
positions_played_df = get_positions_played(df_team_country)
positions_played_df = positions_played_df.reset_index(drop=False).rename({"player_name" : "player"},axis=1)
positions_played_df

Unnamed: 0,player,positions_played,position,raw
0,Aaron Hunt,"[Right Wing, Left Center Midfield, Right Cente...",[Center Attacking Midfield],"[[{'position_id': 22, 'position': 'Right Cente..."
1,Adam Hloušek,"[Left Wing, Left Center Forward, Left Center B...",[Left Center Back],"[[], [{'position_id': 5, 'position': 'Left Cen..."
2,Adelino André Vieira Freitas,"[Right Wing, Left Back, Right Back, Right Cent...","[Right Back, Right Wing]","[[{'position_id': 12, 'position': 'Right Midfi..."
3,Admir Mehmedi,"[Right Wing, Right Center Forward, Center Atta...",[Left Midfield],"[[{'position_id': 22, 'position': 'Right Cente..."
4,Adnan Januzaj,"[Right Wing, Left Wing, Right Midfield, Center...",[Right Wing],"[[], [], [{'position_id': 17, 'position': 'Rig..."
...,...,...,...,...
534,Özkan Yıldırım,[Right Wing],[Right Wing],"[[{'position_id': 17, 'position': 'Right Wing'..."
535,Ørjan Håskjold Nyland,[Goalkeeper],[Goalkeeper],"[[{'position_id': 1, 'position': 'Goalkeeper',..."
536,İlkay Gündoğan,"[Right Wing, Left Center Midfield, Center Atta...",[Right Center Midfield],"[[{'position_id': 13, 'position': 'Right Cente..."
537,Łukasz Piszczek,"[Left Back, Right Back, Center Forward, Right ...",[Right Back],"[[{'position_id': 2, 'position': 'Right Back',..."


In [None]:
player_stats = pd.merge(left=df_match_minutes_played, right=positions_played_df, on="player", how="right")
player_stats

Unnamed: 0,player,match_played,minutes_played,subbed_in,subbed_out,positions_played,position,raw
0,Aaron Hunt,23.0,1742.0,3.0,8.0,"[Right Wing, Left Center Midfield, Right Cente...",[Center Attacking Midfield],"[[{'position_id': 22, 'position': 'Right Cente..."
1,Adam Hloušek,7.0,425.0,3.0,0.0,"[Left Wing, Left Center Forward, Left Center B...",[Left Center Back],"[[], [{'position_id': 5, 'position': 'Left Cen..."
2,Adelino André Vieira Freitas,26.0,1908.0,4.0,8.0,"[Right Wing, Left Back, Right Back, Right Cent...","[Right Back, Right Wing]","[[{'position_id': 12, 'position': 'Right Midfi..."
3,Admir Mehmedi,28.0,1373.0,14.0,7.0,"[Right Wing, Right Center Forward, Center Atta...",[Left Midfield],"[[{'position_id': 22, 'position': 'Right Cente..."
4,Adnan Januzaj,6.0,176.0,6.0,0.0,"[Right Wing, Left Wing, Right Midfield, Center...",[Right Wing],"[[], [], [{'position_id': 17, 'position': 'Rig..."
...,...,...,...,...,...,...,...,...
534,Özkan Yıldırım,1.0,18.0,1.0,0.0,[Right Wing],[Right Wing],"[[{'position_id': 17, 'position': 'Right Wing'..."
535,Ørjan Håskjold Nyland,6.0,540.0,0.0,0.0,[Goalkeeper],[Goalkeeper],"[[{'position_id': 1, 'position': 'Goalkeeper',..."
536,İlkay Gündoğan,25.0,1997.0,3.0,7.0,"[Right Wing, Left Center Midfield, Center Atta...",[Right Center Midfield],"[[{'position_id': 13, 'position': 'Right Cente..."
537,Łukasz Piszczek,20.0,1533.0,4.0,1.0,"[Left Back, Right Back, Center Forward, Right ...",[Right Back],"[[{'position_id': 2, 'position': 'Right Back',..."


In [None]:
df_team_country

Unnamed: 0_level_0,positions_played,position,raw
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aaron Hunt,"[Right Wing, Left Center Midfield, Right Cente...",[Center Attacking Midfield],"[[{'position_id': 22, 'position': 'Right Cente..."
Adam Hloušek,"[Left Wing, Left Center Forward, Left Center B...",[Left Center Back],"[[], [{'position_id': 5, 'position': 'Left Cen..."
Adelino André Vieira Freitas,"[Right Wing, Left Back, Right Back, Right Cent...","[Right Back, Right Wing]","[[{'position_id': 12, 'position': 'Right Midfi..."
Admir Mehmedi,"[Right Wing, Right Center Forward, Center Atta...",[Left Midfield],"[[{'position_id': 22, 'position': 'Right Cente..."
Adnan Januzaj,"[Right Wing, Left Wing, Right Midfield, Center...",[Right Wing],"[[], [], [{'position_id': 17, 'position': 'Rig..."
...,...,...,...
Özkan Yıldırım,[Right Wing],[Right Wing],"[[{'position_id': 17, 'position': 'Right Wing'..."
Ørjan Håskjold Nyland,[Goalkeeper],[Goalkeeper],"[[{'position_id': 1, 'position': 'Goalkeeper',..."
İlkay Gündoğan,"[Right Wing, Left Center Midfield, Center Atta...",[Right Center Midfield],"[[{'position_id': 13, 'position': 'Right Cente..."
Łukasz Piszczek,"[Left Back, Right Back, Center Forward, Right ...",[Right Back],"[[{'position_id': 2, 'position': 'Right Back',..."


## 2. Merge

In [None]:
player_stats = pd.merge(left=player_stats, right=positions_played_df ,left_on="player_name",right_on="player_name", how="right")
player_stats

Unnamed: 0,match_played,minutes_played,player_name,country,positions,team,positions_played,position
0,23.0,1742.0,Aaron Hunt,Germany,"[{'position_id': 22, 'position': 'Right Center...",Hamburger SV,"[Left Wing, Right Center Forward]","[Right Center Forward, Left Wing]"
1,7.0,425.0,Adam Hloušek,Czech Republic,[],VfB Stuttgart,[],
2,26.0,1908.0,Adelino André Vieira Freitas,Portugal,"[{'position_id': 12, 'position': 'Right Midfie...",Wolfsburg,"[Right Midfield, Right Back]",[Right Midfield]
3,28.0,1373.0,Admir Mehmedi,Switzerland,"[{'position_id': 22, 'position': 'Right Center...",Bayer Leverkusen,[Right Center Forward],[Right Center Forward]
4,6.0,176.0,Adnan Januzaj,Belgium,[],Borussia Dortmund,[],
...,...,...,...,...,...,...,...,...
534,1.0,18.0,Özkan Yıldırım,Germany,"[{'position_id': 17, 'position': 'Right Wing',...",Werder Bremen,[Right Wing],[Right Wing]
535,6.0,540.0,Ørjan Håskjold Nyland,Norway,"[{'position_id': 1, 'position': 'Goalkeeper', ...",Ingolstadt,[Goalkeeper],[Goalkeeper]
536,25.0,1997.0,İlkay Gündoğan,Germany,"[{'position_id': 13, 'position': 'Right Center...",Borussia Dortmund,[Right Center Midfield],[Right Center Midfield]
537,20.0,1533.0,Łukasz Piszczek,Poland,"[{'position_id': 2, 'position': 'Right Back', ...",Borussia Dortmund,[Right Back],[Right Back]


In [None]:
player_stats = player_stats[["player_name", "country","team","positions_played","match_played","minutes_played"]]
player_stats

Unnamed: 0,player_name,country,team,positions_played,match_played,minutes_played
0,Aaron Hunt,Germany,Hamburger SV,"[Left Wing, Right Center Forward]",23.0,1742.0
1,Adam Hloušek,Czech Republic,VfB Stuttgart,[],7.0,425.0
2,Adelino André Vieira Freitas,Portugal,Wolfsburg,"[Right Midfield, Right Back]",26.0,1908.0
3,Admir Mehmedi,Switzerland,Bayer Leverkusen,[Right Center Forward],28.0,1373.0
4,Adnan Januzaj,Belgium,Borussia Dortmund,[],6.0,176.0
...,...,...,...,...,...,...
534,Özkan Yıldırım,Germany,Werder Bremen,[Right Wing],1.0,18.0
535,Ørjan Håskjold Nyland,Norway,Ingolstadt,[Goalkeeper],6.0,540.0
536,İlkay Gündoğan,Germany,Borussia Dortmund,[Right Center Midfield],25.0,1997.0
537,Łukasz Piszczek,Poland,Borussia Dortmund,[Right Back],20.0,1533.0


## __Store data__

In [None]:
player_stats.to_csv("../data/standard_stats.csv",index=False)