In [24]:
# libraries
import pandas as pd
import numpy as np
import time
import re

***Collecting every player who has played for every club in the top 5 leagues leads to duplicates as players happen to transfer in winter transfer windows. This code drops the duplicates to be able to further work with the df***

In [13]:
file_path = "../merged_t5_players.csv"

df = pd.read_csv(file_path)
# Entfernen von doppelten Einträgen basierend auf 'Player', 'Nation' und 'Pos'
df = df.drop_duplicates(subset=['Player', 'Nation', 'Pos'])
# Aktualisieren der CSV-Datei
df.to_csv(file_path, index=False)
print(f"Updated CSV file: {file_path}")

Updated CSV file: ../merged_t5_players.csv


In [27]:
origin_path = '../update_t5_players.csv'
misc_path = '../t5_leagues_players_misc.csv'
defense_path = '../t5_leagues_players_defense.csv'
possession_path = '../t5_leagues_players_possession.csv'
passing_path = '../t5_leagues_players_passing.csv'
shooting_path = '../t5_leagues_players_shooting.csv'
gca_path = '../t5_leagues_players_gca.csv'

**Insertion of a useful stat**

In [17]:
df = pd.read_csv(defense_path)
# insert a new row after TlkW and insert the TlkW divided by Tkl values
df['TlkW/Tkl'] = df['TklW'] / df['Tkl']
df.to_csv(defense_path, index=False)

***Merging all the different stat tables from FBRef into one large dataset. The primary key that's worked with here is the combination of Player, Nation, Age and Club to guarantee non-redundant data***

In [31]:
import pandas as pd

# Pfade zu den CSV-Dateien


# Laden der CSV-Dateien
# Laden der CSV-Dateien
origin_players_df = pd.read_csv(origin_path)
defense_df = pd.read_csv(defense_path)
possession_df = pd.read_csv(possession_path)
misc_df = pd.read_csv(misc_path)
passing_df = pd.read_csv(passing_path)
gca_df = pd.read_csv(gca_path)
shooting_df = pd.read_csv(shooting_path)

# Spieler mit weniger als 250 Minuten ausschließen
origin_players_df = origin_players_df[origin_players_df['Min'] >= 250]

# Funktion zur Bereinigung der Positionsspalte
def clean_pos(pos):
    return pos.split(',')[0]

# Anwenden der Bereinigung der Positionsspalte
origin_players_df['Pos'] = origin_players_df['Pos'].apply(clean_pos)

# Funktion zum Hinzufügen von Präfixen und Nummerierung bei Kollisionen
def add_prefix_and_number(df, prefix, exclude_cols):
    col_count = {}
    new_cols = []
    for col in df.columns:
        if col not in exclude_cols:
            new_col = col
            if col in new_cols:
                if col not in col_count:
                    col_count[col] = 1
                col_count[col] += 1
                new_col = f"{prefix}_{col}_{col_count[col]}"
            elif col in combined_df.columns:
                new_col = f"{prefix}_{col}"
            new_cols.append(new_col)
        else:
            new_cols.append(col)
    df.columns = new_cols
    return df

# Initialisieren des kombinierten DataFrames mit origin_players_df
combined_df = origin_players_df[['Player', 'Nation', 'Age', 'Club', '90s', 'Min','Pos']]

# Liste der zu kombinierenden DataFrames mit ihren Präfixen
dataframes = [
    (passing_df, 'Pass'),
    (gca_df, 'GCA'),
    (shooting_df, 'Shoot'),
    (defense_df, 'Def'),
    (possession_df, 'Poss'),
    (misc_df, 'Misc')
]

# Zusammenführen der Daten
for df, prefix in dataframes:
    df = add_prefix_and_number(df, prefix, ['Player', 'Nation', 'Age', 'Club'])
    combined_df = pd.merge(combined_df, df, on=['Player', 'Nation', 'Age', 'Club'], how='left')

# Entfernen der zusätzlichen 'Pos' und '90s' Spalten
combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]

# Entfernen der 'Matches'-Spalten
combined_df = combined_df.loc[:, ~combined_df.columns.str.contains('Matches')]

# Speichern der kombinierten Daten in eine neue CSV-Datei
combined_df.to_csv('entire_players_list.csv', index=False)

print("Die CSV-Dateien wurden erfolgreich kombiniert und gespeichert.")

Die CSV-Dateien wurden erfolgreich kombiniert und gespeichert.


## Deletion of unnecessary / redundant columns

In [35]:
import pandas as pd

# Pfad zur CSV-Datei
file_path = 'entire_players_list.csv'

# Laden der CSV-Datei
df = pd.read_csv(file_path)

# Spalten filtern, um nur die gewünschten `90s` und `Pos` Werte zu behalten
filtered_columns = [col for col in df.columns if not any(col.startswith(prefix) for prefix in ['GCA_', 'Shoot_', 'Pass_', 'Misc_', 'Def_']) or col == '90s' or col == 'Pos']

# Zusätzlicher Filter für 'Pos' Werte, um nur die ursprüngliche `Pos` Spalte zu behalten
filtered_columns = [col for col in filtered_columns if not any(col.endswith(suffix) for suffix in ['_90s', '_Pos']) or col == '90s' or col == 'Pos' or col.startswith('Poss')]

# DataFrame mit den gefilterten Spalten erstellen
df_filtered = df[filtered_columns]

# Speichern der bereinigten CSV-Datei
df_filtered.to_csv('entire_players_list.csv', index=False)

print("Die CSV-Datei wurde erfolgreich bereinigt und gespeichert.")

Die CSV-Datei wurde erfolgreich bereinigt und gespeichert.


***All those stats were confusing so I eventually renamed them all as I plan to upload my dataset to Kaggle to help someone save some time***

In [36]:
import pandas as pd

# Pfad zur CSV-Datei
file_path = 'entire_players_list.csv'

# Laden der CSV-Datei
df = pd.read_csv(file_path)

# Dictionary zur Umbenennung der Spalten
rename_columns = {
    'Player': 'Player',
    'Nation': 'Nation',
    'Age': 'Age',
    'Club': 'Club',
    '90s': '90 minutes played',
    'Min': 'Minutes played',
    'Pos': 'Position',
    'Cmp': 'Passes completed',
    'Att': 'Passes attempted',
    'Cmp%': 'Completed passes %',
    'TotDist': 'Total pass distance',
    'PrgDist': 'Progressive pass distance',
    'Cmp.1': 'Short passes completed',
    'Att.1': 'Short passes attempted',
    'Cmp%.1': 'Short passes completed %',
    'Cmp.2': 'Medium passes completed',
    'Att.2': 'Medium passes attempted',
    'Cmp%.2': 'Medium passes completed %',
    'Cmp.3': 'Long passes completed',
    'Att.3': 'Long passes attempted',
    'Cmp%.3': 'Long passes completed %',
    'Ast': 'Assists',
    'xAG': 'Expected Assisted Goals',
    'xA': 'Expected Assists',
    'A-xAG': 'A minus xAG',
    'KP': 'Key Passes',
    '1/3': 'Passes into final third',
    'PPA': 'Passes into penalty area',
    'CrsPA': 'Crosses into penalty area',
    'PrgP': 'Progressive passes',
    'SCA': 'Shot creating actions',
    'SCA90': 'Shot creating actions per 90',
    'PassLive': 'Shot creating actions by live-ball passes',
    'PassDead': 'Shot creating actions by dead-ball passes',
    'TO': 'Successful Take-Ons that led to a shot',
    'Sh': 'Shots that led to another shot attempt',
    'Fld': 'Fouls drawn that led to a shot attempt',
    'Def': 'Defensive actions that led to a shot attempt',
    'GCA': 'Goal creating actions',
    'GCA90': 'Goal creating actions per 90',
    'PassLive.1': 'Goal creating actions by live-ball passes',
    'PassDead.1': 'Goal creating actions by dead-ball passes',
    'TO.1': 'Successful Take-Ons that led to a goal',
    'Sh.1': 'Shots that led to another goal-scoring shot',
    'Fld.1': 'Fouls drawn that led to a goal',
    'Def.1': 'Defensive actions that led to a goal',
    'Gls': 'Goals',
    'SoT': 'Shots on target',
    'SoT%': 'Shots on target %',
    'Sh/90': 'Shots per 90',
    'SoT/90': 'Shots on target per 90',
    'G/Sh': 'Goals per shot',
    'G/SoT': 'Goals per shot on target',
    'Dist': 'Average shooting distance',
    'FK': 'Shots from free kicks',
    'PK': 'Penalty kicks made',
    'PKatt': 'Penalty kicks attempted',
    'xG': 'Expected Goals',
    'npxG': 'Non-penalty expected goals',
    'npxG/Sh': 'Non-penalty expected goals/Shot',
    'G-xG': 'Goals minus expected Goals',
    'np:G-xG': 'Non-penalty goals minus non-penalty expected Goals',
    'Tkl': 'Tackles',
    'TklW': 'Tackles won',
    'Def 3rd': 'Tackles in defensive 1/3',
    'Mid 3rd': 'Tackles in middle 1/3',
    'Att 3rd': 'Tackles in attacking 1/3',
    'Tkl.1': 'Dribblers tackled',
    'Tkl%': '% of dribblers tackled',
    'Lost': 'Challenges lost',
    'Blocks': 'Balls blocked',
    'Pass': 'Passes blocked',
    'Int': 'Interceptions',
    'Tkl+Int': 'Number of tackles and interceptions',
    'Clr': 'Clearances',
    'Err': 'Errors',
    'TlkW/Tkl': 'Tackles won %',
    'Touches': 'Touches',
    'Def Pen': 'Touches in defensive penalty area',
    'Poss_Def 3rd': 'Touches in defensive 1/3',
    'Poss_Mid 3rd': 'Touches in middle 1/3',
    'Poss_Att 3rd': 'Touches in attacking 1/3',
    'Att Pen': 'Touches in attacking penalty area',
    'Live': 'Live-ball touches',
    'Poss_Att': 'Take ons attempted',
    'Succ': 'Successful take ons',
    'Succ%': 'Successful take ons %',
    'Tkld': 'Times tackled during take on',
    'Tkld%': 'Times tackled during take on %',
    'Carries': 'Times ball carried with feet',
    'Poss_TotDist': 'Total moved ball distance',
    'Poss_PrgDist': 'Progressive moved ball distance',
    'PrgC': 'Progressive Carries',
    'Poss_1/3': 'Carries into final 1/3',
    'CPA': 'Carries into penalty area',
    'Mis': 'Miscontrols',
    'Dis': 'Dispossessed',
    'Rec': 'Passes received',
    'PrgR': 'Progressive Passes Received',
    'CrdY': 'Yellow cards',
    'CrdR': 'Red cards',
    '2CrdY': 'Second yellow card',
    'Fls': 'Fouls committed',
    'Off': 'Offsides',
    'Crs': 'Crosses',
    'PKwon': 'Penalty kicks won',
    'PKcon': 'Penalty kicks conceded',
    'OG': 'Own goals',
    'Recov': 'Recoveries',
    'Won': 'Aerial duels won',
    'Won%': 'Aerial duels won %'
}

# Entfernen der Spalten 'Poss_Pos' und 'Poss_90s'
df.drop(columns=['Poss_Pos', 'Poss_90s'], inplace=True)

# Umbenennen der Spalten
df.rename(columns=rename_columns, inplace=True)

# Speichern der bereinigten CSV-Datei
df.to_csv('entire_players_list.csv', index=False)

print("Die CSV-Datei wurde erfolgreich bereinigt und gespeichert.")


Die CSV-Datei wurde erfolgreich bereinigt und gespeichert.


***For the in-depth transfer analysis I took the dataset with all percentiles saved and combined it with the transfers csv. At the end, I have a csv with each player that was bought in the last 6 years by a big 6 PL club for a sum greater than 0. The total score is also in the csv, its based on their 18/19 season***

In [50]:
import pandas as pd

# Einlesen der CSV-Dateien
transfers_df = pd.read_csv('transfer_data_big6_2003_2023.csv') #CSV with all player arrivals and the corresponding fees (non-existant anymore)
performance_df = pd.read_csv('../entire_players_list_with_percentiles.csv') #CSV with player performance data

# Filtern der Ankünfte (Arrivals) mit einer Gebühr
arrivals_df = transfers_df[(transfers_df['Transfer'] == 'Arrival') & (transfers_df['Fee'] > 0)]

# Finden der Spieler, die sowohl in den Ankünfte der ersten CSV als auch in den Leistungsdaten der zweiten CSV vorhanden sind
merged_df = pd.merge(arrivals_df, performance_df, left_on=['Player', 'Club'], right_on=['Player', 'Club'], how='inner')

# Auswahl der relevanten Spalten und Umbenennung
final_df = merged_df[['Player', 'Club', 'Age', 'Fee', 'Total Score']] # result is a csv with age, transfer fee and total score

# Speichern der finalen Daten in einer neuen CSV-Datei
final_df.to_csv('../filtered_arrivals_with_additional_data.csv', index=False)

# Ausgabe der finalen Daten zur Überprüfung
print(final_df.to_string(index=False))

                   Player              Club  Age    Fee  Total Score
         Jordan Henderson         Liverpool   28  18.00   911.520292
         Daniel Sturridge         Liverpool   28  15.00   754.803668
             Adam Lallana         Liverpool   30  31.00   717.903456
             Dejan Lovren         Liverpool   29  25.30   835.611979
             Divock Origi         Liverpool   23  12.63   848.991478
          Roberto Firmino         Liverpool   26  41.00   822.808678
                Joe Gomez         Liverpool   21   4.90   886.848958
               Sadio Mané         Liverpool   26  41.20   891.597545
      Georginio Wijnaldum         Liverpool   27  27.50   732.207061
          Virgil van Dijk         Liverpool   27  84.65   898.632813
            Mohamed Salah         Liverpool   26  42.00   888.645385
         Andrew Robertson         Liverpool   24   9.00   731.770833
               Naby Keïta         Liverpool   23  60.00   910.274142
                  Fabinho         