In [59]:
import pandas as pd
from scipy import stats

# Load the data
file_path = '../op_players_list.csv'
df = pd.read_csv(file_path)

# Define the stats for each position
attacker_stats = [
    'Goals', 'Assists', 'Expected Goals', 'Expected Assisted Goals', 'Progressive Passes Received',
    'Shot creating actions', 'Crosses into penalty area', 'Successful take ons',
    'Goals per shot on target', 'Shots on target %', 'Aerial duels won %', 'Carries into final 1/3'
]

midfielder_stats = [
    'Passes completed', 'Progressive passes', 'Passes into final third', 'Shot creating actions',
    'Interceptions', 'Balls blocked', 'Expected Assisted Goals', 'Carries into final 1/3',
    'Goals', 'Touches', 'Long passes completed %', 'Clearances'
]

defender_stats = [
    'Tackles won %', 'Clearances', 'Interceptions', 'Progressive moved ball distance', 'Progressive passes',
    'Aerial duels won %', 'Balls blocked', 'Recoveries', 'Passes attempted', 'Errors', 'Fouls committed', 'Progressive pass distance', 'Yellow cards'
]

# Stats that should not be converted to per 90 minutes
non_per_90_stats = ['Shots on target %', 'Goals per shot on target', 'Aerial duels won %', 'Long passes completed %']

# Function to normalize stats to per 90 minutes
def normalize_per_90(df, stats_list, minutes_col='Minutes played'):
    df_normalized = df.copy()
    for stat in stats_list:
        if stat not in non_per_90_stats:
            df_normalized[stat] = (df[stat] / df[minutes_col]) * 90
    return df_normalized

# Calculate percentiles for a given DataFrame and list of stats
def calculate_percentiles(df, stats_list):
    percentiles = df.copy()
    for stat in stats_list:
        if stat in ['Fouls committed', 'Errors', 'Yellow cards']:
            # Inverse percentile for negative stats
            percentiles[stat + '_percentile'] = (1 - df[stat].rank(pct=True)) * 100
        else:
            percentiles[stat + '_percentile'] = df[stat].rank(pct=True) * 100
    return percentiles

# Process each position separately
positions = ['FW', 'MF', 'DF']
position_stats = {
    'FW': attacker_stats,
    'MF': midfielder_stats,
    'DF': defender_stats
}

# Create an empty DataFrame to store the results
results = pd.DataFrame()

for position in positions:
    # Filter players by position
    position_df = df[df['Position'] == position]
    
    # Normalize stats to per 90 minutes
    normalized_df = normalize_per_90(position_df, position_stats[position])
    
    # Calculate percentiles for the relevant stats
    percentiles_df = calculate_percentiles(normalized_df, position_stats[position])
    
    # Append to the results DataFrame
    results = pd.concat([results, percentiles_df], ignore_index=True)

# Calculate the Total Score as the sum of all percentiles
percentile_columns = [col for col in results.columns if col.endswith('_percentile')]
results['Total Score'] = results[percentile_columns].sum(axis=1)

# Save the results to a new CSV file
output_file_path = '../op_players_list_with_percentiles.csv'
results.to_csv(output_file_path, index=False)

print(f"Percentiles calculated and saved to {output_file_path}")


Percentiles calculated and saved to ../op_players_list_with_percentiles.csv


In [50]:
df_total = pd.read_csv(output_file_path)

In [51]:
df_total["Total Score"] = df_total["Total Score"].rank(ascending=False)

In [53]:
df_total["Total Score"].max()

1907.0