# Load imports and dataset

In [211]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

# preprocessing imports
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# dimension reduction imports
from sklearn.decomposition import PCA

# clustering/similarity imports
from sklearn.neighbors import NearestNeighbors

# Ignore all warnings
warnings.filterwarnings("ignore")

# Display all columns
pd.set_option('display.max_columns', None)

In [212]:
# load in EA Sports 26 csv
df = pd.read_csv("../raw_data/FC26_20250921.csv", low_memory=False)

# Preprocessing

#### Drop columns, Scale and Encode features, Impute NaNs

## Dropping columns

In [213]:
# In-game 'boost' columns not relevant
drop_cols = df.columns[-28:-1]
df = df.drop(columns=drop_cols)

# Work rate column is empty. Drop work rate column
df = df.drop(columns=['work_rate'])

In [214]:
# Numerical and Categorical features
features_columns = [
    'player_id', 'player_positions',

    'overall', 'potential',

    'height_cm', 'weight_kg',

    'preferred_foot', 'weak_foot', 'skill_moves', 'international_reputation',

    'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic',

    'attacking_crossing', 'attacking_finishing',
    'attacking_heading_accuracy', 'attacking_short_passing',
    'attacking_volleys', 'skill_dribbling', 'skill_curve',
    'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
    'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
    'movement_reactions', 'movement_balance', 'power_shot_power',
    'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
    'mentality_aggression', 'mentality_interceptions',
    'mentality_positioning', 'mentality_vision', 'mentality_penalties',
    'mentality_composure', 'defending_marking_awareness',
    'defending_standing_tackle', 'defending_sliding_tackle',
    'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
    'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed'
    ]

In [215]:
# Additional profiling features used for filtering down the line
# info_columns = ['player_id', 'player_url', 'short_name', 'long_name',
#        'value_eur', 'wage_eur', 'age', 'dob', 'league_id', 'league_name',
#        'league_level', 'club_team_id', 'club_name', 'club_position',
#        'club_jersey_number', 'club_loaned_from', 'club_joined_date',
#        'club_contract_valid_until_year', 'nationality_id', 'nationality_name',
#        'nation_team_id', 'nation_position', 'nation_jersey_number',
#        'body_type', 'real_face',
#        'release_clause_eur', 'player_tags', 'player_traits',
#        'player_face_url']

In [216]:
# Create features and info dataframes
player_features_df = df[features_columns]
# player_info_df =  df[info_columns]

## Creating Primary Positions column

In [217]:
# Take the first given position as a player's primary position (new column)
player_features_df['primary_position'] = player_features_df['player_positions'].str.split(',').str[0]

## Scaling and encoding features

In [218]:
# numerical features
numeric_columns = ['overall', 'potential', 'height_cm', 'weight_kg', 'weak_foot', 'skill_moves',
                'international_reputation', 'pace', 'shooting', 'passing', 'dribbling', 'defending',
                'physic', 'attacking_crossing', 'attacking_finishing','attacking_heading_accuracy',
                'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve',
                'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
                'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
                'movement_reactions', 'movement_balance', 'power_shot_power',
                'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
                'mentality_aggression', 'mentality_interceptions',
                'mentality_positioning', 'mentality_vision', 'mentality_penalties',
                'mentality_composure', 'defending_marking_awareness',
                'defending_standing_tackle', 'defending_sliding_tackle',
                'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
                'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed']

# categorical features
categorical_columns = ['preferred_foot', 'player_positions', 'primary_position']

In [219]:
# MinMax scale numerical features
mm_scaler = MinMaxScaler()
player_features_df[numeric_columns] = mm_scaler.fit_transform(player_features_df[numeric_columns])

In [220]:
# One Hot Encode categorical features
# OHE player primary positions
ohe = OneHotEncoder(sparse_output=False)
ohe.fit(player_features_df[['primary_position']])
player_features_df[ohe.get_feature_names_out()] = ohe.transform(player_features_df[['primary_position']])

# OHE player preffered foot
ohe = OneHotEncoder(sparse_output=False)
ohe.fit(player_features_df[['preferred_foot']])
player_features_df[ohe.get_feature_names_out()] = ohe.transform(player_features_df[['preferred_foot']])

# Drop the original categorical columns
player_features_df = player_features_df.drop(columns=categorical_columns)


## Imputing NaNs

In [221]:
# Fill in NaNs

# 0 for goalkeeping speed for all outfield players
player_features_df['goalkeeping_speed'] = player_features_df['goalkeeping_speed'].fillna(0)

# 0 for grouped outfield attribute scores for goalkeepers
columns_to_fill = ['pace', 'physic', 'defending', 'passing', 'shooting', 'dribbling']
player_features_df[columns_to_fill] = player_features_df[columns_to_fill].fillna(0)

## Assigning final features table

In [222]:
# Assign features df to X
X = player_features_df.copy()
# X.set_index('player_id', inplace=True) # with player ID as the index
X = X.drop(columns=['player_id']) # remove player ID

# Principal Component Analysis

### Creating components of statistically significant linear combinations of our features and reducing our dimensions.

Questions: Which features should we include? How many components should we choose?


Notes: Required 'explained variance' from Principal Components can be changed

#### PCA on all features

In [223]:
# PCA of all features
X_all = X.copy()

# X is your preprocessed dataframe
pca = PCA()
X_pca = pca.fit_transform(X_all)

# Cumulative explained variance
explained_variance = np.cumsum(pca.explained_variance_ratio_) # Example: choose components to capture 90% of variance
n_components_optimal = np.argmax(explained_variance >= 0.95) + 1
print(f"Optimal components for 95% variance: {n_components_optimal}")


Optimal components for 95% variance: 19


In [224]:
pca_all = PCA(n_components=n_components_optimal)
pca_all.fit(X_all)

# Remapping players with PCAs
X_all_proj = pca_all.transform(X_all)
X_all_proj = pd.DataFrame(X_all_proj, index=X_all.index)
X_all_proj

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,-1.977765,-0.298012,0.476839,-0.325512,0.992991,-0.621234,0.552504,-0.163471,0.681490,0.321033,-0.093658,-0.106062,0.207012,-0.092149,0.013089,-0.046210,0.237256,-0.018536,0.439985
1,-1.950642,-0.103675,0.586825,-0.651652,1.097780,0.268739,0.098774,-0.498904,-0.150280,0.101931,-0.082822,-0.166954,0.046549,0.038274,0.038024,-0.020338,0.204385,0.257037,0.244062
2,-1.825927,0.002576,0.585798,-0.583836,1.048849,-0.720983,-0.082671,0.486644,-0.169697,-0.005884,-0.041394,-0.242116,-0.287299,-0.165010,0.095670,-0.025171,0.053222,0.054821,0.018037
3,-1.898248,-0.126698,0.579114,-0.453414,0.554392,-0.845003,-0.129556,-0.803335,0.131819,-0.260450,0.143396,-0.066294,-0.234558,-0.033525,0.088264,-0.005136,0.098882,0.264822,0.171141
4,-1.866810,-0.177487,0.525343,-0.808031,0.934142,0.295913,0.141962,-0.436795,-0.110954,0.113097,-0.095954,-0.284292,-0.168493,-0.191752,0.065984,-0.043592,0.066152,0.013554,0.215663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18400,2.910573,-0.129498,-0.009790,-0.102890,-0.154841,0.196173,-0.122797,0.215456,0.099951,-0.038341,0.054049,0.417082,-0.014077,0.135439,-0.067693,0.031461,0.010992,0.088320,-0.071773
18401,2.870727,-0.158477,-0.025998,-0.144994,-0.232121,0.187785,-0.104837,0.216202,0.090751,-0.022241,0.036479,0.305997,-0.076165,0.034445,-0.050068,0.013597,-0.122268,-0.049072,-0.006059
18402,2.846474,-0.194324,-0.055365,-0.253845,-0.336906,0.199299,-0.080937,0.246086,0.101053,-0.015573,0.026632,0.224721,-0.169553,-0.064684,-0.054059,0.009631,-0.118163,-0.101437,-0.000704
18403,2.950349,-0.171766,-0.075425,-0.250070,-0.360941,0.185112,-0.095806,0.217552,0.081977,-0.016912,0.017579,0.220448,-0.151267,-0.054208,-0.326221,0.062360,-0.062415,0.114988,0.076606


#### PCA on only detailed skill attributes

In [225]:
# Detailed skill attributes only (doesnt include postitioning, footedness, height etc, aggregated stats)
skill_attributes_columns = ['attacking_crossing', 'attacking_finishing',
       'attacking_heading_accuracy', 'attacking_short_passing',
       'attacking_volleys', 'skill_dribbling', 'skill_curve',
       'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
       'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
       'movement_reactions', 'movement_balance', 'power_shot_power',
       'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
       'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking_awareness',
       'defending_standing_tackle', 'defending_sliding_tackle',
       'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
       'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed']

In [226]:
X_attributes = X[skill_attributes_columns].copy()

# X is your preprocessed dataframe
pca = PCA()
X_pca = pca.fit_transform(X_attributes)

# Cumulative explained variance
explained_variance = np.cumsum(pca.explained_variance_ratio_) # Example: choose components to capture 90% of variance
n_components_optimal = np.argmax(explained_variance >= 0.95) + 1
print(f"Optimal components for 95% variance: {n_components_optimal}")

Optimal components for 95% variance: 12


In [227]:
# PCA on detailed attritbues only
pca_attributes = PCA(n_components=n_components_optimal)
pca_attributes.fit(X_attributes)

X_attributes_proj = pca_attributes.transform(X_attributes)
X_attributes_proj = pd.DataFrame(X_attributes_proj, index=X_attributes.index)
X_attributes_proj

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.709295,0.002810,0.796939,-0.153128,0.059983,-0.107279,0.187202,0.049477,-0.162496,-0.032854,-0.053074,0.065296
1,1.666497,0.153331,0.772054,-0.230626,0.053121,0.059923,-0.072182,0.080221,-0.322362,0.025191,0.005538,0.015813
2,1.631639,0.190027,0.735140,-0.455352,-0.076887,0.047197,0.061918,-0.035287,0.086741,0.012795,-0.081895,-0.061393
3,1.640518,0.095229,0.616773,-0.267366,0.179086,0.143211,-0.029078,-0.059534,-0.163570,-0.034278,-0.192495,-0.071766
4,1.630969,0.059796,0.643551,-0.469841,0.047872,-0.040498,0.179024,-0.040803,-0.065824,0.035438,-0.092683,-0.008853
...,...,...,...,...,...,...,...,...,...,...,...,...
18400,-2.415902,-0.028274,-0.199080,0.070714,-0.413341,0.043232,-0.162192,0.057485,-0.029536,0.060726,0.054663,0.051546
18401,-2.388558,-0.051306,-0.255158,0.037082,-0.362183,0.095318,0.030173,0.039269,0.126831,0.006843,0.007039,0.002183
18402,-2.365826,-0.100808,-0.379345,-0.165054,-0.390031,0.039584,0.091042,0.088822,0.136188,-0.003702,-0.018243,0.018337
18403,-2.492990,-0.078827,-0.383500,-0.111888,-0.362900,0.109115,-0.114662,-0.004385,-0.159936,0.078572,-0.056480,-0.212723


#### PCA on detailed skill attributes and physical features

In [228]:
# Features without position information and aggergated attributes
attributes_and_physical_features = ['height_cm', 'weight_kg', 'weak_foot',
       'skill_moves', 'attacking_crossing',
       'attacking_finishing', 'attacking_heading_accuracy',
       'attacking_short_passing', 'attacking_volleys', 'skill_dribbling',
       'skill_curve', 'skill_fk_accuracy', 'skill_long_passing',
       'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed',
       'movement_agility', 'movement_reactions', 'movement_balance',
       'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength',
       'power_long_shots', 'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking_awareness',
       'defending_standing_tackle', 'defending_sliding_tackle',
       'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
       'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed',
       'preferred_foot_Left', 'preferred_foot_Right']

In [229]:
# PCA on features without position information and aggergated attributes
X_physical_and_attributes = X[attributes_and_physical_features].copy()

# X is your preprocessed dataframe
pca = PCA()
X_pca = pca.fit_transform(X_physical_and_attributes)

# Cumulative explained variance
explained_variance = np.cumsum(pca.explained_variance_ratio_) # Example: choose components to capture 90% of variance
n_components_optimal = np.argmax(explained_variance >= 0.95) + 1
print(f"Optimal components for 95% variance: {n_components_optimal}")


Optimal components for 95% variance: 13


In [230]:
pca_physical_and_attributes = PCA(n_components=n_components_optimal)
pca_physical_and_attributes.fit(X_physical_and_attributes)

X_physical_and_attributes_proj = pca_physical_and_attributes.transform(X_physical_and_attributes)
X_physical_and_attributes_proj = pd.DataFrame(X_physical_and_attributes_proj, index=X_physical_and_attributes.index)
X_physical_and_attributes_proj

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.685293,-0.623885,0.076874,0.737391,-0.374078,0.066003,0.089693,-0.008184,-0.123521,0.023023,0.020746,-0.251000,0.019756
1,1.608427,-0.582952,0.245805,0.680160,-0.444966,0.059221,0.077895,0.002389,0.104279,-0.246002,-0.087325,-0.184252,0.058253
2,1.580460,-0.561142,0.266546,0.575313,-0.659379,-0.069594,0.096483,-0.102566,0.022781,0.000796,-0.013611,0.108118,-0.015747
3,1.626648,-0.580441,0.145700,0.518093,-0.447053,0.183505,0.093390,0.038814,0.174948,0.060337,-0.096260,-0.164470,-0.011317
4,1.588775,-0.576063,0.127050,0.458388,-0.667785,0.056235,0.093107,-0.155597,-0.100756,-0.036017,-0.065984,-0.047956,0.037325
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18400,-2.489543,0.014172,-0.042012,-0.134103,0.161122,-0.418807,-0.073801,0.170643,0.142087,-0.059261,0.036311,-0.005543,0.070541
18401,-2.445690,0.009953,-0.088718,-0.265503,0.087711,-0.362757,-0.067094,-0.044159,0.070151,0.108554,0.103513,0.042862,0.006182
18402,-2.417332,0.013306,-0.148481,-0.438746,-0.070943,-0.391610,-0.064540,-0.063304,0.009209,0.067748,0.151465,0.041263,-0.009712
18403,-2.545000,0.049727,-0.133475,-0.492181,-0.046529,-0.370708,-0.324599,0.015301,0.106194,-0.038046,-0.081370,-0.049915,0.068100


# Similarity scoring with K-Nearest Neighbours

### Check similar alternatives output after dimensionality reduction.

Questions: How does different feature selection affect similarity suggestions?

Notes: Number of neighbours can be changed

#### Player ID search


In [231]:
# Use this function to find your desired player's unique player ID using their name
# Does not have to be an exact match!
# Sensitive to accents
def get_player_id(name):
    player_names_ids = df[['long_name', 'short_name', 'nationality_name', 'club_name', 'player_positions', 'player_id']]
    return player_names_ids[player_names_ids['long_name'].str.contains(name, case=False) |
                            player_names_ids['short_name'].str.contains(name, case=False)]

In [232]:
# Example demonstrating need of extra columns!
get_player_id('Moussa Diarra')

Unnamed: 0,long_name,short_name,nationality_name,club_name,player_positions,player_id
8556,Moussa Diarra,M. Diarra,Mali,Deportivo Alavés,"CB, LB",250725
12454,Moussa Diarra,M. Diarra,Mali,Málaga CF,CB,263677


#### Similarity matching when selecting all features


In [233]:
# Test using all features
from sklearn.neighbors import NearestNeighbors
knn_all = NearestNeighbors(
    n_neighbors=6,        # 1 self + 5 similar players
    metric='cosine'       # best for similarity in high dimensions
)
knn_all.fit(X_all_proj)

# Function only producing similar alternatives based on selected features
def find_similar_players_all(player_id):
    # Step 1: Find correct row index of the player
    player_index = player_features_df.index[player_features_df['player_id'] == player_id][0]
    # Step 2: Get nearest neighbors
    distances, indices = knn_all.kneighbors([X_all_proj.iloc[player_index]])
    # Step 3: Exclude the queried player (index 0)
    similar_indices = indices[0][1:6]
    similar_distances = distances[0][1:6]
    # step 4: Convert distance to similarity score
    similarity_scores = 1 - similar_distances
    # Step 5: Return selected columns
    results =  df.iloc[similar_indices][[
        'player_id', 'long_name', 'short_name', 'player_positions', 'overall', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'value_eur'
    ]]
    # step 6: Add the similarity score column (rounded for readability)
    results['similarity'] = similarity_scores.round(4)
    return results.reset_index(drop=True)


#### Similarity matching when selecting only detailed skill attributes

In [234]:
# Test using only skill attributes features
from sklearn.neighbors import NearestNeighbors
knn_skill = NearestNeighbors(
    n_neighbors=6,        # 1 self + 5 similar players
    metric='cosine'       # best for similarity in high dimensions
)
knn_skill.fit(X_attributes_proj)

# Function only producing similar alternatives based on selected features
def find_similar_players_skill(player_id):
    # Step 1: Find correct row index of the player
    player_index = player_features_df.index[player_features_df['player_id'] == player_id][0]
    # Step 2: Get nearest neighbors
    distances, indices = knn_skill.kneighbors([X_attributes_proj.iloc[player_index]])
    # Step 3: Exclude the queried player (index 0)
    similar_indices = indices[0][1:6]
    similar_distances = distances[0][1:6]
    # step 4: Convert distance to similarity score
    similarity_scores = 1 - similar_distances
    # Step 5: Return selected columns
    results =  df.iloc[similar_indices][[
        'short_name', 'player_positions', 'overall', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'value_eur'
    ]]
    # step 6: Add the similarity score column (rounded for readability)
    results['similarity'] = similarity_scores.round(4)
    return results.reset_index(drop=True)


#### Similarity matching when selecting only detailed skill attributes and some physical features

In [235]:
# Test using only skill and physical attributes features
from sklearn.neighbors import NearestNeighbors
knn_phy_skill = NearestNeighbors(
    n_neighbors=6,        # 1 self + 5 similar players
    metric='cosine'       # best for similarity in high dimensions
)
knn_phy_skill.fit(X_physical_and_attributes_proj)

# Function only producing similar alternatives based on selected features
def find_similar_players_phy_skill(player_id):
    # Step 1: Find correct row index of the player
    player_index = player_features_df.index[player_features_df['player_id'] == player_id][0]
    # Step 2: Get nearest neighbors
    distances, indices = knn_phy_skill.kneighbors([X_physical_and_attributes_proj.iloc[player_index]])
    # Step 3: Exclude the queried player (index 0)
    similar_indices = indices[0][1:6]
    similar_distances = distances[0][1:6]
    # step 4: Convert distance to similarity score
    similarity_scores = 1 - similar_distances
    # Step 5: Return selected columns
    results =  df.iloc[similar_indices][[
        'short_name', 'player_positions', 'overall', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'value_eur'
    ]]
    # step 6: Add the similarity score column (rounded for readability)
    results['similarity'] = similarity_scores.round(4)
    return results.reset_index(drop=True)

#### Put it all together!

In [236]:
# Find player ID
get_player_id('messi')

Unnamed: 0,long_name,short_name,nationality_name,club_name,player_positions,player_id
240,Lionel Andrés Messi Cuccitini,L. Messi,Argentina,Inter Miami,"RW, ST, CAM, RM",158023
1327,Junior Walter Messias,Junior Messias,Brazil,Genoa,"CM, RM, CAM",240938
11175,Pablo Ruan Messias Cardozo,Pablo Ruan,Brazil,CD Nacional,"ST, RW, RM",79479
12715,Rayane Messi Tanfouri,R. Messi,France,Pau FC,"LM, LW",75421


In [237]:
# Input player_id from search
find_similar_players_skill(158023)

Unnamed: 0,short_name,player_positions,overall,pace,shooting,passing,dribbling,defending,physic,value_eur,similarity
0,C. Nkunku,"CAM, LM, ST, CM",81,77.0,79.0,80.0,82.0,40.0,59.0,27500000,0.9911
1,R. Mahrez,"RM, RW",84,78.0,80.0,81.0,88.0,39.0,63.0,20000000,0.9892
2,S. Benrahma,"LW, LM, CAM",77,75.0,75.0,74.0,80.0,42.0,61.0,10500000,0.986
3,Ivi López,"CAM, ST, LW",73,73.0,72.0,72.0,75.0,40.0,66.0,2500000,0.9857
4,P. Dybala,"CAM, ST",86,80.0,85.0,84.0,87.0,41.0,64.0,56500000,0.9853


## Thoughts

#### Quick note on the brief quality checks completed and the recommended feature selection. 

Short explanaition: Select for detailed attributes only


Longer explanaition: I checked each feature-selection option for its player suggestion capabilities. I looked at outfield players that were perhaps generally average but possessed some key attributes that were really good. These were best taken into account when only skill attributes were selected. For goalkeepers, a key differential in real-life is whether they can pass the ball well or not? Again, using only skill attributes produces a matching engine which best picks up these key differences.  