# Load imports and dataset

In [256]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

# preprocessing imports
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# dimension reduction imports
from sklearn.decomposition import PCA

# clustering/similarity imports
from sklearn.neighbors import NearestNeighbors

# Ignore all warnings
warnings.filterwarnings("ignore")

# Display all columns
pd.set_option('display.max_columns', None)

In [257]:
# load in EA Sports 26 csv
df = pd.read_csv("../raw_data/FC26_20250921.csv", low_memory=False)

# Preprocessing

#### Drop columns, Scale and Encode features, Impute NaNs

## Dropping columns

In [258]:
# In-game 'boost' columns not relevant
drop_cols = df.columns[-28:-1]
df = df.drop(columns=drop_cols)

# Work rate column is empty. Drop work rate column
df = df.drop(columns=['work_rate'])

In [259]:
# Numerical and Categorical features
features_columns = [
    'player_id', 'player_positions',

    'overall', 'potential',

    'height_cm', 'weight_kg',

    'preferred_foot', 'weak_foot', 'skill_moves', 'international_reputation',

    'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic',

    'attacking_crossing', 'attacking_finishing',
    'attacking_heading_accuracy', 'attacking_short_passing',
    'attacking_volleys', 'skill_dribbling', 'skill_curve',
    'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
    'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
    'movement_reactions', 'movement_balance', 'power_shot_power',
    'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
    'mentality_aggression', 'mentality_interceptions',
    'mentality_positioning', 'mentality_vision', 'mentality_penalties',
    'mentality_composure', 'defending_marking_awareness',
    'defending_standing_tackle', 'defending_sliding_tackle',
    'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
    'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed'
    ]

In [260]:
# Additional profiling features used for filtering down the line
# info_columns = ['player_id', 'player_url', 'short_name', 'long_name',
#        'value_eur', 'wage_eur', 'age', 'dob', 'league_id', 'league_name',
#        'league_level', 'club_team_id', 'club_name', 'club_position',
#        'club_jersey_number', 'club_loaned_from', 'club_joined_date',
#        'club_contract_valid_until_year', 'nationality_id', 'nationality_name',
#        'nation_team_id', 'nation_position', 'nation_jersey_number',
#        'body_type', 'real_face',
#        'release_clause_eur', 'player_tags', 'player_traits',
#        'player_face_url']

In [261]:
# Create features and info dataframes
player_features_df = df[features_columns]
# player_info_df =  df[info_columns]

## Creating Primary Positions column

In [262]:
# Take the first given position as a player's primary position (new column)
player_features_df['primary_position'] = player_features_df['player_positions'].str.split(',').str[0]

## Scaling and encoding features

In [263]:
# numerical features
numeric_columns = ['overall', 'potential', 'height_cm', 'weight_kg', 'weak_foot', 'skill_moves',
                'international_reputation', 'pace', 'shooting', 'passing', 'dribbling', 'defending',
                'physic', 'attacking_crossing', 'attacking_finishing','attacking_heading_accuracy',
                'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve',
                'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
                'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
                'movement_reactions', 'movement_balance', 'power_shot_power',
                'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
                'mentality_aggression', 'mentality_interceptions',
                'mentality_positioning', 'mentality_vision', 'mentality_penalties',
                'mentality_composure', 'defending_marking_awareness',
                'defending_standing_tackle', 'defending_sliding_tackle',
                'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
                'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed']

# categorical features
categorical_columns = ['preferred_foot', 'player_positions', 'primary_position']

In [264]:
# MinMax scale numerical features
mm_scaler = MinMaxScaler()
player_features_df[numeric_columns] = mm_scaler.fit_transform(player_features_df[numeric_columns])

In [265]:
# One Hot Encode categorical features
# OHE player primary positions
ohe = OneHotEncoder(sparse_output=False)
ohe.fit(player_features_df[['primary_position']])
player_features_df[ohe.get_feature_names_out()] = ohe.transform(player_features_df[['primary_position']])

# OHE player preffered foot
ohe = OneHotEncoder(sparse_output=False)
ohe.fit(player_features_df[['preferred_foot']])
player_features_df[ohe.get_feature_names_out()] = ohe.transform(player_features_df[['preferred_foot']])

# Drop the original categorical columns
player_features_df = player_features_df.drop(columns=categorical_columns)


## Imputing NaNs

In [266]:
# Fill in NaNs

# 0 for goalkeeping speed for all outfield players
player_features_df['goalkeeping_speed'] = player_features_df['goalkeeping_speed'].fillna(0)

# 0 for grouped outfield attribute scores for goalkeepers
columns_to_fill = ['pace', 'physic', 'defending', 'passing', 'shooting', 'dribbling']
player_features_df[columns_to_fill] = player_features_df[columns_to_fill].fillna(0)

## Assigning final features table

In [267]:
# Assign features df to X
X = player_features_df.copy()
# X.set_index('player_id', inplace=True) # with player ID as the index
X = X.drop(columns=['player_id']) # remove player ID

# Principal Component Analysis

### Creating components of statistically significant linear combinations of our features and reducing our dimensions.

Questions: Which features should we include? How many components should we choose?


Notes: Required 'explained variance' from Principal Components can be changed

#### PCA on all features

In [268]:
# PCA of all features
X_all = X.copy()

# X is your preprocessed dataframe
pca = PCA()
X_pca = pca.fit_transform(X_all)

# Cumulative explained variance
explained_variance = np.cumsum(pca.explained_variance_ratio_) # Example: choose components to capture 90% of variance
n_components_optimal = np.argmax(explained_variance >= 0.95) + 1
print(f"Optimal components for 95% variance: {n_components_optimal}")


Optimal components for 95% variance: 19


In [269]:
pca_all = PCA(n_components=n_components_optimal)
pca_all.fit(X_all)

# Remapping players with PCAs
X_all_proj = pca_all.transform(X_all)
X_all_proj = pd.DataFrame(X_all_proj, index=X_all.index)

#### PCA on only detailed skill attributes

In [270]:
# Detailed skill attributes only (doesnt include postitioning, footedness, height etc, aggregated stats)
skill_attributes_columns = ['attacking_crossing', 'attacking_finishing',
       'attacking_heading_accuracy', 'attacking_short_passing',
       'attacking_volleys', 'skill_dribbling', 'skill_curve',
       'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
       'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
       'movement_reactions', 'movement_balance', 'power_shot_power',
       'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
       'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking_awareness',
       'defending_standing_tackle', 'defending_sliding_tackle',
       'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
       'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed']

In [271]:
X_attributes = X[skill_attributes_columns].copy()

# X is your preprocessed dataframe
pca = PCA()
X_pca = pca.fit_transform(X_attributes)

# Cumulative explained variance
explained_variance = np.cumsum(pca.explained_variance_ratio_) # Example: choose components to capture 90% of variance
n_components_optimal = np.argmax(explained_variance >= 0.95) + 1
print(f"Optimal components for 95% variance: {n_components_optimal}")

Optimal components for 95% variance: 12


In [272]:
# PCA on detailed attritbues only
pca_attributes = PCA(n_components=n_components_optimal)
pca_attributes.fit(X_attributes)

X_attributes_proj = pca_attributes.transform(X_attributes)
X_attributes_proj = pd.DataFrame(X_attributes_proj, index=X_attributes.index)

#### PCA on detailed skill attributes and physical features

In [273]:
# Features without position information and aggergated attributes
attributes_and_physical_features = ['height_cm', 'weight_kg', 'weak_foot',
       'skill_moves', 'attacking_crossing',
       'attacking_finishing', 'attacking_heading_accuracy',
       'attacking_short_passing', 'attacking_volleys', 'skill_dribbling',
       'skill_curve', 'skill_fk_accuracy', 'skill_long_passing',
       'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed',
       'movement_agility', 'movement_reactions', 'movement_balance',
       'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength',
       'power_long_shots', 'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking_awareness',
       'defending_standing_tackle', 'defending_sliding_tackle',
       'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
       'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed',
       'preferred_foot_Left', 'preferred_foot_Right']

In [274]:
# PCA on features without position information and aggergated attributes
X_physical_and_attributes = X[attributes_and_physical_features].copy()

# X is your preprocessed dataframe
pca = PCA()
X_pca = pca.fit_transform(X_physical_and_attributes)

# Cumulative explained variance
explained_variance = np.cumsum(pca.explained_variance_ratio_) # Example: choose components to capture 90% of variance
n_components_optimal = np.argmax(explained_variance >= 0.95) + 1
print(f"Optimal components for 95% variance: {n_components_optimal}")


Optimal components for 95% variance: 13


In [275]:
pca_physical_and_attributes = PCA(n_components=n_components_optimal)
pca_physical_and_attributes.fit(X_physical_and_attributes)

X_physical_and_attributes_proj = pca_physical_and_attributes.transform(X_physical_and_attributes)
X_physical_and_attributes_proj = pd.DataFrame(X_physical_and_attributes_proj, index=X_physical_and_attributes.index)

# Similarity scoring with K-Nearest Neighbours

### Check similar alternatives output after dimensionality reduction.

Questions: How does different feature selection affect similarity suggestions?

Notes: Number of neighbours can be changed

#### Player ID search


In [276]:
# Use this function to find your desired player's unique player ID using their name
# Does not have to be an exact match!
# Sensitive to accents
def get_player_id(name):
    player_names_ids = df[['long_name', 'short_name', 'nationality_name', 'club_name', 'player_positions', 'player_id']]
    return player_names_ids[player_names_ids['long_name'].str.contains(name, case=False) |
                            player_names_ids['short_name'].str.contains(name, case=False)]

In [277]:
# Example demonstrating need of extra columns!
get_player_id('Moussa Diarra')

Unnamed: 0,long_name,short_name,nationality_name,club_name,player_positions,player_id
8556,Moussa Diarra,M. Diarra,Mali,Deportivo Alavés,"CB, LB",250725
12454,Moussa Diarra,M. Diarra,Mali,Málaga CF,CB,263677


#### Similarity matching when selecting all features


In [278]:
# Test using all features
from sklearn.neighbors import NearestNeighbors
knn_all = NearestNeighbors(
    n_neighbors=6,        # 1 self + 5 similar players
    metric='cosine'       # best for similarity in high dimensions
)
knn_all.fit(X_all_proj)

# Function only producing similar alternatives based on selected features
def find_similar_players_all(player_id):
    # Step 1: Find correct row index of the player
    player_index = player_features_df.index[player_features_df['player_id'] == player_id][0]
    # Step 2: Get nearest neighbors
    distances, indices = knn_all.kneighbors([X_all_proj.iloc[player_index]])
    # Step 3: Exclude the queried player (index 0)
    similar_indices = indices[0][1:6]
    similar_distances = distances[0][1:6]
    # step 4: Convert distance to similarity score
    similarity_scores = 1 - similar_distances
    # Step 5: Return selected columns
    results =  df.iloc[similar_indices][[
        'player_id', 'long_name', 'short_name', 'player_positions', 'overall', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'value_eur'
    ]]
    # step 6: Add the similarity score column (rounded for readability)
    results['similarity'] = similarity_scores.round(4)
    return results.reset_index(drop=True)


#### Similarity matching when selecting only detailed skill attributes

In [279]:
# Test using only skill attributes features
from sklearn.neighbors import NearestNeighbors
knn_skill = NearestNeighbors(
    n_neighbors=6,        # 1 self + 5 similar players
    metric='cosine'       # best for similarity in high dimensions
)
knn_skill.fit(X_attributes_proj)

# Function only producing similar alternatives based on selected features
def find_similar_players_skill(player_id):
    # Step 1: Find correct row index of the player
    player_index = player_features_df.index[player_features_df['player_id'] == player_id][0]
    # Step 2: Get nearest neighbors
    distances, indices = knn_skill.kneighbors([X_attributes_proj.iloc[player_index]])
    # Step 3: Exclude the queried player (index 0)
    similar_indices = indices[0][1:6]
    similar_distances = distances[0][1:6]
    # step 4: Convert distance to similarity score
    similarity_scores = 1 - similar_distances
    # Step 5: Return selected columns
    results =  df.iloc[similar_indices][[
        'short_name', 'player_positions', 'overall', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'value_eur'
    ]]
    # step 6: Add the similarity score column (rounded for readability)
    results['similarity'] = similarity_scores.round(4)
    return results.reset_index(drop=True)


#### Similarity matching when selecting only detailed skill attributes and some physical features

In [280]:
# Test using only skill and physical attributes features
from sklearn.neighbors import NearestNeighbors
knn_phy_skill = NearestNeighbors(
    n_neighbors=6,        # 1 self + 5 similar players
    metric='cosine'       # best for similarity in high dimensions
)
knn_phy_skill.fit(X_physical_and_attributes_proj)

# Function only producing similar alternatives based on selected features
def find_similar_players_phy_skill(player_id):
    # Step 1: Find correct row index of the player
    player_index = player_features_df.index[player_features_df['player_id'] == player_id][0]
    # Step 2: Get nearest neighbors
    distances, indices = knn_phy_skill.kneighbors([X_physical_and_attributes_proj.iloc[player_index]])
    # Step 3: Exclude the queried player (index 0)
    similar_indices = indices[0][1:6]
    similar_distances = distances[0][1:6]
    # step 4: Convert distance to similarity score
    similarity_scores = 1 - similar_distances
    # Step 5: Return selected columns
    results =  df.iloc[similar_indices][[
        'short_name', 'player_positions', 'overall', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'value_eur'
    ]]
    # step 6: Add the similarity score column (rounded for readability)
    results['similarity'] = similarity_scores.round(4)
    return results.reset_index(drop=True)

#### Put it all together!

In [287]:
# Find player ID
get_player_id('messi')

Unnamed: 0,long_name,short_name,nationality_name,club_name,player_positions,player_id
240,Lionel Andrés Messi Cuccitini,L. Messi,Argentina,Inter Miami,"RW, ST, CAM, RM",158023
1327,Junior Walter Messias,Junior Messias,Brazil,Genoa,"CM, RM, CAM",240938
11175,Pablo Ruan Messias Cardozo,Pablo Ruan,Brazil,CD Nacional,"ST, RW, RM",79479
12715,Rayane Messi Tanfouri,R. Messi,France,Pau FC,"LM, LW",75421


In [288]:
# Input player_id from search
find_similar_players_skill(158023)

Unnamed: 0,short_name,player_positions,overall,pace,shooting,passing,dribbling,defending,physic,value_eur,similarity
0,C. Nkunku,"CAM, LM, ST, CM",81,77.0,79.0,80.0,82.0,40.0,59.0,27500000,0.9911
1,R. Mahrez,"RM, RW",84,78.0,80.0,81.0,88.0,39.0,63.0,20000000,0.9892
2,S. Benrahma,"LW, LM, CAM",77,75.0,75.0,74.0,80.0,42.0,61.0,10500000,0.986
3,Ivi López,"CAM, ST, LW",73,73.0,72.0,72.0,75.0,40.0,66.0,2500000,0.9857
4,P. Dybala,"CAM, ST",86,80.0,85.0,84.0,87.0,41.0,64.0,56500000,0.9853


## Thoughts

#### Quick note on the brief quality checks completed and the recommended feature selection. 

Short explanaition: Select for detailed attributes only


Longer explanaition: I checked each feature-selection option for its player suggestion capabilities. I looked at outfield players that were perhaps generally average but possessed some key attributes that were really good. These were best taken into account when only skill attributes were selected. For goalkeepers, a key differential in real-life is whether they can pass the ball well or not? Again, using only skill attributes produces a matching engine which best picks up these key differences.  

# Appendix

#### Check which features are contributing the Principal Components the most

In [283]:
# Get PCA loadings (how much each feature contributes to each PC)
loadings = pd.DataFrame(
    pca_attributes.components_.T,  # transpose so features are rows
    index=X_attributes.columns
)
# Sort features by their importance for each principal component
top_features_pc1 = loadings[0].abs().sort_values(ascending=False).head(10)
top_features_pc2 = loadings[1].abs().sort_values(ascending=False).head(10)
top_features_pc3 = loadings[2].abs().sort_values(ascending=False).head(10)


In [284]:
print("Top 10 features contributing to PC1:")
print(top_features_pc1)


Top 10 features contributing to PC1:
skill_dribbling            0.219746
mentality_positioning      0.212253
skill_ball_control         0.209270
power_long_shots           0.209129
attacking_crossing         0.201012
skill_curve                0.200872
goalkeeping_reflexes       0.197293
attacking_finishing        0.195224
goalkeeping_diving         0.194586
goalkeeping_positioning    0.188853
Name: 0, dtype: float64


In [285]:
print("Top 10 features contributing to PC2:")
print(top_features_pc2)


Top 10 features contributing to PC2:
defending_sliding_tackle       0.389264
defending_standing_tackle      0.382208
mentality_interceptions        0.374323
defending_marking_awareness    0.364459
attacking_finishing            0.247276
power_long_shots               0.200681
attacking_volleys              0.196468
mentality_aggression           0.190234
power_shot_power               0.177083
mentality_positioning          0.174197
Name: 1, dtype: float64


In [286]:
print("Top 10 features contributing to PC3:")
print(top_features_pc3)

Top 10 features contributing to PC3:
movement_reactions         0.327516
power_jumping              0.293875
power_shot_power           0.287840
power_strength             0.281454
goalkeeping_reflexes       0.269546
goalkeeping_diving         0.264080
goalkeeping_positioning    0.262227
goalkeeping_handling       0.256147
goalkeeping_kicking        0.250196
mentality_composure        0.225248
Name: 2, dtype: float64
