## Import Libraries

In [1]:
#Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
import plotly.express as px
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA
import plotly.express as px
from scipy.stats import chi2_contingency
import plotly.graph_objects as go

## Visualization Helper Function

In [56]:
# @title
def show_df_table(df, title="DataFrame Table", max_rows=None,player_id=None,original_df=None):
    """
    Display any pandas DataFrame as an interactive Plotly table.

    Parameters:
    - df: pandas DataFrame
    - title: str, title of the table
    - max_rows: int or None, number of rows to display (default all)
    """
    if max_rows:
        #max_rows = int(max_rows)
        df = df.head(max_rows)

    fig = go.Figure(data=[go.Table(
        header=dict(
            values=list(df.columns),
            fill_color='lightblue',
            align='center',
            font=dict(size=11, color='black')
        ),
        cells=dict(
            values=[df[col].tolist() for col in df.columns],
            fill_color='white',
            align='center',
            font=dict(size=11)
        )
    )])

    if player_id is not None:
        player_name = original_df[original_df['sofifa_id']==player_id].iloc[0]['short_name']
        fig.add_annotation(
            text=f"<b>{player_name}</b>",
            x=0.5, y=1.15,              # position above the table
            xref="paper", yref="paper",
            showarrow=False,
            font=dict(size=16, color="black")
        )

    fig.update_layout(
        title_text=title,
        width=1400,
        height=450,
        margin=dict(t=120)  # increase top margin so name fits
    )
    fig.show()

## Initial preprocessing and data preparation

In [6]:
# @title
#Read data from Google Drive
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/players_22.csv'

fifa_22 = pd.read_csv(file_path)

#fifa_22 = pd.read_csv('players_22.csv')
columns_to_remove = ['ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm',
                     'lwb','ldm','cdm','rdm','rwb','lb','lcb','cb','rcb','rb','gk','player_face_url',
                     'club_logo_url','club_flag_url','nation_logo_url','nation_flag_url','player_url',
                     'long_name']
fifa_22.drop(columns_to_remove, axis=1, inplace=True)
gk_df = fifa_22.loc[fifa_22['player_positions'].str.contains('GK', case=False, na=False)]
fifa_22_without_gk = fifa_22.loc[~fifa_22['player_positions'].str.contains('GK', case=False, na=False)]


# @title
#fifa_22.columns.to_list()

fifa_22_without_gk = fifa_22_without_gk[['sofifa_id','short_name','player_positions','overall','potential','value_eur',
         'wage_eur','age','height_cm','weight_kg','club_name','league_name','league_level','club_position',
         'nationality_name','preferred_foot','weak_foot','skill_moves','international_reputation','work_rate','body_type','pace','shooting',
         'passing','dribbling','defending','physic','attacking_crossing','attacking_finishing','attacking_heading_accuracy','attacking_short_passing',
         'attacking_volleys','skill_dribbling','skill_curve','skill_fk_accuracy','skill_long_passing','skill_ball_control',
         'movement_acceleration','movement_sprint_speed','movement_agility','movement_reactions','movement_balance',
         'power_shot_power','power_jumping','power_stamina','power_strength','power_long_shots','mentality_aggression',
         'mentality_interceptions','mentality_positioning','mentality_vision','mentality_penalties','mentality_composure',
         'defending_marking_awareness','defending_standing_tackle','defending_sliding_tackle']]

def tag_player(positions_str):
    forward_positions = {'ST', 'CF', 'RW', 'LW', 'RF', 'LF', 'SS'}
    midfield_positions = {'CAM', 'CM', 'CDM', 'LM', 'RM'}
    defense_positions = {'CB', 'LB', 'RB', 'LWB', 'RWB'}

    # Clean and split the input string
    positions = set(pos.strip().upper() for pos in positions_str.split(','))

    has_forward = bool(positions & forward_positions)
    has_mid = bool(positions & midfield_positions)
    has_defense = bool(positions & defense_positions)

    # Classification logic
    if positions <= forward_positions:
        return 'Pure_Forward'
    elif positions <= midfield_positions:
        return 'Pure_Mid'
    elif positions <= defense_positions:
        return 'Pure_Defense'
    elif has_forward and has_mid and not has_defense:
        return 'Forward/Mid'
    elif has_mid and has_defense and not has_forward:
        return 'Defense/Mid'
    elif has_forward and has_defense and not has_mid:
        return 'Forward/Defense'
    elif has_forward and has_mid and has_defense:
        return 'Forward/Mid/Defense'
    else:
        return 'Skip/Other'
fifa_22_without_gk['role'] = fifa_22_without_gk['player_positions'].apply(tag_player)

# @title
fifa_22_without_gk = (
    fifa_22_without_gk
    .assign(
        value_eur_M=lambda df: df.value_eur / 1e6,
        wage_eur_K=lambda df: df.wage_eur / 1e3
    )
    .drop(columns=['value_eur','wage_eur'])
)

def create_dynamic_buckets(df, column, num_buckets=5):

    min_val = df[column].min()
    max_val = df[column].max()

    bins = np.linspace(min_val, max_val, num_buckets + 1)
    labels = [f"{bins[i]:.0f}–{bins[i+1]:.0f}" for i in range(len(bins)-1)]

    return pd.cut(
        df[column],
        bins=bins,
        labels=labels,
        right=True,
        include_lowest=True
    )
fifa_22_without_gk['overall_bucket'] = create_dynamic_buckets(fifa_22_without_gk, 'overall')
fifa_22_without_gk['age_bucket'] = create_dynamic_buckets(fifa_22_without_gk, 'age')
fifa_22_without_gk['value_bucket'] = create_dynamic_buckets(fifa_22_without_gk, 'value_eur_M')
fifa_22_without_gk['wage_bucket'] = create_dynamic_buckets(fifa_22_without_gk, 'wage_eur_K')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
fifa_22_without_gk.head()

Unnamed: 0,sofifa_id,short_name,player_positions,overall,potential,age,height_cm,weight_kg,club_name,league_name,league_level,club_position,nationality_name,preferred_foot,weak_foot,skill_moves,international_reputation,work_rate,body_type,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,role,value_eur_M,wage_eur_K,overall_bucket,age_bucket,value_bucket,wage_bucket
0,158023,L. Messi,"RW, ST, CF",93,93,34,170,72,Paris Saint-Germain,French Ligue 1,1.0,RW,Argentina,Left,4,4,5,Medium/Low,Unique,85.0,92.0,91.0,95.0,34.0,65.0,85,95,70,91,88,96,93,94,91,96,91,80,91,94,95,86,68,72,69,94,44,40,93,95,75,96,20,35,24,Pure_Forward,78.0,320.0,84–93,31–39,78–116,280–350
1,188545,R. Lewandowski,ST,92,92,32,185,81,FC Bayern München,German 1. Bundesliga,1.0,ST,Poland,Right,4,4,5,High/Medium,Unique,78.0,92.0,79.0,86.0,44.0,82.0,71,95,90,85,89,85,79,85,70,88,77,79,77,93,82,90,85,76,86,87,81,49,95,81,90,88,35,42,19,Pure_Forward,119.5,270.0,84–93,31–39,116–155,210–280
2,20801,Cristiano Ronaldo,"ST, LW",91,91,36,187,83,Manchester United,English Premier League,1.0,ST,Portugal,Right,4,5,5,High/Low,Unique,87.0,94.0,80.0,88.0,34.0,75.0,87,95,90,80,86,88,81,84,77,88,85,88,86,94,74,94,95,77,77,93,63,29,95,76,88,95,24,32,24,Pure_Forward,45.0,270.0,84–93,31–39,39–78,210–280
3,190871,Neymar Jr,"LW, CAM",91,91,29,175,68,Paris Saint-Germain,French Ligue 1,1.0,LW,Brazil,Right,5,5,5,High/Medium,Unique,91.0,83.0,86.0,94.0,37.0,63.0,85,83,63,86,86,95,88,87,81,95,93,89,96,89,84,80,64,81,53,81,63,37,86,90,93,93,35,32,29,Forward/Mid,129.0,270.0,84–93,24–31,116–155,210–280
4,192985,K. De Bruyne,"CM, CAM",91,91,30,181,70,Manchester City,English Premier League,1.0,RCM,Belgium,Right,5,4,4,High/High,Unique,76.0,86.0,93.0,88.0,64.0,78.0,94,82,55,94,82,88,85,83,93,91,76,76,79,91,78,91,63,89,74,91,76,66,88,94,83,89,68,65,53,Pure_Mid,125.5,350.0,84–93,24–31,116–155,280–350


## Creating the Role Feature Map

In [9]:
# @title
ROLE_FEATURE_MAP = {
    "Pure_Forward": [
        'sofifa_id','short_name','overall','age','height_cm',
        'weight_kg','preferred_foot','weak_foot','skill_moves','work_rate','pace',
        'shooting','passing','dribbling','physic','attacking_crossing',
        'attacking_finishing','attacking_heading_accuracy','attacking_short_passing',
        'attacking_volleys','movement_acceleration','movement_sprint_speed'
    ],

    "Forward/Mid": [
        'sofifa_id','short_name','overall','age','height_cm',
        'weight_kg','preferred_foot','weak_foot','skill_moves','work_rate','pace',
        'shooting','passing','dribbling','attacking_crossing','attacking_finishing',
        'attacking_heading_accuracy','attacking_short_passing','attacking_volleys',
        'skill_dribbling','skill_fk_accuracy','skill_long_passing','skill_ball_control',
        'movement_acceleration','movement_sprint_speed','movement_agility','power_shot_power',
        'power_stamina','power_long_shots','mentality_positioning','mentality_vision',
        'mentality_penalties','mentality_composure'
    ],

    "Forward/Defense": [
        'sofifa_id','short_name','overall','age','height_cm',
        'weight_kg','preferred_foot','skill_moves','work_rate','pace','passing','dribbling',
        'defending','physic','attacking_crossing','attacking_short_passing','skill_dribbling',
        'skill_long_passing','skill_ball_control','movement_acceleration','movement_sprint_speed',
        'movement_agility','movement_reactions','movement_balance','power_stamina','power_strength',
        'mentality_aggression','mentality_interceptions','mentality_positioning','mentality_vision',
        'mentality_composure','defending_marking_awareness','defending_standing_tackle',
        'defending_sliding_tackle'
    ],

    "Pure_Mid": [
        'sofifa_id','short_name','overall','age','height_cm',
        'weight_kg','preferred_foot','skill_moves','work_rate','shooting','passing','dribbling',
        'defending','physic','attacking_crossing','attacking_finishing','attacking_short_passing',
        'attacking_volleys','skill_dribbling','skill_curve','skill_fk_accuracy','skill_long_passing',
        'skill_ball_control','movement_agility','movement_reactions','movement_balance',
        'power_shot_power','power_stamina','power_strength','power_long_shots',
        'mentality_aggression','mentality_interceptions','mentality_positioning',
        'mentality_vision','mentality_penalties','mentality_composure'
    ],

    "Defense/Mid": [
        'sofifa_id','short_name','overall','age','height_cm',
        'weight_kg','preferred_foot','work_rate','pace','passing','dribbling','defending','physic',
        'skill_dribbling','skill_long_passing','skill_ball_control','movement_agility',
        'movement_reactions','movement_balance','power_jumping','power_stamina','power_strength',
        'mentality_aggression','mentality_interceptions','mentality_positioning',
        'mentality_vision','mentality_composure','defending_marking_awareness',
        'defending_standing_tackle','defending_sliding_tackle'
    ],

    "Pure_Defense": [
        'sofifa_id','short_name','overall','age','height_cm','weight_kg',
        'preferred_foot','work_rate','passing','defending','physic','skill_long_passing',
        'movement_agility','movement_reactions','movement_balance','power_jumping','power_stamina',
        'power_strength','mentality_aggression','mentality_interceptions','mentality_positioning',
        'mentality_composure','defending_marking_awareness','defending_standing_tackle',
        'defending_sliding_tackle'
    ],

    "Forward/Mid/Defense": [
        'sofifa_id','short_name','overall','age','height_cm','weight_kg',
        'preferred_foot','work_rate','weak_foot','skill_moves','work_rate','pace','shooting',
        'passing','dribbling','defending','physic','attacking_crossing','skill_dribbling',
        'movement_acceleration','movement_sprint_speed','power_stamina','power_strength',
        'skill_long_passing','mentality_aggression','mentality_interceptions','mentality_positioning',
        'mentality_vision','mentality_composure','defending_marking_awareness',
        'defending_standing_tackle','defending_sliding_tackle'
    ]
}


## Initial Analysis based on Overall, Age, Role, Value and Wages.

In [10]:
# @title
non_ml_df = fifa_22_without_gk[['sofifa_id','short_name','overall','role','club_position','age','league_name',
                                'overall_bucket','age_bucket','value_bucket','wage_bucket']]

def attribute_based_best_players(df, column, id_column):

    column_based_dist = df.groupby(column)[id_column].count().reset_index()

    fig = px.bar(
        column_based_dist, x=column, y=id_column,
        title=f'{column} Distribution', width=1200, height=600,
        text=id_column,
        labels={id_column: 'Count'}
    )
    fig.show()

    matrix = {}

    for value in df[column].unique():
        df_top5 = (
            df[df[column] == value]
            .sort_values(by='overall', ascending=False)
            .head(5)
            .reset_index(drop=True)
        )

        matrix[value] = df_top5['short_name'] + " (" + df_top5['overall'].astype(str) + ")"

    matrix_df = pd.DataFrame(matrix)

    return show_df_table(matrix_df,"Top Winners by Category")


In [11]:
attribute_based_best_players(non_ml_df,'wage_bucket','sofifa_id')

## Deviation Based Similarity

In [15]:
# @title
def similar_players_by_deviation(
        df,
        player_id,
        column,
        deviation_pct=None,
        same_role=False,
        same_league=False
    ):
    """
    Find players similar to a target player based on a numeric column with optional deviation,
    role, and league filters. Displays a Plotly table and optionally returns the filtered DataFrame.

    Parameters:
    - df: DataFrame to search
    - player_id: target player's sofifa_id
    - column: numeric column to compare
    - deviation_pct: allowed deviation (if None, automatically set)
    - same_role: filter by same role if True
    - same_league: filter by same league if True
    - return_df: if True, return the filtered DataFrame

    Returns:
    - Filtered DataFrame (if return_df=True)
    """
    # --- 1. Locate the player ---
    player_row = df[df['sofifa_id'] == player_id]
    if player_row.empty:
        print(f"Player with ID {player_id} not found.")
        return None

    player_value = player_row.iloc[0][column]
    player_name = player_row.iloc[0]["short_name"]
    player_role = player_row.iloc[0]["role"]
    player_league = player_row.iloc[0]["league_name"]

    # --- 2. Automatically set deviation ---
    if deviation_pct is None:
        deviation_pct = 0.02 if column == "age" else 0.05

    # --- 3. Compute range ---
    lower = player_value * (1 - deviation_pct)
    upper = player_value * (1 + deviation_pct)
    result = df[(df[column] >= lower) & (df[column] <= upper)].copy()

    # --- 3b. Remove target player ---
    result = result[result["sofifa_id"] != player_id]

    # --- 4. Optional filters ---
    if same_role:
        result = result[result["role"] == player_role]
    if same_league:
        result = result[result["league_name"] == player_league]

    # --- 5. Sort ---
    result = result.sort_values(by=column, ascending=False)

    return result


In [16]:
deviation_based = similar_players_by_deviation(non_ml_df, 192985, 'overall', 0.01, False, False)
show_df_table(deviation_based,"Correlation overall results",player_id=192985,original_df=fifa_22_without_gk)

## Correlation Based Similarty

In [31]:
# @title
def correlation_similarity(df,player_id,top_n=5):

  cols_to_drop = ["sofifa_id", "short_name", "role",'preferred_foot','weak_foot','skill_moves','work_rate']

  CORR_ROLE_FEATURE_MAP = {}

  for role,cols in ROLE_FEATURE_MAP.items():
    CORR_ROLE_FEATURE_MAP[role] = [
        col for col in ROLE_FEATURE_MAP[role]
        if col not in cols_to_drop
    ]

  role = df.loc[df["sofifa_id"] == player_id, "role"].iloc[0]

  if role not in CORR_ROLE_FEATURE_MAP:
        raise ValueError(f"Role '{role}' is not mapped to any role group in ROLE_GROUP_MAP.")

  feature_cols = CORR_ROLE_FEATURE_MAP[role]

  player_vector = df.loc[df["sofifa_id"] == player_id, feature_cols].iloc[0]

  other_players_same_role = df[(df['sofifa_id']!= player_id) & (df['role'] == role)][feature_cols]
  other_players_overall = df[df['sofifa_id']!= player_id][feature_cols]

  pearson_corr_same_role = other_players_same_role[feature_cols].apply(lambda row: row.corr(player_vector[feature_cols]), axis=1)
  pearson_corr_all = other_players_overall[feature_cols].apply(lambda row: row.corr(player_vector[feature_cols]), axis=1)

  result_same_role = df[["sofifa_id", "short_name", "overall"]]
  result_same_role['correlation'] = pearson_corr_same_role
  result_same_role = (
        result_same_role
        .sort_values("correlation", ascending=False)
        [["sofifa_id", "short_name", "overall", "correlation"]]
        .head(top_n)
        .reset_index(drop=True)
    )

  result_overall = df[["sofifa_id", "short_name", "overall"]]
  result_overall['correlation'] = pearson_corr_all
  result_overall = (
        result_overall
        .sort_values("correlation", ascending=False)
        [["sofifa_id", "short_name", "overall", "correlation"]]
        .head(top_n)
        .reset_index(drop=True)
    )
  return result_same_role, result_overall





In [32]:
correlation_same_role_based, correlation_overall_based = correlation_similarity(fifa_22_without_gk,158023)
show_df_table(correlation_same_role_based,"Correlation for same role results",original_df=fifa_22_without_gk,player_id=158023)
show_df_table(correlation_overall_based,"Correlation overall results",original_df=fifa_22_without_gk,player_id=158023)


## Distance based Similarities

In [52]:
# @title
cols_to_drop = [
    'player_positions','value_eur_M','short_name','wage_eur_K','body_type','work_rate',
    'international_reputation','league_level','club_name','league_name','club_position',
    'nationality_name','overall_bucket','age_bucket','value_bucket','wage_bucket'
]

fifa_22_without_gk_dist = (
    fifa_22_without_gk
    .drop(columns=cols_to_drop)
    .assign(preferred_foot=lambda df: df['preferred_foot'].map({'Left': 1, 'Right': 0}))
    .set_index('sofifa_id')
)

def get_role(df, role):
    return df.query("role == @role").drop(columns="role")

pure_forward          = get_role(fifa_22_without_gk_dist, 'Pure Forward')
pure_mid              = get_role(fifa_22_without_gk_dist, 'Pure Mid')
pure_defense          = get_role(fifa_22_without_gk_dist, 'Pure Defense')
forward_mid           = get_role(fifa_22_without_gk_dist, 'Forward/Mid')
defense_mid           = get_role(fifa_22_without_gk_dist, 'Defense/Mid')
forward_defense       = get_role(fifa_22_without_gk_dist, 'Forward/Defense')
forward_mid_defense   = get_role(fifa_22_without_gk_dist, 'Forward/Mid/Defense')

def compute_distance(metric, X, target_vec):
    if metric == "euclidean":
        return euclidean_distances(X, target_vec.reshape(1, -1))
    elif metric == "manhattan":
        return manhattan_distances(X, target_vec.reshape(1, -1))
    elif metric == "cosine":
        # cosine_similarity → higher = more similar, so convert to distance
        sim = cosine_similarity(X, target_vec.reshape(1, -1))
        return 1 - sim
    else:
        raise ValueError("Invalid metric (choose: 'euclidean', 'manhattan', 'cosine')")

def player_similarity_combined(player_id, metric="euclidean", role_specific=False):
    """
    Returns two DataFrames:
    1. Table of top 10 similar players for all distance metrics (names only)
    2. Detailed top 10 similar players for the specified metric with overall and value
    """
    # Required columns in original FIFA DataFrame
    required_cols = ["sofifa_id", "short_name", "overall", "value_eur_M"]
    for col in required_cols:
        if col not in fifa_22_without_gk.columns:
            raise ValueError(f"Column '{col}' not found in fifa_22_without_gk")

    # Display target player info
    player_info = fifa_22_without_gk.loc[fifa_22_without_gk["sofifa_id"] == player_id, ["short_name", "overall"]].iloc[0]
    #print(f"Target Player: {player_info['short_name']} | Overall: {player_info['overall']}")

    # Choose dataset based on role
    role_map = {
        "Pure Forward": pure_forward,
        "Pure Mid": pure_mid,
        "Pure Defense": pure_defense,
        "Forward/Mid": forward_mid,
        "Defense/Mid": defense_mid,
        "Forward/Defense": forward_defense,
        "Forward/Mid/Defense": forward_mid_defense
    }
    df_use = role_map.get(fifa_22_without_gk_dist.loc[player_id, "role"], fifa_22_without_gk_dist)

    # Target vector
    target_vec = df_use.loc[player_id].drop("role", errors="ignore")
    X = df_use.drop(columns="role", errors="ignore")

    # ----- 1️⃣ Table view: top 10 players for all distance metrics -----
    table_data = {}
    for m in ["euclidean", "manhattan", "cosine"]:
        df_metric = df_use.copy()
        df_metric["distance"] = compute_distance(m, X, target_vec.values).flatten()
        df_metric = df_metric.drop(player_id)
        df_metric = df_metric.merge(fifa_22_without_gk[["sofifa_id", "short_name"]], left_index=True, right_on="sofifa_id")
        top5 = df_metric.nsmallest(5, "distance")[["short_name", "distance"]].reset_index(drop=True)
        table_data[m] = top5

    # Combine into a single DataFrame with MultiIndex columns
    table_view = pd.concat(table_data, axis=1)
    table_view.columns = pd.MultiIndex.from_tuples(table_view.columns)

    # ----- 2️⃣ Detailed view: top 5 players for selected metric -----
    df_use_clean = df_use.drop(columns=["overall"], errors="ignore")  # avoid _x/_y conflict
    df_use_clean["distance"] = compute_distance(metric, X, target_vec.values).flatten()

    detailed_df = (
    df_use_clean.drop(player_id)
    .merge(fifa_22_without_gk[required_cols], left_index=True, right_on="sofifa_id")
    )

    detailed_df = (
        detailed_df[["sofifa_id", "short_name", "overall", "distance"]]
        .sort_values("distance")
        .head(5)
    )

    return table_view, detailed_df


In [51]:
# @title
table_view, detailed_view = player_similarity_combined(192985, metric="euclidean")
show_df_table(table_view,"Distance Metric Comparison Result",original_df=fifa_22_without_gk,player_id=192985)
show_df_table(detailed_view,"Detailed View for a Distance Metric",original_df=fifa_22_without_gk,player_id=192985)

## ML Algos Based Similarities

### KMeans Clustering

#### Role Based Clustering

In [21]:
# @title
def similar_players_by_cluster_role_based(
    df,
    player_id,
    n_clusters=10,
    top_n=5,
    random_state=42
):

    # 1) locate player
    player_row = df[df["sofifa_id"] == player_id]
    if player_row.empty:
        raise ValueError(f"Player ID {player_id} not found.")

    role = player_row.iloc[0]["role"]
    overall = player_row.iloc[0]["overall"]
    player_name = player_row.iloc[0]["short_name"]

    # 2) map to role group
    if role not in ROLE_FEATURE_MAP:
        raise ValueError(f"Role '{role}' is not mapped to any role group in ROLE_GROUP_MAP.")


    # 3) get feature list for this group
    feature_cols = ROLE_FEATURE_MAP[role]

    # 4) subset df to only this role group
    role_df = df[df["role"] == role].copy()

    # 5) ensure required columns exist
    missing = [c for c in feature_cols if c not in role_df.columns]
    if missing:
        raise ValueError(f"Missing required columns for role group '{role}': {missing}")

    # 6) Encode categorical columns (preferred foot, work_rate, weak_foot)
    cat_cols = ["preferred_foot", "work_rate"]
    for c in cat_cols:
        if c in role_df.columns:
            role_df[c] = LabelEncoder().fit_transform(role_df[c].astype(str))

    # 7) build X matrix (drops id/names automatically)
    non_feature_cols = ["sofifa_id", "short_name", "role", "cluster"]
    X_cols = [c for c in feature_cols if c not in non_feature_cols]

    X = role_df[X_cols].values

    # 8) scale
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 9) cluster
    kmeans = KMeans(n_clusters=min(n_clusters, len(role_df)), random_state=random_state)
    labels = kmeans.fit_predict(X_scaled)
    role_df["cluster"] = labels
    role_df = role_df.reset_index(drop=True)

    # 10) find player index
    player_idx = role_df.index[role_df["sofifa_id"] == player_id][0]
    player_cluster = role_df.at[player_idx, "cluster"]

    # 11) same-cluster members
    cluster_members = role_df[role_df["cluster"] == player_cluster].copy()

    # 12) compute distances
    X_cluster = X_scaled[cluster_members.index]
    player_vec = X_scaled[player_idx].reshape(1, -1)
    dists = euclidean_distances(X_cluster, player_vec).flatten()
    cluster_members["distance"] = dists

    # 13) sort + return
    result = (
        cluster_members[cluster_members["sofifa_id"] != player_id]
            .sort_values("distance")
            [["sofifa_id", "short_name", "overall", "distance"]]
            .head(top_n)
            .reset_index(drop=True)
    )

    #print(f"Target: {player_name} | Role: {role} | Overall: {overall} |Cluster size: {cluster_members.shape[0]}")
    return result

In [22]:
kmeans_similar_role_based = similar_players_by_cluster_role_based(fifa_22_without_gk, player_id=190871, n_clusters=10, top_n=None)
show_df_table(kmeans_similar_role_based, title="Kmeans Similarity Results", max_rows=5,original_df=fifa_22_without_gk,player_id=190871)

#### Global Clustering

In [23]:
global_clustering_df = fifa_22_without_gk[['sofifa_id', 'short_name','role','overall','age', 'height_cm', 'weight_kg',
                                          'preferred_foot','work_rate', 'pace','shooting', 'passing', 'dribbling',
                                           'defending', 'physic']]

# @title
def similar_players_global(
    df,                    # must include sofifa_id, short_name, overall
    player_id,
    n_clusters=10,
    top_n=5,
    random_state=42
):
    """
    Global version (NO ROLE FILTERING):
    - scale numeric features for ALL players
    - run KMeans
    - find player's cluster
    - return top_n closest players inside that cluster

    Returns: DataFrame with sofifa_id, short_name, overall, distance
    """

    # ---------------------------
    # 1) Find player row
    # ---------------------------
    player_row = df[df["sofifa_id"] == player_id]
    if player_row.empty:
        raise ValueError(f"Player ID {player_id} not found.")

    player_name = player_row.iloc[0]["short_name"]

    # ---------------------------
    # 2) Prepare feature matrix
    # ---------------------------
    drop_cols = ["sofifa_id", "short_name", "role", "cluster"]
    feature_cols = [c for c in df.columns if c not in drop_cols]

    if not feature_cols:
        raise ValueError("No feature columns available for clustering.")

    cat_cols = ["preferred_foot", "work_rate"]
    for c in cat_cols:
        if c in df.columns:
            df[c] = LabelEncoder().fit_transform(df[c].astype(str))

    X = df[feature_cols].values

    # ---------------------------
    # 3) Scale and cluster
    # ---------------------------
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    kmeans = KMeans(
        n_clusters=min(n_clusters, len(df)),
        random_state=random_state
    )
    labels = kmeans.fit_predict(X_scaled)

    df_clustered = df.copy().reset_index(drop=True)
    df_clustered["cluster"] = labels

    # ---------------------------
    # 4) Find player's index and cluster
    # ---------------------------
    player_idx = df_clustered.index[df_clustered["sofifa_id"] == player_id][0]
    player_cluster = df_clustered.at[player_idx, "cluster"]

    # ---------------------------
    # 5) Get same-cluster players
    # ---------------------------
    cluster_members = df_clustered[df_clustered["cluster"] == player_cluster].copy()

    if cluster_members.shape[0] <= 1:
        raise ValueError("No other players exist in the same cluster.")

    # compute distances
    X_cluster = X_scaled[cluster_members.index]
    player_vec_scaled = X_scaled[player_idx].reshape(1, -1)

    distances = euclidean_distances(X_cluster, player_vec_scaled).flatten()
    cluster_members["distance"] = distances

    # ---------------------------
    # 6) Get top_n closest players except target
    # ---------------------------
    result = (
        cluster_members[cluster_members["sofifa_id"] != player_id]
        .sort_values("distance")
        [["sofifa_id", "short_name", "overall", "distance"]]
        .head(top_n)
        .reset_index(drop=True)
    )

    #print(f"Target: {player_name} | Cluster: {player_cluster} | Cluster Size: {cluster_members.shape[0]}")
    return result


In [24]:
cluster_all = similar_players_global(global_clustering_df, player_id=20801, n_clusters=10, top_n=5)
show_df_table(cluster_all, title="Kmeans Similarity Results for all", max_rows=None,original_df=fifa_22_without_gk,player_id=20801)

### KNN + PCA

In [25]:
# @title
def similar_players_pca(
    df,
    player_id,
    top_n=5,
    n_components=5
):
    # 1) locate player
    player_row = df[df["sofifa_id"] == player_id]
    if player_row.empty:
        raise ValueError("Player not found")

    role = player_row.iloc[0]["role"]
    player_name = player_row.iloc[0]["short_name"]
    player_ovr = player_row.iloc[0]["overall"]

    # 2) select features
    feature_cols = [c for c in ROLE_FEATURE_MAP[role]
                    if c not in ["sofifa_id", "short_name", "role"]]

    role_df = df[df["role"] == role].copy()

    # 3) encode categorical
    cat_cols = ["preferred_foot", "work_rate"]
    for c in cat_cols:
        if c in role_df.columns:
            role_df[c] = LabelEncoder().fit_transform(role_df[c].astype(str))

    # 4) build feature matrix
    X = role_df[feature_cols].values
    X_scaled = StandardScaler().fit_transform(X)

    # 5) fit PCA
    pca = PCA(n_components=min(n_components, X_scaled.shape[1]))
    X_pca = pca.fit_transform(X_scaled)

    # 6) KNN in PCA space
    knn = NearestNeighbors(n_neighbors=top_n+1, metric="euclidean")
    knn.fit(X_pca)

    player_idx = role_df.index[role_df["sofifa_id"] == player_id][0]
    distances, indices = knn.kneighbors(X_pca[player_idx].reshape(1, -1))

    distances, indices = distances.flatten(), indices.flatten()

    # Remove the player itself
    mask = indices != player_idx
    indices, distances = indices[mask], distances[mask]

    # 7) build result
    neighbors = role_df.iloc[indices].copy()
    neighbors["distance"] = distances
    result = neighbors[["sofifa_id", "short_name", "overall", "distance"]].reset_index(drop=True)

    #print(f"PCA-based recommendations for {player_name} | Role: {role} | OVR: {player_ovr}")
    return result

In [26]:
pca_df = similar_players_pca(fifa_22_without_gk, player_id=192985, top_n=5, n_components=5)
show_df_table(pca_df, title="PCA+KNN Similarity Results for all", max_rows=None,original_df=fifa_22_without_gk,player_id=20801)

## Final Result

In [53]:
deviation_based = similar_players_by_deviation(non_ml_df, 192985, 'overall', 0.01, False, False)
correlation_same_role_based, correlation_overall_based = correlation_similarity(fifa_22_without_gk,192985)
table_view, detailed_view_euc = player_similarity_combined(192985, metric="euclidean")
table_view, detailed_view_man = player_similarity_combined(192985, metric="manhattan")
table_view, detailed_view_cos = player_similarity_combined(192985, metric="cosine")
kmeans_similar_role_based = similar_players_by_cluster_role_based(fifa_22_without_gk, player_id=192985, n_clusters=10, top_n=5)
cluster_all = similar_players_global(global_clustering_df, player_id=192985, n_clusters=10, top_n=5)
pca_df = similar_players_pca(fifa_22_without_gk, player_id=192985, top_n=5, n_components=5)

In [54]:
# --- Base player data ---
df_base = fifa_22_without_gk[['sofifa_id','short_name','overall','club_position','role']].copy()

# --- Helper to standardize metric DF shapes ---
def make_metric_df(df, id_col, value_col, metric_name):
    return df[[id_col, value_col]].rename(columns={value_col: "metric_value"}) \
            .assign(metric_name=metric_name)


# ---------- Build metric DataFrames (long, tidy format) ----------
metric_frames = []

# 1. Deviation
metric_frames.append(
    make_metric_df(deviation_based, "sofifa_id", "value_bucket", "Deviation")
)

# 2. Same-role correlation
metric_frames.append(
    make_metric_df(correlation_same_role_based, "sofifa_id", "correlation", "Same Role Correlation")
)

# 3. Overall correlation
metric_frames.append(
    make_metric_df(correlation_overall_based, "sofifa_id", "correlation", "Overall Correlation")
)

# 4. Euclidean
metric_frames.append(
    make_metric_df(detailed_view_euc, "sofifa_id", "distance", "Euclidean Distance")
)

# 5. Manhattan
metric_frames.append(
    make_metric_df(detailed_view_man, "sofifa_id", "distance", "Manhattan Distance")
)

# 6. Cosine
metric_frames.append(
    make_metric_df(detailed_view_cos, "sofifa_id", "distance", "Cosine Distance")
)

# 7. KMeans same-role
metric_frames.append(
    make_metric_df(kmeans_similar_role_based, "sofifa_id", "distance", "KMeans Similar Role")
)

# 8. Global KMeans
metric_frames.append(
    make_metric_df(cluster_all, "sofifa_id", "distance", "KMeans Overall")
)

# 9. PCA+KNN
metric_frames.append(
    make_metric_df(pca_df, "sofifa_id", "distance", "PCA+KNN")
)


# ---------- Combine all metric rows into one long DF ----------
all_metrics_long = pd.concat(metric_frames, ignore_index=True)

# ---------- Attach base player info ----------
final_long = all_metrics_long.merge(df_base, on="sofifa_id", how="left")

# ---------- Remove players where all metrics are NaN ----------
final_long = final_long.dropna(subset=["metric_value"], how="all")
show_df_table(final_long, title="Overall Player Recommendation", max_rows=None,original_df=fifa_22_without_gk,player_id=192985)


## Development