In [None]:
import polars as pl
import polars.selectors as cs
import soccerdata as sd
import matplotlib.pyplot as plt
import sklearn as skl
import pandas as ps
import numpy as np
import pyarrow
import seaborn as sb

In [None]:
fbref = sd.FBref(seasons=2025)
print(fbref.__doc__)

In [None]:
player_season_stats = fbref.read_player_season_stats(stat_type="standard")
player_season_stats.head()


In [None]:
df = fbref.read_player_season_stats(stat_type='defense')

# Access only the Per 90 columns
print(df.columns)

In [None]:
print(player_season_stats.columns)

In [None]:
fbref = sd.FBref(seasons = "2024-25")

defense = fbref.read_player_season_stats(stat_type="defense")
possesion = fbref.read_player_season_stats(stat_type="possession")
passing = fbref.read_player_season_stats(stat_type="passing")

In [None]:
s = fbref.read_player_season_stats(stat_type="standard")
s1 = pl.from_pandas(s, include_index=True)
progress = fbref.read_player_season_stats(stat_type="possession")
p1 = pl.from_pandas(progress, include_index=True)
creative = fbref.read_player_season_stats(stat_type="goal_shot_creation")
c1 = pl.from_pandas(creative, include_index=True)
defense = fbref.read_player_season_stats(stat_type="defense")
d1 = pl.from_pandas(defense, include_index=True)

S = s1.drop("season")
S = S.drop("('nation', '')")
S = S.drop("('born', '')")
S = S.drop("('age', '')")

S = S.drop([
    "('Playing Time', 'MP')",     
    "('Playing Time', 'Starts')",
    "('Playing Time', 'Min')"
])

D = d1.drop("season")
D = D.drop("('nation', '')")
D = D.drop("('born', '')")
D = D.drop("('age', '')")
D = D.drop("('90s', '')")

C = c1.drop("season")
C = C.drop("('nation', '')")
C = C.drop("('born', '')")
C = C.drop("('age', '')")
C = C.drop("('90s', '')")

P = p1.drop("season")
P = P.drop("('nation', '')")
P = P.drop("('born', '')")
P = P.drop("('age', '')")
P = P.drop("('90s', '')")



all_player = S.join(P, on=["league", "team", "player"], how="inner", suffix="_progress")
all_player = all_player.join(C, on=["league", "team", "player"], how="inner", suffix="_creative")
all_player = all_player.join(D, on=["league", "team", "player"], how="inner", suffix="_defense")
all_player = all_player.filter(
    (pl.col("('pos', '')").str.contains("MF")) | 
    (pl.col("('pos', '')").str.contains("FW"))
)
all_player = all_player.drop("('pos', '')_progress")
all_player = all_player.drop("('pos', '')_creative")
all_player = all_player.drop("('pos', '')_defense")
all_player = all_player.drop("('pos', '')")     
all_player = all_player.drop(cs.matches(r"^\('Per 90 Minutes'.*"))
all_player = all_player.fill_nan(0.0)


In [None]:
all_player

In [None]:
# ... (existing imports and exclusions)
# 1. Identify Numeric Columns to Normalize
# We exclude identifier columns AND any column that is already a percentage/ratio
exclude_cols = ["born", "season", "league", "team", "player"]

# Get all numeric columns first
numeric_cols_all = all_player.select(cs.numeric().exclude(exclude_cols)).columns

# Filter: Keep only columns that are NOT percentages and NOT the '90s' column itself
cols_to_normalize = [
    c for c in numeric_cols_all 
    if "90s" not in c 
    and "%" not in c 
    and "Pct" not in c
]

# 2. Normalize by '90s'
# Only divide the raw count columns (e.g., 'Goals', 'Passes'), leaving percentages alone
all_player = all_player.with_columns([
    (pl.col(col_name) / pl.col("('Playing Time', '90s')")).alias(col_name)
    for col_name in cols_to_normalize
])

# 3. Clean the Data: Handle NaN, Null, and Infinity
# This logic remains the same to handle cases where '90s' was 0
all_player = all_player.fill_nan(0.0).fill_null(0.0).with_columns([
    pl.when(pl.col(col_name).is_infinite())
      .then(0.0)
      .otherwise(pl.col(col_name))
      .alias(col_name)
    for col_name in numeric_cols_all  # Apply cleanup to ALL numeric columns, including percentages
])

print("Normalization complete. Percentages were preserved.")
print(all_player.head())

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Store identifier columns
identifier_cols = ["league", "team", "player"]
numeric_data = all_player.select(pl.col(pl.Float64, pl.Int64))

X_filtered = numeric_data.fill_null(0).to_numpy()

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_filtered)

# Cluster using the SCALED data
kmeans = KMeans(n_clusters=4, random_state=42)  # or n_clusters=2
kmeans.fit(X_scaled)  # Use X_scaled, not X_sorted! 
labels = kmeans.labels_

# Add labels back
all_players_with_clusters = all_player. with_columns(
    pl.Series(name="cluster", values=labels)
)

new = all_players_with_clusters.select(["player", "team", "cluster"]).filter(pl.col("player") == "Amad Diallo")
num = new.get_column("cluster")
new
new2 = all_players_with_clusters.select(["player", "team", "cluster"]).filter(pl.col("cluster") == num)
new2

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

wcss = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('Elbow Method')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    scores.append(silhouette_score(X_scaled, labels))

plt.plot(range(2, 11), scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# 1. PCA Visualization
# PCA reduces the data to 2 dimensions while preserving global variance
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled) 

# Create plotting dataframe
plot_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
plot_df['cluster'] = labels
# Convert to string to ensure discrete coloring
plot_df['cluster'] = plot_df['cluster'].astype(str)

plt.figure(figsize=(10, 6))
sns.scatterplot(data=plot_df, x='PC1', y='PC2', hue='cluster', palette='tab10', s=70)
plt.title('Clusters Visualized (PCA)')
plt.show()

# 2. t-SNE Visualization 
# t-SNE is often better at preserving local clusters in high-dimensional data
print("Running t-SNE (this might take a moment)...")
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

plot_df['TSNE1'] = X_tsne[:, 0]
plot_df['TSNE2'] = X_tsne[:, 1]

plt.figure(figsize=(10, 6))
sns.scatterplot(data=plot_df, x='TSNE1', y='TSNE2', hue='cluster', palette='tab10', s=70)
plt.title('Clusters Visualized (t-SNE)')
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

target_cluster_id = num = new.get_column("cluster")
identifier_cols = ["league", "team", "player"]

sub_group = all_players_with_clusters.filter(pl.col("cluster") == target_cluster_id)
nums = sub_group.select(pl.col(pl.Float64, pl.Int64))
print(f"Sub-clustering {sub_group.height} players from Cluster {target_cluster_id}...")

scaler_sub = StandardScaler()
X_sub_scaled = scaler_sub.fit_transform(nums) 

kmeans_sub = KMeans(n_clusters=2, random_state=42)
sub_labels = kmeans_sub.fit_predict(X_sub_scaled)

sub_group = sub_group.with_columns(
    pl.Series(name="sub_cluster", values=sub_labels)
)

sub_group.columns

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

wcss = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_sub_scaled)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('Elbow Method')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_sub_scaled)
    scores.append(silhouette_score(X_sub_scaled, labels))

plt.plot(range(2, 11), scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.show()

In [23]:
key_stats = [
    "Goals", "Assists", "Shots", "Key Passes", 
    "Crosses", "Dribbles", "Touches (Att Pen)"
]

analysis = sub_group.group_by("sub_cluster").agg([
    pl.col(c).mean() for c in key_stats if c in sub_group.columns
])

print(analysis)

shape: (2, 1)
┌─────────────┐
│ sub_cluster │
│ ---         │
│ i32         │
╞═════════════╡
│ 0           │
│ 1           │
└─────────────┘
