This first part is used to create the binary encoding out of the data. It reads the cluster data along with aggregated data and goals

In [1]:
import pandas as pd

data = pd.read_csv("../../data/interim/player_season_stats_23-24_relevant.csv")
goal_data = pd.read_csv('../../data/interim/player_season_stats_23-24.csv', sep=',', encoding="latin1")

Aggregating the fines and storing them as CSV

In [2]:
aggregated_goals = goal_data.groupby('player', as_index=False).agg({
    'Gls': 'sum',    # sum goals
    'Ast': 'sum',    # sum assists
    'player': 'first'  # keep the first occurrence of Name
})

#Save aggregated goals to a new csv file
aggregated_goals.to_csv('../../data/processed/aggregated_goals.csv', index=False)

#Aggregate data by player
categorical_cols = ['league', 'season',	'team',	'player', 'nation',	'pos',	'age']

sum_numeric_cols = ['MP Playing Time', 'Min Playing Time', 'KP',	'PrgP',	'PPA',	'CrsPA', 'Att Long', 'TB Pass Types', 'Sw Pass Types', 'Crs Pass Types', 'PrgC Carries', 'CPA Carries',	'Succ Take-Ons',    'Dis Carries',	'Mis Carries',	'PrgR Receiving', 'TklW Tackles', 'Att 3rd Tackles', 'Mid 3rd Tackles',	'Def 3rd Tackles',	'Int',	'Sh Blocks', 'Recov Performance', 'Won Aerial Duels', 'Fls Performance', 'Fld Performance', 'Att Pen Touches', 'Att 3rd Touches', 'Mid 3rd Touches', 'Def 3rd Touches',	'Def Pen Touches', 'Live Touches',	'Cmp Total']
average_cols = ['90s Playing Time',	'npxG Per 90 Minutes',	'G-PK Per 90 Minutes',	'xAG Per 90 Minutes',	'xG+xAG Per 90 Minutes',	'Sh/90 Standard',	'SoT% Standard',	'Dist Standard',	'G/Sh Standard',	'Cmp% Total',	'Cmp% Long',	'Succ% Take-Ons', 'Won% Aerial Duels']

aggregated_data = data.groupby('player', as_index=False).agg({
    **{col: 'first' for col in categorical_cols},  # keep the first occurrence of categorical columns
    **{col: 'sum' for col in sum_numeric_cols},        # sum numeric columns
    **{col: 'mean' for col in average_cols}        # average specified columns 
})

#save aggregated data to a new csv file
aggregated_data.to_csv('../../data/processed/aggregated_data.csv', index=False)

Read the cluster data and merge everything for binary encoding

In [3]:

data = pd.read_csv('../../data/processed/aggregated_data.csv', sep=',', encoding="latin1")
clustered_data = pd.read_csv('../../data/processed/players_w_clusters.csv', sep=',', encoding="latin1")
goal_data = pd.read_csv('../../data/processed/aggregated_goals.csv', sep=',', encoding="latin1")

# Merge cluster labels by player name
data = data.merge(clustered_data[['player', 'cluster']], on='player', how='left')
cols = list(data.columns)
# Remove cluster and reinsert at index 2
cols.insert(2, cols.pop(cols.index('cluster')))
# Reorder dataframe
data = data[cols]

# Merge goals and assists by player name
data = data.merge(goal_data[['player', 'Gls']], on='player', how='left')
data = data.merge(goal_data[['player', 'Ast']], on='player', how='left')

#Store the headers in a list
headers = data.columns.tolist()
filtered_data = data.copy()


Data manipulation to only take big group clusters

In [4]:
#Group the data by cluster
clustered_grouped_data = filtered_data.groupby('cluster')

#Remove clusters with less than 50 from filtered data
cluster_sizes = clustered_grouped_data.size()
clusters_to_keep = cluster_sizes[cluster_sizes >= 50].index
filtered_data = filtered_data[filtered_data['cluster'].isin(clusters_to_keep)]

#Group the data by cluster
clustered_grouped_data = filtered_data.groupby('cluster')

#Print the number of players in each cluster 
for cluster, group in clustered_grouped_data:
    num_players = len(group)
    print(f"Cluster: {cluster}, Number of players: {num_players}")


Cluster: 0.0, Number of players: 581
Cluster: 4.0, Number of players: 346
Cluster: 5.0, Number of players: 801
Cluster: 8.0, Number of players: 286
Cluster: 9.0, Number of players: 242
Cluster: 16.0, Number of players: 105


Transform into binary encoding

In [6]:
#Calculate the 90th percentile for each stat in each cluster
stats = headers[8:]
result = {}

# Loop through clusters and stats
for cluster, group in clustered_grouped_data:
    result[cluster] = {}
    for stat in stats:
        result[cluster][stat] = group[stat].quantile(0.90)

#Transform data into binary format based on 90th percentile
binary_data = filtered_data.copy()
for cluster in result:
    for stat in stats:
        threshold = result[cluster][stat]
        binary_data.loc[binary_data['cluster'] == cluster, stat] = binary_data.loc[binary_data['cluster'] == cluster, stat].apply(lambda x: 1 if x >= threshold else 0)

#Save the binary data to a new CSV file
binary_data.to_csv('../../data/processed/binary_encoded_player_stats_2.csv', index=False)

Import libraries for Analysis and read code

In [23]:
from mlxtend.frequent_patterns import apriori, association_rules
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
bin_data = pd.read_csv('../../data/processed/binary_encoded_player_stats_2.csv', sep=',', encoding="latin1")

# Dictionary to store cluster-wise stat metrics
cluster_stats = {'cluster': [], 'stat': [], 'lift': [], 'confidence': []}

Calculate Dynamic Support 

In [24]:
# Group the data by cluster
clustered = bin_data.groupby('cluster')

# Function to calculate dynamic min_support
def compute_dynamic_support(cluster_size):
    return max(0.03, 5 / cluster_size)

# Dictionary to store min_support per cluster
dynamic_supports = {}

for cluster, group in clustered:
    size = len(group)
    gls_count = group['Gls'].sum()
    goal_rate = gls_count / size
    min_sup = compute_dynamic_support(size)
    dynamic_supports[cluster] = min_sup


APRIORI algorithm

In [None]:
for cluster, group in clustered:

    if group['Gls'].mean() == 1:
        print(f"\nSkipping Cluster {cluster} â€” all players have Gls=1.")
        continue

    print(f"\n------- CLUSTER {cluster} -------")

    cluster_data = group.drop(columns=[
        'cluster','league','player','season',
        'team','nation','pos','age'
    ])
    support_threshold = dynamic_supports[cluster]

    # Apriori (must allow 2-item sets!)
    frequent_itemsets = apriori(
        cluster_data,
        min_support=support_threshold,
        use_colnames=True,
        max_len=2
    )

    # Association rules
    rules = association_rules(
        frequent_itemsets,
        metric="confidence",
        min_threshold=0.3
    )

    if rules.empty:
        print("No rules generated for this cluster.")
        continue

    # Remove rules with lift < 1
    rules = rules[rules['lift'] >= 1]

    if rules.empty:
        print("No rules with lift >= 1.")
        continue

    # Keep only rules where consequent == {Gls}
    rules_to_goals = rules[
        rules['consequents'].apply(lambda x: len(x) == 1 and 'Gls' in x)
    ]

    if rules_to_goals.empty:
        print("No rules implying Gls.")
        continue

    # Keep only single-stat antecedents
    rules_to_goals = rules_to_goals[
        rules_to_goals['antecedents'].apply(lambda a: len(a) == 1)
    ]

    if rules_to_goals.empty:
        print("No single-stat rules implying Gls.")
        continue

    # Store
    for _, row in rules_to_goals.iterrows():
        stat = list(row['antecedents'])[0]
        cluster_stats['cluster'].append(cluster)
        cluster_stats['stat'].append(stat)
        cluster_stats['lift'].append(row['lift'])
        cluster_stats['confidence'].append(row['confidence'])

    print("\nSingle-stat rules implying goals:")
    print(rules_to_goals[['antecedents','consequents','support','confidence','lift']])

    # Ranking
    stats = [list(a)[0] for a in rules_to_goals['antecedents']]
    lifts = rules_to_goals['lift'].tolist()
    lift_df = pd.DataFrame({'stat': stats, 'lift': lifts})
    lift_df = lift_df.groupby('stat').sum().sort_values('lift', ascending=False)

    print("\nStats ranked by lift (single-stat influence):")
    print(lift_df.head(10))


------- CLUSTER 0.0 -------
No meaningful goal-predicting single-stat rules found in this cluster.

------- CLUSTER 4.0 -------
No meaningful goal-predicting single-stat rules found in this cluster.

------- CLUSTER 5.0 -------
No meaningful goal-predicting single-stat rules found in this cluster.

------- CLUSTER 8.0 -------
No meaningful goal-predicting single-stat rules found in this cluster.

------- CLUSTER 9.0 -------
No meaningful goal-predicting single-stat rules found in this cluster.

------- CLUSTER 16.0 -------
No meaningful goal-predicting single-stat rules found in this cluster.




Visuialization

In [None]:
# Create a DataFrame for cluster-wise stat metrics
cluster_stats_df = pd.DataFrame(cluster_stats)

#Display table where each row is a cluster and each column is a st
pivot_table = cluster_stats_df.pivot(index='cluster', columns='stat', values='lift').fillna(0)

# Plot heatmap
plt.figure(figsize=(12, 12))
sns.heatmap(pivot_table, annot=True, cmap="YlGnBu", square=True, cbar_kws={"shrink": 0.8})
plt.title("Heatmap of Stat Influence (Lift) by Cluster")
plt.xlabel("Stat")
plt.ylabel("Cluster")
plt.tight_layout()
plt.show()