In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read the data
df = pd.read_csv('../data_processing/champion_stats.csv')

# Select features for clustering
features = [
    # Core Stats
    'avg_kills', 'avg_deaths', 'avg_assists', 'kda',
    'avg_kill_participation', 'avg_takedowns',
    'avg_deaths_by_enemy_champs',
    'winrate',
    
    # Damage Stats
    'avg_dmg_dealt_to_champions', 'avg_dmg_taken',
    'avg_magic_dmg_to_champs', 'avg_physical_dmg_to_champs', 'avg_true_dmg_to_champs',
    'avg_dmg_self_mitigated', 'damage_per_minute',
    'avg_largest_crit', 'avg_pct_damage_in_team', 'avg_dmg_taken_team_pct',
    'pct_highest_dmg_in_match',
    
    # Support/Utility Stats
    'avg_time_ccing_champs', 'avg_heals_on_teammate', 'avg_dmg_shielded_on_team',
    'avg_effective_heal_and_shield', 'avg_champ_immobilizations',
    'pct_highest_cc_in_match', 'avg_times_save_ally_from_death',
    
    # Economy Stats
    'avg_gold_earned_per_game', 'avg_gold_spent', 'avg_cs',
    'avg_neutral_monsters_cs', 'cs_per_minute', 'gold_per_minute',
    'avg_cs_10_mins', 'avg_jg_cs_before_10m',
    'avg_max_cs_over_lane_opp',
    
    # Vision Stats
    'avg_vision_score', 'avg_wards_placed', 'avg_wards_killed', 'avg_ctrl_wards_bought',
    'avg_ctrol_wards_placed', 'avg_vision_score_per_min', 'pct_highest_ward_kills_in_match',
    'avg_ctrl_ward_time_coverage_in_river_or_enemy_half',
    
    # Objective Stats
    'pct_of_games_team_took_first_baron', 'pct_of_games_team_took_first_drag', 
    'pct_of_games_team_took_first_turret', 'pct_games_team_took_first_herald',
    'avg_indiv_dmg_dealt_to_buildings', 'avg_dmg_dealt_to_objs', 'avg_indiv_turret_plates_taken',
    'avg_epic_monster_steals', 'avg_epic_monster_kills_within_30s_of_spawn',
    
    # Jungle Stats
    'avg_buffs_stolen', 'avg_initial_buff_count', 'avg_initial_crab_count',
    'avg_crabs_per_game', 'avg_jgler_kills_early_jungle',
    'avg_jgler_early_kills_on_laners',
    
    # Survival Stats
    'avg_longest_time_alive', 'avg_bounty_lvl', 'avg_time_spent_dead',
    'avg_times_survived_single_digit_hp', 'avg_times_survived_3_immobilizes_in_fight',
    'avg_times_took_large_dmg_survived',
    
    # Early Game Stats
    'pct_of_games_with_early_lanephase_gold_exp_adv', 'pct_of_games_with_lanephase_gold_exp_adv',
    'avg_max_lvl_lead_lane_opp', 'pct_games_first_blood_kill', 'pct_of_games_indiv_killed_1st_tower',
    
    # Multikill Stats
    'avg_killing_sprees', 'avg_largest_killing_spee', 'avg_number_of_multikills',
    'avg_multikills_with_one_spell', 'avg_legendary_count'
]

# Check which features exist in the dataframe
available_features = [f for f in features if f in df.columns]
missing_features = [f for f in features if f not in df.columns]

if missing_features:
    print(f"Warning: The following features are not in the dataset and will be skipped: {missing_features}")

# Use only available features
features = available_features

# Prepare the data
X = df[features].copy()

# Check for NaN values
nan_counts = X.isna().sum()
print("\nColumns with NaN values:")
print(nan_counts[nan_counts > 0])

# Handle missing values by imputing with mean
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Now standardize the imputed data
X = StandardScaler().fit_transform(X_imputed)

# Elbow method to find optimal number of clusters
inertias = []
K = range(1, 20)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(K, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()

# Perform k-means clustering
n_clusters = 12  # You can adjust this based on the elbow plot
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)

# Analyze PCA components
pca = PCA()
X_pca = pca.fit_transform(X)

# Calculate explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_variance_ratio, 'bo-')
plt.axhline(y=0.80, color='r', linestyle='--', label='80% Threshold')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio by PCA Components')
plt.legend()
plt.grid(True)
plt.show()

# Find number of components needed for 80% variance
n_components_80 = np.argmax(cumulative_variance_ratio >= 0.80) + 1
print(f"Number of components needed to explain 80% of variance: {n_components_80}")

# Create visualization using first 3 components (if we have enough dimensions)
if X_pca.shape[1] >= 3:
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], 
                        c=df['Cluster'], cmap='viridis')
    ax.set_xlabel('First Principal Component')
    ax.set_ylabel('Second Principal Component')
    ax.set_zlabel('Third Principal Component')
    plt.title('Champion-Role Clusters (3D)')
    
    # Add champion-role names as annotations
    for i, champ_role in enumerate(df['champ_role']):
        ax.text(X_pca[i, 0], X_pca[i, 1], X_pca[i, 2], champ_role, fontsize=8)
    
    plt.colorbar(scatter)
    plt.show()

# Calculate cluster means
cluster_means = df.groupby('Cluster')[features].mean()

# Print clusters and their characteristics
for cluster in range(n_clusters):
    print(f"\n=== Cluster {cluster} ===")
    print("\nChampion-Roles in this cluster:")
    cluster_champs = df[df['Cluster'] == cluster]['champ_role'].tolist()
    print(', '.join(sorted(cluster_champs)))
    
    print("\nDistinctive features:")
    cluster_features = cluster_means.loc[cluster]
    sorted_features = cluster_features.sort_values(ascending=False)
    print(sorted_features.head())
    print("-" * 50)

In [None]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib.pyplot as plt

# Prepare data (using existing X from previous code)
clustering_methods = {
    'K-Means': KMeans(n_clusters=5, random_state=42),
    'DBSCAN': DBSCAN(eps=2.5, min_samples=3),
    'Hierarchical': AgglomerativeClustering(n_clusters=5),
    'Gaussian Mixture': GaussianMixture(n_components=5, random_state=42)
}

# Compare clustering methods
plt.figure(figsize=(20, 5))
for idx, (name, model) in enumerate(clustering_methods.items(), 1):
    plt.subplot(1, 4, idx)
    
    # Fit the model
    if name == 'Gaussian Mixture':
        labels = model.fit_predict(X)
    else:
        labels = model.fit_predict(X)
    
    # Calculate silhouette score (except for DBSCAN which might have -1 labels)
    if name != 'DBSCAN':
        score = silhouette_score(X, labels)
        print(f"{name} Silhouette Score: {score:.3f}")
    
    # Plot first two PCA components colored by cluster
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis')
    plt.title(f'{name} Clustering')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')

plt.tight_layout()
plt.show()

In [None]:
# For Hierarchical Clustering, show dendrogram
from scipy.cluster.hierarchy import dendrogram, linkage

plt.figure(figsize=(15, 10))
linkage_matrix = linkage(X, 'ward')
dendrogram(linkage_matrix, labels=df['champ_role'].values)
plt.title('Champion-Role Hierarchy Dendrogram')
plt.xticks(rotation=90)
plt.show()

# For Gaussian Mixture, show probability distribution
if 'Gaussian Mixture' in clustering_methods:
    gmm = clustering_methods['Gaussian Mixture']
    probs = gmm.predict_proba(X)
    
    # Show champion-roles with highest mixture of playstyles
    hybrid_scores = -np.sum(probs * np.log(probs + 1e-10), axis=1)  # entropy
    most_hybrid = pd.DataFrame({
        'Champion-Role': df['champ_role'],
        'Hybrid Score': hybrid_scores
    }).sort_values('Hybrid Score', ascending=False)
    
    print("\nMost Hybrid Champion-Roles (highest mixture of playstyles):")
    print(most_hybrid.head(10))

In [None]:
# Create a more detailed cluster profile
def analyze_cluster_profiles(df, cluster_means, features, n_clusters):
    # Create a dictionary to store cluster profiles
    cluster_profiles = {}
    
    # Calculate global means and standard deviations
    global_means = df[features].mean()
    global_stds = df[features].std()
    
    for cluster in range(n_clusters):
        # Get champions in this cluster
        cluster_champs = df[df['Cluster'] == cluster]['champ_role'].tolist()
        
        # Calculate z-scores for this cluster's means
        cluster_means_series = cluster_means.loc[cluster]
        z_scores = (cluster_means_series - global_means) / global_stds
        
        # Sort features by absolute z-score
        sorted_features = z_scores.abs().sort_values(ascending=False)
        
        # Store cluster profile
        cluster_profiles[cluster] = {
            'champion_roles': sorted(cluster_champs),
            'size': len(cluster_champs),
            'distinctive_features': {
                feature: {
                    'z_score': z_scores[feature],
                    'mean': cluster_means_series[feature],
                    'global_mean': global_means[feature]
                }
                for feature in sorted_features.index[:10]  # Top 10 most distinctive features
            }
        }
    
    return cluster_profiles

# Generate cluster profiles
cluster_profiles = analyze_cluster_profiles(df, cluster_means, features, n_clusters)

# Function to visualize cluster features
def visualize_cluster_features(cluster_profiles, cluster_to_visualize=None):
    if cluster_to_visualize is not None:
        clusters_to_show = [cluster_to_visualize]
    else:
        clusters_to_show = list(cluster_profiles.keys())
    
    for cluster in clusters_to_show:
        profile = cluster_profiles[cluster]
        
        print(f"\n=== Cluster {cluster} ===")
        print(f"Size: {profile['size']} champion-roles")
        print("\nChampion-Roles in this cluster:")
        print(', '.join(profile['champion_roles']))
        
        print("\nDistinctive Features:")
        features_df = pd.DataFrame.from_dict(profile['distinctive_features'], orient='index')
        
        # Create a bar plot of z-scores
        plt.figure(figsize=(12, 6))
        features_df['z_score'].plot(kind='bar')
        plt.title(f'Distinctive Features for Cluster {cluster}')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
        
        # Print detailed statistics
        print("\nDetailed Statistics:")
        for feature, stats in profile['distinctive_features'].items():
            print(f"{feature}:")
            print(f"  Z-score: {stats['z_score']:.2f}")
            print(f"  Cluster mean: {stats['mean']:.2f}")
            print(f"  Global mean: {stats['global_mean']:.2f}")
            print()

# Function to find similar champion-roles
def find_similar_champion_roles(champ_role_name, n=5):
    if champ_role_name not in df['champ_role'].values:
        print(f"Champion-Role '{champ_role_name}' not found in the dataset.")
        return
    
    # Get the features for the target champion-role
    target_features = X[df['champ_role'] == champ_role_name].flatten()
    
    # Calculate Euclidean distances to all other champion-roles
    distances = []
    for idx, row in df.iterrows():
        if row['champ_role'] != champ_role_name:
            dist = np.linalg.norm(X[idx] - target_features)
            distances.append((row['champ_role'], dist))
    
    # Sort by distance and get top N
    similar_champs = sorted(distances, key=lambda x: x[1])[:n]
    
    print(f"\nMost similar champion-roles to {champ_role_name}:")
    for champ, dist in similar_champs:
        print(f"{champ}: Distance = {dist:.2f}")

# Function to recommend champion-roles based on preferences
def recommend_champion_roles_by_preferences(preferences, cluster_profiles, cluster_descriptions, top_n=5):
    """
    Recommend champion-roles based on player preferences.
    
    preferences: dict of feature preferences, e.g., {'damage_per_minute': 1, 'avg_vision_score': 0.5}
    cluster_profiles: output from analyze_cluster_profiles
    cluster_descriptions: dict mapping cluster numbers to playstyle descriptions
    top_n: number of recommendations to return
    """
    # Calculate preference scores for each cluster
    cluster_scores = {}
    for cluster, profile in cluster_profiles.items():
        score = 0
        for feature, weight in preferences.items():
            if feature in profile['distinctive_features']:
                score += profile['distinctive_features'][feature]['z_score'] * weight
        cluster_scores[cluster] = score
    
    # Get the best matching clusters
    best_clusters = sorted(cluster_scores.items(), key=lambda x: x[1], reverse=True)[:3]
    
    # Get champion-roles from the best clusters
    recommendations = []
    for cluster, score in best_clusters:
        champ_roles = cluster_profiles[cluster]['champion_roles']
        playstyle = cluster_descriptions.get(cluster, "Unknown playstyle")
        recommendations.extend([(champ_role, playstyle, score) for champ_role in champ_roles])
    
    # Sort by cluster score and return top N
    recommendations.sort(key=lambda x: x[2], reverse=True)
    
    print("\nRecommended champion-roles based on your preferences:")
    for champ_role, playstyle, score in recommendations[:top_n]:
        print(f"{champ_role} - {playstyle} (Score: {score:.2f})")

# Save the cluster profiles and descriptions
import pickle

# Save cluster profiles
with open('champion_cluster_profiles.pkl', 'wb') as f:
    pickle.dump(cluster_profiles, f)

# Create and save cluster descriptions (you would need to manually create these based on analysis)
cluster_descriptions = {
    # Example descriptions - these should be updated based on actual analysis
    0: "High damage carries",
    1: "Utility supports",
    2: "Tank initiators",
    # ... add descriptions for all clusters
}

with open('champion_cluster_descriptions.pkl', 'wb') as f:
    pickle.dump(cluster_descriptions, f)


In [None]:
# Add this visualization to better understand what defines each cluster
import matplotlib.pyplot as plt
import seaborn as sns

def visualize_cluster_features(cluster_profiles, cluster_to_visualize=None):
    if cluster_to_visualize is not None:
        clusters_to_show = [cluster_to_visualize]
    else:
        clusters_to_show = list(cluster_profiles.keys())
    
    for cluster in clusters_to_show:
        profile = cluster_profiles[cluster]
        
        # Combine high and low features
        all_features = pd.concat([profile['distinctive_high'], profile['distinctive_low']])
        
        plt.figure(figsize=(12, 8))
        bars = plt.barh(all_features.index, all_features.values, color=['green' if x > 0 else 'red' for x in all_features.values])
        plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
        plt.title(f'Cluster {cluster} Distinctive Features (Z-scores)')
        plt.xlabel('Z-score (standard deviations from mean)')
        plt.tight_layout()
        plt.show()

# Visualize all clusters or a specific one
visualize_cluster_features(cluster_profiles)
# Or for a specific cluster: visualize_cluster_features(cluster_profiles, 0)

In [None]:
# Add this to understand relationships between clusters
from scipy.spatial.distance import pdist, squareform

# Calculate distances between cluster centers
cluster_centers = kmeans.cluster_centers_
cluster_distances = squareform(pdist(cluster_centers))

# Create a DataFrame for better visualization
cluster_distance_df = pd.DataFrame(
    cluster_distances, 
    index=[f'Cluster {i}' for i in range(n_clusters)],
    columns=[f'Cluster {i}' for i in range(n_clusters)]
)

# Visualize as a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(cluster_distance_df, annot=True, cmap='viridis')
plt.title('Distance Between Cluster Centers')
plt.tight_layout()
plt.show()

# Find the closest clusters for each cluster
for i in range(n_clusters):
    # Get distances to other clusters (excluding self)
    distances = cluster_distances[i]
    distances[i] = np.inf  # Exclude self
    
    # Find the 3 closest clusters
    closest_indices = np.argsort(distances)[:3]
    closest_distances = distances[closest_indices]
    
    print(f"\nCluster {i} is most similar to:")
    for idx, dist in zip(closest_indices, closest_distances):
        print(f"  Cluster {idx} (distance: {dist:.2f})")

In [None]:
# Add this to find similar champions across the dataset
from sklearn.metrics.pairwise import euclidean_distances

# Calculate distances between all champions
champion_distances = euclidean_distances(X_imputed)

# Create a DataFrame with champion names
champion_distance_df = pd.DataFrame(
    champion_distances,
    index=df['championName'],
    columns=df['championName']
)

# Function to find similar champions
def find_similar_champions(champion_name, n=5):
    if champion_name not in champion_distance_df.index:
        print(f"Champion {champion_name} not found!")
        return
    
    # Get distances to all other champions
    distances = champion_distance_df.loc[champion_name]
    
    # Sort and get the closest ones (excluding self)
    similar_champions = distances.sort_values()[1:n+1]
    
    print(f"Champions most similar to {champion_name}:")
    for champ, dist in similar_champions.items():
        print(f"  {champ} (distance: {dist:.2f})")
    
    return similar_champions

# Example usage
find_similar_champions('Ahri')
find_similar_champions('Darius')

In [8]:
# Add this after analyzing the clusters
cluster_descriptions = {
    0: {
        "name": "Tanky Frontliners",
        "description": "Champions who excel at absorbing damage and providing crowd control for their team.",
        "playstyle": "These champions typically build defensive items and focus on initiating fights and protecting allies.",
        "strengths": ["High survivability", "Good crowd control", "Team fight presence"],
        "weaknesses": ["Lower damage output", "Can be kited", "Dependent on team follow-up"],
        "recommended_for": "Players who enjoy being in the middle of fights and protecting their team."
    },
    # Add more clusters with detailed descriptions
}

# Save these descriptions for your chat agent
with open('champion_cluster_descriptions.pkl', 'wb') as f:
    pickle.dump(cluster_descriptions, f)

In [None]:
def recommend_champions_by_preferences(preferences, cluster_profiles, cluster_descriptions, top_n=5):
    """
    Recommend champions based on user preferences.
    
    Parameters:
    - preferences: dict of feature preferences (e.g., {'damage': 'high', 'survivability': 'medium'})
    - cluster_profiles: the cluster profiles generated earlier
    - cluster_descriptions: human-readable descriptions of clusters
    - top_n: number of champions to recommend
    
    Returns:
    - List of recommended champions with explanations
    """
    # Map user preferences to features
    feature_mapping = {
        'damage': ['totalDamageDealtToChampions', 'damage_per_minute'],
        'survivability': ['totalDamageTaken', 'damageSelfMitigated', 'longestTimeSpentLiving'],
        'utility': ['timeCCingOthers', 'totalHealsOnTeammates', 'totalDamageShieldedOnTeammates'],
        'mobility': ['challenges_quickSoloKills', 'challenges_survivedSingleDigitHpCount'],
        'farming': ['cs_per_minute', 'gold_per_minute'],
        # Add more mappings
    }
    
    # Score each cluster based on preferences
    cluster_scores = {}
    for cluster_id, profile in cluster_profiles.items():
        score = 0
        for pref, value in preferences.items():
            if pref in feature_mapping:
                relevant_features = feature_mapping[pref]
                
                # Calculate average z-score for relevant features
                feature_scores = []
                for feature in relevant_features:
                    if feature in profile['raw_means']:
                        # Find this feature's z-score
                        if feature in profile['distinctive_high'].index:
                            z_score = profile['distinctive_high'][feature]
                        elif feature in profile['distinctive_low'].index:
                            z_score = profile['distinctive_low'][feature]
                        else:
                            # If not in distinctive features, it's close to average
                            z_score = 0
                        
                        feature_scores.append(z_score)
                
                if feature_scores:
                    avg_score = sum(feature_scores) / len(feature_scores)
                    
                    # Adjust score based on preference value
                    if value == 'high' and avg_score > 0:
                        score += avg_score
                    elif value == 'low' and avg_score < 0:
                        score += abs(avg_score)
                    elif value == 'medium' and abs(avg_score) < 0.5:
                        score += 1 - abs(avg_score)
        
        cluster_scores[cluster_id] = score
    
    # Get top clusters
    top_clusters = sorted(cluster_scores.items(), key=lambda x: x[1], reverse=True)[:3]
    
    # Get champions from top clusters
    recommendations = []
    for cluster_id, score in top_clusters:
        cluster_champs = cluster_profiles[cluster_id]['champions']
        
        # Add explanation from cluster description
        explanation = cluster_descriptions.get(cluster_id, {}).get('description', 
                                                                 f"Champions from cluster {cluster_id}")
        
        # Add some champions from this cluster
        champs_to_add = min(top_n // len(top_clusters) + 1, len(cluster_champs))
        for champ in cluster_champs[:champs_to_add]:
            recommendations.append({
                'champion': champ,
                'cluster': cluster_id,
                'cluster_score': score,
                'explanation': explanation
            })
    
    # Return top N recommendations
    return sorted(recommendations, key=lambda x: x['cluster_score'], reverse=True)[:top_n]

# Example usage
preferences = {
    'damage': 'high',
    'survivability': 'medium',
    'utility': 'low',
    'mobility': 'high'
}

recommendations = recommend_champions_by_preferences(
    preferences, 
    cluster_profiles, 
    cluster_descriptions
)

for rec in recommendations:
    print(f"{rec['champion']} - {rec['explanation']}")