In [10]:

import torch
import pickle
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import matplotlib

from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

from pmf import PMF
from lr import LogReg

In [4]:
SEED = 42
REPLACEMENT_LEVELS = [25, 50, 100, 250, 500, 1000]
LATENT_FACTORS = [1, 2, 3, 4]

df = pd.read_csv('data/men_data.csv')

# Climber Embeddings

In [2]:
def get_climber_accuracy(df, model, names):
    accuracies = []
    for name in reversed(names):
        df_climber = df.loc[df.Name == name]

        y_true = df_climber['Status'].values
        y_pred = model.predict(df_climber)
        y_pred_binary = np.round(y_pred)

        accuracies.append(accuracy_score(y_true, y_pred_binary))
    return accuracies

def get_climber_counts(df, names):
    counts = []
    for name in names:
        df_climber = df.loc[df.Name == name]
        counts.append(df_climber.shape[0])
    return counts

def get_climber_success(df, names):
    success = []
    for name in names:
        df_climber = df.loc[df.Name == name]
        success.append(df_climber['Status'].mean())
    return success

def create_climbers_df():
    climbers = {}

    for replacement_level in REPLACEMENT_LEVELS:
        with open(f"models/lr/model_rl_{replacement_level}_full_data.pkl", 'rb') as f:
            lr_model = pickle.load(f)

        for num_factors in LATENT_FACTORS:

            ### Handle Climbers (LR)
            lr_climber_names = lr_model.climber_vocab.get_itos()[1:]
            lr_climbers = pd.DataFrame({
                "coefs": lr_model.lr.coef_.flatten().tolist()[1:-3],
            }, index=lr_climber_names)

            ### Handle Climbers (PMF)
            pmf_model = torch.load(f"models/pmf/model_rl_{replacement_level}_d_{num_factors}_full_data.pth")
            pmf_model.eval()

            pmf_climber_names = pmf_model.climber_vocab.get_itos()[1:]
            pmf_climbers = pd.DataFrame({
                "weights": pmf_model.climber_embedding.weight.data.numpy().tolist()[1:],
                "pmf_accuracy": get_climber_accuracy(df, pmf_model, pmf_climber_names),
                "size": get_climber_counts(df, pmf_model, pmf_climber_names),
                "success": get_climber_success(df, pmf_climber_names),
            }, index=pmf_climber_names)

            weights = pmf_climbers['weights'].apply(pd.Series)
            pmf_climbers = pd.concat([pmf_climbers[['pmf_accuracy','size', 'success']], weights], axis=1)
            pmf_climbers.columns = ['pmf_accuracy', 'size', 'success'] + [f'weight_{i+1}' for i in range(weights.shape[1])]

            ### Merge Climbers
            lr_pmf_climbers = pd.merge(lr_climbers, pmf_climbers, left_index=True, right_index=True, how='outer')
            lr_pmf_climbers.dropna(inplace=True)

            ### Create PCs
            embeddings = lr_pmf_climbers[[row for row in lr_pmf_climbers.columns if row.startswith('w')]].values
            components = min(embeddings.shape[0], embeddings.shape[1])
            pca = PCA(n_components=components)
            pcs = pca.fit_transform(embeddings)

            for pc in range(components):
                lr_pmf_climbers[f'PC{pc+1}'] = pcs[:, pc]

            climbers[f'{replacement_level}_{num_factors}'] = lr_pmf_climbers

    return climbers

def create_correlation_matrices(df, path, raw=True):
    for replacement_level in REPLACEMENT_LEVELS:

        fig, axs = plt.subplots(nrows=1, ncols=len(LATENT_FACTORS), figsize=(28, 8))

        for num_factors in LATENT_FACTORS:
            df_athletes = df[f'{replacement_level}_{num_factors}']

            prefix = 'weight' if raw else 'PC'
            rows = [row for row in df_athletes.columns if row.startswith(prefix)]
            cols = ['coefs','pmf_accuracy','size', 'success']
            df_corr = df_athletes.corr().loc[rows, cols]
            sns.heatmap(df_corr, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1, ax=axs[num_factors-1])

        plt.suptitle(f'Correlation Matrices at RL: {replacement_level}', fontsize=16)
        plt.tight_layout()

        filename = f'figs/{path}/{"corr_raw" if raw else "corr_pc"}/CMatrix_{replacement_level}_{"raw" if raw else "pc"}'
        plt.savefig(filename)
        plt.close(fig)

def create_pc_figures(df, variable, path):
    for replacement_level in REPLACEMENT_LEVELS:

        fig, axs = plt.subplots(nrows=1, ncols=len(LATENT_FACTORS[1:]), figsize=(28, 8))

        for i, num_factors in enumerate(LATENT_FACTORS[1:]):
            df_climbers = df[f'{replacement_level}_{num_factors}']

            unique_categories = df_climbers[variable].unique()
            colormap = matplotlib.colormaps['tab10']
            category_colors = {category: colormap(i) for i, category in enumerate(unique_categories)}

            colors = df_climbers[variable].map(category_colors)

            axs[i].scatter(df_climbers['PC1'], df_climbers['PC2'], alpha=0.5, c=colors) #, c=colors)

            handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=colormap(i), markersize=10) for i in range(len(unique_categories))]
            axs[i].legend(handles, unique_categories, title=variable, loc="best")

            axs[i].set_title(f'PCA for {num_factors} Factors')
            axs[i].set_xlabel('Principal Component 1')
            axs[i].set_ylabel('Principal Component 2')

        plt.suptitle(f'PCA at RL: {replacement_level}', fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.95])

        filename = f'figs/{path}/PCA/{variable}/PCA_{replacement_level}_{variable}'
        plt.savefig(filename)
        plt.close(fig)

### Problem Embeddings
def create_problems_df():
    problems = {}

    for replacement_level in REPLACEMENT_LEVELS:
        # with open(f"models/lr/model_rl_{replacement_level}_fold_0.pkl", 'rb') as f:            ########## CHANGE PATH FOR COMPLETE MODEL
        #     lr_model = pickle.load(f)

        for num_factors in LATENT_FACTORS:

            ### Handle Climbers (LR)
            # lr_climbers = lr_model.climber_vocab.get_itos()[1:]
            # lr_athletes = pd.DataFrame({
            #     "coefs": lr_model.lr.coef_.flatten().tolist(),
            # }, index=lr_climbers)

            ### Handle Climbers (PMF)
            pmf_model = torch.load(f"models/pmf/model_rl_{replacement_level}_d_{num_factors}_fold_1.pth")  ########## CHANGE PATH FOR COMPLETE MODEL
            pmf_model.eval()

            pmf_problem_ids = pmf_model.problem_vocab.get_itos()[1:]                                 ########## WITHOUT OTHER ATM
            pmf_problems = pd.DataFrame({
                "weights": pmf_model.problem_embedding.weight.data.numpy().tolist()[1:],
                "year": [float(problem.split('_')[0]) for problem in pmf_problem_ids],
                "round": [problem.split('_')[-2] for problem in pmf_problem_ids],
                "type": [problem.split('_')[-1] for problem in pmf_problem_ids],
                "category": [problem.split('_')[-1][:-1] for problem in pmf_problem_ids],
            }, index=pmf_problem_ids)

            weights = pmf_problems['weights'].apply(pd.Series)
            pmf_problems = pd.concat([pmf_problems[['year', "round", "category", "type"]], weights], axis=1)
            pmf_problems.columns = ['year', "round", "category", "type"] + [f'weight_{i+1}' for i in range(weights.shape[1])]

            ### Merge Climbers
            c_probs = pmf_problems
            # c_probs = pd.merge(lr_athletes, pmf_athletes, left_index=True, right_index=True, how='outer')
            # lr_pmf_athletes.dropna(inplace=True)

            # ### Create PCs
            embeddings = c_probs[[row for row in c_probs.columns if row.startswith('w')]].values
            components = min(embeddings.shape[0], embeddings.shape[1])
            pca = PCA(n_components=components)
            pcs = pca.fit_transform(embeddings)

            for pc in range(components):
                c_probs[f'PC{pc+1}'] = pcs[:, pc]

            problems[f'{replacement_level}_{num_factors}'] = c_probs

    return problems

def create_geom_smooth(df, path):
    for replacement_level in REPLACEMENT_LEVELS:

        fig, axs = plt.subplots(nrows=1, ncols=len(LATENT_FACTORS), figsize=(48, 8))

        for i, num_factors in enumerate(LATENT_FACTORS):
            df_climbers = df[f'{replacement_level}_{num_factors}']

            sns.regplot(x='year', y=f'PC{i+1}', data=df_climbers, lowess=True, ax=axs[i])

            axs[i].set_title(f'GeomPlot for PC{1}')
            axs[i].set_xlabel('Year')
            axs[i].set_ylabel('Embedding')

        plt.suptitle(f'GeomSmooth at RL: {replacement_level}', fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.95])

        filename = f'figs/{path}/geom/geom_{replacement_level}.png'
        plt.savefig(filename)
        plt.close(fig)

### Load Athlete DataFrame

In [3]:
athletes = create_climbers_df()

NameError: name 'REPLACEMENT_LEVELS' is not defined

### Load CMatrix Figures (Raw Embeddings)

In [None]:
create_correlation_matrices(athletes, 'climbers')

### Load CMatrix Figures (PCs)

In [None]:
create_correlation_matrices(athletes, 'climbers', raw=False)

In [None]:
create_pc_figures(athletes, 'climbers')

# Problem Embeddings

In [6]:
problems = create_problems_df()

In [8]:
create_geom_smooth(problems, 'problems')

In [11]:
create_pc_figures(problems, 'category', 'problems')
create_pc_figures(problems, 'type', 'problems')
create_pc_figures(problems, 'round', 'problems')

OLD IMPLEMENTATION FOR REF

In [None]:
def create_correlation_matrices():
    athletes = {}

    for replacement_level in REPLACEMENT_LEVELS:
        with open(f"models/lr/model_rl_{replacement_level}_fold_0.pkl", 'rb') as f:            ########## CHANGE PATH FOR COMPLETE MODEL
            lr_model = pickle.load(f)

        fig, axs = plt.subplots(nrows=1, ncols=len(LATENT_FACTORS), figsize=(28, 8))

        for num_factors in LATENT_FACTORS:
            # lr_weights = dict(zip(lr_model.climber_vocab.get_itos(), [lr_model.lr.intercept_[0]] + lr_model.lr.coef_.flatten().tolist())) ### WITH OTHER
            lr_climbers = lr_model.climber_vocab.get_itos()[1:]
            lr_athletes = pd.DataFrame({
                "coefs": lr_model.lr.coef_.flatten().tolist(),
            }, index=lr_climbers)

            pmf_model = torch.load(f"models/pmf/model_rl_{replacement_level}_d_{num_factors}_fold_1.pth")  ########## CHANGE PATH FOR COMPLETE MODEL
            pmf_model.eval()


            pmf_climbers = pmf_model.climber_vocab.get_itos()[1:]                                         ########## WITHOUT OTHER ATM
            pmf_athletes = pd.DataFrame({
                "weights": pmf_model.climber_embedding.weight.data.numpy().tolist()[1:],
                "pmf_accuracy": get_athlete_accuracy(df, pmf_model, pmf_climbers),
                "size": get_athlete_counts(df, pmf_model, pmf_climbers),
            }, index=pmf_climbers)

            weights = pmf_athletes['weights'].apply(pd.Series)
            pmf_athletes = pd.concat([pmf_athletes[['pmf_accuracy','size']], weights], axis=1)
            pmf_athletes.columns = ['pmf_accuracy', 'size'] + [f'weight_{i+1}' for i in range(weights.shape[1])]

            ### Correlations
            lr_pmf_athletes = pd.merge(lr_athletes, pmf_athletes, left_index=True, right_index=True, how='outer')
            lr_pmf_athletes.dropna(inplace=True)
            rows = [row for row in lr_pmf_athletes.columns if row.startswith('w')]
            cols = ['coefs','pmf_accuracy','size']
            df_corr = lr_pmf_athletes.corr().loc[rows, cols]
            sns.heatmap(df_corr, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1, ax=axs[num_factors-1])

            ### PCA
            embeddings = lr_pmf_athletes[rows].values
            components = min(embeddings.shape[0], embeddings.shape[1])
            pca = PCA(n_components=components)
            pcs = pca.fit_transform(embeddings)

            for pc in range(components):
                lr_pmf_athletes[f'PC{pc+1}'] = pcs[:, pc]

            athletes[f'{replacement_level}_{num_factors}'] = lr_pmf_athletes

        plt.suptitle(f'Correlation Matrices at RL: {replacement_level}', fontsize=16)

        plt.tight_layout()
        plt.savefig(f'figs/correlations_{replacement_level}')
        plt.close(fig)

    return athletes

In [19]:
problems['1000_1']['round'].unique()

array(['Q', 'S'], dtype=object)

In [23]:
df[df.Level=='F']

Unnamed: 0,Year,Competition,Gender,Level,Name,Country,Problem,Attempts,Max_attempts,Status,Time,Problem_category,Problem_ID
104,2008,Boulder IFSC Climbing Worldcup (B) - Hall (AUT...,M,F,Kilian Fischhuber,AUT,Top1,2,4,1,2,Top,2008_Boulder IFSC Climbing Worldcup (B) - Hall...
105,2008,Boulder IFSC Climbing Worldcup (B) - Hall (AUT...,M,F,Dmitrii Sharafutdinov,RUS,Top1,1,4,1,1,Top,2008_Boulder IFSC Climbing Worldcup (B) - Hall...
106,2008,Boulder IFSC Climbing Worldcup (B) - Hall (AUT...,M,F,Daniel Woods,USA,Top1,2,4,1,2,Top,2008_Boulder IFSC Climbing Worldcup (B) - Hall...
107,2008,Boulder IFSC Climbing Worldcup (B) - Hall (AUT...,M,F,Guillaume Glairon Mondet,FRA,Top1,4,4,1,4,Top,2008_Boulder IFSC Climbing Worldcup (B) - Hall...
108,2008,Boulder IFSC Climbing Worldcup (B) - Hall (AUT...,M,F,Jérôme Meyer,FRA,Top1,2,4,1,2,Top,2008_Boulder IFSC Climbing Worldcup (B) - Hall...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
87708,2022,"Boulder • Lead IFSC - Climbing World Cup (B,L)...",M,F,Dohyun Lee,KOR,Zone4,2,5,1,2,Zone,2022_Boulder • Lead IFSC - Climbing World Cup ...
87709,2022,"Boulder • Lead IFSC - Climbing World Cup (B,L)...",M,F,Yoshiyuki Ogata,JPN,Zone4,3,5,1,3,Zone,2022_Boulder • Lead IFSC - Climbing World Cup ...
87710,2022,"Boulder • Lead IFSC - Climbing World Cup (B,L)...",M,F,Jongwon Chon,KOR,Zone4,1,5,1,1,Zone,2022_Boulder • Lead IFSC - Climbing World Cup ...
87711,2022,"Boulder • Lead IFSC - Climbing World Cup (B,L)...",M,F,Kokoro Fujii,JPN,Zone4,2,5,1,2,Zone,2022_Boulder • Lead IFSC - Climbing World Cup ...
