In [1]:
import torch
import pickle
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

from pmf import PMF
from lr import LogReg



In [2]:
SEED = 42
REPLACEMENT_LEVELS = [25, 50, 100, 250, 500, 1000]
LATENT_FACTORS = [1, 2, 3, 4]

df = pd.read_csv('data/men_data.csv')

# Climber Embeddings

In [3]:
def get_climber_accuracy(df, model, names):
    accuracies = []
    for name in reversed(names):
        if name != 'other':
            df_climber = df.loc[df.Name == name]
        else:
            mask = df.Name.isin([climber for climber in model.climber_vocab.get_itos() if climber != 'other'])
            df_climber= df.loc[~mask]

        y_true = df_climber['Status'].values
        y_pred = model.predict(df_climber)
        y_pred_binary = np.round(y_pred)
        accuracies.append(accuracy_score(y_true, y_pred_binary))
    return accuracies

def get_climber_counts(df, model, names):
    accuracies = []
    for name in names:
        if name != 'other':
            df_climber = df.loc[df.Name == name]
        else:
            mask = df.Name.isin([climber for climber in model.climber_vocab.get_itos() if climber != 'other'])
            df_climber= df.loc[~mask]
        accuracies.append(df_climber.shape[0])
    return accuracies

def create_climbers_df():
    climbers = {}

    for replacement_level in REPLACEMENT_LEVELS:
        with open(f"models/lr/model_rl_{replacement_level}_full_data.pkl", 'rb') as f:
            lr_model = pickle.load(f)

        for num_factors in LATENT_FACTORS:

            ### Handle Climbers (LR)
            lr_climber_names = lr_model.climber_vocab.get_itos()[1:]
            lr_climbers = pd.DataFrame({
                "coefs": lr_model.lr.coef_.flatten().tolist()[1:-3],
            }, index=lr_climber_names)

            ### Handle Climbers (PMF)
            pmf_model = torch.load(f"models/pmf/model_rl_{replacement_level}_d_{num_factors}_full_data.pth")
            pmf_model.eval()

            pmf_climber_names = pmf_model.climber_vocab.get_itos()[1:]
            pmf_climbers = pd.DataFrame({
                "weights": pmf_model.climber_embedding.weight.data.numpy().tolist()[1:],
                "pmf_accuracy": get_climber_accuracy(df, pmf_model, pmf_climber_names),
                "size": get_climber_counts(df, pmf_model, pmf_climber_names),
            }, index=pmf_climber_names)

            weights = pmf_climbers['weights'].apply(pd.Series)
            pmf_climbers = pd.concat([pmf_climbers[['pmf_accuracy','size']], weights], axis=1)
            pmf_climbers.columns = ['pmf_accuracy', 'size'] + [f'weight_{i+1}' for i in range(weights.shape[1])]

            ### Merge Climbers
            lr_pmf_climbers = pd.merge(lr_climbers, pmf_climbers, left_index=True, right_index=True, how='outer')
            lr_pmf_climbers.dropna(inplace=True)

            ### Create PCs
            embeddings = lr_pmf_climbers[[row for row in lr_pmf_climbers.columns if row.startswith('w')]].values
            components = min(embeddings.shape[0], embeddings.shape[1])
            pca = PCA(n_components=components)
            pcs = pca.fit_transform(embeddings)

            for pc in range(components):
                lr_pmf_climbers[f'PC{pc+1}'] = pcs[:, pc]

            climbers[f'{replacement_level}_{num_factors}'] = lr_pmf_climbers

    return climbers

def create_correlation_matrices(df, path, raw=True):
    for replacement_level in REPLACEMENT_LEVELS:

        fig, axs = plt.subplots(nrows=1, ncols=len(LATENT_FACTORS), figsize=(28, 8))

        for num_factors in LATENT_FACTORS:
            df_athletes = df[f'{replacement_level}_{num_factors}']

            prefix = 'weight' if raw else 'PC'
            rows = [row for row in df_athletes.columns if row.startswith(prefix)]
            cols = ['coefs','pmf_accuracy','size']
            df_corr = df_athletes.corr().loc[rows, cols]
            sns.heatmap(df_corr, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1, ax=axs[num_factors-1])

        plt.suptitle(f'Correlation Matrices at RL: {replacement_level}', fontsize=16)
        plt.tight_layout()

        filename = f'figs/{path}/{"corr_raw" if raw else "corr_pc"}/CMatrix_{replacement_level}_{"raw" if raw else "pc"}'
        plt.savefig(filename)
        plt.close(fig)

def create_pc_figures(df, path):
    for replacement_level in REPLACEMENT_LEVELS:

        fig, axs = plt.subplots(nrows=1, ncols=len(LATENT_FACTORS[1:]), figsize=(28, 8))

        for i, num_factors in enumerate(LATENT_FACTORS[1:]):
            df_climbers = df[f'{replacement_level}_{num_factors}']

            axs[i].scatter(df_climbers['PC1'], df_climbers['PC2'], alpha=0.5)

            # for j, name in enumerate(df.index):
            #     if name == 'other':
            #         axs[i].text(df_climbers['PC1'][j], df_climbers['PC2'][j], name, color='red', fontsize=9)
            #     else:
            #         continue # Add labels?

            axs[i].set_title(f'PCA for {num_factors} Factors')
            axs[i].set_xlabel('Principal Component 1')
            axs[i].set_ylabel('Principal Component 2')

        plt.suptitle(f'PCA at RL: {replacement_level}', fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.95])

        filename = f'figs/{path}/PCA/PCA_{replacement_level}'
        plt.savefig(filename)
        plt.close(fig)


### Load Athlete DataFrame

In [4]:
athletes = create_climbers_df()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


### Load CMatrix Figures (Raw Embeddings)

In [5]:
create_correlation_matrices(athletes, 'climbers')

### Load CMatrix Figures (PCs)

In [6]:
create_correlation_matrices(athletes, 'climbers', raw=False)

In [7]:
create_pc_figures(athletes, 'climbers')

# Problem Embeddings

In [8]:
def create_problems_df():
    problems = {}

    for replacement_level in REPLACEMENT_LEVELS:
        with open(f"models/lr/model_rl_{replacement_level}_fold_0.pkl", 'rb') as f:            ########## CHANGE PATH FOR COMPLETE MODEL
            lr_model = pickle.load(f)

        for num_factors in LATENT_FACTORS:

            ### Handle Climbers (LR)
            # lr_climbers = lr_model.climber_vocab.get_itos()[1:]
            # lr_athletes = pd.DataFrame({
            #     "coefs": lr_model.lr.coef_.flatten().tolist(),
            # }, index=lr_climbers)

            ### Handle Climbers (PMF)
            pmf_model = torch.load(f"models/pmf/model_rl_{replacement_level}_d_{num_factors}_fold_1.pth")  ########## CHANGE PATH FOR COMPLETE MODEL
            pmf_model.eval()

            pmf_problem_ids = pmf_model.problem_vocab.get_itos()[1:]                                 ########## WITHOUT OTHER ATM
            pmf_problems = pd.DataFrame({
                "weights": pmf_model.problem_embedding.weight.data.numpy().tolist()[1:],
                "year": [float(problem.split('_')[0])for problem in pmf_problem_ids]
            }, index=pmf_problem_ids)

            weights = pmf_problems['weights'].apply(pd.Series)
            pmf_problems = pd.concat([pmf_problems[['year']], weights], axis=1)
            pmf_problems.columns = ['year'] + [f'weight_{i+1}' for i in range(weights.shape[1])]

            ### Merge Climbers
            c_probs = pmf_problems
            # c_probs = pd.merge(lr_athletes, pmf_athletes, left_index=True, right_index=True, how='outer')
            # lr_pmf_athletes.dropna(inplace=True)

            # ### Create PCs
            embeddings = c_probs[[row for row in c_probs.columns if row.startswith('w')]].values
            components = min(embeddings.shape[0], embeddings.shape[1])
            pca = PCA(n_components=components)
            pcs = pca.fit_transform(embeddings)

            for pc in range(components):
                c_probs[f'PC{pc+1}'] = pcs[:, pc]

            problems[f'{replacement_level}_{num_factors}'] = c_probs

    return problems

def create_geom_smooth(df, path):
    for replacement_level in REPLACEMENT_LEVELS:

        fig, axs = plt.subplots(nrows=1, ncols=len(LATENT_FACTORS), figsize=(48, 8))

        for i, num_factors in enumerate(LATENT_FACTORS):
            df_climbers = df[f'{replacement_level}_{num_factors}']

            sns.regplot(x='year', y=f'PC{i+1}', data=df_climbers, lowess=True, ax=axs[i])

            axs[i].set_title(f'GeomPlot for PC{1}')
            axs[i].set_xlabel('Year')
            axs[i].set_ylabel('Embedding')

        plt.suptitle(f'GeomSmooth at RL: {replacement_level}', fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.95])

        filename = f'figs/{path}/geom/geom_{replacement_level}.png'
        plt.savefig(filename)
        plt.close(fig)

In [9]:
problems = create_problems_df()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [10]:
create_geom_smooth(problems, 'problems')

In [11]:
create_pc_figures(problems, 'problems')

In [12]:
problems['25_1']

Unnamed: 0,year,weight_1,PC1
2017_Boulder IFSC Climbing Worldcup (B) - Munich (GER) 2017 MunichWC 18 - 19 Aug_Q_Zone3,2017.0,0.850769,-0.265227
2017_Boulder IFSC Climbing Worldcup (B) - Munich (GER) 2017 MunichWC 18 - 19 Aug_Q_Top4,2017.0,-0.080361,-1.196357
2017_Boulder IFSC Climbing Worldcup (B) - Munich (GER) 2017 MunichWC 18 - 19 Aug_Q_Top3,2017.0,-0.624605,-1.740601
2017_Boulder IFSC Climbing Worldcup (B) - Munich (GER) 2017 MunichWC 18 - 19 Aug_Q_Zone4,2017.0,0.657194,-0.458802
2017_Boulder IFSC Climbing Worldcup (B) - Munich (GER) 2017 MunichWC 18 - 19 Aug_Q_Top2,2017.0,-0.372202,-1.488197
...,...,...,...
2013_Boulder IFSC Climbing Worldcup (B) - Toronto (CAN) 2013 Toronto 1 - 2 Jun_S_Zone1,2013.0,0.412695,-0.703301
"2015_Boulder • Speed IFSC Climbing Worldcup (B,S) - Haiyang (CHN) 2015 Haiyang 26 - 27 Jun_S_Zone3",2015.0,0.494431,-0.621565
2016_Boulder IFSC Climbing Worldcup (B) - Innsbruck (AUT) 2016 TyrolWC 20 - 21 May_S_Top1,2016.0,0.946605,-0.169391
2022_Boulder IFSC - Climbing World Cup (B) - Meiringen (SUI) 2022 Meiringen 8 - 10 Apr_S_Zone2,2022.0,6.208210,5.092215


OLD IMPLEMENTATION FOR REF

In [13]:
def create_correlation_matrices():
    athletes = {}

    for replacement_level in REPLACEMENT_LEVELS:
        with open(f"models/lr/model_rl_{replacement_level}_fold_0.pkl", 'rb') as f:            ########## CHANGE PATH FOR COMPLETE MODEL
            lr_model = pickle.load(f)

        fig, axs = plt.subplots(nrows=1, ncols=len(LATENT_FACTORS), figsize=(28, 8))

        for num_factors in LATENT_FACTORS:
            # lr_weights = dict(zip(lr_model.climber_vocab.get_itos(), [lr_model.lr.intercept_[0]] + lr_model.lr.coef_.flatten().tolist())) ### WITH OTHER
            lr_climbers = lr_model.climber_vocab.get_itos()[1:]
            lr_athletes = pd.DataFrame({
                "coefs": lr_model.lr.coef_.flatten().tolist(),
            }, index=lr_climbers)

            pmf_model = torch.load(f"models/pmf/model_rl_{replacement_level}_d_{num_factors}_fold_1.pth")  ########## CHANGE PATH FOR COMPLETE MODEL
            pmf_model.eval()


            pmf_climbers = pmf_model.climber_vocab.get_itos()[1:]                                         ########## WITHOUT OTHER ATM
            pmf_athletes = pd.DataFrame({
                "weights": pmf_model.climber_embedding.weight.data.numpy().tolist()[1:],
                "pmf_accuracy": get_athlete_accuracy(df, pmf_model, pmf_climbers),
                "size": get_athlete_counts(df, pmf_model, pmf_climbers),
            }, index=pmf_climbers)

            weights = pmf_athletes['weights'].apply(pd.Series)
            pmf_athletes = pd.concat([pmf_athletes[['pmf_accuracy','size']], weights], axis=1)
            pmf_athletes.columns = ['pmf_accuracy', 'size'] + [f'weight_{i+1}' for i in range(weights.shape[1])]

            ### Correlations
            lr_pmf_athletes = pd.merge(lr_athletes, pmf_athletes, left_index=True, right_index=True, how='outer')
            lr_pmf_athletes.dropna(inplace=True)
            rows = [row for row in lr_pmf_athletes.columns if row.startswith('w')]
            cols = ['coefs','pmf_accuracy','size']
            df_corr = lr_pmf_athletes.corr().loc[rows, cols]
            sns.heatmap(df_corr, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1, ax=axs[num_factors-1])

            ### PCA
            embeddings = lr_pmf_athletes[rows].values
            components = min(embeddings.shape[0], embeddings.shape[1])
            pca = PCA(n_components=components)
            pcs = pca.fit_transform(embeddings)

            for pc in range(components):
                lr_pmf_athletes[f'PC{pc+1}'] = pcs[:, pc]

            athletes[f'{replacement_level}_{num_factors}'] = lr_pmf_athletes

        plt.suptitle(f'Correlation Matrices at RL: {replacement_level}', fontsize=16)

        plt.tight_layout()
        plt.savefig(f'figs/correlations_{replacement_level}')
        plt.close(fig)

    return athletes