In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [2]:
path_data = "https://github.com/DavidBreuer/ifsc-analysis/raw/main/ifsc_Boulder.xlsx"
df_raw = pd.read_excel(path_data)

SEED = 42

In [3]:
df = (
    df_raw
    # Get rid of columns we don't need
    .drop(columns=['Unique', 'Discipline', 'Number', 'Group'])
    .drop(columns=df_raw.filter(regex='Route|Run').columns)
    # Capitalize climber names consistently
    .assign(Name=lambda x: x['Name'].str.title())
    # "Unpivot" so it's one row per climber-problem
    # I'm treating tops and zones as separate problems even though there's obviously a correlation
    .melt(id_vars=['Year', 'Competition', 'Gender', 'Level', 'Name', 'Country'],
          value_vars=['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Zone1', 'Zone2', 'Zone3', 'Zone4', 'Zone5'],
          var_name='Problem', value_name='Attempts')
    .dropna(subset=['Attempts'])
    .replace([-np.inf, np.inf], np.nan)
)

In [4]:
# Keep completed boulders
grouped = df.groupby(['Competition', 'Level', 'Problem', 'Gender'])
df = grouped.filter(lambda x: x['Attempts'].notnull().any())

# Max attempts passed through
df = df.assign(Max_attempts=lambda x: grouped['Attempts'].transform('max'))

# Survival model features
df = df.assign(
    Status=lambda x: x['Attempts'].notna(),
    Time=lambda x: np.where(x['Attempts'].isna(), x['Max_attempts'], x['Attempts'])
)

# Failure set to max attempts
df['Attempts'] = np.where(df['Attempts'].isna(), df['Max_attempts'], df['Attempts'])

df[['Attempts', 'Max_attempts', 'Status', 'Time']] = df[['Attempts', 'Max_attempts', 'Status', 'Time']].astype(int)

# Make Problem Category
df['Problem_category'] = np.where(df.Problem.str.startswith('Top').values, 'Top', 'Zone')

# Keep Climbers with lots of data; "Other" as replacement
DATA_THRESHOLDS = [50,100,150,200,250,500,1000]

climber_counts = df['Name'].value_counts()
for threshold in DATA_THRESHOLDS:
    df[f'Climber_{threshold}'] = np.where(df['Name'].map(climber_counts) >= threshold, df['Name'], 'Other')
    df[f'Climber_{threshold}'] = pd.Categorical(df[f'Climber_{threshold}'], categories=['Other'] + [x for x in df[f'Climber_{threshold}'].unique() if x != 'Other'])

In [5]:
def examine_qualified_Climbers(threshold):
    m_count, w_count = df[(df['Gender']=="M") & (df[f'Climber_{threshold}']!="Other")][f'Climber_{threshold}'].nunique(), df[(df['Gender']=="W") & (df[f'Climber_{threshold}']!="Other")][f'Climber_{threshold}'].nunique()
    m_total_count, w_total_count = df_raw[df_raw['Gender']=='M'].Name.nunique(), df_raw[df_raw['Gender']=='W'].Name.nunique()

    print(f'There are {m_count} male contestants with over {threshold} data points out of {m_total_count}')
    print(f'There are {w_count} female contestans with over {threshold} data points out of {w_total_count}')

    m_contestants = df[(df['Gender']=="M") & (df[f'Climber_{threshold}']!="Other")].Name.unique()
    w_contestants = df[(df['Gender']=="W") & (df[f'Climber_{threshold}']!="Other")].Name.unique()

    print(f'The male contestans are {", ".join(m_contestants)}')
    print(f'The female contestans are {", ".join(w_contestants)}')

for threshold in DATA_THRESHOLDS:
    examine_qualified_Climbers(threshold)

There are 399 male contestants with over 50 data points out of 1424
There are 290 female contestans with over 50 data points out of 1007
The male contestans are Dmitrii Sharafutdinov, Gabriele Moroni, Kilian Fischhuber, Matthias Müller, Martin Stranik, Kevin Hemund, Gareth Parry, Daniel Woods, Ludovic Laurence, Rustam Gelmanov, Jorg Verhoeven, Pierre Duroche, Markus Hoppe, Masatoshi Sugita, Thomas Caleyron, Guillaume Glairon Mondet, Christian Core, Mikhail Chernikov, Andre Borowka, Michele Caminati, David Lama, Lukas Ennemoser, Jernej Kruder, Tomasz Oleksy, Gérome Pouvreau, Emanuel Moosburger, Jonas Baumann, Lucas Preti, Thomas Tauporn, Jakob Schubert, Stewart Watson, Stéphan Julien, Akito Matsushima, Wouter Jongeneelen, Tsukuru Hori, Yury Novitskiy, Kazuma Watanabe, Klemen Becan, Christopher Webb-Parsons, David Barrans, Nalle Hukkataival, Mykhaylo Shalagin, Julien Meral, Remo Sommer, Peter Würth, Jesse Van Der Werf, Aric Merz, Baptiste Nomine, Ignasi Tarrazona Gasque, Bruno Macias Mat

In [6]:
dfm = df[df.Gender=="M"].copy()
dfw = df[df.Gender=="W"].copy()

dfm.to_csv('./data/men_data.csv', index=False)
dfw.to_csv('./data/women_data.csv', index=False)

Stratification = ['Year', 'Competition', 'Level', 'Problem']

def create_split(df, Stratification, seed):
    df['Stratification'] = df[Stratification].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
    train_df, test_df = train_test_split(df, train_size=0.8, test_size=0.2, random_state=seed, stratify=df['Stratification'])
    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

# Create the train/test splits
train_M, test_M = create_split(dfm, Stratification, SEED)
train_W, test_W = create_split(dfw, Stratification, SEED)

In [None]:
train_M.to_csv(f'./data/split/train_M.csv', index=False)
test_M.to_csv(f'./data/split/test_M.csv', index=False)

train_W.to_csv(f'./data/split/train_W.csv', index=False)
test_W.to_csv(f'./data/split/test_W.csv', index=False)