In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [2]:
path_data = "https://github.com/DavidBreuer/ifsc-analysis/raw/main/ifsc_Boulder.xlsx"
df_raw = pd.read_excel(path_data)

SEED = 42
DATA_THRESHOLD = 1000

In [3]:
df = (
    df_raw
    # Get rid of columns we don't need
    .drop(columns=['Unique', 'Discipline', 'Number', 'Group'])
    .drop(columns=df_raw.filter(regex='Route|Run').columns)
    # Capitalize climber names consistently
    .assign(Name=lambda x: x['Name'].str.title())
    # "Unpivot" so it's one row per climber-problem
    # I'm treating tops and zones as separate problems even though there's obviously a correlation
    .melt(id_vars=['Year', 'Competition', 'Gender', 'Level', 'Name', 'Country'],
          value_vars=['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Zone1', 'Zone2', 'Zone3', 'Zone4', 'Zone5'],
          var_name='problem', value_name='attempts')
    .dropna(subset=['attempts'])
    .replace([-np.inf, np.inf], np.nan)
)

In [4]:
# Keep completed boulders
grouped = df.groupby(['Competition', 'Level', 'problem', 'Gender'])
# # df = grouped.filter(lambda x: np.isfinite(x['attempts']).any())
df = grouped.filter(lambda x: x['attempts'].notnull().any())

# Max attempts passed through
df = df.assign(max_attempts=lambda x: grouped['attempts'].transform('max'))

# Survival model features
df = df.assign(
    status=lambda x: x['attempts'].notna(),
    time=lambda x: np.where(x['attempts'].isna(), x['max_attempts'], x['attempts'])
)

# Failure set to max attempts
df['attempts'] = np.where(df['attempts'].isna(), df['max_attempts'], df['attempts'])

df[['attempts', 'max_attempts', 'status', 'time']] = df[['attempts', 'max_attempts', 'status', 'time']].astype(int)

# Keep Climbers with lots of data; "Other" as replacement
climber_counts = df['Name'].value_counts()
df['climber'] = np.where(df['Name'].map(climber_counts) >= DATA_THRESHOLD, df['Name'], 'Other')
df['climber'] = pd.Categorical(df['climber'], categories=['Other'] + [x for x in df['climber'].unique() if x != 'Other'])

# Make Problem Category
df['problem_category'] = np.where(df.problem.str.startswith('Top').values, 'Top', 'Zone')

In [5]:
DATA_THRESHOLD# Data Cleaning Analysis
m_count, w_count = df[(df['Gender']=="M") & (df['climber']!="Other")].climber.nunique(), df[(df['Gender']=="W") & (df['climber']!="Other")].climber.nunique()
m_total_count, w_total_count = df_raw[df_raw['Gender']=='M'].Name.nunique(), df_raw[df_raw['Gender']=='W'].Name.nunique()

print(f'There are {m_count} male contestants with over {DATA_THRESHOLD} data points out of {m_total_count}')
print(f'There are {w_count} female contestans with over {DATA_THRESHOLD} data points out of {w_total_count}')

m_contestants = df[(df['Gender']=="M") & (df['climber']!="Other")].Name.unique()
w_contestants = df[(df['Gender']=="W") & (df['climber']!="Other")].Name.unique()

print(f'The male contestans are {", ".join(m_contestants)}')
print(f'The female contestans are {", ".join(w_contestants)}')

There are 10 male contestants with over 1000 data points out of 1424
There are 7 female contestans with over 1000 data points out of 1007
The male contestans are Dmitrii Sharafutdinov, Kilian Fischhuber, Rustam Gelmanov, Guillaume Glairon Mondet, Jernej Kruder, Tsukuru Hori, Sean Mccoll, Aleksei Rubtsov, Jan Hojer, Kokoro Fujii
The female contestans are Akiyo Noguchi, Katharina Saurwein, Anna Stöhr, Melissa Le Neve, Petra Klingler, Shauna Coxsey, Miho Nonaka


In [6]:
directory_path = f'data/threshold_{DATA_THRESHOLD}'
if not os.path.exists(directory_path):
    os.makedirs(directory_path)
    print(f"Directory {directory_path} created.")
else:
    print(f"Directory {directory_path} already exists.")


df[df.Gender=="M"].to_csv(f'./{directory_path}/men_data_{DATA_THRESHOLD}.csv', index=False)
df[df.Gender=="W"].to_csv(f'./{directory_path}/women_data_{DATA_THRESHOLD}.csv', index=False)

def create_split(df, seed):
    df['stratify_group'] = df[['Competition', 'Level', 'problem']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
    train_df, test_df = train_test_split(df, train_size=0.8, test_size=0.2, random_state=seed, stratify=df['stratify_group'])

    train_df = train_df.drop(columns=['stratify_group'])
    test_df = test_df.drop(columns=['stratify_group'])

    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

# Create the train/test splits
train_M, test_M = create_split(df[df.Gender=="M"], SEED)
train_W, test_W = create_split(df[df.Gender=="W"], SEED)

# Write train/test M+W to
train_M.to_csv(f'./{directory_path}/train_M_{DATA_THRESHOLD}.csv', index=False)
test_M.to_csv(f'./{directory_path}/test_M_{DATA_THRESHOLD}.csv', index=False)

train_W.to_csv(f'./{directory_path}/train_W_{DATA_THRESHOLD}.csv', index=False)
test_W.to_csv(f'./{directory_path}/test_W_{DATA_THRESHOLD}.csv', index=False)

Directory data/threshold_1000 already exists.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stratify_group'] = df[['Competition', 'Level', 'problem']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stratify_group'] = df[['Competition', 'Level', 'problem']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
