In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, brier_score_loss, roc_auc_score, f1_score

In [3]:
# Load Data
train = pd.read_csv('data/split/train_M.csv')
test = pd.read_csv('data/split/test_M.csv')

encoded_features = ['Level', 'problem','climber']
target = 'status'

X_train, X_test = train[encoded_features], test[encoded_features]
y_train, y_test = train[target], test[target]

In [4]:
train

Unnamed: 0,Year,Competition,Gender,Level,Name,Country,problem,attempts,max_attempts,status,time,climber
0,2011,Boulder IFSC Climbing Worldcup (B) - Vienna (A...,M,Q,Mathias Conrad,GER,Zone5,8,8,0,8,Other
1,2022,"Boulder • Speed IFSC - Climbing World Cup (B,S...",M,Q,Guy Mcnamee,CAN,Top4,1,6,1,1,Other
2,2008,Boulder • Speed IFSC Climbing Worldcup (S+B) -...,M,Q,Kilian Fischhuber,AUT,Zone5,2,8,1,2,Kilian Fischhuber
3,2018,"Boulder • Speed IFSC Climbing Worldcup (B,S) -...",M,Q,Tomoaki Takata,JPN,Zone5,2,7,1,2,Other
4,2011,Boulder IFSC Climbing Worldcup (B) - Vail (USA...,M,S,Jonas Baumann,GER,Zone1,1,4,1,1,Other
...,...,...,...,...,...,...,...,...,...,...,...,...
76182,2022,"Boulder • Speed IFSC - Climbing World Cup (B,S...",M,Q,Nimrod Marcus,ISR,Zone4,1,6,1,1,Other
76183,2021,"Lead • Boulder IFSC - Climbing World Cup (B,L)...",M,Q,Simon Potucek,CZE,Zone3,7,7,0,7,Other
76184,2012,Lead • Boulder • Speed IFSC Climbing World Cha...,M,Q,Aleksander Romanowski,POL,Top2,2,10,1,2,Other
76185,2016,Boulder IFSC Climbing Worldcup (B) - Innsbruck...,M,Q,Yoshiyuki Ogata,JPN,Top3,6,6,0,6,Other


In [5]:
# Encode Features
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_train_encoded = encoder.fit_transform(X_train)
X_test_encoded = encoder.transform(X_test)

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(encoded_features))
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(encoded_features))

In [11]:
# Scale the data - so our intercept isn't over 1
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded_df)
X_test_scaled = scaler.transform(X_test_encoded_df)

# Train Model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)

# Apply model onto Test
y_pred = logreg.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Log loss:", log_loss(y_test, y_pred))
print("Brier score loss:", brier_score_loss(y_test, y_pred))
print(f"F1 Score: {f1_score(y_test, y_pred)}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred)}")
print(f"base rates (# of rows that were correct from training set): \n {y_train.value_counts(normalize=True)}")

Accuracy: 0.6778495301097286
Log loss: 11.611479875866165
Brier score loss: 0.3221504698902714
F1 Score: 0.7342112102572987
ROC AUC Score: 0.6625977281056706
base rates (# of rows that were correct from training set): 
 1    0.555528
0    0.444472
Name: status, dtype: float64


In [7]:
# Retrieve and display regression coefficients
coefficients = logreg.coef_[0]
intercept = logreg.intercept_[0]
feature_names = X_train_encoded_df.columns

coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})
print("Intercept:", intercept)
print("Coefficients:\n", coef_df)

Intercept: 0.27516593701616254
Coefficients:
                              Feature  Coefficient
0                            Level_Q    -0.104553
1                            Level_S    -0.084785
2                       problem_Top2    -0.235944
3                       problem_Top3    -0.266416
4                       problem_Top4    -0.284800
5                       problem_Top5    -0.211073
6                      problem_Zone1     0.416570
7                      problem_Zone2     0.170683
8                      problem_Zone3     0.163461
9                      problem_Zone4     0.149831
10                     problem_Zone5     0.123889
11     climber_Dmitrii Sharafutdinov     0.033021
12  climber_Guillaume Glairon Mondet    -0.002548
13                 climber_Jan Hojer    -0.005397
14             climber_Jernej Kruder    -0.017823
15         climber_Kilian Fischhuber     0.060615
16              climber_Kokoro Fujii     0.018611
17                     climber_Other    -0.393401
18  