In [23]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, brier_score_loss, roc_auc_score, f1_score
import matplotlib.pyplot as plt
import pickle

In [21]:
def train_lr(df, replacement_level):
    # Split the data first (to avoid info leakage)
    train, test = train_test_split(df, test_size=0.2, random_state=42)

    # Create the threshold column for training data (using Zeke's code)
    train_climber_counts = train['Name'].value_counts()
    train[f'Climber_{replacement_level}'] = np.where(train['Name'].map(train_climber_counts) >= replacement_level, train['Name'], 'Other')
    train[f'Climber_{replacement_level}'] = pd.Categorical(train[f'Climber_{replacement_level}'], categories=['Other'] + [x for x in train[f'Climber_{replacement_level}'].unique() if x != 'Other'])

    # Create the threshold column for test data (using Zeke's code)
    test_climber_counts = train['Name'].value_counts()
    test[f'Climber_{replacement_level}'] = np.where(test['Name'].map(test_climber_counts) >= replacement_level, test['Name'], 'Other')
    test[f'Climber_{replacement_level}'] = pd.Categorical(test[f'Climber_{replacement_level}'], categories=['Other'] + [x for x in train[f'Climber_{replacement_level}'].unique() if x != 'Other'])

    # Features
    encoded_features = ['Level', 'Problem', f'Climber_{replacement_level}']
    target = 'Status'

    X_train, X_test = train[encoded_features], test[encoded_features]
    y_train, y_test = train[target], test[target]

    # Encode features
    encoder = OneHotEncoder(drop='first', sparse_output=False)
    X_train_encoded = encoder.fit_transform(X_train)
    X_test_encoded = encoder.transform(X_test)

    X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(encoded_features))
    X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(encoded_features))

    # Convert to tensors
    X_train_tensor = torch.tensor(X_train_encoded_df.values, dtype=torch.float)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float)
    X_test_tensor = torch.tensor(X_test_encoded_df.values, dtype=torch.float)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float)

    # Logistic Regression --> 1 linear layer, sigmoid activation
    class LogRegModel(nn.Module):
        def __init__(self, df_dim):
            super().__init__()
            self.linear = nn.Linear(df_dim, 1)
            self.sigmoid = nn.Sigmoid()
        
        def forward(self, x):
            out = self.linear(x)
            return self.sigmoid(out)

    # Model init
    input_dim = X_train_tensor.shape[1]
    model = LogRegModel(input_dim)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters())

    # Train
    num_epochs = 100
    batches = 32

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        correct_train = 0
        for i in range(0, len(X_train_tensor), batches):
            batch_X = X_train_tensor[i:i+batches]
            batch_y = y_train_tensor[i:i+batches]
            
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            correct_train += ((outputs > 0.5).float().squeeze() == batch_y).float().sum()
        
        train_loss = epoch_loss / (len(X_train_tensor) / batches)
        train_accuracy = correct_train / len(X_train_tensor)
        model.eval()
        with torch.no_grad():
            test_outputs = model(X_test_tensor)
            test_accuracy = ((test_outputs > 0.5).float().squeeze() == y_test_tensor).float().mean()
        
        if ((epoch + 1) % 10) == 0:
            print(f'Epoch [{epoch}/{num_epochs}], Loss: {train_loss}, Train Accuracy: {train_accuracy}, Test Accuracy: {test_accuracy}')

    # *********** EVALUATION ***********
    model.eval()
    with torch.no_grad():
        y_pred_proba = model(X_test_tensor).numpy().flatten()
        y_pred = (y_pred_proba > 0.5).astype(int)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Log loss:", log_loss(y_test, y_pred_proba))
    print("Brier score loss:", brier_score_loss(y_test, y_pred_proba))
    print(f"F1 Score: {f1_score(y_test, y_pred)}")
    print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba)}")
    print(f"predictions:  \n {pd.Series(y_pred).value_counts(normalize=True)}")
    print(f"actual dist in test: \n {pd.Series(y_test).value_counts(normalize=True)}")

    # Print coefficients
    coefficients = model.linear.weight.detach().numpy().flatten()
    features = encoder.get_feature_names_out(encoded_features)
    coeff_df = pd.DataFrame({
        "Feature": features,
        "Coefficient": coefficients
    })
    intercept = model.linear.bias.detach().numpy().flatten()[0]
    print(f"Intercept: {intercept}")
    print("Coefficients:")
    print(coeff_df)

    return model

In [7]:
train = pd.read_csv(f'data/split/train_M.csv')
test = pd.read_csv(f'data/split/test_M.csv')
df = pd.concat([train, test])

In [22]:
model = train_lr(df, 500)


13593                Other
67363                Other
26960        Jernej Kruder
34860                Other
71157    Kilian Fischhuber
               ...        
6265                 Other
54886                Other
633                  Other
860                  Other
15795                Other
Name: Climber_500, Length: 76187, dtype: category
Categories (23, object): ['Other', 'Jernej Kruder', 'Kilian Fischhuber', 'Michael Piccolruaz', ..., 'Klemen Becan', 'David Barrans', 'Rustam Gelmanov', 'Jorg Verhoeven']
Epoch [9/100], Loss: 0.5975650614135707, Train Accuracy: 0.6847493648529053, Test Accuracy: 0.6875623464584351
Epoch [19/100], Loss: 0.5974517307143551, Train Accuracy: 0.6857600212097168, Test Accuracy: 0.6880873441696167
Epoch [29/100], Loss: 0.5973731415767485, Train Accuracy: 0.6861406564712524, Test Accuracy: 0.6884548664093018
Epoch [39/100], Loss: 0.5973161530303485, Train Accuracy: 0.6865344643592834, Test Accuracy: 0.6882448792457581
Epoch [49/100], Loss: 0.597274288934