In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score

import numpy as np

In [2]:
DATA_THRESHOLD = 1000

In [3]:
# Load Data
train = pd.read_csv(f'data/threshold_{DATA_THRESHOLD}/train_M_{DATA_THRESHOLD}.csv')
test = pd.read_csv(f'data/threshold_{DATA_THRESHOLD}/test_M_{DATA_THRESHOLD}.csv')

In [4]:
stratification = ['Level', 'problem', 'Year', 'Competition']

In [5]:
train['Stratification'] = train[stratification].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
test['Stratification'] = test[stratification].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

In [6]:
# Convert to Category
train['Climber'] = train['climber'].astype('category').cat.codes
train['Stratification'] = train['Stratification'].astype('category').cat.codes

test['Climber'] = test['climber'].astype('category').cat.codes
test['Stratification'] = test['Stratification'].astype('category').cat.codes

# Convert to tensor
train_values   = torch.tensor(train['status'].values, dtype=torch.float32)
train_climbers = torch.tensor(train['Climber'].values, dtype=torch.long)
train_problems = torch.tensor(train['Stratification'].values, dtype=torch.long)

test_values   = torch.tensor(test['status'].values, dtype=torch.float32)
test_climbers = torch.tensor(test['Climber'].values, dtype=torch.long)
test_problems = torch.tensor(test['Stratification'].values, dtype=torch.long)

# train['Stratification'].value_counts().value_counts(normalize=True)

In [7]:
class PMFModel(nn.Module):
    def __init__(self, num_climbers, num_problems, num_factors):
        super(PMFModel, self).__init__()
        self.climber_embedding = nn.Embedding(num_climbers, num_factors)
        self.problem_embedding = nn.Embedding(num_problems, num_factors)

    def forward(self, climber_indices, problem_indices):
        climber_vector = self.climber_embedding(climber_indices)
        problem_vector = self.problem_embedding(problem_indices)
        dot_product = (climber_vector * problem_vector).sum(dim=1)
        outputs = torch.sigmoid(dot_product)
        return outputs

num_factors = 1
num_climbers = train['climber'].nunique()
num_problems = train['Stratification'].nunique()

model = PMFModel(num_climbers, num_problems, num_factors)

In [8]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

num_epochs = 1000

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    predictions = model(train_climbers, train_problems)
    loss = criterion(predictions, train_values)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 100 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

def predict(model, climber_indices, problem_indices):
    model.eval()
    with torch.no_grad():
        predictions = model(climber_indices, problem_indices)
    return predictions

test_predictions = predict(model, test_climbers, test_problems)
test_predictions_binary = np.round(test_predictions)
accuracy = accuracy_score(test_values.numpy(), test_predictions_binary.numpy())
print(f'Test Accuracy: {accuracy}')

Epoch 100/1000, Loss: 0.5050785541534424
Epoch 200/1000, Loss: 0.5046169757843018
Epoch 300/1000, Loss: 0.5045244097709656
Epoch 400/1000, Loss: 0.5044912099838257
Epoch 500/1000, Loss: 0.5044758319854736
Epoch 600/1000, Loss: 0.5044674277305603
Epoch 700/1000, Loss: 0.5044624209403992
Epoch 800/1000, Loss: 0.504459023475647
Epoch 900/1000, Loss: 0.5044569373130798
Epoch 1000/1000, Loss: 0.5044550895690918
Test Accuracy: 0.7254160760224707


In [9]:
comparison_df = pd.DataFrame({
    'Predicted_Probability': test_predictions.numpy(),
    'Predicted': test_predictions_binary.numpy(),
    'Actual': test_values.numpy()
})

comparison_df

Unnamed: 0,Predicted_Probability,Predicted,Actual
0,0.691271,1.0,1.0
1,0.797388,1.0,0.0
2,0.870519,1.0,1.0
3,0.779410,1.0,0.0
4,0.366035,0.0,1.0
...,...,...,...
19042,0.228604,0.0,0.0
19043,0.475021,0.0,1.0
19044,0.830087,1.0,0.0
19045,0.999983,1.0,1.0


In [10]:
df = {
    "ID": train['Climber'].unique(),
    "Name": train['climber'].unique(),
    "Weight": model.climber_embedding.weight.squeeze().detach().numpy()
}
df = pd.DataFrame(df).set_index('ID')
df.sort_values(by='Weight', ascending=False)

Unnamed: 0_level_0,Name,Weight
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
10,Tsukuru Hori,-0.939327
8,Rustam Gelmanov,-1.092888
5,Kilian Fischhuber,-1.18175
3,Jan Hojer,-1.212818
6,Kokoro Fujii,-1.54097
7,Other,-1.573935
4,Jernej Kruder,-1.640489
9,Sean Mccoll,-1.668629
1,Dmitrii Sharafutdinov,-1.672276
0,Aleksei Rubtsov,-1.747139
