In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score

import random

In [2]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


# data processing

In [None]:
# same as sklearn 
df = pd.read_csv("data/allHorizonData_cut.csv")

seq_features = [
    'r1', 'c1', 'rt1',
    'r2', 'c2', 'rt2',
    'r3', 'c3', 'rt3',
    'r4', 'c4', 'rt4'
]
static_features = ['gameLength', 'uc']

target = 'c5'

X_seq = df[seq_features]
X_static = df[static_features]
# y = df[target]
# y = y-1 # binary CE only takes in 0, 1

subject_id = df['subjectID'].unique()

train_subjects, test_subjects = train_test_split(subject_id, test_size=0.2, random_state=42)
df_train = df[df['subjectID'].isin(train_subjects)]
df_test = df[df['subjectID'].isin(test_subjects)]

# normalize
X_seq_train_flat = df_train[seq_features].to_numpy()
X_seq_test_flat = df_test[seq_features].to_numpy()

X_static_train_flat = df_train[static_features].to_numpy()
X_static_test_flat = df_test[static_features].to_numpy()


scaler = StandardScaler()
X_seq_train = scaler.fit_transform(X_seq_train_flat).reshape(-1, 4, 3) # (num_samples, seq_len=4, feature_dim=3)
X_seq_test = scaler.transform(X_seq_test_flat).reshape(-1, 4, 3)
X_static_train = scaler.fit_transform(X_static_train_flat)
X_static_test = scaler.transform(X_static_test_flat)


In [4]:
print("--- Feature Data (X) ---")
print(X_seq.head())
print(X_static.head())
print("\n--- Target Data (y) ---")
print(y.head())

--- Feature Data (X) ---
   r1  c1       rt1  r2  c2       rt2  r3  c3       rt3  r4  c4       rt4
0  66   2  1.849054  80   2  1.771619  29   1  0.562676  75   2  0.578808
1  69   2  0.967068  50   2  0.495166  51   1  0.506639  64   2  0.460037
2  31   2  0.862793  43   1  0.490816  26   2  0.924838  36   1  0.951034
3  65   1  6.272626  77   2  1.204784  52   2  0.795462  73   1  0.457327
4  70   2  0.614185  19   1  0.364167  43   2  0.306713  41   1  0.372321
   gameLength  uc
0           5   3
1          10   3
2          10   2
3          10   2
4          10   2

--- Target Data (y) ---
0    1
1    0
2    0
3    0
4    1
Name: c5, dtype: int64


conver to tensor

In [7]:

X_seq_train_tensor = torch.tensor(X_seq_train, dtype=torch.float32) # sklearn output float64, doesn't work with torch
X_seq_test_tensor = torch.tensor(X_seq_test, dtype=torch.float32) 

X_static_train_tensor = torch.tensor(X_static_train, dtype=torch.float32) 
X_static_test_tensor = torch.tensor(X_static_test, dtype=torch.float32) 

y_train_tensor = torch.tensor(df_train[target].to_numpy() - 1, dtype=torch.long) # pandas series to tensor
y_test_tensor = torch.tensor(df_test[target].to_numpy() - 1, dtype=torch.long)

In [8]:
print(torch.unique(y_train_tensor))

tensor([0, 1])


h1 h6 mask

In [9]:

# split based on original data frame
h1_mask = df_test['gameLength'] == 5
h6_mask = df_test['gameLength'] == 10

h1_mask_bool = torch.tensor(h1_mask, dtype=torch.bool)
h6_mask_bool = torch.tensor(h6_mask, dtype=torch.bool)

X_seq_test_h1 = torch.tensor(X_seq_test[h1_mask_bool], dtype=torch.float32)
X_seq_test_h6 = torch.tensor(X_seq_test[h6_mask_bool], dtype=torch.float32)

X_static_test_h1 = torch.tensor(X_static_test[h1_mask_bool], dtype=torch.float32)
X_static_test_h6 = torch.tensor(X_static_test[h6_mask_bool], dtype=torch.float32)

y_test_h1 = y_test_tensor[h1_mask_bool]
y_test_h6 = y_test_tensor[h6_mask_bool]