In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
from SeqFM import SeqFM

In [2]:
partition = 100

In [3]:
trainpath = f'../../data/top30groups/LongLatCombined/train1/train{partition}.csv'
testpath = f'../../data/top30groups/LongLatCombined/test1/test{partition}.csv'
traindata = pd.read_csv(trainpath, encoding='ISO-8859-1')
testdata = pd.read_csv(testpath, encoding='ISO-8859-1')

In [4]:
def splitting_for_seqfm(train, test, device="cuda"):
    from sklearn.preprocessing import LabelEncoder
    import numpy as np
    import torch

    # 1. Combine to ensure consistent encoding of location_id across splits
    train = train.copy()
    test = test.copy()
    full = pd.concat([train, test], axis=0).reset_index(drop=True)

    # 2. Create longlat feature and encode it
    full['longlat'] = list(zip(full['longitude'], full['latitude']))
    full['location_id'], _ = pd.factorize(full['longlat'])
    full = full.drop(columns=['longitude', 'latitude', 'longlat'])

    # 3. Split back
    train = full.iloc[:len(train)].reset_index(drop=True)
    test = full.iloc[len(train):].reset_index(drop=True)

    # 4. Encode labels
    le = LabelEncoder()
    y_train = le.fit_transform(train['gname']).astype(np.int64)
    y_test = le.transform(test['gname']).astype(np.int64)

    # 5. Drop label column to form features
    X_train = train.drop(columns=['gname']).to_numpy(dtype=np.float32)
    X_test = test.drop(columns=['gname']).to_numpy(dtype=np.float32)

    # 6. Format for SeqFM (N, T=1, F)
    X_train_seq = torch.tensor(X_train[:, np.newaxis, :], device=device)
    X_test_seq = torch.tensor(X_test[:, np.newaxis, :], device=device)
    y_train = torch.tensor(y_train, device=device)
    y_test = torch.tensor(y_test, device=device)

    # 7. Lengths (since T=1 for all sequences)
    lengths_train = torch.ones(X_train_seq.shape[0], dtype=torch.long, device=device)
    lengths_test = torch.ones(X_test_seq.shape[0], dtype=torch.long, device=device)

    return X_train_seq, X_test_seq, y_train, y_test, lengths_train, lengths_test, le, train, test



In [5]:
import torch

# Device
#device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
print("Using device:", device)

# 1. Load data
X_train, X_test, y_train, y_test, lengths_train, lengths_test, le, traindata_processed, testdata_processed = splitting_for_seqfm(traindata, testdata, device)

# 2. Choose features
dynamic_feature = 'attacktype1'     # or 'weaptype1'
static_u_feature = 'location_id'    # already integer-encoded

# 3. Extract features from training & test
def get_feature_tensors(df, dynamic_feature, static_u_feature):
    static_U = torch.tensor(df[static_u_feature].values, device=device).long()
    static_E = torch.tensor(df.drop(columns=['gname', dynamic_feature, static_u_feature]).values, device=device).long()
    dynamic = torch.tensor(df[dynamic_feature].values[:, None, None], device=device).long()
    return static_U, static_E, dynamic

static_U_X, static_E_X, dynamic_X = get_feature_tensors(traindata_processed, dynamic_feature, static_u_feature)
static_U_X_test, static_E_X_test, dynamic_X_test = get_feature_tensors(testdata_processed, dynamic_feature, static_u_feature)

# 4. Compute vocab sizes from raw data (important!)
num_static_u = max(static_U_X.max().item(), static_U_X_test.max().item()) + 1
num_static_e = max(static_E_X.max().item(), static_E_X_test.max().item()) + 1
num_dynamic  = max(dynamic_X.max().item(), dynamic_X_test.max().item()) + 1

# Final sanity clamp after vocab computation
static_U_X = static_U_X.clamp(0, num_static_u - 1)
static_E_X = static_E_X.clamp(0, num_static_e - 1)
dynamic_X  = dynamic_X.clamp(0, num_dynamic - 1)

static_U_X_test = static_U_X_test.clamp(0, num_static_u - 1)
static_E_X_test = static_E_X_test.clamp(0, num_static_e - 1)
dynamic_X_test  = dynamic_X_test.clamp(0, num_dynamic - 1)

embedding_dim = 32

# 5. Clamp indices to avoid out-of-range errors
static_U_X = torch.clamp(static_U_X, 0, num_static_u - 1)
static_E_X = torch.clamp(static_E_X, 0, num_static_e - 1)
dynamic_X  = torch.clamp(dynamic_X,  0, num_dynamic - 1)

static_U_X_test = torch.clamp(static_U_X_test, 0, num_static_u - 1)
static_E_X_test = torch.clamp(static_E_X_test, 0, num_static_e - 1)
dynamic_X_test  = torch.clamp(dynamic_X_test,  0, num_dynamic - 1)

assert static_U_X.max().item() < num_static_u, "static_U_X has out-of-bound index!"
assert static_E_X.max().item() < num_static_e, "static_E_X has out-of-bound index!"
assert dynamic_X.max().item() < num_dynamic, "dynamic_X has out-of-bound index!"
print("static_U_X:", static_U_X.shape)
print("static_E_X:", static_E_X.shape)
print("dynamic_X:", dynamic_X.shape)

feature_X_dummy = torch.empty((static_U_X.shape[0], 0), device=device).long()



# 6. Initialize model
model = SeqFM(
    static_u_m=num_static_u,
    feature_m=num_static_e,  
    dynamic_m=num_dynamic,
    emb_dim=embedding_dim,
    dropout=0.2,
    n_layer=2,
    use_cuda=(device == "cuda"),
    unshared=False,
    pos_emb_dim=0
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss()

# 7. Print debug info
print("num_static_u:", num_static_u)
print("num_static_e:", num_static_e)
print("num_dynamic:", num_dynamic)
print("Max static_U_X:", static_U_X.max().item())
print("Max static_E_X:", static_E_X.max().item())
print("Max dynamic_X:", dynamic_X.max().item())
print("Min static_U_X:", static_U_X.min().item())
print("Min static_E_X:", static_E_X.min().item())
print("Min dynamic_X:", dynamic_X.min().item())

# 8. Training loop
for epoch in range(1, 101):
    model.train()
    preds = model([static_U_X, static_E_X, feature_X_dummy, dynamic_X, lengths_train])
    loss = loss_fn(preds, y_train)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch:03d} | Train Loss: {loss.item():.4f}")


Using device: cpu
static_U_X: torch.Size([2100])
static_E_X: torch.Size([2100, 12])
dynamic_X: torch.Size([2100, 1, 1])
num_static_u: 1790
num_static_e: 1855
num_dynamic: 9
Max static_U_X: 1303
Max static_E_X: 1835
Max dynamic_X: 8
Min static_U_X: 0
Min static_E_X: 0
Min dynamic_X: 1


IndexError: index out of range in self