In [59]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
path_to_data = "/home/jli159/blf/trace_cc_with_final_basins.csv"
trace = pd.read_csv(path_to_data)

In [62]:
trace.columns

Index(['x', 'y', 'dt', 'u_id', 'clust_labels', 'botscore', 'basin',
       'x_grid_float', 'y_grid_float', 'x_grid', 'y_grid', 'mu_dx', 'mu_dy',
       'var_dx', 'var_dy', 'speed', 'basin_id', 'dist_to_edge',
       'norm_dist_to_edge', 'dist_to_center', 'basin_avg_speed',
       'basin_convergence', 'basin_size', 'prev_basin', 'basin_change',
       'transition_confidence', 'basin_belief', 'cumulative_basin_belief',
       'time_in_basin', 'delta_x', 'delta_y', 'movement_magnitude',
       'time_diff', 'user_speed', 'user_basin_entropy', 'user_basin_changes',
       'user_total_moves', 'user_change_rate', 'user_avg_speed',
       'user_avg_movement', 'user_stability_index', 'final_basin'],
      dtype='object')

In [52]:
trace = trace[['u_id', 'dt', 'x', 'y', 'mu_dx', 'mu_dy', 'final_basin']]
trace['dt'] = pd.to_datetime(trace['dt'])
trace = trace.sort_values(['u_id', 'dt'])

# Calculate prev_x and prev_y
trace[['prev_x', 'prev_y']] = trace.groupby('u_id')[['x', 'y']].shift(1)

# For the first entry per user, where prev_x/prev_y is NaN, set it equal to x/y
mask_first = trace['prev_x'].isna()
trace.loc[mask_first, 'prev_x'] = trace.loc[mask_first, 'x']
trace.loc[mask_first, 'prev_y'] = trace.loc[mask_first, 'y']

# Now calculate dx/dy: will be zero for first entry per user
trace['dx'] = trace['x'] - trace['prev_x']
trace['dy'] = trace['y'] - trace['prev_y']

# --- Calculate cosine similarity for each movement ---
def cosine_similarity(row):
    a = np.array([row['dx'], row['dy']])
    b = np.array([row['mu_dx'], row['mu_dy']])
    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
        return np.nan  # Or 0, if you want to treat zero movement as "neutral"
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

trace['cosine_sim'] = trace.apply(cosine_similarity, axis=1)
# Aggregate by user: take mean cosine similarity (or other stats if needed)
trace['cosine_sim'] = trace['cosine_sim'].fillna(0)

# For each user, check if their final_basin changes
def user_basin_change(x):
    return int(x['final_basin'].nunique() > 1)

# Map result to all rows for that user
trace['belief_change'] = trace.groupby('u_id')['final_basin'].transform(lambda x: int(x.nunique() > 1))
# Convert timestamp to datetime
trace['dt'] = pd.to_datetime(trace['dt'])
# Compute the time difference in days with granularity
trace['time_diff'] = trace.groupby('u_id')['dt'].diff().fillna(pd.Timedelta(seconds=0)).dt.total_seconds() / (24 * 3600)

In [21]:
trace.head(10)

Unnamed: 0,u_id,dt,x,y,mu_dx,mu_dy,final_basin,prev_x,prev_y,dx,dy,cosine_sim,belief_change,time_diff
0,12428,2020-09-21 17:07:48+00:00,22.076456,-4.707787,2.675405,6.800396,0,22.076456,-4.707787,0.0,0.0,0.0,0,0.0
1,12428,2020-10-05 13:11:46+00:00,21.852156,-5.451804,4.19684,2.252347,0,22.076456,-4.707787,-0.2243,-0.744017,-0.707082,0,13.836088
2,12428,2020-10-06 01:30:51+00:00,22.517124,-4.323751,1.484263,4.452345,0,21.852156,-5.451804,0.664968,1.128053,0.977849,0,0.513252
3,12428,2020-10-06 01:36:42+00:00,23.640362,-2.489461,-0.555071,-0.044118,0,22.517124,-4.323751,1.123238,1.83429,-0.588151,0,0.004063
4,29283,2020-07-26 15:00:07+00:00,23.296371,-4.816743,2.255973,2.391535,0,23.296371,-4.816743,0.0,0.0,0.0,1,0.0
5,29283,2020-08-08 22:40:35+00:00,26.746895,-3.141624,-4.145869,1.29341,1,23.296371,-4.816743,3.450523,1.675119,-0.728709,1,13.319769
6,29283,2020-08-11 01:25:22+00:00,27.124683,-2.451145,-2.292868,-1.383628,2,26.746895,-3.141624,0.377789,0.69048,-0.864219,1,2.114433
7,29283,2020-08-14 00:53:27+00:00,26.890812,-3.209466,-5.292282,0.10734,1,27.124683,-2.451145,-0.233871,-0.758322,0.275271,1,2.977836
8,29283,2020-08-17 13:36:35+00:00,27.144356,-2.407447,-1.966226,-1.316553,2,26.890812,-3.209466,0.253544,0.802019,-0.780965,1,3.529954
9,29283,2020-08-17 15:53:10+00:00,27.091753,-2.292559,-1.788803,-0.881881,2,27.144356,-2.407447,-0.052603,0.114888,-0.028656,1,0.09485


In [25]:
print(trace.isna().sum())
# Shows the number of NaNs for each column

u_id                0
dt                  0
x                   0
y                   0
mu_dx            1867
mu_dy            1867
final_basin         0
prev_x              0
prev_y              0
dx                  0
dy                  0
cosine_sim          0
belief_change       0
time_diff           0
dtype: int64


In [53]:
trace['mu_dx'] = trace['mu_dx'].fillna(0)
trace['mu_dy'] = trace['mu_dy'].fillna(0)

In [54]:
# Posts
WINDOW  = 20          # number of statements per sample
STRIDE  = 3           # slide by 5 rows ⇒ overlapping windows
FEATS   = ['cosine_sim', 'time_diff']   # ← add more features if desired

seqs, lengths, labels = [], [], []

for uid, sub in trace.groupby('u_id'):
    arr = sub[FEATS].to_numpy(dtype=np.float32)
    lbl = int(sub['belief_change'].iloc[0])
    # cut into [WINDOW]-long chunks; keep a final shorter tail
    for start in range(0, len(arr), STRIDE):
        chunk = arr[start:start+WINDOW]
        if len(chunk)==0:               # shouldn’t happen
            continue
        seqs.append(torch.tensor(chunk))
        lengths.append(len(chunk))
        labels.append(lbl)

print(f"Total samples: {len(seqs):,}  |  Unique users: {trace.u_id.nunique():,}")

Total samples: 60,058  |  Unique users: 13,236


In [40]:
# Or in days
WINDOW = 30      # width of each window  (days)
STRIDE = 5       # slide amount          (days)
FEATS       = ['cosine_sim', 'time_diff']   # same feature list

seqs, lengths, labels = [], [], []

for uid, sub in trace.groupby('u_id'):
    sub = sub.sort_values('dt')                  # already sorted, but be safe
    label = int(sub['belief_change'].iloc[0])    # constant for this user
    
    # iterate over the user’s timeline with a rolling [start, start+WINDOW]
    win_start = sub['dt'].min()
    last_time = sub['dt'].max()
    
    while win_start <= last_time:
        win_end = win_start + pd.Timedelta(days=WINDOW)
        mask    = (sub['dt'] >= win_start) & (sub['dt'] <  win_end)
        chunk_df = sub.loc[mask, FEATS]
        
        if chunk_df.empty:                # no statements in this slice → skip
            win_start += pd.Timedelta(days=STRIDE)
            continue
        
        seqs.append(torch.tensor(chunk_df.to_numpy(dtype=np.float32)))
        lengths.append(len(chunk_df))
        labels.append(label)
        
        win_start += pd.Timedelta(days=STRIDE)

print(f"Day-windows: {len(seqs):,}  |  Unique users: {trace.u_id.nunique():,}")

Day-windows: 126,179  |  Unique users: 13,236


In [55]:
class BeliefWindowDataset(Dataset):
    def __init__(self, sequences, lengths, labels):
        self.sequences = sequences
        self.lengths   = lengths
        self.labels    = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.lengths[idx], self.labels[idx]

def collate(batch):
    seqs, lens, lbls = zip(*batch)
    # pad to longest in mini-batch (NOT to WINDOW necessarily)
    padded = pad_sequence(seqs, batch_first=True)          # [B, Lmax, F]
    lens   = torch.tensor(lens)
    lbls   = torch.tensor(lbls, dtype=torch.float32)
    return padded, lens, lbls

In [56]:
# 4.1  collect a set of windows per user
by_user = {}
for (seq, ln, lbl), uid in zip(zip(seqs, lengths, labels), trace.groupby('u_id').groups):
    by_user.setdefault(uid, []).append((seq, ln, lbl))

uids = np.array(list(by_user.keys()))
train_uid, temp_uid = train_test_split(uids, test_size=0.3, random_state=SEED, stratify=[
                                       by_user[uid][0][2] for uid in uids])
val_uid,  test_uid  = train_test_split(temp_uid, test_size=0.5, random_state=SEED, stratify=[
                                       by_user[uid][0][2] for uid in temp_uid])

def rebuild(uid_subset):
    s,l,y = [],[],[]
    for uid in uid_subset:
        for seq, ln, lbl in by_user[uid]:
            s.append(seq); l.append(ln); y.append(lbl)
    return BeliefWindowDataset(s,l,y)

train_ds, val_ds, test_ds = map(rebuild, [train_uid,val_uid,test_uid])
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True,  collate_fn=collate)
val_dl   = DataLoader(val_ds,   batch_size=128, shuffle=False, collate_fn=collate)
test_dl  = DataLoader(test_ds,  batch_size=128, shuffle=False, collate_fn=collate)

In [57]:
class BeliefLSTM(nn.Module):
    def __init__(self, in_dims, hid=64, layers=2, bidir=True, drop=0.3):
        super().__init__()
        self.lstm = nn.LSTM(in_dims, hid, num_layers=layers, 
                            batch_first=True, bidirectional=bidir,
                            dropout=drop if layers>1 else 0.0)
        out_dim = hid * (2 if bidir else 1)
        self.classifier = nn.Sequential(
            nn.LayerNorm(out_dim),
            nn.Linear(out_dim, 32),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(32, 1)
        )

    def forward(self, x, lens):
        # x: [B, L, F], lens: [B]
        packed = pack_padded_sequence(x, lens.cpu(), batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.lstm(packed)      # h_n: [layers*dir, B, hid]
        # concat final fw + bw if bidirectional
        if self.lstm.bidirectional:
            h_last = torch.cat((h_n[-2], h_n[-1]), dim=-1)
        else:
            h_last = h_n[-1]
        logits = self.classifier(h_last).squeeze(-1)        # [B]
        return logits
        

model = BeliefLSTM(in_dims=len(FEATS)).to(DEVICE)
opt    = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
loss_fn= nn.BCEWithLogitsLoss()

BEST_AUROC, patience, PATIENCE = 0.0, 0, 5
for epoch in range(1, 51):      # max 50 epochs
    # 6.1  Train
    model.train(); tot_loss = 0
    for xb, lens, yb in train_dl:
        xb, lens, yb = xb.to(DEVICE), lens.to(DEVICE), yb.to(DEVICE)
        opt.zero_grad(set_to_none=True)
        logits = model(xb, lens)
        loss   = loss_fn(logits, yb)
        loss.backward()
        opt.step()
        tot_loss += loss.item()*len(yb)
    train_loss = tot_loss/len(train_ds)

    # 6.2  Validation
    model.eval(); all_logits, all_y = [], []
    with torch.no_grad():
        for xb, lens, yb in val_dl:
            logits = model(xb.to(DEVICE), lens.to(DEVICE))
            all_logits.append(torch.sigmoid(logits).cpu())
            all_y.append(yb)
    probs = torch.cat(all_logits).numpy()
    ys    = torch.cat(all_y).numpy()
    auroc = roc_auc_score(ys, probs)

    print(f"Epoch {epoch:02d} | train loss {train_loss:.4f} | val AUROC {auroc:.4f}")

    # early stopping
    if auroc > BEST_AUROC + 1e-4:       # significant improvement
        BEST_AUROC, patience = auroc, 0
        torch.save(model.state_dict(), "best_lstm.pt")
    else:
        patience += 1
        if patience >= PATIENCE:
            print("→ early-stopping")
            break

model.load_state_dict(torch.load("best_lstm.pt"))
model.eval(); all_logits, all_y = [], []

with torch.no_grad():
    for xb, lens, yb in test_dl:
        logits = model(xb.to(DEVICE), lens.to(DEVICE))
        all_logits.append(torch.sigmoid(logits).cpu())
        all_y.append(yb)

probs = torch.cat(all_logits).numpy()
ys    = torch.cat(all_y).numpy()
yhat  = (probs >= 0.5).astype(int)

print(classification_report(ys, yhat, digits=3))
print("Confusion matrix:\n", confusion_matrix(ys, yhat))
print("Test AUROC:", roc_auc_score(ys, probs).round(4))

Epoch 01 | train loss 0.5922 | val AUROC 0.7086
Epoch 02 | train loss 0.5658 | val AUROC 0.7101
Epoch 03 | train loss 0.5632 | val AUROC 0.7281
Epoch 04 | train loss 0.5580 | val AUROC 0.7308
Epoch 05 | train loss 0.5517 | val AUROC 0.7357
Epoch 06 | train loss 0.5340 | val AUROC 0.7561
Epoch 07 | train loss 0.5247 | val AUROC 0.7638
Epoch 08 | train loss 0.5221 | val AUROC 0.7633
Epoch 09 | train loss 0.5155 | val AUROC 0.7589
Epoch 10 | train loss 0.5170 | val AUROC 0.7561
Epoch 11 | train loss 0.5124 | val AUROC 0.7424
Epoch 12 | train loss 0.5195 | val AUROC 0.7638
→ early-stopping
              precision    recall  f1-score   support

         0.0      0.756     0.336     0.465       589
         1.0      0.773     0.954     0.854      1397

    accuracy                          0.771      1986
   macro avg      0.764     0.645     0.660      1986
weighted avg      0.768     0.771     0.739      1986

Confusion matrix:
 [[ 198  391]
 [  64 1333]]
Test AUROC: 0.7398
