In [4]:
!pip install torch_geometric



In [5]:
# =============================================================================
# Cell 1: Imports and Configuration
# =============================================================================

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import GCNConv, RGCNConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from collections import defaultdict
import time
import warnings
warnings.filterwarnings('ignore')

# Configuration
class Config:
    # Data
    SEED = 42
    TEST_STUDENT_RATIO = 0.15      # 15% students for TEST (Split B)
    VAL_STUDENT_RATIO = 0.15       # 15% of TRAIN students for VAL

    # Model Architecture
    FEATURE_DIM = 5                # 5D statistical features (base for all nodes)
    EMBED_DIM = 32
    HIDDEN_DIM = 64
    NUM_GNN_LAYERS = 2
    DROPOUT = 0.2
    NUM_BASES = 2                  # For RGCN basis decomposition

    # Training
    BATCH_SIZE = 512               # Larger batch for efficiency
    LEARNING_RATE = 1e-3
    WEIGHT_DECAY = 0.01
    EPOCHS = 100
    PATIENCE = 10
    GRAD_CLIP = 1.0

    # Mastery (M3a specific)
    MASTERY_INIT = 0.5             # Initial mastery value [0.5 or 'global_mean']
    LAMBDA_EMA = 0.1               # Mastery update learning rate (0.05, 0.1, 0.2)

    # Device
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = Config()

# Set seeds for reproducibility
np.random.seed(config.SEED)
torch.manual_seed(config.SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(config.SEED)

print(f"Device: {config.DEVICE}")
print(f"PyTorch version: {torch.__version__}")
print(f"Configuration loaded successfully (M3a - Mastery as Features)")  # ‚Üê MODIFI√â

Device: cuda
PyTorch version: 2.9.0+cu128
Configuration loaded successfully (M3a - Mastery as Features)


In [6]:
# =============================================================================
# Cell 2: Load and Explore Dataset
# =============================================================================

# Load the Algebra 2005-2006 dataset
DATA_PATH = "algebra_2005_2006_train.txt"

# Load with tab separator (standard format for this dataset)
df_raw = pd.read_csv(DATA_PATH, sep='\t', low_memory=False)

print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)
print(f"Shape: {df_raw.shape}")
print(f"\nColumns ({len(df_raw.columns)}):")
for i, col in enumerate(df_raw.columns):
    print(f"  {i+1:2d}. {col}")

print("\n" + "=" * 60)
print("FIRST 3 ROWS")
print("=" * 60)
display(df_raw.head(3))

print("\n" + "=" * 60)
print("DATA TYPES")
print("=" * 60)
print(df_raw.dtypes)

print("\n" + "=" * 60)
print("MISSING VALUES")
print("=" * 60)
missing = df_raw.isnull().sum()
missing_pct = (missing / len(df_raw) * 100).round(2)
missing_df = pd.DataFrame({'Missing': missing, 'Percent': missing_pct})
print(missing_df[missing_df['Missing'] > 0])

print("\n" + "=" * 60)
print("KEY STATISTICS")
print("=" * 60)
print(f"Total interactions: {len(df_raw):,}")
print(f"Unique students: {df_raw['Anon Student Id'].nunique():,}")
print(f"Unique problems (questions): {df_raw['Problem Name'].nunique():,}")
print(f"Unique steps: {df_raw[['Problem Name', 'Step Name']].drop_duplicates().shape[0]:,}")
print(f"Unique KC(Default): {df_raw['KC(Default)'].nunique():,}")
print(f"KC(Default) missing: {df_raw['KC(Default)'].isnull().sum():,} ({df_raw['KC(Default)'].isnull().mean()*100:.2f}%)")

print("\n" + "=" * 60)
print("TARGET DISTRIBUTION (Correct First Attempt)")
print("=" * 60)
print(df_raw['Correct First Attempt'].value_counts(normalize=True).round(4))

DATASET OVERVIEW
Shape: (809694, 19)

Columns (19):
   1. Row
   2. Anon Student Id
   3. Problem Hierarchy
   4. Problem Name
   5. Problem View
   6. Step Name
   7. Step Start Time
   8. First Transaction Time
   9. Correct Transaction Time
  10. Step End Time
  11. Step Duration (sec)
  12. Correct Step Duration (sec)
  13. Error Step Duration (sec)
  14. Correct First Attempt
  15. Incorrects
  16. Hints
  17. Corrects
  18. KC(Default)
  19. Opportunity(Default)

FIRST 3 ROWS


Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,Step Duration (sec),Correct Step Duration (sec),Error Step Duration (sec),Correct First Attempt,Incorrects,Hints,Corrects,KC(Default),Opportunity(Default)
0,1,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG4-FIXED,1,3(x+2) = 15,2005-09-09 12:24:35.0,2005-09-09 12:24:49.0,2005-09-09 12:25:15.0,2005-09-09 12:25:15.0,40.0,,40.0,0,2,3,1,[SkillRule: Eliminate Parens; {CLT nested; CLT...,1
1,2,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG4-FIXED,1,x+2 = 5,2005-09-09 12:25:15.0,2005-09-09 12:25:31.0,2005-09-09 12:25:31.0,2005-09-09 12:25:31.0,16.0,16.0,,1,0,0,1,"[SkillRule: Remove constant; {ax+b=c, positive...",1~~1
2,3,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,1,2-8y = -4,2005-09-09 12:25:36.0,2005-09-09 12:25:43.0,2005-09-09 12:26:12.0,2005-09-09 12:26:12.0,36.0,,36.0,0,2,3,1,"[SkillRule: Remove constant; {ax+b=c, positive...",2



DATA TYPES
Row                              int64
Anon Student Id                 object
Problem Hierarchy               object
Problem Name                    object
Problem View                     int64
Step Name                       object
Step Start Time                 object
First Transaction Time          object
Correct Transaction Time        object
Step End Time                   object
Step Duration (sec)            float64
Correct Step Duration (sec)    float64
Error Step Duration (sec)      float64
Correct First Attempt            int64
Incorrects                       int64
Hints                            int64
Corrects                         int64
KC(Default)                     object
Opportunity(Default)            object
dtype: object

MISSING VALUES
                             Missing  Percent
Step Start Time                  919     0.11
Correct Transaction Time       25851     3.19
Step Duration (sec)              919     0.11
Correct Step Duration (sec)   189

In [7]:
# =============================================================================
# Cell 3 (Corrected): Data Cleaning - Keep Missing KC as UNKNOWN
# =============================================================================

# Start with a copy
df = df_raw.copy()

print("=" * 60)
print("STEP 1: Drop rows with missing critical fields (NOT KC)")
print("=" * 60)

# Critical fields: student, step, target (NOT KC - we'll handle separately)
critical_cols = ['Anon Student Id', 'Problem Name', 'Step Name', 'Correct First Attempt']
before_drop = len(df)
df = df.dropna(subset=critical_cols)
after_drop = len(df)
print(f"Dropped {before_drop - after_drop:,} rows ({(before_drop - after_drop)/before_drop*100:.2f}%)")
print(f"Remaining: {after_drop:,} rows")

print("\n" + "=" * 60)
print("STEP 2: Handle missing KC(Default)")
print("=" * 60)

kc_missing_before = df['KC(Default)'].isnull().sum()
print(f"Missing KC(Default): {kc_missing_before:,} ({kc_missing_before/len(df)*100:.2f}%)")

# Fill missing KC with special token
df['KC(Default)'] = df['KC(Default)'].fillna('UNKNOWN_KC')
print(f"Filled with 'UNKNOWN_KC' token")

print("\n" + "=" * 60)
print("STEP 3: Create canonical identifiers")
print("=" * 60)

# Student ID
df['student_id'] = df['Anon Student Id'].astype(str).str.strip()

# Question ID (Problem Name)
df['question_id'] = df['Problem Name'].astype(str).str.strip()

# Step ID (Problem Name + Step Name)
df['step_id'] = df['Problem Name'].astype(str).str.strip() + "||" + df['Step Name'].astype(str).str.strip()

# KC ID (KC(Default) as composite string)
df['kc_id'] = df['KC(Default)'].astype(str).str.strip()

# Target
df['correct'] = df['Correct First Attempt'].astype(int)

print(f"Unique students: {df['student_id'].nunique():,}")
print(f"Unique questions: {df['question_id'].nunique():,}")
print(f"Unique steps: {df['step_id'].nunique():,}")
print(f"Unique KCs (including UNKNOWN): {df['kc_id'].nunique():,}")

print("\n" + "=" * 60)
print("STEP 4: Parse timestamps and create temporal ordering")
print("=" * 60)

# Parse First Transaction Time (primary timestamp)
df['timestamp'] = pd.to_datetime(df['First Transaction Time'], errors='coerce')

# Fallback to Step Start Time
mask_missing_ts = df['timestamp'].isnull()
df.loc[mask_missing_ts, 'timestamp'] = pd.to_datetime(
    df.loc[mask_missing_ts, 'Step Start Time'], errors='coerce'
)

# Check remaining missing timestamps
ts_missing = df['timestamp'].isnull().sum()
print(f"Rows with missing timestamp after fallback: {ts_missing}")

if ts_missing > 0:
    # Drop only these (should be minimal)
    df = df.dropna(subset=['timestamp'])
    print(f"Dropped {ts_missing} rows with no valid timestamp")

# Sort by student and timestamp
df = df.sort_values(['student_id', 'timestamp']).reset_index(drop=True)

# Create time index within each student
df['time_idx'] = df.groupby('student_id').cumcount()

print(f"Final dataset size: {len(df):,} rows")

print("\n" + "=" * 60)
print("STEP 5: Process behavioral features")
print("=" * 60)

# Fill missing durations with median
duration_col = 'Step Duration (sec)'
median_duration = df[duration_col].median()
df[duration_col] = df[duration_col].fillna(median_duration)

# Log transform duration
df['log_duration'] = np.log1p(df[duration_col].clip(lower=0))

# Clip extreme values
df['Incorrects'] = df['Incorrects'].clip(upper=10)
df['Hints'] = df['Hints'].clip(upper=10)

print(f"Median duration: {median_duration:.2f} sec")
print(f"Log duration range: [{df['log_duration'].min():.2f}, {df['log_duration'].max():.2f}]")

print("\n" + "=" * 60)
print("STEP 6: Final dataset summary")
print("=" * 60)

print(f"Total interactions: {len(df):,}")
print(f"Unique students: {df['student_id'].nunique():,}")
print(f"Unique questions: {df['question_id'].nunique():,}")
print(f"Unique steps: {df['step_id'].nunique():,}")
print(f"Unique KCs: {df['kc_id'].nunique():,}")
print(f"  - Real KCs: {(df['kc_id'] != 'UNKNOWN_KC').sum():,} interactions")
print(f"  - UNKNOWN_KC: {(df['kc_id'] == 'UNKNOWN_KC').sum():,} interactions")

print(f"\nTarget distribution:")
print(df['correct'].value_counts(normalize=True).round(4))

print("\n" + "=" * 60)
print("STEP 7: Verify temporal ordering")
print("=" * 60)

sample_student = df['student_id'].iloc[0]
sample_seq = df[df['student_id'] == sample_student][['time_idx', 'timestamp', 'question_id', 'kc_id', 'correct']].head(5)
print(f"Sample student '{sample_student}' first 5 interactions:")
display(sample_seq)

STEP 1: Drop rows with missing critical fields (NOT KC)
Dropped 0 rows (0.00%)
Remaining: 809,694 rows

STEP 2: Handle missing KC(Default)
Missing KC(Default): 202,669 (25.03%)
Filled with 'UNKNOWN_KC' token

STEP 3: Create canonical identifiers
Unique students: 574
Unique questions: 1,084
Unique steps: 210,710
Unique KCs (including UNKNOWN): 437

STEP 4: Parse timestamps and create temporal ordering
Rows with missing timestamp after fallback: 0
Final dataset size: 809,694 rows

STEP 5: Process behavioral features
Median duration: 11.00 sec
Log duration range: [0.00, 7.90]

STEP 6: Final dataset summary
Total interactions: 809,694
Unique students: 574
Unique questions: 1,084
Unique steps: 210,710
Unique KCs: 437
  - Real KCs: 607,025 interactions
  - UNKNOWN_KC: 202,669 interactions

Target distribution:
correct
1    0.7665
0    0.2335
Name: proportion, dtype: float64

STEP 7: Verify temporal ordering
Sample student '02ZjVTxC34' first 5 interactions:


Unnamed: 0,time_idx,timestamp,question_id,kc_id,correct
0,0,2005-09-06 13:00:23,LDEMO_WKST,UNKNOWN_KC,1
1,1,2005-09-06 13:00:44,LDEMO_WKST,Identifying units,1
2,2,2005-09-06 13:01:12,LDEMO_WKST,UNKNOWN_KC,1
3,3,2005-09-06 13:01:46,LDEMO_WKST,Identifying units,1
4,4,2005-09-06 13:02:27,LDEMO_WKST,"Entering a given~~Convert unit, multiplier",1


In [8]:
# =============================================================================
# Cell 4: Separate TEST Set + 5-Fold Student-Level CV Setup
# =============================================================================

import time
from sklearn.model_selection import KFold, train_test_split
from collections import defaultdict

print("=" * 60)
print("STEP 1: SEPARATE TEST STUDENTS (HELD OUT ENTIRELY)")
print("=" * 60)

all_students = df['student_id'].unique()
n_students = len(all_students)
print(f"Total students: {n_students}")

# Hold out 15% of students as TEST - NEVER touched during CV
non_test_students, test_students = train_test_split(
    all_students,
    test_size=config.TEST_STUDENT_RATIO,
    random_state=config.SEED
)

df_test_final = df[df['student_id'].isin(test_students)].copy()
df_non_test = df[df['student_id'].isin(non_test_students)].copy()

print(f"\nTEST set (held out):")
print(f"  Students: {len(test_students)} ({len(test_students)/n_students*100:.1f}%)")
print(f"  Interactions: {len(df_test_final):,}")

print(f"\nNon-test (enters K-Fold CV):")
print(f"  Students: {len(non_test_students)} ({len(non_test_students)/n_students*100:.1f}%)")
print(f"  Interactions: {len(df_non_test):,}")

print("\n" + "=" * 60)
print("STEP 2: DEFINE 5-FOLD STUDENT-LEVEL SPLITS")
print("=" * 60)

kf = KFold(n_splits=5, shuffle=True, random_state=config.SEED)

fold_assignments = {}
for fold_idx, (train_indices, val_indices) in enumerate(kf.split(non_test_students)):
    train_studs = non_test_students[train_indices]
    val_studs = non_test_students[val_indices]
    fold_assignments[fold_idx] = {
        'train_students': train_studs,
        'val_students': val_studs
    }
    print(f"\nFold {fold_idx+1}:")
    print(f"  TRAIN: {len(train_studs)} students")
    print(f"  VAL:   {len(val_studs)} students")

    # Verify no overlap
    overlap = set(train_studs) & set(val_studs)
    assert len(overlap) == 0, f"LEAK in fold {fold_idx+1}!"

    # Verify no test leakage
    test_leak = set(train_studs) & set(test_students)
    assert len(test_leak) == 0, f"TEST LEAK in fold {fold_idx+1}!"

print("\n‚úì All folds verified: no student overlap, no test leakage")
print(f"‚úì TEST set ({len(test_students)} students) completely isolated")

STEP 1: SEPARATE TEST STUDENTS (HELD OUT ENTIRELY)
Total students: 574

TEST set (held out):
  Students: 87 (15.2%)
  Interactions: 118,261

Non-test (enters K-Fold CV):
  Students: 487 (84.8%)
  Interactions: 691,433

STEP 2: DEFINE 5-FOLD STUDENT-LEVEL SPLITS

Fold 1:
  TRAIN: 389 students
  VAL:   98 students

Fold 2:
  TRAIN: 389 students
  VAL:   98 students

Fold 3:
  TRAIN: 390 students
  VAL:   97 students

Fold 4:
  TRAIN: 390 students
  VAL:   97 students

Fold 5:
  TRAIN: 390 students
  VAL:   97 students

‚úì All folds verified: no student overlap, no test leakage
‚úì TEST set (87 students) completely isolated


In [9]:
# =============================================================================
# Cell 5: Complete Pipeline Functions (Reusable Per Fold) ‚Äî M3a
# =============================================================================

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import GCNConv, RGCNConv
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score, accuracy_score
from collections import defaultdict
import time

NUM_FEATURES = 5

# ============================================================
# FUNCTION 1: Build Entity Mappings (UNCHANGED)
# ============================================================
def build_entity_mappings(df_train):
    train_students = sorted(df_train['student_id'].unique())
    train_questions = sorted(df_train['question_id'].unique())
    train_steps = sorted(df_train['step_id'].unique())
    train_kcs = sorted(df_train['kc_id'].unique())

    stu2idx = {s: i for i, s in enumerate(train_students)}
    q2idx = {q: i for i, q in enumerate(train_questions)}
    t2idx = {t: i for i, t in enumerate(train_steps)}
    c2idx = {c: i for i, c in enumerate(train_kcs)}

    unk_indices = {
        'student': len(train_students),
        'question': len(train_questions),
        'step': len(train_steps),
        'kc': len(train_kcs)
    }

    entity_counts = {
        'num_students': len(train_students) + 1,
        'num_questions': len(train_questions) + 1,
        'num_steps': len(train_steps) + 1,
        'num_kcs': len(train_kcs) + 1,
    }

    mappings = {
        'stu2idx': stu2idx, 'q2idx': q2idx,
        't2idx': t2idx, 'c2idx': c2idx
    }

    return mappings, entity_counts, unk_indices


# ============================================================
# FUNCTION 2: Build Heterogeneous Graph (UNCHANGED)
# ============================================================
def build_graph(df_train, mappings, entity_counts, unk_indices):
    stu2idx = mappings['stu2idx']
    q2idx = mappings['q2idx']
    t2idx = mappings['t2idx']
    c2idx = mappings['c2idx']

    data = HeteroData()

    data['student'].num_nodes = entity_counts['num_students']
    data['question'].num_nodes = entity_counts['num_questions']
    data['step'].num_nodes = entity_counts['num_steps']
    data['kc'].num_nodes = entity_counts['num_kcs']

    qt_pairs = df_train[['question_id', 'step_id']].drop_duplicates()
    q_idx_list = [q2idx[r['question_id']] for _, r in qt_pairs.iterrows()]
    t_idx_list = [t2idx[r['step_id']] for _, r in qt_pairs.iterrows()]

    data['question', 'contains', 'step'].edge_index = torch.tensor([q_idx_list, t_idx_list], dtype=torch.long)
    data['step', 'belongs_to', 'question'].edge_index = torch.tensor([t_idx_list, q_idx_list], dtype=torch.long)

    tc_pairs = df_train[['step_id', 'kc_id']].drop_duplicates()
    t_idx_list2 = [t2idx[r['step_id']] for _, r in tc_pairs.iterrows()]
    c_idx_list = [c2idx[r['kc_id']] for _, r in tc_pairs.iterrows()]

    data['step', 'requires', 'kc'].edge_index = torch.tensor([t_idx_list2, c_idx_list], dtype=torch.long)
    data['kc', 'required_by', 'step'].edge_index = torch.tensor([c_idx_list, t_idx_list2], dtype=torch.long)

    sq_pairs = df_train[['student_id', 'question_id']].drop_duplicates()
    s_idx_list = [stu2idx[r['student_id']] for _, r in sq_pairs.iterrows()]
    q_idx_list2 = [q2idx[r['question_id']] for _, r in sq_pairs.iterrows()]

    data['student', 'attempted', 'question'].edge_index = torch.tensor([s_idx_list, q_idx_list2], dtype=torch.long)
    data['question', 'attempted_by', 'student'].edge_index = torch.tensor([q_idx_list2, s_idx_list], dtype=torch.long)

    total_edges = sum(
        data[et].edge_index.shape[1]
        for et in data.edge_types
    )

    return data, total_edges


# ============================================================
# FUNCTION 3: Compute Node Features (Base 5D)
# ============================================================
def compute_node_features(df_train, mappings, entity_counts, unk_indices):
    """Compute 5D features for all entities (WITHOUT mastery)"""
    def compute_features_for_type(df, entity_col):
        grouped = df.groupby(entity_col).agg({
            'correct': ['count', 'mean'],
            'log_duration': 'mean',
            'Hints': 'mean',
            'Incorrects': 'mean'
        })
        grouped.columns = ['freq', 'correct_rate', 'avg_log_dur', 'avg_hints', 'avg_incorrects']
        grouped = grouped.reset_index()
        grouped['difficulty'] = 1 - grouped['correct_rate']
        grouped['log_freq'] = np.log1p(grouped['freq'])

        features = {}
        feat_cols = ['log_freq', 'difficulty', 'avg_log_dur', 'avg_hints', 'avg_incorrects']
        for _, row in grouped.iterrows():
            features[row[entity_col]] = row[feat_cols].values.astype(np.float32)
        return features

    def to_tensor(features_dict, idx_map, num_with_unk, unk_idx):
        tensor = torch.zeros(num_with_unk, NUM_FEATURES, dtype=torch.float32)
        all_feats = []
        for entity_id, idx in idx_map.items():
            if entity_id in features_dict:
                tensor[idx] = torch.tensor(features_dict[entity_id])
                all_feats.append(features_dict[entity_id])
        if all_feats:
            tensor[unk_idx] = torch.tensor(np.mean(all_feats, axis=0))
        return tensor

    def normalize(tensor):
        mean = tensor.mean(dim=0, keepdim=True)
        std = tensor.std(dim=0, keepdim=True) + 1e-8
        return (tensor - mean) / std

    stu_feats = compute_features_for_type(df_train, 'student_id')
    q_feats = compute_features_for_type(df_train, 'question_id')
    t_feats = compute_features_for_type(df_train, 'step_id')
    c_feats = compute_features_for_type(df_train, 'kc_id')

    feat_tensors = {
        'student': normalize(to_tensor(stu_feats, mappings['stu2idx'], entity_counts['num_students'], unk_indices['student'])),
        'question': normalize(to_tensor(q_feats, mappings['q2idx'], entity_counts['num_questions'], unk_indices['question'])),
        'step': normalize(to_tensor(t_feats, mappings['t2idx'], entity_counts['num_steps'], unk_indices['step'])),
        'kc': normalize(to_tensor(c_feats, mappings['c2idx'], entity_counts['num_kcs'], unk_indices['kc'])),
    }

    return feat_tensors


# ============================================================
# üÜï FUNCTION 3b: Compute Features WITH Mastery for Students
# ============================================================
def compute_node_features_with_mastery(df_train, mappings, entity_counts, unk_indices, mastery_matrix):
    """
    Compute features WITH mastery for students
    Students: [5D_stats + num_kcs mastery values]
    Others: [5D_stats] (unchanged)
    """
    # Get base 5D features
    base_features = compute_node_features(df_train, mappings, entity_counts, unk_indices)

    # For students: concatenate 5D + mastery
    student_features_5d = base_features['student']  # [num_students, 5]

    # Concatenate with mastery matrix
    student_features_with_mastery = torch.cat([
        student_features_5d,
        mastery_matrix  # [num_students, num_kcs]
    ], dim=-1)
    # Shape: [num_students, 5 + num_kcs]

    return {
        'student': student_features_with_mastery,  # [num_students, 5 + num_kcs]
        'question': base_features['question'],      # [num_questions, 5]
        'step': base_features['step'],              # [num_steps, 5]
        'kc': base_features['kc'],                  # [num_kcs, 5]
    }


# ============================================================
# FUNCTION 4: Model Components
# ============================================================

class NodeEncoder(nn.Module):
    """Encode node features to embedding space"""
    def __init__(self, input_dim, embed_dim, dropout=0.1):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, embed_dim),
            nn.LayerNorm(embed_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(embed_dim, embed_dim),
            nn.LayerNorm(embed_dim)
        )

    def forward(self, x):
        return self.encoder(x)


# ============================================================
# üÜï MODEL: M3a - RGCN with Mastery as Features
# ============================================================

class RGCNWithMasteryFeatures(nn.Module):
    """
    M3a: RGCN with mastery as student features
    - Single RGCN (like M2) on full graph (S,Q,T,C)
    - Students have mastery features concatenated
    - 6 relations (S‚ÜîQ, Q‚ÜîT, T‚ÜîC)
    """

    NUM_RELATIONS = 6  # Like M2

    def __init__(self, num_students, num_questions, num_steps, num_kcs,
                 feature_dim_base=5, embed_dim=32, hidden_dim=64,
                 num_gnn_layers=2, dropout=0.2,
                 mastery_init=0.5):
        super().__init__()

        self.num_students = num_students
        self.num_questions = num_questions
        self.num_steps = num_steps
        self.num_kcs = num_kcs
        self.embed_dim = embed_dim

        # Global offsets (like M2)
        self.student_offset = 0
        self.question_offset = num_students
        self.step_offset = num_students + num_questions
        self.kc_offset = num_students + num_questions + num_steps

        # === Node Encoders ===
        # Students: 5 + num_kcs features
        self.student_encoder = NodeEncoder(
            feature_dim_base + num_kcs,  # ‚Üê Mastery as features
            embed_dim,
            dropout
        )
        # Others: 5 features
        self.question_encoder = NodeEncoder(feature_dim_base, embed_dim, dropout)
        self.step_encoder = NodeEncoder(feature_dim_base, embed_dim, dropout)
        self.kc_encoder = NodeEncoder(feature_dim_base, embed_dim, dropout)

        # === SINGLE RGCN (6 relations like M2) ===
        self.gnn_layers = nn.ModuleList([
            RGCNConv(
                embed_dim, embed_dim,
                num_relations=self.NUM_RELATIONS,
                num_bases=2
            )
            for _ in range(num_gnn_layers)
        ])
        self.gnn_norms = nn.ModuleList([
            nn.LayerNorm(embed_dim) for _ in range(num_gnn_layers)
        ])
        self.gnn_dropout = nn.Dropout(dropout)

        # === Prediction Head ===
        pred_input_dim = embed_dim * 3  # [h_s, h_t, h_c]
        self.prediction_head = nn.Sequential(
            nn.Linear(pred_input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1)
        )

        # === Mastery Matrix ===
        self.mastery_init = mastery_init
        self.register_buffer('mastery_matrix', None)

        # Cache
        self._cached_edge_index = None
        self._cached_edge_type = None

    def init_mastery_matrix(self, device):
        """Initialize mastery matrix"""
        self.mastery_matrix = torch.full(
            (self.num_students, self.num_kcs),
            self.mastery_init,
            dtype=torch.float32,
            device=device
        )
        print(f"  ‚úì Mastery matrix initialized: [{self.num_students}, {self.num_kcs}] with value={self.mastery_init}")

    def _build_edge_index(self, hetero_data, device):
        """Build unified edge_index for all 6 relations (like M2)"""
        if self._cached_edge_index is not None:
            return self._cached_edge_index, self._cached_edge_type

        all_edges = []
        all_types = []

        # Relation 0: Q ‚Üí T
        q_t = hetero_data['question', 'contains', 'step'].edge_index.clone()
        q_t[0] += self.question_offset
        q_t[1] += self.step_offset
        all_edges.append(q_t)
        all_types.append(torch.zeros(q_t.size(1), dtype=torch.long))

        # Relation 1: T ‚Üí Q
        t_q = hetero_data['step', 'belongs_to', 'question'].edge_index.clone()
        t_q[0] += self.step_offset
        t_q[1] += self.question_offset
        all_edges.append(t_q)
        all_types.append(torch.ones(t_q.size(1), dtype=torch.long))

        # Relation 2: T ‚Üí C
        t_c = hetero_data['step', 'requires', 'kc'].edge_index.clone()
        t_c[0] += self.step_offset
        t_c[1] += self.kc_offset
        all_edges.append(t_c)
        all_types.append(torch.full((t_c.size(1),), 2, dtype=torch.long))

        # Relation 3: C ‚Üí T
        c_t = hetero_data['kc', 'required_by', 'step'].edge_index.clone()
        c_t[0] += self.kc_offset
        c_t[1] += self.step_offset
        all_edges.append(c_t)
        all_types.append(torch.full((c_t.size(1),), 3, dtype=torch.long))

        # Relation 4: S ‚Üí Q
        s_q = hetero_data['student', 'attempted', 'question'].edge_index.clone()
        s_q[1] += self.question_offset
        all_edges.append(s_q)
        all_types.append(torch.full((s_q.size(1),), 4, dtype=torch.long))

        # Relation 5: Q ‚Üí S
        q_s = hetero_data['question', 'attempted_by', 'student'].edge_index.clone()
        q_s[0] += self.question_offset
        all_edges.append(q_s)
        all_types.append(torch.full((q_s.size(1),), 5, dtype=torch.long))

        self._cached_edge_index = torch.cat(all_edges, dim=1).to(device)
        self._cached_edge_type = torch.cat(all_types, dim=0).to(device)

        return self._cached_edge_index, self._cached_edge_type

    def forward(self, hetero_data, student_idx, step_idx, kc_idx, device):
        """
        Forward pass: Simple RGCN (like M2)
        Mastery is in student features
        """
        # === 1. Encode Nodes ===
        h_s = self.student_encoder(hetero_data['student'].x.to(device))
        h_q = self.question_encoder(hetero_data['question'].x.to(device))
        h_t = self.step_encoder(hetero_data['step'].x.to(device))
        h_c = self.kc_encoder(hetero_data['kc'].x.to(device))

        # === 2. Unified RGCN ===
        H = torch.cat([h_s, h_q, h_t, h_c], dim=0)
        edge_index, edge_type = self._build_edge_index(hetero_data, device)

        for layer, norm in zip(self.gnn_layers, self.gnn_norms):
            H_new = layer(H, edge_index, edge_type)
            H_new = norm(H_new)
            H_new = F.relu(H_new)
            H_new = self.gnn_dropout(H_new)
            H = H + H_new  # Residual

        # === 3. Extract embeddings ===
        h_s = H[:self.num_students]
        h_t_start = self.question_offset + self.num_questions
        h_t_end = h_t_start + self.num_steps
        h_t = H[h_t_start:h_t_end]
        h_c = H[self.kc_offset:]

        # === 4. Batch select ===
        h_s_batch = h_s[student_idx]
        h_t_batch = h_t[step_idx]
        h_c_batch = h_c[kc_idx]

        # === 5. Prediction ===
        combined = torch.cat([h_s_batch, h_t_batch, h_c_batch], dim=-1)
        logits = self.prediction_head(combined).squeeze(-1)

        return logits

    def update_mastery_online(self, student_idx, kc_idx, y_true, lambda_ema):
        """Update mastery after observation"""
        with torch.no_grad():
            old_mastery = self.mastery_matrix[student_idx, kc_idx]
            new_mastery = (1 - lambda_ema) * old_mastery + lambda_ema * y_true
            self.mastery_matrix[student_idx, kc_idx] = new_mastery


# ============================================================
# FUNCTION 5: Dataset (UNCHANGED)
# ============================================================

class KTDatasetPure(Dataset):
    """Dataset for Knowledge Tracing"""
    def __init__(self, df, stu2idx, t2idx, c2idx,
                 unk_student_idx, unk_step_idx, unk_kc_idx):
        self.df = df.reset_index(drop=True)
        self.stu2idx = stu2idx
        self.t2idx = t2idx
        self.c2idx = c2idx
        self.unk_student_idx = unk_student_idx
        self.unk_step_idx = unk_step_idx
        self.unk_kc_idx = unk_kc_idx

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        student_idx = self.stu2idx.get(row['student_id'], self.unk_student_idx)
        step_idx = self.t2idx.get(row['step_id'], self.unk_step_idx)
        kc_idx = self.c2idx.get(row['kc_id'], self.unk_kc_idx)
        label = torch.tensor(row['correct'], dtype=torch.float32)
        return student_idx, step_idx, kc_idx, label


# ============================================================
# FUNCTION 6: Training & Evaluation
# ============================================================

class EarlyStopping:
    """Early stopping with model state saving"""
    def __init__(self, patience=10, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_state = None

    def __call__(self, score, model):
        if self.best_score is None or score > self.best_score + self.min_delta:
            self.best_score = score
            self.best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

    def load_best(self, model):
        if self.best_state:
            model.load_state_dict(self.best_state)


def train_epoch(model, loader, optimizer, criterion, hetero_data, device,
                config, grad_clip=1.0):
    """Train for one epoch with mastery update"""
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []

    for batch in loader:
        student_idx, step_idx, kc_idx, labels = batch
        student_idx = student_idx.to(device)
        step_idx = step_idx.to(device)
        kc_idx = kc_idx.to(device)
        labels = labels.to(device)

        # 1. FORWARD (with current mastery in features - history up to t-1)
        logits = model(hetero_data, student_idx, step_idx, kc_idx, device)
        loss = criterion(logits, labels)

        # 2. BACKWARD (update model weights)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()

        # 3. UPDATE MASTERY (AFTER backward, no gradient)
        #    ‚ö†Ô∏è CRITICAL: This order prevents leakage
        if hasattr(model, 'update_mastery_online'):
            model.update_mastery_online(
                student_idx, kc_idx, labels,
                lambda_ema=config.LAMBDA_EMA
            )

        # Metrics
        total_loss += loss.item() * len(labels)
        all_preds.extend(torch.sigmoid(logits).detach().cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(loader.dataset)
    auc = roc_auc_score(all_labels, all_preds) if len(set(all_labels)) > 1 else 0.5
    acc = accuracy_score(all_labels, np.array(all_preds) > 0.5)

    return avg_loss, auc, acc


@torch.no_grad()
def evaluate(model, loader, criterion, hetero_data, device):
    """Evaluate without mastery update"""
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    for batch in loader:
        student_idx, step_idx, kc_idx, labels = batch
        student_idx = student_idx.to(device)
        step_idx = step_idx.to(device)
        kc_idx = kc_idx.to(device)
        labels = labels.to(device)

        logits = model(hetero_data, student_idx, step_idx, kc_idx, device)
        loss = criterion(logits, labels)

        total_loss += loss.item() * len(labels)
        all_preds.extend(torch.sigmoid(logits).cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(loader.dataset)
    auc = roc_auc_score(all_labels, all_preds) if len(set(all_labels)) > 1 else 0.5
    acc = accuracy_score(all_labels, np.array(all_preds) > 0.5)

    return avg_loss, auc, acc


# ============================================================
# üÜï FUNCTION 7: Full Single-Fold Pipeline (MODIFIED for M3a)
# ============================================================

def run_single_fold(df_train_fold, df_val_fold, config, fold_num=None, verbose=True):
    """Run complete training pipeline for one fold - M3a version"""
    device = config.DEVICE
    prefix = f"[Fold {fold_num}] " if fold_num is not None else ""

    # --- Step 1: Entity mappings ---
    mappings, entity_counts, unk_indices = build_entity_mappings(df_train_fold)
    if verbose:
        print(f"{prefix}Entities: S={entity_counts['num_students']}, "
              f"Q={entity_counts['num_questions']}, "
              f"T={entity_counts['num_steps']}, "
              f"C={entity_counts['num_kcs']}")

    # --- Step 2: Build graph ---
    hetero_data, total_edges = build_graph(df_train_fold, mappings, entity_counts, unk_indices)
    if verbose:
        print(f"{prefix}Graph edges: {total_edges:,}")

    # --- Step 3: Initialize mastery BEFORE computing features ---
    mastery_matrix_cpu = torch.full(
        (entity_counts['num_students'], entity_counts['num_kcs']),
        config.MASTERY_INIT,
        dtype=torch.float32
    )

    # --- Step 4: Node features WITH mastery ---
    feat_tensors = compute_node_features_with_mastery(
        df_train_fold, mappings, entity_counts, unk_indices,
        mastery_matrix_cpu  # ‚Üê NEW
    )
    hetero_data['student'].x = feat_tensors['student']  # [num_students, 5+num_kcs]
    hetero_data['question'].x = feat_tensors['question']
    hetero_data['step'].x = feat_tensors['step']
    hetero_data['kc'].x = feat_tensors['kc']

    # --- Step 5: DataLoaders ---
    train_dataset = KTDatasetPure(
        df_train_fold, mappings['stu2idx'], mappings['t2idx'], mappings['c2idx'],
        unk_indices['student'], unk_indices['step'], unk_indices['kc']
    )
    val_dataset = KTDatasetPure(
        df_val_fold, mappings['stu2idx'], mappings['t2idx'], mappings['c2idx'],
        unk_indices['student'], unk_indices['step'], unk_indices['kc']
    )

    train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True,
                              pin_memory=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False,
                            pin_memory=True, num_workers=0)

    if verbose:
        print(f"{prefix}Train: {len(train_loader.dataset):,} samples, "
              f"Val: {len(val_loader.dataset):,} samples")

    # --- Step 6: Model ---
    model = RGCNWithMasteryFeatures(  # ‚Üê NEW MODEL
        num_students=entity_counts['num_students'],
        num_questions=entity_counts['num_questions'],
        num_steps=entity_counts['num_steps'],
        num_kcs=entity_counts['num_kcs'],
        feature_dim_base=NUM_FEATURES,
        embed_dim=config.EMBED_DIM,
        hidden_dim=config.HIDDEN_DIM,
        num_gnn_layers=config.NUM_GNN_LAYERS,
        dropout=config.DROPOUT,
        mastery_init=config.MASTERY_INIT
    ).to(device)

    model.init_mastery_matrix(device)

    total_params = sum(p.numel() for p in model.parameters())
    if verbose:
        print(f"{prefix}Model params: {total_params:,}")

    # --- Step 7: Training setup ---
    n_correct = df_train_fold['correct'].sum()
    n_incorrect = len(df_train_fold) - n_correct
    pos_weight = torch.tensor([n_incorrect / n_correct], dtype=torch.float32).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE,
                                   weight_decay=config.WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=5
    )
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    early_stopping = EarlyStopping(patience=config.PATIENCE)

    # --- Step 8: Training loop ---
    history = defaultdict(list)
    best_val_auc = 0

    if verbose:
        print(f"{prefix}Training (pos_weight={pos_weight.item():.4f})...")
        print("-" * 70)

    for epoch in range(config.EPOCHS):
        epoch_start = time.time()

        # üÜï RECALCULATE student features with UPDATED mastery
        if epoch > 0:  # Skip first epoch (already computed)
            mastery_cpu = model.mastery_matrix.cpu()
            feat_tensors = compute_node_features_with_mastery(
                df_train_fold, mappings, entity_counts, unk_indices,
                mastery_cpu  # ‚Üê Mastery from previous epoch
            )
            hetero_data['student'].x = feat_tensors['student']

        train_loss, train_auc, train_acc = train_epoch(
            model, train_loader, optimizer, criterion, hetero_data, device, config, config.GRAD_CLIP
        )
        val_loss, val_auc, val_acc = evaluate(
            model, val_loader, criterion, hetero_data, device
        )

        epoch_time = time.time() - epoch_start

        history['train_loss'].append(train_loss)
        history['train_auc'].append(train_auc)
        history['val_loss'].append(val_loss)
        history['val_auc'].append(val_auc)
        history['val_acc'].append(val_acc)

        marker = " ‚òÖ" if val_auc > best_val_auc else ""
        if val_auc > best_val_auc:
            best_val_auc = val_auc

        if verbose:
            print(f"{prefix}Epoch {epoch+1:3d}/{config.EPOCHS} | "
                  f"Train AUC: {train_auc:.4f} | Val AUC: {val_auc:.4f} | "
                  f"Val Acc: {val_acc:.4f} | Time: {epoch_time:.1f}s{marker}")

        scheduler.step(val_auc)
        early_stopping(val_auc, model)

        if early_stopping.early_stop:
            if verbose:
                print(f"{prefix}Early stopping at epoch {epoch+1}")
            break

    early_stopping.load_best(model)

    final_val_loss, final_val_auc, final_val_acc = evaluate(
        model, val_loader, criterion, hetero_data, device
    )

    stopped_epoch = len(history['train_loss'])

    if verbose:
        print("-" * 70)
        print(f"{prefix}Best Val AUC: {final_val_auc:.4f} | "
              f"Val Acc: {final_val_acc:.4f} | Stopped at epoch: {stopped_epoch}")

    return {
        'val_auc': final_val_auc,
        'val_acc': final_val_acc,
        'val_loss': final_val_loss,
        'train_auc': history['train_auc'][-1],
        'stopped_epoch': stopped_epoch,
        'total_params': total_params,
        'history': dict(history),
        'model_state': early_stopping.best_state,
        'hetero_data': hetero_data,
        'mappings': mappings,
        'entity_counts': entity_counts,
        'unk_indices': unk_indices,
    }


# ============================================================
# Confirmation
# ============================================================
print("=" * 60)
print("‚úì ALL PIPELINE FUNCTIONS DEFINED (M3a VERSION)")
print("=" * 60)
print("  - build_entity_mappings()")
print("  - build_graph()")
print("  - compute_node_features()")
print("  - compute_node_features_with_mastery() [NEW]")
print("  - RGCNWithMasteryFeatures (Mastery as Features)")
print("  - KTDatasetPure")
print("  - train_epoch() / evaluate()")
print("  - run_single_fold() [MODIFIED for M3a]")
print("=" * 60)
print("Ready for 5-Fold Cross-Validation! üöÄ")
print("=" * 60)

‚úì ALL PIPELINE FUNCTIONS DEFINED (M3a VERSION)
  - build_entity_mappings()
  - build_graph()
  - compute_node_features()
  - compute_node_features_with_mastery() [NEW]
  - RGCNWithMasteryFeatures (Mastery as Features)
  - KTDatasetPure
  - train_epoch() / evaluate()
  - run_single_fold() [MODIFIED for M3a]
Ready for 5-Fold Cross-Validation! üöÄ


In [10]:
# =============================================================================
# Cell 6: Run 5-Fold Student-Level Cross-Validation
# =============================================================================

print("=" * 60)
print("5-FOLD STUDENT-LEVEL CROSS-VALIDATION")
print("=" * 60)

print(f"""
Protocol:
  - {len(non_test_students)} non-test students split into 5 folds
  - Each fold: full pipeline rebuild (mappings, graph, features, model)
  - All design decisions FROZEN before CV
  - TEST set ({len(test_students)} students) completely untouched

Frozen hyperparameters:
  EMBED_DIM={config.EMBED_DIM}, HIDDEN_DIM={config.HIDDEN_DIM}
  NUM_GNN_LAYERS={config.NUM_GNN_LAYERS}, DROPOUT={config.DROPOUT}
  LR={config.LEARNING_RATE}, WEIGHT_DECAY={config.WEIGHT_DECAY}
  BATCH_SIZE=512, PATIENCE={config.PATIENCE}
""")

cv_results = []
cv_start = time.time()

for fold_idx in range(5):
    fold_start = time.time()

    print(f"\n{'='*60}")
    print(f"FOLD {fold_idx + 1} / 5")
    print(f"{'='*60}")

    # Get this fold's students
    train_students_fold = fold_assignments[fold_idx]['train_students']
    val_students_fold = fold_assignments[fold_idx]['val_students']

    # Create dataframes for this fold
    df_train_fold = df_non_test[df_non_test['student_id'].isin(train_students_fold)].copy()
    df_val_fold = df_non_test[df_non_test['student_id'].isin(val_students_fold)].copy()

    print(f"Train students: {len(train_students_fold)}, "
          f"Val students: {len(val_students_fold)}")
    print(f"Train interactions: {len(df_train_fold):,}, "
          f"Val interactions: {len(df_val_fold):,}")

    # Run full pipeline for this fold
    result = run_single_fold(
        df_train_fold, df_val_fold, config,
        fold_num=fold_idx + 1, verbose=True
    )

    fold_time = time.time() - fold_start
    result['fold_time'] = fold_time
    cv_results.append(result)

    print(f"\nFold {fold_idx + 1} completed in {fold_time:.1f}s")

total_cv_time = time.time() - cv_start

# ============================================================
# CV Summary
# ============================================================
print("\n" + "=" * 60)
print("5-FOLD CROSS-VALIDATION RESULTS")
print("=" * 60)

val_aucs = [r['val_auc'] for r in cv_results]
val_accs = [r['val_acc'] for r in cv_results]
stopped_epochs = [r['stopped_epoch'] for r in cv_results]

print(f"\n{'Fold':<8} {'Val AUC':<12} {'Val Acc':<12} {'Epochs':<10}")
print("-" * 42)
for i, r in enumerate(cv_results):
    print(f"Fold {i+1:<3} {r['val_auc']:<12.4f} {r['val_acc']:<12.4f} {r['stopped_epoch']:<10}")

print("-" * 42)
print(f"{'Mean':<8} {np.mean(val_aucs):<12.4f} {np.mean(val_accs):<12.4f} {np.mean(stopped_epochs):<10.1f}")
print(f"{'Std':<8} {np.std(val_aucs):<12.4f} {np.std(val_accs):<12.4f} {np.std(stopped_epochs):<10.1f}")
print(f"{'Min':<8} {np.min(val_aucs):<12.4f} {np.min(val_accs):<12.4f} {np.min(stopped_epochs):<10}")
print(f"{'Max':<8} {np.max(val_aucs):<12.4f} {np.max(val_accs):<12.4f} {np.max(stopped_epochs):<10}")

print(f"\n‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó")
print(f"‚ïë  CV Val AUC: {np.mean(val_aucs):.4f} ¬± {np.std(val_aucs):.4f}              ‚ïë")
print(f"‚ïë  CV Val Acc: {np.mean(val_accs):.4f} ¬± {np.std(val_accs):.4f}              ‚ïë")
print(f"‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù")

print(f"\nTotal CV time: {total_cv_time:.1f}s ({total_cv_time/60:.1f} min)")

# Store average epochs for final training
avg_epochs_cv = int(np.mean(stopped_epochs))
print(f"\nAverage stopping epoch: {avg_epochs_cv} (will use for final training)")

5-FOLD STUDENT-LEVEL CROSS-VALIDATION

Protocol:
  - 487 non-test students split into 5 folds
  - Each fold: full pipeline rebuild (mappings, graph, features, model)
  - All design decisions FROZEN before CV
  - TEST set (87 students) completely untouched

Frozen hyperparameters:
  EMBED_DIM=32, HIDDEN_DIM=64
  NUM_GNN_LAYERS=2, DROPOUT=0.2
  LR=0.001, WEIGHT_DECAY=0.01
  BATCH_SIZE=512, PATIENCE=10


FOLD 1 / 5
Train students: 389, Val students: 98
Train interactions: 524,961, Val interactions: 166,472
[Fold 1] Entities: S=390, Q=1064, T=146327, C=397
[Fold 1] Graph edges: 662,288
[Fold 1] Train: 524,961 samples, Val: 166,472 samples
  ‚úì Mastery matrix initialized: [390, 397] with value=0.5
[Fold 1] Model params: 33,017
[Fold 1] Training (pos_weight=0.3041)...
----------------------------------------------------------------------
[Fold 1] Epoch   1/100 | Train AUC: 0.9027 | Val AUC: 0.7337 | Val Acc: 0.7771 | Time: 105.1s ‚òÖ
[Fold 1] Epoch   2/100 | Train AUC: 0.9127 | Val AUC: 0.7

In [11]:
# =============================================================================
# Cell 7: Final TEST Set Evaluation
# =============================================================================

print("=" * 60)
print("FINAL TEST EVALUATION")
print("=" * 60)

print(f"""
Protocol:
  - Train on ALL {len(non_test_students)} non-test students (no validation split)
  - Train for {avg_epochs_cv} epochs (average from CV, no early stopping)
  - Evaluate on held-out TEST set ({len(test_students)} students)
  - This number appears in the paper as TEST performance
""")

# --- Train on all non-test data ---
df_train_final = df_non_test.copy()

print("[1/5] Building entity mappings from all non-test data...")
mappings_final, entity_counts_final, unk_indices_final = build_entity_mappings(df_train_final)
print(f"  S={entity_counts_final['num_students']}, "
      f"Q={entity_counts_final['num_questions']}, "
      f"T={entity_counts_final['num_steps']}, "
      f"C={entity_counts_final['num_kcs']}")

print("[2/5] Building graph...")
hetero_data_final, total_edges_final = build_graph(
    df_train_final, mappings_final, entity_counts_final, unk_indices_final
)
print(f"  Edges: {total_edges_final:,}")

# üîß CHANGEMENT 1: Initialize mastery BEFORE computing features
print("[3/5] Initializing mastery matrix...")
mastery_matrix_cpu_final = torch.full(
    (entity_counts_final['num_students'], entity_counts_final['num_kcs']),
    config.MASTERY_INIT,
    dtype=torch.float32
)

# üîß CHANGEMENT 2: Computing node features WITH mastery
print("[4/5] Computing node features with mastery...")
feat_tensors_final = compute_node_features_with_mastery(
    df_train_final, mappings_final, entity_counts_final, unk_indices_final,
    mastery_matrix_cpu_final  # ‚Üê NEW
)
hetero_data_final['student'].x = feat_tensors_final['student']  # [num_students, 5+num_kcs]
hetero_data_final['question'].x = feat_tensors_final['question']
hetero_data_final['step'].x = feat_tensors_final['step']
hetero_data_final['kc'].x = feat_tensors_final['kc']

print("[5/5] Creating dataloaders...")
train_dataset_final = KTDatasetPure(
    df_train_final,
    mappings_final['stu2idx'], mappings_final['t2idx'], mappings_final['c2idx'],
    unk_indices_final['student'], unk_indices_final['step'], unk_indices_final['kc']
)
test_dataset_final = KTDatasetPure(
    df_test_final,
    mappings_final['stu2idx'], mappings_final['t2idx'], mappings_final['c2idx'],
    unk_indices_final['student'], unk_indices_final['step'], unk_indices_final['kc']
)

train_loader_final = DataLoader(train_dataset_final, batch_size=512, shuffle=True,
                                 pin_memory=True, num_workers=0)
test_loader_final = DataLoader(test_dataset_final, batch_size=512, shuffle=False,
                                pin_memory=True, num_workers=0)

print(f"  Train: {len(train_dataset_final):,} samples")
print(f"  Test:  {len(test_dataset_final):,} samples")

print("[6/6] Training final model...")
device = config.DEVICE

# üîß CHANGEMENT 3: GraphKTMinimal ‚Üí RGCNWithMasteryFeatures
model_final = RGCNWithMasteryFeatures(
    num_students=entity_counts_final['num_students'],
    num_questions=entity_counts_final['num_questions'],
    num_steps=entity_counts_final['num_steps'],
    num_kcs=entity_counts_final['num_kcs'],
    feature_dim_base=NUM_FEATURES,
    embed_dim=config.EMBED_DIM,
    hidden_dim=config.HIDDEN_DIM,
    num_gnn_layers=config.NUM_GNN_LAYERS,
    dropout=config.DROPOUT,
    mastery_init=config.MASTERY_INIT
).to(device)

# Initialize mastery matrix
model_final.init_mastery_matrix(device)

# Class weights from full training set
n_correct = df_train_final['correct'].sum()
n_incorrect = len(df_train_final) - n_correct
pos_weight_final = torch.tensor([n_incorrect / n_correct], dtype=torch.float32).to(device)

optimizer_final = torch.optim.AdamW(model_final.parameters(), lr=config.LEARNING_RATE,
                                      weight_decay=config.WEIGHT_DECAY)
scheduler_final = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer_final, mode='min', factor=0.5, patience=5
)
criterion_final = nn.BCEWithLogitsLoss(pos_weight=pos_weight_final)

# Train for fixed number of epochs (from CV average)
print(f"\nTraining for {avg_epochs_cv} epochs (CV average)...")
print("-" * 70)

for epoch in range(avg_epochs_cv):
    epoch_start = time.time()

    # üîß CHANGEMENT 4: RECALCULATE student features with UPDATED mastery
    if epoch > 0:  # Skip first epoch (already computed)
        mastery_cpu = model_final.mastery_matrix.cpu()
        feat_tensors_final = compute_node_features_with_mastery(
            df_train_final, mappings_final, entity_counts_final, unk_indices_final,
            mastery_cpu  # ‚Üê Mastery from previous epoch
        )
        hetero_data_final['student'].x = feat_tensors_final['student']

    train_loss, train_auc, train_acc = train_epoch(
        model_final, train_loader_final, optimizer_final, criterion_final,
        hetero_data_final, device, config, config.GRAD_CLIP
    )

    epoch_time = time.time() - epoch_start

    if (epoch + 1) % 5 == 0 or epoch == 0 or (epoch + 1) == avg_epochs_cv:
        print(f"Epoch {epoch+1:3d}/{avg_epochs_cv} | "
              f"Train Loss: {train_loss:.4f} | Train AUC: {train_auc:.4f} | "
              f"Train Acc: {train_acc:.4f} | Time: {epoch_time:.1f}s")

    scheduler_final.step(train_loss)

print("-" * 70)

# --- Final TEST evaluation ---
print("\nEvaluating on TEST set...")
test_loss, test_auc, test_acc = evaluate(
    model_final, test_loader_final, criterion_final,
    hetero_data_final, device
)

print(f"\n{'='*60}")
print(f"FINAL RESULTS")
print(f"{'='*60}")
print(f"""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë              GraphKT M3a (Mastery as Features)               ‚ïë
‚ïë                    Algebra 2005-2006 Dataset                 ‚ïë
‚ï†‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï£
‚ïë                                                              ‚ïë
‚ïë  5-Fold CV Validation:                                       ‚ïë
‚ïë    AUC:      {np.mean(val_aucs):.4f} ¬± {np.std(val_aucs):.4f}                              ‚ïë
‚ïë    Accuracy: {np.mean(val_accs):.4f} ¬± {np.std(val_accs):.4f}                              ‚ïë
‚ïë                                                              ‚ïë
‚ïë  Test Set (held-out, {len(test_students)} students):                          ‚ïë
‚ïë    AUC:      {test_auc:.4f}                                        ‚ïë
‚ïë    Accuracy: {test_acc:.4f}                                        ‚ïë
‚ïë                                                              ‚ïë
‚ïë  Model: {sum(p.numel() for p in model_final.parameters()):,} parameters                              ‚ïë
‚ïë  Training: {avg_epochs_cv} epochs (CV average)                           ‚ïë
‚ïë  Split: Student-level (Split B), no leakage                  ‚ïë
‚ïë                                                              ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
""")

FINAL TEST EVALUATION

Protocol:
  - Train on ALL 487 non-test students (no validation split)
  - Train for 15 epochs (average from CV, no early stopping)
  - Evaluate on held-out TEST set (87 students)
  - This number appears in the paper as TEST performance

[1/5] Building entity mappings from all non-test data...
  S=488, Q=1081, T=184307, C=428
[2/5] Building graph...
  Edges: 839,032
[3/5] Initializing mastery matrix...
[4/5] Computing node features with mastery...
[5/5] Creating dataloaders...
  Train: 691,433 samples
  Test:  118,261 samples
[6/6] Training final model...
  ‚úì Mastery matrix initialized: [488, 428] with value=0.5

Training for 15 epochs (CV average)...
----------------------------------------------------------------------
Epoch   1/15 | Train Loss: 0.1794 | Train AUC: 0.9013 | Train Acc: 0.8068 | Time: 112.5s
Epoch   5/15 | Train Loss: 0.1665 | Train AUC: 0.9135 | Train Acc: 0.8149 | Time: 171.7s
Epoch  10/15 | Train Loss: 0.1651 | Train AUC: 0.9149 | Train Acc:

In [12]:
# =============================================================================
# Cell 8: Paper-Ready Results Summary & Per-Fold Analysis
# =============================================================================

print("=" * 60)
print("DETAILED RESULTS FOR PAPER")
print("=" * 60)

# Per-fold table
print("\nTable 1: Per-Fold Cross-Validation Results")
print("-" * 55)
print(f"{'Fold':<6} {'Students':<10} {'Interactions':<14} {'Val AUC':<10} {'Val Acc':<10} {'Epochs':<8}")
print("-" * 55)
for i, r in enumerate(cv_results):
    n_val_stu = len(fold_assignments[i]['val_students'])
    df_val_f = df_non_test[df_non_test['student_id'].isin(fold_assignments[i]['val_students'])]
    n_val_int = len(df_val_f)
    print(f"{i+1:<6} {n_val_stu:<10} {n_val_int:<14,} {r['val_auc']:<10.4f} {r['val_acc']:<10.4f} {r['stopped_epoch']:<8}")

print("-" * 55)
print(f"{'Mean':<6} {'':10} {'':14} {np.mean(val_aucs):<10.4f} {np.mean(val_accs):<10.4f} {np.mean(stopped_epochs):<8.1f}")
print(f"{'¬±Std':<6} {'':10} {'':14} {np.std(val_aucs):<10.4f} {np.std(val_accs):<10.4f} {np.std(stopped_epochs):<8.1f}")

# Summary table for paper
print(f"\n\nTable 2: Model Comparison (for paper)")
print("-" * 65)
print(f"{'Model':<20} {'Val AUC':<16} {'Test AUC':<12} {'Params':<10}")
print("-" * 65)
# üîß CHANGEMENT: "GraphKT M3" ‚Üí "GraphKT M3a"
print(f"{'GraphKT M3a':<20} {np.mean(val_aucs):.4f} ¬± {np.std(val_aucs):.4f}   {test_auc:<12.4f} {sum(p.numel() for p in model_final.parameters()):,}")

# Consistency check
auc_range = np.max(val_aucs) - np.min(val_aucs)
print(f"\n\nConsistency Analysis:")
print(f"  AUC range across folds: {auc_range:.4f}")
if auc_range < 0.03:
    print(f"  ‚úì Highly consistent (range < 0.03)")
elif auc_range < 0.05:
    print(f"  ~ Moderately consistent (range < 0.05)")
else:
    print(f"  ‚ö† High variance across folds - investigate fold differences")

print(f"\n  Coefficient of variation: {np.std(val_aucs)/np.mean(val_aucs)*100:.2f}%")

DETAILED RESULTS FOR PAPER

Table 1: Per-Fold Cross-Validation Results
-------------------------------------------------------
Fold   Students   Interactions   Val AUC    Val Acc    Epochs  
-------------------------------------------------------
1      98         166,472        0.7337     0.7771     11      
2      98         112,205        0.7368     0.7178     13      
3      97         153,605        0.7362     0.7188     24      
4      97         133,779        0.7292     0.7432     17      
5      97         125,372        0.7444     0.7707     12      
-------------------------------------------------------
Mean                             0.7361     0.7455     15.4    
¬±Std                             0.0049     0.0250     4.8     


Table 2: Model Comparison (for paper)
-----------------------------------------------------------------
Model                Val AUC          Test AUC     Params    
-----------------------------------------------------------------
GraphKT M3a   