In [1]:
!pip install torch_geometric



In [2]:
# =============================================================================
# Cell 1: Imports and Configuration
# =============================================================================

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import GCNConv, RGCNConv
from torch_geometric.nn import MessagePassing  # ‚Üê Pour MasteryMessagePassing
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Configuration
class Config:
    # Data
    SEED = 42
    TEST_STUDENT_RATIO = 0.15      # 15% students for TEST (Split B)
    VAL_STUDENT_RATIO = 0.15       # 15% of TRAIN students for VAL

    # Model Architecture
    FEATURE_DIM = 5                # Base features for Q, T, C (students: 5 + num_kcs)
    EMBED_DIM = 32
    HIDDEN_DIM = 64
    NUM_GNN_LAYERS = 2
    DROPOUT = 0.2
    NUM_BASES = 2                  # For RGCN basis decomposition

    # Training
    BATCH_SIZE = 512               # Optimis√© pour M3 Final
    LEARNING_RATE = 1e-3
    WEIGHT_DECAY = 0.01
    EPOCHS = 100
    PATIENCE = 10
    GRAD_CLIP = 1.0

    # Mastery (M3 Final specific)
    MASTERY_INIT = 0.5             # Valeur init de mastery [0.5 ou 'global_mean']
    LAMBDA_EMA = 0.1               # Learning rate mastery update (0.05, 0.1, 0.2)
    MASTERY_GATING = 'identity'    # Gating function ['identity' ou 'sigmoid']
    GATING_ALPHA = 2.0             # Si sigmoid, alpha parameter

    # Device
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = Config()

# Set seeds for reproducibility
np.random.seed(config.SEED)
torch.manual_seed(config.SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(config.SEED)

print(f"Device: {config.DEVICE}")
print(f"PyTorch version: {torch.__version__}")
print(f"Configuration loaded (M3 Final - Features + Weights + Dual Propagation)")

Device: cuda
PyTorch version: 2.9.0+cu128
Configuration loaded (M3 Final - Features + Weights + Dual Propagation)


In [3]:
# =============================================================================
# Cell 2: Load and Explore Dataset
# =============================================================================

# Load the Algebra 2005-2006 dataset
DATA_PATH = "algebra_2005_2006_train.txt"

# Load with tab separator (standard format for this dataset)
df_raw = pd.read_csv(DATA_PATH, sep='\t', low_memory=False)

print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)
print(f"Shape: {df_raw.shape}")
print(f"\nColumns ({len(df_raw.columns)}):")
for i, col in enumerate(df_raw.columns):
    print(f"  {i+1:2d}. {col}")

print("\n" + "=" * 60)
print("FIRST 3 ROWS")
print("=" * 60)
display(df_raw.head(3))

print("\n" + "=" * 60)
print("DATA TYPES")
print("=" * 60)
print(df_raw.dtypes)

print("\n" + "=" * 60)
print("MISSING VALUES")
print("=" * 60)
missing = df_raw.isnull().sum()
missing_pct = (missing / len(df_raw) * 100).round(2)
missing_df = pd.DataFrame({'Missing': missing, 'Percent': missing_pct})
print(missing_df[missing_df['Missing'] > 0])

print("\n" + "=" * 60)
print("KEY STATISTICS")
print("=" * 60)
print(f"Total interactions: {len(df_raw):,}")
print(f"Unique students: {df_raw['Anon Student Id'].nunique():,}")
print(f"Unique problems (questions): {df_raw['Problem Name'].nunique():,}")
print(f"Unique steps: {df_raw[['Problem Name', 'Step Name']].drop_duplicates().shape[0]:,}")
print(f"Unique KC(Default): {df_raw['KC(Default)'].nunique():,}")
print(f"KC(Default) missing: {df_raw['KC(Default)'].isnull().sum():,} ({df_raw['KC(Default)'].isnull().mean()*100:.2f}%)")

print("\n" + "=" * 60)
print("TARGET DISTRIBUTION (Correct First Attempt)")
print("=" * 60)
print(df_raw['Correct First Attempt'].value_counts(normalize=True).round(4))

DATASET OVERVIEW
Shape: (809694, 19)

Columns (19):
   1. Row
   2. Anon Student Id
   3. Problem Hierarchy
   4. Problem Name
   5. Problem View
   6. Step Name
   7. Step Start Time
   8. First Transaction Time
   9. Correct Transaction Time
  10. Step End Time
  11. Step Duration (sec)
  12. Correct Step Duration (sec)
  13. Error Step Duration (sec)
  14. Correct First Attempt
  15. Incorrects
  16. Hints
  17. Corrects
  18. KC(Default)
  19. Opportunity(Default)

FIRST 3 ROWS


Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,Step Duration (sec),Correct Step Duration (sec),Error Step Duration (sec),Correct First Attempt,Incorrects,Hints,Corrects,KC(Default),Opportunity(Default)
0,1,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG4-FIXED,1,3(x+2) = 15,2005-09-09 12:24:35.0,2005-09-09 12:24:49.0,2005-09-09 12:25:15.0,2005-09-09 12:25:15.0,40.0,,40.0,0,2,3,1,[SkillRule: Eliminate Parens; {CLT nested; CLT...,1
1,2,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG4-FIXED,1,x+2 = 5,2005-09-09 12:25:15.0,2005-09-09 12:25:31.0,2005-09-09 12:25:31.0,2005-09-09 12:25:31.0,16.0,16.0,,1,0,0,1,"[SkillRule: Remove constant; {ax+b=c, positive...",1~~1
2,3,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,1,2-8y = -4,2005-09-09 12:25:36.0,2005-09-09 12:25:43.0,2005-09-09 12:26:12.0,2005-09-09 12:26:12.0,36.0,,36.0,0,2,3,1,"[SkillRule: Remove constant; {ax+b=c, positive...",2



DATA TYPES
Row                              int64
Anon Student Id                 object
Problem Hierarchy               object
Problem Name                    object
Problem View                     int64
Step Name                       object
Step Start Time                 object
First Transaction Time          object
Correct Transaction Time        object
Step End Time                   object
Step Duration (sec)            float64
Correct Step Duration (sec)    float64
Error Step Duration (sec)      float64
Correct First Attempt            int64
Incorrects                       int64
Hints                            int64
Corrects                         int64
KC(Default)                     object
Opportunity(Default)            object
dtype: object

MISSING VALUES
                             Missing  Percent
Step Start Time                  919     0.11
Correct Transaction Time       25851     3.19
Step Duration (sec)              919     0.11
Correct Step Duration (sec)   189

In [4]:
# =============================================================================
# Cell 3 (Corrected): Data Cleaning - Keep Missing KC as UNKNOWN
# =============================================================================

# Start with a copy
df = df_raw.copy()

print("=" * 60)
print("STEP 1: Drop rows with missing critical fields (NOT KC)")
print("=" * 60)

# Critical fields: student, step, target (NOT KC - we'll handle separately)
critical_cols = ['Anon Student Id', 'Problem Name', 'Step Name', 'Correct First Attempt']
before_drop = len(df)
df = df.dropna(subset=critical_cols)
after_drop = len(df)
print(f"Dropped {before_drop - after_drop:,} rows ({(before_drop - after_drop)/before_drop*100:.2f}%)")
print(f"Remaining: {after_drop:,} rows")

print("\n" + "=" * 60)
print("STEP 2: Handle missing KC(Default)")
print("=" * 60)

kc_missing_before = df['KC(Default)'].isnull().sum()
print(f"Missing KC(Default): {kc_missing_before:,} ({kc_missing_before/len(df)*100:.2f}%)")

# Fill missing KC with special token
df['KC(Default)'] = df['KC(Default)'].fillna('UNKNOWN_KC')
print(f"Filled with 'UNKNOWN_KC' token")

print("\n" + "=" * 60)
print("STEP 3: Create canonical identifiers")
print("=" * 60)

# Student ID
df['student_id'] = df['Anon Student Id'].astype(str).str.strip()

# Question ID (Problem Name)
df['question_id'] = df['Problem Name'].astype(str).str.strip()

# Step ID (Problem Name + Step Name)
df['step_id'] = df['Problem Name'].astype(str).str.strip() + "||" + df['Step Name'].astype(str).str.strip()

# KC ID (KC(Default) as composite string)
df['kc_id'] = df['KC(Default)'].astype(str).str.strip()

# Target
df['correct'] = df['Correct First Attempt'].astype(int)

print(f"Unique students: {df['student_id'].nunique():,}")
print(f"Unique questions: {df['question_id'].nunique():,}")
print(f"Unique steps: {df['step_id'].nunique():,}")
print(f"Unique KCs (including UNKNOWN): {df['kc_id'].nunique():,}")

print("\n" + "=" * 60)
print("STEP 4: Parse timestamps and create temporal ordering")
print("=" * 60)

# Parse First Transaction Time (primary timestamp)
df['timestamp'] = pd.to_datetime(df['First Transaction Time'], errors='coerce')

# Fallback to Step Start Time
mask_missing_ts = df['timestamp'].isnull()
df.loc[mask_missing_ts, 'timestamp'] = pd.to_datetime(
    df.loc[mask_missing_ts, 'Step Start Time'], errors='coerce'
)

# Check remaining missing timestamps
ts_missing = df['timestamp'].isnull().sum()
print(f"Rows with missing timestamp after fallback: {ts_missing}")

if ts_missing > 0:
    # Drop only these (should be minimal)
    df = df.dropna(subset=['timestamp'])
    print(f"Dropped {ts_missing} rows with no valid timestamp")

# Sort by student and timestamp
df = df.sort_values(['student_id', 'timestamp']).reset_index(drop=True)

# Create time index within each student
df['time_idx'] = df.groupby('student_id').cumcount()

print(f"Final dataset size: {len(df):,} rows")

print("\n" + "=" * 60)
print("STEP 5: Process behavioral features")
print("=" * 60)

# Fill missing durations with median
duration_col = 'Step Duration (sec)'
median_duration = df[duration_col].median()
df[duration_col] = df[duration_col].fillna(median_duration)

# Log transform duration
df['log_duration'] = np.log1p(df[duration_col].clip(lower=0))

# Clip extreme values
df['Incorrects'] = df['Incorrects'].clip(upper=10)
df['Hints'] = df['Hints'].clip(upper=10)

print(f"Median duration: {median_duration:.2f} sec")
print(f"Log duration range: [{df['log_duration'].min():.2f}, {df['log_duration'].max():.2f}]")

print("\n" + "=" * 60)
print("STEP 6: Final dataset summary")
print("=" * 60)

print(f"Total interactions: {len(df):,}")
print(f"Unique students: {df['student_id'].nunique():,}")
print(f"Unique questions: {df['question_id'].nunique():,}")
print(f"Unique steps: {df['step_id'].nunique():,}")
print(f"Unique KCs: {df['kc_id'].nunique():,}")
print(f"  - Real KCs: {(df['kc_id'] != 'UNKNOWN_KC').sum():,} interactions")
print(f"  - UNKNOWN_KC: {(df['kc_id'] == 'UNKNOWN_KC').sum():,} interactions")

print(f"\nTarget distribution:")
print(df['correct'].value_counts(normalize=True).round(4))

print("\n" + "=" * 60)
print("STEP 7: Verify temporal ordering")
print("=" * 60)

sample_student = df['student_id'].iloc[0]
sample_seq = df[df['student_id'] == sample_student][['time_idx', 'timestamp', 'question_id', 'kc_id', 'correct']].head(5)
print(f"Sample student '{sample_student}' first 5 interactions:")
display(sample_seq)

STEP 1: Drop rows with missing critical fields (NOT KC)
Dropped 0 rows (0.00%)
Remaining: 809,694 rows

STEP 2: Handle missing KC(Default)
Missing KC(Default): 202,669 (25.03%)
Filled with 'UNKNOWN_KC' token

STEP 3: Create canonical identifiers
Unique students: 574
Unique questions: 1,084
Unique steps: 210,710
Unique KCs (including UNKNOWN): 437

STEP 4: Parse timestamps and create temporal ordering
Rows with missing timestamp after fallback: 0
Final dataset size: 809,694 rows

STEP 5: Process behavioral features
Median duration: 11.00 sec
Log duration range: [0.00, 7.90]

STEP 6: Final dataset summary
Total interactions: 809,694
Unique students: 574
Unique questions: 1,084
Unique steps: 210,710
Unique KCs: 437
  - Real KCs: 607,025 interactions
  - UNKNOWN_KC: 202,669 interactions

Target distribution:
correct
1    0.7665
0    0.2335
Name: proportion, dtype: float64

STEP 7: Verify temporal ordering
Sample student '02ZjVTxC34' first 5 interactions:


Unnamed: 0,time_idx,timestamp,question_id,kc_id,correct
0,0,2005-09-06 13:00:23,LDEMO_WKST,UNKNOWN_KC,1
1,1,2005-09-06 13:00:44,LDEMO_WKST,Identifying units,1
2,2,2005-09-06 13:01:12,LDEMO_WKST,UNKNOWN_KC,1
3,3,2005-09-06 13:01:46,LDEMO_WKST,Identifying units,1
4,4,2005-09-06 13:02:27,LDEMO_WKST,"Entering a given~~Convert unit, multiplier",1


In [5]:
# =============================================================================
# Cell 4: Separate TEST Set + 5-Fold Student-Level CV Setup
# =============================================================================

import time
from sklearn.model_selection import KFold, train_test_split
from collections import defaultdict

print("=" * 60)
print("STEP 1: SEPARATE TEST STUDENTS (HELD OUT ENTIRELY)")
print("=" * 60)

all_students = df['student_id'].unique()
n_students = len(all_students)
print(f"Total students: {n_students}")

# Hold out 15% of students as TEST - NEVER touched during CV
non_test_students, test_students = train_test_split(
    all_students,
    test_size=config.TEST_STUDENT_RATIO,
    random_state=config.SEED
)

df_test_final = df[df['student_id'].isin(test_students)].copy()
df_non_test = df[df['student_id'].isin(non_test_students)].copy()

print(f"\nTEST set (held out):")
print(f"  Students: {len(test_students)} ({len(test_students)/n_students*100:.1f}%)")
print(f"  Interactions: {len(df_test_final):,}")

print(f"\nNon-test (enters K-Fold CV):")
print(f"  Students: {len(non_test_students)} ({len(non_test_students)/n_students*100:.1f}%)")
print(f"  Interactions: {len(df_non_test):,}")

print("\n" + "=" * 60)
print("STEP 2: DEFINE 5-FOLD STUDENT-LEVEL SPLITS")
print("=" * 60)

kf = KFold(n_splits=5, shuffle=True, random_state=config.SEED)

fold_assignments = {}
for fold_idx, (train_indices, val_indices) in enumerate(kf.split(non_test_students)):
    train_studs = non_test_students[train_indices]
    val_studs = non_test_students[val_indices]
    fold_assignments[fold_idx] = {
        'train_students': train_studs,
        'val_students': val_studs
    }
    print(f"\nFold {fold_idx+1}:")
    print(f"  TRAIN: {len(train_studs)} students")
    print(f"  VAL:   {len(val_studs)} students")

    # Verify no overlap
    overlap = set(train_studs) & set(val_studs)
    assert len(overlap) == 0, f"LEAK in fold {fold_idx+1}!"

    # Verify no test leakage
    test_leak = set(train_studs) & set(test_students)
    assert len(test_leak) == 0, f"TEST LEAK in fold {fold_idx+1}!"

print("\n‚úì All folds verified: no student overlap, no test leakage")
print(f"‚úì TEST set ({len(test_students)} students) completely isolated")

STEP 1: SEPARATE TEST STUDENTS (HELD OUT ENTIRELY)
Total students: 574

TEST set (held out):
  Students: 87 (15.2%)
  Interactions: 118,261

Non-test (enters K-Fold CV):
  Students: 487 (84.8%)
  Interactions: 691,433

STEP 2: DEFINE 5-FOLD STUDENT-LEVEL SPLITS

Fold 1:
  TRAIN: 389 students
  VAL:   98 students

Fold 2:
  TRAIN: 389 students
  VAL:   98 students

Fold 3:
  TRAIN: 390 students
  VAL:   97 students

Fold 4:
  TRAIN: 390 students
  VAL:   97 students

Fold 5:
  TRAIN: 390 students
  VAL:   97 students

‚úì All folds verified: no student overlap, no test leakage
‚úì TEST set (87 students) completely isolated


In [6]:
# =============================================================================
# Cell 5: Complete Pipeline Functions (M3 FINAL - Features + Weights + Dual)
# =============================================================================

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import GCNConv, RGCNConv, MessagePassing
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score, accuracy_score
from collections import defaultdict
import time

NUM_FEATURES = 5

# ============================================================
# FUNCTION 1: Build Entity Mappings (UNCHANGED)
# ============================================================
def build_entity_mappings(df_train):
    train_students = sorted(df_train['student_id'].unique())
    train_questions = sorted(df_train['question_id'].unique())
    train_steps = sorted(df_train['step_id'].unique())
    train_kcs = sorted(df_train['kc_id'].unique())

    stu2idx = {s: i for i, s in enumerate(train_students)}
    q2idx = {q: i for i, q in enumerate(train_questions)}
    t2idx = {t: i for i, t in enumerate(train_steps)}
    c2idx = {c: i for i, c in enumerate(train_kcs)}

    unk_indices = {
        'student': len(train_students),
        'question': len(train_questions),
        'step': len(train_steps),
        'kc': len(train_kcs)
    }

    entity_counts = {
        'num_students': len(train_students) + 1,
        'num_questions': len(train_questions) + 1,
        'num_steps': len(train_steps) + 1,
        'num_kcs': len(train_kcs) + 1,
    }

    mappings = {
        'stu2idx': stu2idx, 'q2idx': q2idx,
        't2idx': t2idx, 'c2idx': c2idx
    }

    return mappings, entity_counts, unk_indices


# ============================================================
# FUNCTION 2: Build Heterogeneous Graph (UNCHANGED)
# ============================================================
def build_graph(df_train, mappings, entity_counts, unk_indices):
    stu2idx = mappings['stu2idx']
    q2idx = mappings['q2idx']
    t2idx = mappings['t2idx']
    c2idx = mappings['c2idx']

    data = HeteroData()

    data['student'].num_nodes = entity_counts['num_students']
    data['question'].num_nodes = entity_counts['num_questions']
    data['step'].num_nodes = entity_counts['num_steps']
    data['kc'].num_nodes = entity_counts['num_kcs']

    qt_pairs = df_train[['question_id', 'step_id']].drop_duplicates()
    q_idx_list = [q2idx[r['question_id']] for _, r in qt_pairs.iterrows()]
    t_idx_list = [t2idx[r['step_id']] for _, r in qt_pairs.iterrows()]

    data['question', 'contains', 'step'].edge_index = torch.tensor([q_idx_list, t_idx_list], dtype=torch.long)
    data['step', 'belongs_to', 'question'].edge_index = torch.tensor([t_idx_list, q_idx_list], dtype=torch.long)

    tc_pairs = df_train[['step_id', 'kc_id']].drop_duplicates()
    t_idx_list2 = [t2idx[r['step_id']] for _, r in tc_pairs.iterrows()]
    c_idx_list = [c2idx[r['kc_id']] for _, r in tc_pairs.iterrows()]

    data['step', 'requires', 'kc'].edge_index = torch.tensor([t_idx_list2, c_idx_list], dtype=torch.long)
    data['kc', 'required_by', 'step'].edge_index = torch.tensor([c_idx_list, t_idx_list2], dtype=torch.long)

    sq_pairs = df_train[['student_id', 'question_id']].drop_duplicates()
    s_idx_list = [stu2idx[r['student_id']] for _, r in sq_pairs.iterrows()]
    q_idx_list2 = [q2idx[r['question_id']] for _, r in sq_pairs.iterrows()]

    data['student', 'attempted', 'question'].edge_index = torch.tensor([s_idx_list, q_idx_list2], dtype=torch.long)
    data['question', 'attempted_by', 'student'].edge_index = torch.tensor([q_idx_list2, s_idx_list], dtype=torch.long)

    total_edges = sum(
        data[et].edge_index.shape[1]
        for et in data.edge_types
    )

    return data, total_edges


# ============================================================
# FUNCTION 3: Compute Node Features WITH MASTERY (M3 FINAL - FIXED)
# ============================================================
def compute_node_features_with_mastery(df_train, mappings, entity_counts, unk_indices, mastery_matrix):
    """
    M3 FINAL: Compute features with mastery for students

    Students: [5D stats + mastery_vector] = (5 + num_kcs)D
    Others (Q, T, C): 5D stats only

    Args:
        mastery_matrix: [num_students, num_kcs] tensor with current mastery values

    Returns:
        feat_tensors: Dict with tensors for each entity type
            - student: [num_students, 5 + num_kcs]
            - question, step, kc: [num_entities, 5]
    """
    # ‚úÖ FIX: Include UNK KC
    num_kcs = entity_counts['num_kcs']

    # ‚úÖ FIX: Move mastery to CPU for feature computation
    mastery_cpu = mastery_matrix.cpu() if mastery_matrix.is_cuda else mastery_matrix

    def compute_features_for_type(df, entity_col):
        grouped = df.groupby(entity_col).agg({
            'correct': ['count', 'mean'],
            'log_duration': 'mean',
            'Hints': 'mean',
            'Incorrects': 'mean'
        })
        grouped.columns = ['freq', 'correct_rate', 'avg_log_dur', 'avg_hints', 'avg_incorrects']
        grouped = grouped.reset_index()
        grouped['difficulty'] = 1 - grouped['correct_rate']
        grouped['log_freq'] = np.log1p(grouped['freq'])

        features = {}
        feat_cols = ['log_freq', 'difficulty', 'avg_log_dur', 'avg_hints', 'avg_incorrects']
        for _, row in grouped.iterrows():
            features[row[entity_col]] = row[feat_cols].values.astype(np.float32)
        return features

    def to_tensor_5d(features_dict, idx_map, num_with_unk, unk_idx):
        """Create 5D tensor (original)"""
        tensor = torch.zeros(num_with_unk, NUM_FEATURES, dtype=torch.float32)
        all_feats = []
        for entity_id, idx in idx_map.items():
            if entity_id in features_dict:
                tensor[idx] = torch.tensor(features_dict[entity_id])
                all_feats.append(features_dict[entity_id])
        if all_feats:
            tensor[unk_idx] = torch.tensor(np.mean(all_feats, axis=0))
        return tensor

    def to_tensor_with_mastery(features_dict, idx_map, num_with_unk, unk_idx, mastery_matrix):
        """Create [5D + mastery] tensor for students"""
        # 5D features
        tensor_5d = to_tensor_5d(features_dict, idx_map, num_with_unk, unk_idx)

        # Concatenate with mastery (already on CPU)
        tensor_full = torch.cat([tensor_5d, mastery_matrix], dim=1)  # [num_students, 5 + num_kcs]

        return tensor_full

    def normalize(tensor):
        mean = tensor.mean(dim=0, keepdim=True)
        std = tensor.std(dim=0, keepdim=True) + 1e-8
        return (tensor - mean) / std

    # Compute 5D stats for all entity types
    stu_feats = compute_features_for_type(df_train, 'student_id')
    q_feats = compute_features_for_type(df_train, 'question_id')
    t_feats = compute_features_for_type(df_train, 'step_id')
    c_feats = compute_features_for_type(df_train, 'kc_id')

    # Students: [5D + mastery] - use CPU version
    student_features = to_tensor_with_mastery(
        stu_feats, mappings['stu2idx'],
        entity_counts['num_students'], unk_indices['student'],
        mastery_cpu  # ‚Üê Use CPU version
    )

    # Others: 5D only
    feat_tensors = {
        'student': normalize(student_features),  # [num_students, 5 + num_kcs]
        'question': normalize(to_tensor_5d(q_feats, mappings['q2idx'], entity_counts['num_questions'], unk_indices['question'])),
        'step': normalize(to_tensor_5d(t_feats, mappings['t2idx'], entity_counts['num_steps'], unk_indices['step'])),
        'kc': normalize(to_tensor_5d(c_feats, mappings['c2idx'], entity_counts['num_kcs'], unk_indices['kc'])),
    }

    return feat_tensors


# ============================================================
# FUNCTION 4: Model Components
# ============================================================

class NodeEncoder(nn.Module):
    """Encode node features to embedding space"""
    def __init__(self, input_dim, embed_dim, dropout=0.1):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, embed_dim),
            nn.LayerNorm(embed_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(embed_dim, embed_dim),
            nn.LayerNorm(embed_dim)
        )

    def forward(self, x):
        return self.encoder(x)


class MasteryMessagePassing(MessagePassing):
    """
    Message passing layer with mastery-weighted edges
    Used for Student ‚Üî KC knowledge propagation
    """
    def __init__(self, embed_dim, gating='identity'):
        super().__init__(aggr='mean')
        self.lin = nn.Linear(embed_dim, embed_dim)
        self.gating = gating

    def forward(self, h_nodes, edge_index, mastery_weights):
        """
        Args:
            h_nodes: [num_nodes, embed_dim] - all nodes (students + KCs)
            edge_index: [2, num_edges] - S‚ÜîC edges
            mastery_weights: [num_edges] - mastery weights ‚àà [0,1]

        Returns:
            h_out: [num_nodes, embed_dim] - updated embeddings
        """
        out = self.propagate(edge_index, x=h_nodes, mastery=mastery_weights)
        return out

    def message(self, x_j, mastery):
        """
        x_j: neighbor features
        mastery: edge weights
        """
        return mastery.unsqueeze(-1) * self.lin(x_j)


# ============================================================
# MODEL: GraphKT M3 FINAL (Features + Weights + Dual Propagation)
# ============================================================

class GraphKTMinimal(nn.Module):
    """
    GraphKT M3 FINAL with:
    - Student features: [5D + mastery] (from M3a)
    - Block A: Structural Propagation (Q,T,C via RGCN) (from M3b)
    - Block B: Knowledge Propagation (S‚ÜîC weighted by mastery) (from M3b)
    - Dynamic feature updates each epoch (from M3a)
    """

    NUM_RELATIONS_STRUCT = 4  # Q‚ÜîT, T‚ÜîC (4 relation types)

    def __init__(self, num_students, num_questions, num_steps, num_kcs,
                 feature_dim=5, embed_dim=32, hidden_dim=64,
                 num_gnn_layers=2, dropout=0.2,
                 mastery_init=0.5, mastery_gating='identity'):
        super().__init__()

        self.num_students = num_students
        self.num_questions = num_questions
        self.num_steps = num_steps
        self.num_kcs = num_kcs
        self.embed_dim = embed_dim

        # === Node Encoders ===
        # ‚úÖ FIX: Student encoder includes UNK KC
        self.student_encoder = NodeEncoder(feature_dim + num_kcs, embed_dim, dropout)  # 5 + num_kcs
        self.question_encoder = NodeEncoder(feature_dim, embed_dim, dropout)  # 5D
        self.step_encoder = NodeEncoder(feature_dim, embed_dim, dropout)  # 5D
        self.kc_encoder = NodeEncoder(feature_dim, embed_dim, dropout)  # 5D

        # === BLOCK A: Structural Propagation (Q,T,C) ===
        self.structural_rgcn = nn.ModuleList([
            RGCNConv(
                embed_dim, embed_dim,
                num_relations=self.NUM_RELATIONS_STRUCT,
                num_bases=2
            )
            for _ in range(num_gnn_layers)
        ])
        self.struct_norms = nn.ModuleList([
            nn.LayerNorm(embed_dim) for _ in range(num_gnn_layers)
        ])
        self.struct_dropout = nn.Dropout(dropout)

        # === BLOCK B: Knowledge Propagation (S‚ÜîC with mastery) ===
        self.knowledge_conv = MasteryMessagePassing(
            embed_dim,
            gating=mastery_gating
        )

        # === Fusion Layers ===
        # Student: [h_s_base, h_s_knowledge] ‚Üí embed_dim
        self.fusion_student = nn.Sequential(
            nn.Linear(embed_dim * 2, embed_dim),
            nn.LayerNorm(embed_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        # KC: [h_c_structural, h_c_knowledge] ‚Üí embed_dim
        self.fusion_kc = nn.Sequential(
            nn.Linear(embed_dim * 2, embed_dim),
            nn.LayerNorm(embed_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        # === Prediction Head ===
        pred_input_dim = embed_dim * 3  # [h_s*, h_t_struct, h_c*]
        self.prediction_head = nn.Sequential(
            nn.Linear(pred_input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1)
        )

        # === Mastery Matrix ===
        self.mastery_init = mastery_init
        self.mastery_gating = mastery_gating
        self.register_buffer('mastery_matrix', None)

        # Gating parameter for sigmoid
        if mastery_gating == 'sigmoid':
            self.gating_alpha = nn.Parameter(torch.tensor(2.0))

        # Cache
        self._cached_struct_edges = None
        self._cached_struct_types = None

    def init_mastery_matrix(self, device):
        """Initialize mastery matrix (call once per fold)"""
        # ‚úÖ FIX: Include UNK KC in mastery matrix
        self.mastery_matrix = torch.full(
            (self.num_students, self.num_kcs),  # Include UNK
            self.mastery_init,
            dtype=torch.float32,
            device=device
        )
        print(f"  ‚úì Mastery matrix initialized: [{self.num_students}, {self.num_kcs}] with value={self.mastery_init}")

    def _build_structural_edges(self, hetero_data, device):
        """Build edge_index and edge_type for Q,T,C (BLOCK A)"""
        if self._cached_struct_edges is not None:
            return self._cached_struct_edges, self._cached_struct_types

        all_edges = []
        all_types = []

        # ‚úì LOCAL offsets for H_struct = [Q, T, C]
        question_offset_local = 0
        step_offset_local = self.num_questions
        kc_offset_local = self.num_questions + self.num_steps

        # Relation 0: Q ‚Üí T (contains)
        q_t = hetero_data['question', 'contains', 'step'].edge_index.clone()
        q_t[0] += question_offset_local
        q_t[1] += step_offset_local
        all_edges.append(q_t)
        all_types.append(torch.zeros(q_t.size(1), dtype=torch.long))

        # Relation 1: T ‚Üí Q (belongs_to)
        t_q = hetero_data['step', 'belongs_to', 'question'].edge_index.clone()
        t_q[0] += step_offset_local
        t_q[1] += question_offset_local
        all_edges.append(t_q)
        all_types.append(torch.ones(t_q.size(1), dtype=torch.long))

        # Relation 2: T ‚Üí C (requires)
        t_c = hetero_data['step', 'requires', 'kc'].edge_index.clone()
        t_c[0] += step_offset_local
        t_c[1] += kc_offset_local
        all_edges.append(t_c)
        all_types.append(torch.full((t_c.size(1),), 2, dtype=torch.long))

        # Relation 3: C ‚Üí T (required_by)
        c_t = hetero_data['kc', 'required_by', 'step'].edge_index.clone()
        c_t[0] += kc_offset_local
        c_t[1] += step_offset_local
        all_edges.append(c_t)
        all_types.append(torch.full((c_t.size(1),), 3, dtype=torch.long))

        self._cached_struct_edges = torch.cat(all_edges, dim=1).to(device)
        self._cached_struct_types = torch.cat(all_types, dim=0).to(device)

        return self._cached_struct_edges, self._cached_struct_types

    def _build_sc_edges_batch(self, student_idx, kc_idx, device):
        """
        Build S‚ÜîC edge_index for batch with mastery weights

        Returns:
            edge_index: [2, 2*batch_size]
            mastery_weights: [2*batch_size]
        """
        # ‚úì LOCAL offset for H_know = [S, C]
        kc_offset_local = self.num_students

        # S ‚Üí C edges
        edge_index_sc = torch.stack([student_idx, kc_idx + kc_offset_local], dim=0)
        # C ‚Üí S edges (reverse)
        edge_index_cs = torch.stack([kc_idx + kc_offset_local, student_idx], dim=0)

        # Concatenate
        edge_index = torch.cat([edge_index_sc, edge_index_cs], dim=1).to(device)

        # Lookup mastery weights (VECTORIZED)
        mastery_weights_sc = self.mastery_matrix[student_idx, kc_idx]
        mastery_weights_cs = mastery_weights_sc.clone()

        mastery_weights = torch.cat([mastery_weights_sc, mastery_weights_cs])

        # Apply gating function
        if self.mastery_gating == 'sigmoid':
            mastery_weights = torch.sigmoid(
                self.gating_alpha * (mastery_weights - 0.5)
            )

        return edge_index, mastery_weights

    def forward(self, hetero_data, student_idx, step_idx, kc_idx, device):
        """
        Forward pass with Structural + Knowledge propagation
        NO event features to prevent leakage
        """
        # === 1. Encode Nodes ===
        h_s_base = self.student_encoder(hetero_data['student'].x.to(device))
        h_q = self.question_encoder(hetero_data['question'].x.to(device))
        h_t = self.step_encoder(hetero_data['step'].x.to(device))
        h_c_base = self.kc_encoder(hetero_data['kc'].x.to(device))

        # === 2. BLOCK A: Structural Propagation (Q,T,C) ===
        H_struct = torch.cat([h_q, h_t, h_c_base], dim=0)
        edge_index_struct, edge_type_struct = self._build_structural_edges(
            hetero_data, device
        )

        for layer, norm in zip(self.structural_rgcn, self.struct_norms):
            H_new = layer(H_struct, edge_index_struct, edge_type_struct)
            H_new = norm(H_new)
            H_new = F.relu(H_new)
            H_new = self.struct_dropout(H_new)
            H_struct = H_struct + H_new  # Residual connection

        # Extract h_t^struct, h_c^struct
        num_questions = h_q.size(0)
        num_steps = h_t.size(0)
        h_t_struct = H_struct[num_questions : num_questions + num_steps]
        h_c_struct = H_struct[num_questions + num_steps :]

        # === 3. BLOCK B: Knowledge Propagation (S‚ÜîC with mastery) ===
        edge_index_sc, mastery_weights = self._build_sc_edges_batch(
            student_idx, kc_idx, device
        )

        # Combine S + C nodes
        H_know = torch.cat([h_s_base, h_c_base], dim=0)

        # Message passing with mastery weighting
        H_know = self.knowledge_conv(H_know, edge_index_sc, mastery_weights)

        # Split back
        h_s_know = H_know[:self.num_students]
        h_c_know = H_know[self.num_students:]

        # === 4. Fusion ===
        # Student: [h_s_base, h_s_knowledge]
        h_s_batch = h_s_base[student_idx]
        h_s_know_batch = h_s_know[student_idx]
        h_s_fused = self.fusion_student(
            torch.cat([h_s_batch, h_s_know_batch], dim=-1)
        )

        # Step: structural only
        h_t_batch = h_t_struct[step_idx]

        # KC: [h_c_structural, h_c_knowledge]
        h_c_struct_batch = h_c_struct[kc_idx]
        h_c_know_batch = h_c_know[kc_idx]
        h_c_fused = self.fusion_kc(
            torch.cat([h_c_struct_batch, h_c_know_batch], dim=-1)
        )

        # === 5. Prediction ===
        combined = torch.cat([h_s_fused, h_t_batch, h_c_fused], dim=-1)
        logits = self.prediction_head(combined).squeeze(-1)

        return logits

    def update_mastery_online(self, student_idx, kc_idx, y_true, lambda_ema):
        """
        Update mastery AFTER observing outcome (prevents leakage)

        Args:
            student_idx: [batch] student indices
            kc_idx: [batch] KC indices
            y_true: [batch] outcomes (0 or 1)
            lambda_ema: learning rate for EMA update
        """
        with torch.no_grad():
            # EMA update: M ‚Üê (1-Œª)M + Œªy
            old_mastery = self.mastery_matrix[student_idx, kc_idx]
            new_mastery = (1 - lambda_ema) * old_mastery + lambda_ema * y_true
            self.mastery_matrix[student_idx, kc_idx] = new_mastery


# ============================================================
# FUNCTION 5: Dataset (UNCHANGED)
# ============================================================

class KTDatasetPure(Dataset):
    """Dataset for Knowledge Tracing"""
    def __init__(self, df, stu2idx, t2idx, c2idx,
                 unk_student_idx, unk_step_idx, unk_kc_idx):
        self.df = df.reset_index(drop=True)
        self.stu2idx = stu2idx
        self.t2idx = t2idx
        self.c2idx = c2idx
        self.unk_student_idx = unk_student_idx
        self.unk_step_idx = unk_step_idx
        self.unk_kc_idx = unk_kc_idx

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        student_idx = self.stu2idx.get(row['student_id'], self.unk_student_idx)
        step_idx = self.t2idx.get(row['step_id'], self.unk_step_idx)
        kc_idx = self.c2idx.get(row['kc_id'], self.unk_kc_idx)
        label = torch.tensor(row['correct'], dtype=torch.float32)
        return student_idx, step_idx, kc_idx, label


# ============================================================
# FUNCTION 6: Training & Evaluation (UNCHANGED)
# ============================================================

class EarlyStopping:
    """Early stopping with model state saving"""
    def __init__(self, patience=10, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_state = None

    def __call__(self, score, model):
        if self.best_score is None or score > self.best_score + self.min_delta:
            self.best_score = score
            self.best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

    def load_best(self, model):
        if self.best_state:
            model.load_state_dict(self.best_state)


def train_epoch(model, loader, optimizer, criterion, hetero_data, device,
                config, grad_clip=1.0):
    """Train for one epoch with mastery update"""
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []

    for batch in loader:
        student_idx, step_idx, kc_idx, labels = batch
        student_idx = student_idx.to(device)
        step_idx = step_idx.to(device)
        kc_idx = kc_idx.to(device)
        labels = labels.to(device)

        # 1. FORWARD (with current mastery - history up to t-1)
        logits = model(hetero_data, student_idx, step_idx, kc_idx, device)
        loss = criterion(logits, labels)

        # 2. BACKWARD (update model weights)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()

        # 3. UPDATE MASTERY (AFTER backward, no gradient)
        #    ‚ö†Ô∏è CRITICAL: This order prevents leakage
        if hasattr(model, 'update_mastery_online'):
            model.update_mastery_online(
                student_idx, kc_idx, labels,
                lambda_ema=config.LAMBDA_EMA
            )

        # Metrics
        total_loss += loss.item() * len(labels)
        all_preds.extend(torch.sigmoid(logits).detach().cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(loader.dataset)
    auc = roc_auc_score(all_labels, all_preds) if len(set(all_labels)) > 1 else 0.5
    acc = accuracy_score(all_labels, np.array(all_preds) > 0.5)

    return avg_loss, auc, acc


@torch.no_grad()
def evaluate(model, loader, criterion, hetero_data, device):
    """Evaluate without mastery update"""
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    for batch in loader:
        student_idx, step_idx, kc_idx, labels = batch
        student_idx = student_idx.to(device)
        step_idx = step_idx.to(device)
        kc_idx = kc_idx.to(device)
        labels = labels.to(device)

        logits = model(hetero_data, student_idx, step_idx, kc_idx, device)
        loss = criterion(logits, labels)

        total_loss += loss.item() * len(labels)
        all_preds.extend(torch.sigmoid(logits).cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(loader.dataset)
    auc = roc_auc_score(all_labels, all_preds) if len(set(all_labels)) > 1 else 0.5
    acc = accuracy_score(all_labels, np.array(all_preds) > 0.5)

    return avg_loss, auc, acc


# ============================================================
# FUNCTION 7: Full Single-Fold Pipeline (M3 FINAL - MODIFIED)
# ============================================================

def run_single_fold(df_train_fold, df_val_fold, config, fold_num=None, verbose=True):
    """
    M3 FINAL: Run complete training pipeline for one fold

    Key modification: RECALCULATE features each epoch with updated mastery
    """
    device = config.DEVICE
    prefix = f"[Fold {fold_num}] " if fold_num is not None else ""

    # --- Step 1: Entity mappings ---
    mappings, entity_counts, unk_indices = build_entity_mappings(df_train_fold)
    if verbose:
        print(f"{prefix}Entities: S={entity_counts['num_students']}, "
              f"Q={entity_counts['num_questions']}, "
              f"T={entity_counts['num_steps']}, "
              f"C={entity_counts['num_kcs']}")

    # --- Step 2: Build graph ---
    hetero_data, total_edges = build_graph(df_train_fold, mappings, entity_counts, unk_indices)
    if verbose:
        print(f"{prefix}Graph edges: {total_edges:,}")

    # --- Step 3: Model (init BEFORE features!) ---
    model = GraphKTMinimal(
        num_students=entity_counts['num_students'],
        num_questions=entity_counts['num_questions'],
        num_steps=entity_counts['num_steps'],
        num_kcs=entity_counts['num_kcs'],
        feature_dim=NUM_FEATURES,
        embed_dim=config.EMBED_DIM,
        hidden_dim=config.HIDDEN_DIM,
        num_gnn_layers=config.NUM_GNN_LAYERS,
        dropout=config.DROPOUT,
        mastery_init=config.MASTERY_INIT,
        mastery_gating=config.MASTERY_GATING
    ).to(device)

    # ‚≠ê M3 FINAL: Initialize mastery BEFORE computing features
    model.init_mastery_matrix(device)

    total_params = sum(p.numel() for p in model.parameters())
    if verbose:
        print(f"{prefix}Model params: {total_params:,}")

    # --- Step 4: Initial node features WITH mastery ---
    feat_tensors = compute_node_features_with_mastery(
        df_train_fold, mappings, entity_counts, unk_indices,
        model.mastery_matrix  # ‚Üê Pass current mastery
    )
    hetero_data['student'].x = feat_tensors['student']
    hetero_data['question'].x = feat_tensors['question']
    hetero_data['step'].x = feat_tensors['step']
    hetero_data['kc'].x = feat_tensors['kc']

    if verbose:
        print(f"{prefix}Student features: {hetero_data['student'].x.shape} (5D + mastery)")

    # --- Step 5: DataLoaders ---
    train_dataset = KTDatasetPure(
        df_train_fold, mappings['stu2idx'], mappings['t2idx'], mappings['c2idx'],
        unk_indices['student'], unk_indices['step'], unk_indices['kc']
    )
    val_dataset = KTDatasetPure(
        df_val_fold, mappings['stu2idx'], mappings['t2idx'], mappings['c2idx'],
        unk_indices['student'], unk_indices['step'], unk_indices['kc']
    )

    train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True,
                              pin_memory=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False,
                            pin_memory=True, num_workers=0)

    if verbose:
        print(f"{prefix}Train: {len(train_loader.dataset):,} samples, "
              f"Val: {len(val_loader.dataset):,} samples")

    # --- Step 6: Training setup ---
    n_correct = df_train_fold['correct'].sum()
    n_incorrect = len(df_train_fold) - n_correct
    pos_weight = torch.tensor([n_incorrect / n_correct], dtype=torch.float32).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE,
                                   weight_decay=config.WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=5
    )
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    early_stopping = EarlyStopping(patience=config.PATIENCE)

    # --- Step 7: Training loop ---
    history = defaultdict(list)
    best_val_auc = 0

    if verbose:
        print(f"{prefix}Training (pos_weight={pos_weight.item():.4f})...")
        print("-" * 70)

    for epoch in range(config.EPOCHS):
        epoch_start = time.time()

        # ‚≠ê M3 FINAL: RECALCULATE features with updated mastery each epoch
        feat_tensors = compute_node_features_with_mastery(
            df_train_fold, mappings, entity_counts, unk_indices,
            model.mastery_matrix  # ‚Üê Updated mastery from previous epoch
        )
        hetero_data['student'].x = feat_tensors['student']
        hetero_data['question'].x = feat_tensors['question']
        hetero_data['step'].x = feat_tensors['step']
        hetero_data['kc'].x = feat_tensors['kc']

        train_loss, train_auc, train_acc = train_epoch(
            model, train_loader, optimizer, criterion, hetero_data, device, config, config.GRAD_CLIP
        )
        val_loss, val_auc, val_acc = evaluate(
            model, val_loader, criterion, hetero_data, device
        )

        epoch_time = time.time() - epoch_start

        history['train_loss'].append(train_loss)
        history['train_auc'].append(train_auc)
        history['val_loss'].append(val_loss)
        history['val_auc'].append(val_auc)
        history['val_acc'].append(val_acc)

        marker = " ‚òÖ" if val_auc > best_val_auc else ""
        if val_auc > best_val_auc:
            best_val_auc = val_auc

        if verbose:
            print(f"{prefix}Epoch {epoch+1:3d}/{config.EPOCHS} | "
                  f"Train AUC: {train_auc:.4f} | Val AUC: {val_auc:.4f} | "
                  f"Val Acc: {val_acc:.4f} | Time: {epoch_time:.1f}s{marker}")

        scheduler.step(val_auc)
        early_stopping(val_auc, model)

        if early_stopping.early_stop:
            if verbose:
                print(f"{prefix}Early stopping at epoch {epoch+1}")
            break

    early_stopping.load_best(model)

    final_val_loss, final_val_auc, final_val_acc = evaluate(
        model, val_loader, criterion, hetero_data, device
    )

    stopped_epoch = len(history['train_loss'])

    if verbose:
        print("-" * 70)
        print(f"{prefix}Best Val AUC: {final_val_auc:.4f} | "
              f"Val Acc: {final_val_acc:.4f} | Stopped at epoch: {stopped_epoch}")

    return {
        'val_auc': final_val_auc,
        'val_acc': final_val_acc,
        'val_loss': final_val_loss,
        'train_auc': history['train_auc'][-1],
        'stopped_epoch': stopped_epoch,
        'total_params': total_params,
        'history': dict(history),
        'model_state': early_stopping.best_state,
        'hetero_data': hetero_data,
        'mappings': mappings,
        'entity_counts': entity_counts,
        'unk_indices': unk_indices,
    }


# ============================================================
# Confirmation
# ============================================================
print("=" * 60)
print("‚úì ALL PIPELINE FUNCTIONS DEFINED (M3 FINAL VERSION - FIXED)")
print("=" * 60)
print("  - build_entity_mappings() (unchanged)")
print("  - build_graph() (unchanged)")
print("  - compute_node_features_with_mastery() ‚≠ê NEW + FIXED")
print("  - GraphKTMinimal ‚≠ê FIXED (student encoder + mastery matrix)")
print("  - MasteryMessagePassing (mastery-weighted edges)")
print("  - KTDatasetPure (unchanged)")
print("  - train_epoch() / evaluate() (unchanged)")
print("  - run_single_fold() ‚≠ê MODIFIED (recalc features each epoch)")
print("=" * 60)
print("FIXES APPLIED:")
print("  ‚úÖ mastery_matrix includes UNK KC: [num_students, num_kcs]")
print("  ‚úÖ student_encoder input_dim: feature_dim + num_kcs")
print("  ‚úÖ mastery moved to CPU before concatenation")
print("=" * 60)
print("M3 FINAL = M3a Features + M3b Weights + Dual Propagation")
print("=" * 60)
print("Ready for 5-Fold Cross-Validation! üöÄ")
print("=" * 60)

‚úì ALL PIPELINE FUNCTIONS DEFINED (M3 FINAL VERSION - FIXED)
  - build_entity_mappings() (unchanged)
  - build_graph() (unchanged)
  - compute_node_features_with_mastery() ‚≠ê NEW + FIXED
  - GraphKTMinimal ‚≠ê FIXED (student encoder + mastery matrix)
  - MasteryMessagePassing (mastery-weighted edges)
  - KTDatasetPure (unchanged)
  - train_epoch() / evaluate() (unchanged)
  - run_single_fold() ‚≠ê MODIFIED (recalc features each epoch)
FIXES APPLIED:
  ‚úÖ mastery_matrix includes UNK KC: [num_students, num_kcs]
  ‚úÖ student_encoder input_dim: feature_dim + num_kcs
  ‚úÖ mastery moved to CPU before concatenation
M3 FINAL = M3a Features + M3b Weights + Dual Propagation
Ready for 5-Fold Cross-Validation! üöÄ


In [7]:
# =============================================================================
# Cell 6: Run 5-Fold Student-Level Cross-Validation
# =============================================================================

print("=" * 60)
print("5-FOLD STUDENT-LEVEL CROSS-VALIDATION")
print("=" * 60)

print(f"""
Protocol:
  - {len(non_test_students)} non-test students split into 5 folds
  - Each fold: full pipeline rebuild (mappings, graph, features, model)
  - All design decisions FROZEN before CV
  - TEST set ({len(test_students)} students) completely untouched

Frozen hyperparameters:
  EMBED_DIM={config.EMBED_DIM}, HIDDEN_DIM={config.HIDDEN_DIM}
  NUM_GNN_LAYERS={config.NUM_GNN_LAYERS}, DROPOUT={config.DROPOUT}
  LR={config.LEARNING_RATE}, WEIGHT_DECAY={config.WEIGHT_DECAY}
  BATCH_SIZE=512, PATIENCE={config.PATIENCE}
""")

cv_results = []
cv_start = time.time()

for fold_idx in range(5):
    fold_start = time.time()

    print(f"\n{'='*60}")
    print(f"FOLD {fold_idx + 1} / 5")
    print(f"{'='*60}")

    # Get this fold's students
    train_students_fold = fold_assignments[fold_idx]['train_students']
    val_students_fold = fold_assignments[fold_idx]['val_students']

    # Create dataframes for this fold
    df_train_fold = df_non_test[df_non_test['student_id'].isin(train_students_fold)].copy()
    df_val_fold = df_non_test[df_non_test['student_id'].isin(val_students_fold)].copy()

    print(f"Train students: {len(train_students_fold)}, "
          f"Val students: {len(val_students_fold)}")
    print(f"Train interactions: {len(df_train_fold):,}, "
          f"Val interactions: {len(df_val_fold):,}")

    # Run full pipeline for this fold
    result = run_single_fold(
        df_train_fold, df_val_fold, config,
        fold_num=fold_idx + 1, verbose=True
    )

    fold_time = time.time() - fold_start
    result['fold_time'] = fold_time
    cv_results.append(result)

    print(f"\nFold {fold_idx + 1} completed in {fold_time:.1f}s")

total_cv_time = time.time() - cv_start

# ============================================================
# CV Summary
# ============================================================
print("\n" + "=" * 60)
print("5-FOLD CROSS-VALIDATION RESULTS")
print("=" * 60)

val_aucs = [r['val_auc'] for r in cv_results]
val_accs = [r['val_acc'] for r in cv_results]
stopped_epochs = [r['stopped_epoch'] for r in cv_results]

print(f"\n{'Fold':<8} {'Val AUC':<12} {'Val Acc':<12} {'Epochs':<10}")
print("-" * 42)
for i, r in enumerate(cv_results):
    print(f"Fold {i+1:<3} {r['val_auc']:<12.4f} {r['val_acc']:<12.4f} {r['stopped_epoch']:<10}")

print("-" * 42)
print(f"{'Mean':<8} {np.mean(val_aucs):<12.4f} {np.mean(val_accs):<12.4f} {np.mean(stopped_epochs):<10.1f}")
print(f"{'Std':<8} {np.std(val_aucs):<12.4f} {np.std(val_accs):<12.4f} {np.std(stopped_epochs):<10.1f}")
print(f"{'Min':<8} {np.min(val_aucs):<12.4f} {np.min(val_accs):<12.4f} {np.min(stopped_epochs):<10}")
print(f"{'Max':<8} {np.max(val_aucs):<12.4f} {np.max(val_accs):<12.4f} {np.max(stopped_epochs):<10}")

print(f"\n‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó")
print(f"‚ïë  CV Val AUC: {np.mean(val_aucs):.4f} ¬± {np.std(val_aucs):.4f}              ‚ïë")
print(f"‚ïë  CV Val Acc: {np.mean(val_accs):.4f} ¬± {np.std(val_accs):.4f}              ‚ïë")
print(f"‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù")

print(f"\nTotal CV time: {total_cv_time:.1f}s ({total_cv_time/60:.1f} min)")

# Store average epochs for final training
avg_epochs_cv = int(np.mean(stopped_epochs))
print(f"\nAverage stopping epoch: {avg_epochs_cv} (will use for final training)")

5-FOLD STUDENT-LEVEL CROSS-VALIDATION

Protocol:
  - 487 non-test students split into 5 folds
  - Each fold: full pipeline rebuild (mappings, graph, features, model)
  - All design decisions FROZEN before CV
  - TEST set (87 students) completely untouched

Frozen hyperparameters:
  EMBED_DIM=32, HIDDEN_DIM=64
  NUM_GNN_LAYERS=2, DROPOUT=0.2
  LR=0.001, WEIGHT_DECAY=0.01
  BATCH_SIZE=512, PATIENCE=10


FOLD 1 / 5
Train students: 389, Val students: 98
Train interactions: 524,961, Val interactions: 166,472
[Fold 1] Entities: S=390, Q=1064, T=146327, C=397
[Fold 1] Graph edges: 662,288
  ‚úì Mastery matrix initialized: [390, 397] with value=0.5
[Fold 1] Model params: 38,353
[Fold 1] Student features: torch.Size([390, 402]) (5D + mastery)
[Fold 1] Train: 524,961 samples, Val: 166,472 samples
[Fold 1] Training (pos_weight=0.3041)...
----------------------------------------------------------------------
[Fold 1] Epoch   1/100 | Train AUC: 0.9036 | Val AUC: 0.7255 | Val Acc: 0.7284 | Time: 153

In [8]:
# =============================================================================
# Cell 7: Final TEST Set Evaluation (M3 FINAL)
# =============================================================================

print("=" * 60)
print("FINAL TEST EVALUATION (M3 FINAL)")
print("=" * 60)

print(f"""
Protocol:
  - Train on ALL {len(non_test_students)} non-test students (no validation split)
  - Train for {avg_epochs_cv} epochs (average from CV, no early stopping)
  - Evaluate on held-out TEST set ({len(test_students)} students)
  - Features recalculated each epoch with updated mastery (M3 FINAL)
  - This number appears in the paper as TEST performance
""")

# --- Train on all non-test data ---
df_train_final = df_non_test.copy()

print("[1/6] Building entity mappings from all non-test data...")
mappings_final, entity_counts_final, unk_indices_final = build_entity_mappings(df_train_final)
print(f"  S={entity_counts_final['num_students']}, "
      f"Q={entity_counts_final['num_questions']}, "
      f"T={entity_counts_final['num_steps']}, "
      f"C={entity_counts_final['num_kcs']}")

print("[2/6] Building graph...")
hetero_data_final, total_edges_final = build_graph(
    df_train_final, mappings_final, entity_counts_final, unk_indices_final
)
print(f"  Edges: {total_edges_final:,}")

# ‚≠ê M3 FINAL: Create model BEFORE features
print("[3/6] Creating model...")
device = config.DEVICE

model_final = GraphKTMinimal(
    num_students=entity_counts_final['num_students'],
    num_questions=entity_counts_final['num_questions'],
    num_steps=entity_counts_final['num_steps'],
    num_kcs=entity_counts_final['num_kcs'],
    feature_dim=NUM_FEATURES,
    embed_dim=config.EMBED_DIM,
    hidden_dim=config.HIDDEN_DIM,
    num_gnn_layers=config.NUM_GNN_LAYERS,
    dropout=config.DROPOUT,
    mastery_init=config.MASTERY_INIT,
    mastery_gating=config.MASTERY_GATING
).to(device)

# ‚≠ê M3 FINAL: Init mastery BEFORE features
model_final.init_mastery_matrix(device)

total_params_final = sum(p.numel() for p in model_final.parameters())
print(f"  Model params: {total_params_final:,}")

print("[4/6] Computing initial node features WITH mastery...")
feat_tensors_final = compute_node_features_with_mastery(
    df_train_final, mappings_final, entity_counts_final, unk_indices_final,
    model_final.mastery_matrix  # ‚Üê Pass mastery
)
hetero_data_final['student'].x = feat_tensors_final['student']
hetero_data_final['question'].x = feat_tensors_final['question']
hetero_data_final['step'].x = feat_tensors_final['step']
hetero_data_final['kc'].x = feat_tensors_final['kc']

print(f"  Student features: {hetero_data_final['student'].x.shape} (5D + mastery)")

print("[5/6] Creating dataloaders...")
train_dataset_final = KTDatasetPure(
    df_train_final,
    mappings_final['stu2idx'], mappings_final['t2idx'], mappings_final['c2idx'],
    unk_indices_final['student'], unk_indices_final['step'], unk_indices_final['kc']
)
test_dataset_final = KTDatasetPure(
    df_test_final,
    mappings_final['stu2idx'], mappings_final['t2idx'], mappings_final['c2idx'],
    unk_indices_final['student'], unk_indices_final['step'], unk_indices_final['kc']
)

train_loader_final = DataLoader(train_dataset_final, batch_size=512, shuffle=True,
                                 pin_memory=True, num_workers=0)
test_loader_final = DataLoader(test_dataset_final, batch_size=512, shuffle=False,
                                pin_memory=True, num_workers=0)

print(f"  Train: {len(train_dataset_final):,} samples")
print(f"  Test:  {len(test_dataset_final):,} samples")

print("[6/6] Training final model...")

# Class weights from full training set
n_correct = df_train_final['correct'].sum()
n_incorrect = len(df_train_final) - n_correct
pos_weight_final = torch.tensor([n_incorrect / n_correct], dtype=torch.float32).to(device)

optimizer_final = torch.optim.AdamW(model_final.parameters(), lr=config.LEARNING_RATE,
                                      weight_decay=config.WEIGHT_DECAY)
scheduler_final = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer_final, mode='min', factor=0.5, patience=5
)
criterion_final = nn.BCEWithLogitsLoss(pos_weight=pos_weight_final)

# Train for fixed number of epochs (from CV average)
print(f"\nTraining for {avg_epochs_cv} epochs (CV average)...")
print("-" * 70)

for epoch in range(avg_epochs_cv):
    epoch_start = time.time()

    # ‚≠ê M3 FINAL: RECALCULATE features with updated mastery each epoch
    feat_tensors_final = compute_node_features_with_mastery(
        df_train_final, mappings_final, entity_counts_final, unk_indices_final,
        model_final.mastery_matrix  # ‚Üê Updated mastery from previous epoch
    )
    hetero_data_final['student'].x = feat_tensors_final['student']
    hetero_data_final['question'].x = feat_tensors_final['question']
    hetero_data_final['step'].x = feat_tensors_final['step']
    hetero_data_final['kc'].x = feat_tensors_final['kc']

    train_loss, train_auc, train_acc = train_epoch(
        model_final, train_loader_final, optimizer_final, criterion_final,
        hetero_data_final, device, config, config.GRAD_CLIP
    )

    epoch_time = time.time() - epoch_start

    if (epoch + 1) % 5 == 0 or epoch == 0 or (epoch + 1) == avg_epochs_cv:
        print(f"Epoch {epoch+1:3d}/{avg_epochs_cv} | "
              f"Train Loss: {train_loss:.4f} | Train AUC: {train_auc:.4f} | "
              f"Train Acc: {train_acc:.4f} | Time: {epoch_time:.1f}s")

    scheduler_final.step(train_loss)

print("-" * 70)

# --- Final TEST evaluation ---
print("\nEvaluating on TEST set...")
test_loss, test_auc, test_acc = evaluate(
    model_final, test_loader_final, criterion_final,
    hetero_data_final, device
)

print(f"\n{'='*60}")
print(f"FINAL RESULTS (M3 FINAL)")
print(f"{'='*60}")
print(f"""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë         GraphKT M3 FINAL (Features + Weights + Dual)         ‚ïë
‚ïë                    Algebra 2005-2006 Dataset                 ‚ïë
‚ï†‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï£
‚ïë                                                              ‚ïë
‚ïë  5-Fold CV Validation:                                       ‚ïë
‚ïë    AUC:      {np.mean(val_aucs):.4f} ¬± {np.std(val_aucs):.4f}                              ‚ïë
‚ïë    Accuracy: {np.mean(val_accs):.4f} ¬± {np.std(val_accs):.4f}                              ‚ïë
‚ïë                                                              ‚ïë
‚ïë  Test Set (held-out, {len(test_students)} students):                          ‚ïë
‚ïë    AUC:      {test_auc:.4f}                                        ‚ïë
‚ïë    Accuracy: {test_acc:.4f}                                        ‚ïë
‚ïë                                                              ‚ïë
‚ïë  Model: {total_params_final:,} parameters                              ‚ïë
‚ïë  Training: {avg_epochs_cv} epochs (CV average)                           ‚ïë
‚ïë  Split: Student-level (Split B), no leakage                  ‚ïë
‚ïë                                                              ‚ïë
‚ïë  Architecture:                                               ‚ïë
‚ïë    ‚Ä¢ Student features: [5D + mastery] dynamic                ‚ïë
‚ïë    ‚Ä¢ Dual propagation (Structural + Knowledge)               ‚ïë
‚ïë    ‚Ä¢ Mastery in features AND weights                         ‚ïë
‚ïë                                                              ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
""")

FINAL TEST EVALUATION (M3 FINAL)

Protocol:
  - Train on ALL 487 non-test students (no validation split)
  - Train for 12 epochs (average from CV, no early stopping)
  - Evaluate on held-out TEST set (87 students)
  - Features recalculated each epoch with updated mastery (M3 FINAL)
  - This number appears in the paper as TEST performance

[1/6] Building entity mappings from all non-test data...
  S=488, Q=1081, T=184307, C=428
[2/6] Building graph...
  Edges: 839,032
[3/6] Creating model...
  ‚úì Mastery matrix initialized: [488, 428] with value=0.5
  Model params: 39,345
[4/6] Computing initial node features WITH mastery...
  Student features: torch.Size([488, 433]) (5D + mastery)
[5/6] Creating dataloaders...
  Train: 691,433 samples
  Test:  118,261 samples
[6/6] Training final model...

Training for 12 epochs (CV average)...
----------------------------------------------------------------------
Epoch   1/12 | Train Loss: 0.1760 | Train AUC: 0.9050 | Train Acc: 0.8091 | Time: 180.4s

In [9]:
# =============================================================================
# Cell 8: Paper-Ready Results Summary & Per-Fold Analysis (M3 FINAL)
# =============================================================================

print("=" * 60)
print("DETAILED RESULTS FOR PAPER (M3 FINAL)")
print("=" * 60)

# Per-fold table
print("\nTable 1: Per-Fold Cross-Validation Results")
print("-" * 55)
print(f"{'Fold':<6} {'Students':<10} {'Interactions':<14} {'Val AUC':<10} {'Val Acc':<10} {'Epochs':<8}")
print("-" * 55)
for i, r in enumerate(cv_results):
    n_val_stu = len(fold_assignments[i]['val_students'])
    df_val_f = df_non_test[df_non_test['student_id'].isin(fold_assignments[i]['val_students'])]
    n_val_int = len(df_val_f)
    print(f"{i+1:<6} {n_val_stu:<10} {n_val_int:<14,} {r['val_auc']:<10.4f} {r['val_acc']:<10.4f} {r['stopped_epoch']:<8}")

print("-" * 55)
print(f"{'Mean':<6} {'':10} {'':14} {np.mean(val_aucs):<10.4f} {np.mean(val_accs):<10.4f} {np.mean(stopped_epochs):<8.1f}")
print(f"{'¬±Std':<6} {'':10} {'':14} {np.std(val_aucs):<10.4f} {np.std(val_accs):<10.4f} {np.std(stopped_epochs):<8.1f}")

# Summary table for paper
print(f"\n\nTable 2: Model Comparison (for paper)")
print("-" * 65)
print(f"{'Model':<20} {'Val AUC':<16} {'Test AUC':<12} {'Params':<10}")
print("-" * 65)
print(f"{'GraphKT M3 Final':<20} {np.mean(val_aucs):.4f} ¬± {np.std(val_aucs):.4f}   {test_auc:<12.4f} {sum(p.numel() for p in model_final.parameters()):,}")

print("\n  Architecture: Features + Weights + Dual Propagation")
print("  - Student features: [5D + mastery] (dynamic)")
print("  - Dual propagation: Structural (Q‚ÜîT‚ÜîC) + Knowledge (S‚ÜîC weighted)")
print("  - Mastery in both features AND edge weights")

# Consistency check
auc_range = np.max(val_aucs) - np.min(val_aucs)
print(f"\n\nConsistency Analysis:")
print(f"  AUC range across folds: {auc_range:.4f}")
if auc_range < 0.03:
    print(f"  ‚úì Highly consistent (range < 0.03)")
elif auc_range < 0.05:
    print(f"  ~ Moderately consistent (range < 0.05)")
else:
    print(f"  ‚ö† High variance across folds - investigate fold differences")

print(f"\n  Coefficient of variation: {np.std(val_aucs)/np.mean(val_aucs)*100:.2f}%")

DETAILED RESULTS FOR PAPER (M3 FINAL)

Table 1: Per-Fold Cross-Validation Results
-------------------------------------------------------
Fold   Students   Interactions   Val AUC    Val Acc    Epochs  
-------------------------------------------------------
1      98         166,472        0.7251     0.7432     11      
2      98         112,205        0.7109     0.6711     13      
3      97         153,605        0.7634     0.7599     11      
4      97         133,779        0.7304     0.6259     14      
5      97         125,372        0.7271     0.7878     11      
-------------------------------------------------------
Mean                             0.7314     0.7176     12.0    
¬±Std                             0.0174     0.0599     1.3     


Table 2: Model Comparison (for paper)
-----------------------------------------------------------------
Model                Val AUC          Test AUC     Params    
-----------------------------------------------------------------
Gra