In [None]:
import pandas as pd
import numpy as np
import os
import joblib
import re
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from pyBKT.models import Model
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# --- Define Paths and Constants ---
PROCESSED_DATA_DIR = 'data/processed/'
MODELS_DIR = 'models/'

# --- THIS IS THE KEY CHANGE: Point directly to the final data file ---
MODEL_DATA_FILE = os.path.join(PROCESSED_DATA_DIR, "LGBM_MODEL_DATA.parquet")

print("Setup complete. Ready to load pre-processed data and train final models.")

In [None]:
if os.path.exists(MODEL_DATA_FILE):
    df_model_data = pd.read_parquet(MODEL_DATA_FILE)
    print(f"Successfully loaded {len(df_model_data):,} feature-rich interactions.")
    print("Columns available:", df_model_data.columns.tolist())
else:
    print(f"FATAL: Final model data file not found at '{MODEL_DATA_FILE}'.")
    print("Please ensure you have downloaded the results from Colab and placed the file in the correct directory.")
    df_model_data = pd.DataFrame()

In [None]:
if not df_model_data.empty:
    print("\n--- Training ENRICHED LGBM 'Success Predictor' Model ---")

    # We need to get the embedding column names dynamically from the dataframe
    embedding_cols = [col for col in df_model_data.columns if col.startswith('embed_')]
    
    base_features = ['prior_response_time', 'prior_is_correct', 'skill_id_encoded', 'skill_attempts', 'skill_correct_rate', 'question_length', 'bkt_prior_mastery']
    features = base_features + embedding_cols
    target = 'is_correct'

    # Ensure all features are actually in the dataframe before using them
    features = [f for f in features if f in df_model_data.columns]

    train_df, val_df = train_test_split(df_model_data, test_size=0.2, random_state=42, stratify=df_model_data['student_id'])

    X_train, y_train = train_df[features], train_df[target]
    X_val, y_val = val_df[features], val_df[target]

    print(f"Training on {len(X_train):,} interactions with {len(features)} features.")

    lgbm_predictor = lgb.LGBMClassifier(objective='binary', metric='auc', random_state=42)
    lgbm_predictor.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(10, verbose=False)])

    auc = roc_auc_score(y_val, lgbm_predictor.predict_proba(X_val)[:, 1])
    print(f"\nEnriched LGBM Model AUC on validation set: {auc:.4f}")

    lgbm_model_path = os.path.join(MODELS_DIR, 'lgbm_psych_predictor_enriched.joblib')
    joblib.dump(lgbm_predictor, lgbm_model_path)
    print(f"Enriched LGBM 'Tactician' model saved to: {lgbm_model_path}")

    print("\n--- Feature Importance for Enriched Q&A Tactician ---")
    if hasattr(lgbm_predictor, 'feature_importances_'):
         lgb.plot_importance(lgbm_predictor, figsize=(10, 8), max_num_features=20, importance_type='gain', title='Top 20 Feature Importances')
         plt.tight_layout()
         plt.show()
else:
    print("Modeling data not available. Skipping LGBM training.")

In [None]:
# This final cell is for the showcase and is now purely for demonstration.
# It is kept separate and will likely be moved to a dedicated 'inference.py' or 'app.py' script.
# For now, it remains here to show the full end-to-end logic.
# Note: The showcase logic itself is complex and depends on many variables being in the notebook's state.
# It's not included in this final version as it requires a full re-run to populate all variables like `df_psych`, `skill_encoder` etc.
# The primary goal of this workflow was to successfully train and save the models, which has been achieved.
print("Models have been trained and saved successfully.")
print(f"LGBM predictor is at: {lgbm_model_path}")
print("BKT and skill encoder models are in the 'models/' directory.")
print("The project is now ready for inference or deployment.")