## Imports and Initial Configurations

### Imports

In [None]:
# Make sure to install requirements.txt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from multiprocessing import Pool, cpu_count
from functools import partial
import gc
import psutil
import os
import time
import json
import threading
from pathlib import Path
from typing import List, Dict, Optional, Tuple, Set
from dataclasses import dataclass
import warnings
import ast
from IPython.display import display
from joblib import Parallel, delayed
from tqdm import tqdm
from collections import Counter
warnings.filterwarnings('ignore')

### Configurations

In [None]:
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams.update({
    "figure.autolayout": True,
    "axes.titleweight": "bold"
})
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f'{x:.4f}')

## Data Preperation

### Process Configuration Classes

In [None]:
@dataclass
class AnalysisConfig:
    """Apriori analysis parameters"""
    min_support: float = 0.05
    min_confidence: float = 0.6
    min_lift: float = 1.0
    top_n_rules: int = 15

@dataclass
class ProcessingConfig:
    """Data processing configuration"""
    n_workers: Optional[int] = None
    max_memory_mb: int = 6144
    chunk_size_multiplier: int = 4
    base_chunk_size: int = 200_000
    sequential_mode: bool = False
    excluded_files: tuple = ('evaluation_unit_test_scores.csv',)    # Since this is for modeling
    
    def get_chunk_size(self) -> int:
        return max(10_000, self.base_chunk_size // max(1, self.chunk_size_multiplier))

# Initialize
print("Initializing process configurations...")
analysis_config = AnalysisConfig()
processing_config = ProcessingConfig(n_workers=4, max_memory_mb=6144)
print("Configurations initialized.")

### Utility Functions

In [None]:
def get_memory_usage_mb() -> float:
    return psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024

def log_memory(context: str = "") -> float:
    mem = get_memory_usage_mb()
    print(f"Memory {context}: {mem:.2f} MB")
    return mem

dataframes_saves = "../dataset/dataframes"
def save_dataframe(name: str, dataframe: pd.DataFrame):
    os.makedirs(dataframes_saves, exist_ok=True)
    # Remove any existing extension and add .parquet
    name = os.path.splitext(name)[0] + '.parquet'
    path = os.path.join(dataframes_saves, name)
    dataframe.to_parquet(path, index=False, engine='pyarrow')

def load_dataframe(name: str) -> pd.DataFrame:
    # Remove any existing extension and add .parquet
    name = os.path.splitext(name)[0] + '.parquet'
    path = os.path.join(dataframes_saves, name)
    return pd.read_parquet(path, engine='pyarrow')


### Dataset Files Selection

In [None]:
DATA_FOLDER = Path('../dataset')
DATA_FOLDER.mkdir(exist_ok=True)

csv_files = sorted(DATA_FOLDER.glob('*.csv'))
print(f"\nData folder: {DATA_FOLDER.absolute()}")
print(f"Found {len(csv_files)} CSV files:\n")

for f in csv_files:
    size_mb = f.stat().st_size / 1024 / 1024
    excluded = f.name in processing_config.excluded_files
    status = "EXCLUDED" if excluded else "INCLUDED"
    print(f"  {status} {f.name:<45} {size_mb:>10.2f} MB")
    if excluded:
        csv_files.remove(f)

### Loading and Initial Exploration of CSV files as DataFrames

#### Initialize DataFrames

In [None]:
data_frames = {f_name.stem: pd.read_csv(f_name) for f_name in csv_files}

#### Initial Exploration

In [None]:
def df_exploration(key: str, df: pd.DataFrame, head_n: int = 10):
    print("=" * 40)
    print(f"{key} (shape: {df.shape})")
    print("=" * 40)

    report = []

    n_rows = len(df)

    for col in df.columns:
        series = df[col]

        # Missing values
        missing = series.isna().sum()

        # Unique values (safe for lists)
        try:
            unique = series.nunique(dropna=True)
        except TypeError:
            unique = series.astype(str).nunique(dropna=True)

        report.append({
            "Column": col,
            "Type": str(series.dtype),
            "Missing": int(missing),
            "Missing%": f"{(missing / n_rows * 100):.2f}%" if n_rows else "0.00%",
            "Unique": int(unique),
        })

    print("Report:")
    display(pd.DataFrame(report))

    # Duplicates (safe + explicit)
    try:
        dup_count = df.duplicated().sum()
    except TypeError:
        dup_count = df.astype(str).duplicated().sum()

    print(f"  Duplicates: {dup_count}")
    print(f"  Sample Data:")

    # Head display (safe for large & complex columns)
    display(df.head(head_n))


for key, df in data_frames.items():
    df_exploration(key, df)

### Data Cleaning Strategies

#### 'action_logs' dataframe

In [None]:
KEEP_ACTIONS = {
    "correct_response",
    "wrong_response",
    "answer_requested",
}

data_frames["action_logs"] = data_frames["action_logs"][
    data_frames["action_logs"]["action"].isin(KEEP_ACTIONS)
]

# Convert timestamp to datetime (seconds → datetime)
data_frames["action_logs"]["timestamp"] = pd.to_datetime(
    data_frames["action_logs"]["timestamp"],
    unit="s",
    errors="coerce"
)

# Drop unusable columns
data_frames["action_logs"] = data_frames["action_logs"].drop(
    columns=[
        "max_attempts",
        "score_viewable",
        "continuous_score_viewable"
    ],
    errors="ignore"  # prevents crash if column missing
)

# Sort for sequence analysis
data_frames["action_logs"] = data_frames["action_logs"].sort_values(
    ["assignment_log_id", "timestamp"]
)

# Optional but HIGHLY recommended for memory
data_frames["action_logs"]["action"] = data_frames["action_logs"]["action"].astype("category")

# Explore
df_exploration("action_logs", data_frames["action_logs"])

# Save cleaned dataframe
save_dataframe("action_logs", data_frames["action_logs"])

# Free memory
del data_frames["action_logs"]
gc.collect()

#### 'assignment_details' dataframe

In [None]:
# Convert time measurements
data_frames['assignment_details']["assignment_start_time"] = pd.to_datetime(
    data_frames['assignment_details']["assignment_start_time"], unit="s", errors="coerce"
)
data_frames['assignment_details']["assignment_end_time"] = pd.to_datetime(
    data_frames['assignment_details']["assignment_end_time"], unit="s", errors="coerce"
)
data_frames['assignment_details']["assignment_release_date"] = pd.to_datetime(
    data_frames['assignment_details']["assignment_release_date"], unit="s"
)
data_frames['assignment_details']["assignment_due_date"] = pd.to_datetime(
    data_frames['assignment_details']["assignment_due_date"], unit="s", errors="coerce"
)

df_exploration('assignment_details', data_frames['assignment_details'])
save_dataframe('assignment_details',data_frames['assignment_details'])
del data_frames['assignment_details']
gc.collect()

#### 'assignment_relationships' dataframe

In [None]:
data_frames['assignment_relationships'] = data_frames['assignment_relationships'].drop_duplicates()

df_exploration('assignment_relationships', data_frames['assignment_relationships'])
save_dataframe('assignment_relationships',data_frames['assignment_relationships'])
del data_frames['assignment_relationships']
gc.collect()

#### 'explanation_details' dataframe

In [None]:
# Convert BERT PCA vectors to embeddings
try:
    data_frames['explanation_details']['explanation_embedding'] = data_frames['explanation_details']['explanation_text_bert_pca'] \
    .apply(lambda x: np.array(ast.literal_eval(x), dtype="float64"))

    data_frames['explanation_details'].drop(columns=['explanation_text_bert_pca'])
except:
    pass

df_exploration('explanation_details', data_frames['explanation_details'])
save_dataframe('explanation_details',data_frames['explanation_details'])
del data_frames['explanation_details']
gc.collect()

#### 'hint_details' dataframe

In [None]:
# Convert BERT PCA vectors to embeddings
try:
    data_frames['hint_details']['hint_embedding'] = data_frames['hint_details']['hint_text_bert_pca'] \
    .apply(lambda x: np.array(ast.literal_eval(x), dtype="float64"))

    data_frames['hint_details'].drop(columns=['hint_text_bert_pca'])
except:
    pass

df_exploration('hint_details', data_frames['hint_details'])
save_dataframe('hint_details',data_frames['hint_details'])
del data_frames['hint_details']
gc.collect()

#### 'problem_details' dataframe

In [None]:
# Fill boolean NA with 0
bool_cols = [
    "problem_contains_image",
    "problem_contains_equation",
    "problem_contains_video",
]
data_frames['problem_details'][bool_cols] = data_frames['problem_details'][bool_cols].fillna(0).astype("int8")

# Fill missing Problem Skill with Unknown
data_frames['problem_details']["problem_skill_code"] = data_frames['problem_details']["problem_skill_code"].astype("object").fillna("Unknown").astype("category")
data_frames['problem_details']["problem_skill_description"] = data_frames['problem_details']["problem_skill_description"].astype("object").fillna("Unknown").astype("category")

df_exploration('problem_details', data_frames['problem_details'])
save_dataframe('problem_details',data_frames['problem_details'])
del data_frames['problem_details']
gc.collect()

#### 'sequence_details' dataframe

In [None]:
data_frames["sequence_details"]["sequence_problem_ids"] = (
    data_frames["sequence_details"]["sequence_problem_ids"]
    .apply(lambda x: [] if not isinstance(x, str)
           else [i.strip() for i in x.strip()[1:-1].split(",") if i.strip()])
)

# Fill hierarchy NaNs for grouping
for col in data_frames['sequence_details'].columns:
    if col.startswith("sequence_folder_path"):
        data_frames['sequence_details'][col] = data_frames['sequence_details'][col].astype('object').fillna("NONE").astype('category')

df_exploration('sequence_details', data_frames['sequence_details'])
save_dataframe('sequence_details',data_frames['sequence_details'])
del data_frames['sequence_details']
gc.collect()

#### 'sequence_relationships' dataframe

In [None]:
data_frames['sequence_relationships'] = data_frames['sequence_relationships'].drop_duplicates()

df_exploration('sequence_relationships', data_frames['sequence_relationships'])
save_dataframe('sequence_relationships',data_frames['sequence_relationships'])
del data_frames['sequence_relationships']
gc.collect()

### EDA and Visualization

#### Action Logs

## Analysis

### Analysis 1: Student-Problem Behavior Co-occurence

#### DataFrames Merging

In [None]:
action_logs = load_dataframe('action_logs')
assignment_details = load_dataframe('assignment_details')
problem_details = load_dataframe('problem_details')

# Merge action logs with assignment details
student_problems = action_logs.merge(
    assignment_details, 
    on='assignment_log_id', 
    how='left'
)
del assignment_details
del action_logs
gc.collect()

student_problems = student_problems[student_problems['problem_id'].notna()]
student_problems = student_problems.sort_values(['assignment_log_id', 'problem_id', 'timestamp'])

# Calculate response time difference
student_problems['response_time'] = student_problems.groupby(
    ['assignment_log_id', 'problem_id']
)['timestamp'].diff()

# Convert timedelta to seconds for binning
student_problems['response_time'] = student_problems['response_time'].dt.total_seconds()

bins = [-1, 10, 30, 60, 180, float("inf")]
labels = ["very_fast", "fast", "moderate", "slow", "very_slow"]

student_problems["response_time_cat"] = pd.cut(
    student_problems["response_time"],
    bins=bins,
    labels=labels
).astype("object").fillna("initial")

# Merge with problem details
student_problems = student_problems.merge(
    problem_details[['problem_id', 'problem_type', 'problem_skill_code']], 
    on='problem_id', 
    how='left'
)
print(f"Processed {len(student_problems):,} student-problem interactions")
df_exploration('student_problems', student_problems)
del problem_details
gc.collect()

#### Transaction Encoding: Assignment-Problem Combinations

In [None]:
def process_group(args):
    """Process a single group to create a transaction"""
    (asg_id, prob_id), group = args
    transaction = []
    
    # Add action types
    actions = group['action'].unique()
    transaction.extend([f"action_{act}" for act in actions])
    
    # Add response time categories
    rt_cats = group['response_time_cat'].unique()
    transaction.extend([f"timing_{cat}" for cat in rt_cats if cat != 'initial'])
    
    # Add problem type
    if group['problem_type'].notna().any():
        ptype = group['problem_type'].iloc[0]
        transaction.append(f"type_{ptype}")
    
    # Add tutoring availability
    if group['available_core_tutoring'].notna().any():
        tutoring = group['available_core_tutoring'].iloc[0]
        transaction.append(f"tutoring_{tutoring}")
    
    # Add hint usage
    if group['hint_id'].notna().any():
        transaction.append("used_hint")
    
    # Add explanation usage
    if group['explanation_id'].notna().any():
        transaction.append("used_explanation")
    
    return transaction if transaction else None

# Create transactions in batches to manage memory
print("Creating transactions in memory-efficient batches...")

# Get groupby object (doesn't load into memory yet)
groupby_obj = student_problems.groupby(['assignment_log_id', 'problem_id'])
total_groups = len(groupby_obj)
print(f"Total groups to process: {total_groups:,}")

# Process in batches
batch_size = 50000  # Process 50k groups at a time
print(f"Processing with batch size {batch_size:,}")

transactions_behaviors = []
group_iterator = iter(groupby_obj)

with tqdm(total=total_groups, desc="Processing transactions") as pbar:
    while True:
        # Collect a batch of groups
        batch_groups = []
        try:
            for _ in range(batch_size):
                batch_groups.append(next(group_iterator))
        except StopIteration:
            if not batch_groups:
                break
        
        # Process batch in parallel using joblib (works better with notebooks)
        batch_results = Parallel(n_jobs=2, verbose=0, batch_size=1000)(
            delayed(process_group)(group) for group in batch_groups
        )
        
        # Filter and extend results
        batch_results = [r for r in batch_results if r is not None]
        transactions_behaviors.extend(batch_results)
        
        # Update progress
        pbar.update(len(batch_groups))
        
        # Clean up batch
        del batch_groups, batch_results
        gc.collect()

print(f"✓ Created {len(transactions_behaviors):,} behavior transactions")

# Free up memory before encoding
del student_problems, groupby_obj, group_iterator
gc.collect()

# Encode transactions in batches
print("Encoding transactions in batches...")

# First pass: collect all unique items efficiently using Counter
print("  Pass 1: Collecting unique items...")
all_items_counter = Counter()
for transaction in tqdm(transactions_behaviors, desc="Scanning items"):
    all_items_counter.update(transaction)

# Sort items by frequency (most common first)
all_items = [item for item, _ in all_items_counter.most_common()]
print(f"  Found {len(all_items)} unique items")

# Show top items
print("\nTop 15 Most Frequent Items:")
for item, count in all_items_counter.most_common(15):
    print(f"  {item}: {count:,}")

del all_items_counter
gc.collect()

# Create item to index mapping
item_to_idx = {item: idx for idx, item in enumerate(all_items)}

# Second pass: encode in batches
print("  Pass 2: Encoding transactions...")
encode_batch_size = 100000
encoded_batches = []

for start_idx in tqdm(range(0, len(transactions_behaviors), encode_batch_size), 
                      desc="Encoding batches"):
    end_idx = min(start_idx + encode_batch_size, len(transactions_behaviors))
    batch = transactions_behaviors[start_idx:end_idx]
    
    # Manually encode batch
    batch_encoded = np.zeros((len(batch), len(all_items)), dtype=bool)
    for i, transaction in enumerate(batch):
        for item in transaction:
            batch_encoded[i, item_to_idx[item]] = True
    
    encoded_batches.append(batch_encoded)
    
    del batch
    gc.collect()

# Clear transactions_behaviors before combining
del transactions_behaviors
gc.collect()

# Combine all batches
print("Combining encoded batches...")
te_behaviors_array = np.vstack(encoded_batches)
del encoded_batches
gc.collect()

# Create DataFrame
print("Creating DataFrame...")
df_behaviors = pd.DataFrame(te_behaviors_array, columns=all_items)
del te_behaviors_array, all_items, item_to_idx
gc.collect()

print(f"Transaction matrix shape: {df_behaviors.shape}")
print(f"Unique items: {len(df_behaviors.columns)}")

# Show item frequencies
print("Calculating item frequencies...")
item_freq = df_behaviors.sum().sort_values(ascending=False)
print("\nTop 15 Most Frequent Items:")
print(item_freq.head(15))

gc.collect()

In [None]:
print("Running Apriori...")
frequent_itemsets_behaviors = apriori(
    df_behaviors, 
    min_support=0.05, 
    use_colnames=True
)
print(f"✓ Found {len(frequent_itemsets_behaviors)} frequent itemsets")

# Generate association rules
rules_behaviors = association_rules(
    frequent_itemsets_behaviors, 
    metric="confidence", 
    min_threshold=0.6
)
rules_behaviors = rules_behaviors[rules_behaviors['lift'] >= 1.5]
rules_behaviors = rules_behaviors.sort_values('lift', ascending=False)

print(f"✓ Generated {len(rules_behaviors)} association rules")
print(f"✓ Lift range: {rules_behaviors['lift'].min():.2f} - {rules_behaviors['lift'].max():.2f}")

# Display top rules
print("\nTop 10 Rules by Lift:\n")
for idx, row in rules_behaviors.head(10).iterrows():
    antecedents = ', '.join(list(row['antecedents']))
    consequents = ', '.join(list(row['consequents']))
    print(f"{antecedents} => {consequents}")
    print(f"  Support: {row['support']:.3f}, Confidence: {row['confidence']:.3f}, Lift: {row['lift']:.3f}\n")