## Configurations and Utility Functions

### Imports

In [1]:
# Make sure to install requirements.txt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules
import gc
import re
import psutil
import os
from pathlib import Path
from dataclasses import dataclass
import warnings
from IPython.display import display
from mlxtend.preprocessing import TransactionEncoder
import glob
from collections import Counter
warnings.filterwarnings('ignore')

### Visualization Configs

In [2]:
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams.update({
    "figure.autolayout": True,
    "axes.titleweight": "bold"
})
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f'{x:.4f}')

### Process Configuration Classes

In [13]:
@dataclass
class AnalysisConfig:
    """Apriori analysis parameters"""
    min_support: float = 0.05
    min_confidence: float = 0.6
    min_lift: float = 1.0
    top_n_rules: int = 10

@dataclass
class ProcessingConfig:
    """Data processing configuration"""
    dataset_folder: Path = Path('../dataset')
    preprocessed_df_dir: Path = Path("../dataset/dataframes")
    analysis_df_dir: Path = Path('../dataset/tempdf')
    results_raw_dir: Path = Path('../results/raw')
    results_vis_dir: Path = Path('../results/vis')
    include_files: tuple = (
        'action_logs.csv', 
        'problem_details.csv',
        'training_unit_test_scores.csv',
        'assignment_relationships.csv'
    )

# Initialize
print("Initializing process configurations...")
analysis_config = AnalysisConfig()
processing_config = ProcessingConfig()
print("Configurations initialized.")

Initializing process configurations...
Configurations initialized.


### Utility Functions

In [4]:
def get_memory_usage_mb() -> float:
    return psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024

def log_memory(context: str = "") -> float:
    mem = get_memory_usage_mb()
    print(f"Memory {context}: {mem:.2f} MB")
    return mem

def save_dataframe(name: str, dataframe: pd.DataFrame):
    os.makedirs(ProcessingConfig.preprocessed_df_dir, exist_ok=True)
    # Remove any existing extension and add .parquet
    path = ProcessingConfig.preprocessed_df_dir / f'{name}.parquet'
    dataframe.to_parquet(path, index=False, engine='pyarrow')

def load_dataframe(name: str) -> pd.DataFrame:
    # Remove any existing extension and add .parquet
    path = ProcessingConfig.preprocessed_df_dir / f'{name}.parquet'
    return pd.read_parquet(path, engine='pyarrow')

def optimize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Optimize DataFrame memory usage by downcasting numeric types"""
    initial_memory = df.memory_usage(deep=True).sum() / 1024**2
    df = df.drop_duplicates()
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            # Integers
            if str(col_type).startswith('int'):
                c_min = df[col].min()
                c_max = df[col].max()
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            
            # Floats
            elif str(col_type).startswith('float'):
                c_min = df[col].min()
                c_max = df[col].max()
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    
    final_memory = df.memory_usage(deep=True).sum() / 1024**2
    print(f"  Memory reduced: {initial_memory:.2f} MB → {final_memory:.2f} MB "
          f"({100 * (initial_memory - final_memory) / initial_memory:.1f}% reduction)")
    
    return df

def df_exploration(key: str, df: pd.DataFrame, head_n: int = 10):
    print("=" * 40)
    print(f"{key} (shape: {df.shape})")
    print("=" * 40)

    report = []
    n_rows = len(df)

    for col in df.columns:
        series = df[col]
        # Missing values
        missing = series.isna().sum()
        # Unique values (safe for lists)
        try:
            unique = series.nunique(dropna=True)
        except TypeError:
            unique = series.astype(str).nunique(dropna=True)

        report.append({
            "Column": col,
            "Type": str(series.dtype),
            "Missing": int(missing),
            "Missing%": f"{(missing / n_rows * 100):.2f}%" if n_rows else "0.00%",
            "Unique": int(unique),
        })

    print("Report:")
    display(pd.DataFrame(report))

    try:
        dup_count = df.duplicated().sum()
    except TypeError:
        dup_count = df.astype(str).duplicated().sum()

    print(f"  Duplicates: {dup_count}")
    print(f"  Sample Data:")
    # Head display (safe for large & complex columns)
    display(df.head(head_n))

def col_exploration(column):
    return column.unique()

def get_resume_state(temp_dir, chunk_size):
    files = glob.glob(f"{temp_dir}/problem_attempts_part_*.parquet")
    if not files:
        return 0, 0

    indices = [
        int(re.search(r"part_(\d+)\.parquet", f).group(1))
        for f in files
    ]

    last_chunk_idx = max(indices)
    resume_row = (last_chunk_idx + 1) * chunk_size

    print(f"Resuming from chunk {last_chunk_idx + 1}, row {resume_row:,}")
    return resume_row, last_chunk_idx + 1

def print_top_association_results(
    frequent_itemsets,
    rules,
    config: AnalysisConfig,
    sort_itemsets_by='support',
    sort_rules_by=('lift', 'confidence')
):
    """
    Print top-N frequent itemsets and association rules in readable format.
    """

    print("\n" + "=" * 60)
    print(f"TOP {config.top_n_rules} FREQUENT ITEMSETS")
    print("=" * 60)

    top_itemsets = (
        frequent_itemsets
        .sort_values(sort_itemsets_by, ascending=False)
        .head(config.top_n_rules)
    )

    for _, row in top_itemsets.iterrows():
        items = ", ".join(sorted(row['itemsets']))
        print(f"{items}  (support={row['support']:.3f})")
    print(f"From {len(frequent_itemsets)} to {len(frequent_itemsets.sort_values(sort_itemsets_by, ascending=False))} items with support > {config.min_support}")
    print("\n" + "=" * 60)
    print(f"TOP {config.top_n_rules} ASSOCIATION RULES")
    print("=" * 60)

    top_rules = (
        rules
        .sort_values(list(sort_rules_by), ascending=False)
        .head(config.top_n_rules)
    )

    for _, row in top_rules.iterrows():
        print(
            f"{row['antecedents']} -> {row['consequents']}  "
            f"(support={row['support']:.3f}, "
            f"confidence={row['confidence']:.3f}, "
            f"lift={row['lift']:.3f})"
        )


## Exploration

### Dataset Files Selection

In [6]:
csv_files = sorted(ProcessingConfig.dataset_folder.glob('*.csv'))
print(f"\nData folder: {ProcessingConfig.dataset_folder.absolute()}")
print(f"Found {len(csv_files)} CSV files:\nOnly included {len(processing_config.include_files)} files.")

csv_files = [csv_file for csv_file in csv_files if csv_file.name in processing_config.include_files]
for f in csv_files:
    size_mb = f.stat().st_size / 1024 / 1024
    included = f.name in processing_config.include_files
    print(f"  {f.name:<45} {size_mb:>10.2f} MB")


Data folder: /mnt/41A664F31125B500/Personal/Academics/4th_Year/1st_Sem/CSC172_Data_Mining_and_Analysis/CSC172-AssociationMining-Bautista/notebooks/../dataset
Found 10 CSV files:
Only included 4 files.
  action_logs.csv                                  1371.44 MB
  assignment_relationships.csv                       14.25 MB
  problem_details.csv                                58.98 MB
  training_unit_test_scores.csv                      10.03 MB


### Loading and Initial Exploration of CSV files as DataFrames

#### Initialize DataFrames

In [7]:
data_frames = {f_name.stem: pd.read_csv(f_name) for f_name in csv_files}

#### Initial Exploration

In [7]:
for key, df in data_frames.items():
    df_exploration(key, df)
    for col in df.columns:
        print(f"Column `{col}`: {col_exploration(df[col])}")

action_logs (shape: (23932276, 10))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,assignment_log_id,object,0,0.00%,638528
1,timestamp,float64,0,0.00%,23908418
2,problem_id,object,6136715,25.64%,57360
3,max_attempts,float64,18686416,78.08%,2
4,available_core_tutoring,object,18686416,78.08%,4
5,score_viewable,float64,18686416,78.08%,2
6,continuous_score_viewable,float64,18686416,78.08%,2
7,action,object,0,0.00%,14
8,hint_id,object,23858933,99.69%,9125
9,explanation_id,object,23911140,99.91%,4132


  Duplicates: 0
  Sample Data:


Unnamed: 0,assignment_log_id,timestamp,problem_id,max_attempts,available_core_tutoring,score_viewable,continuous_score_viewable,action,hint_id,explanation_id
0,2QV1F2GSBZ,1599150988.995,,,,,,assignment_started,,
1,2QV1F2GSBZ,1599150990.935,I2GX4OQIE,3.0,answer,1.0,1.0,problem_started,,
2,2QV1F2GSBZ,1599151065.758,I2GX4OQIE,,,,,wrong_response,,
3,2QV1F2GSBZ,1599151090.746,I2GX4OQIE,,,,,wrong_response,,
4,2QV1F2GSBZ,1599151096.323,I2GX4OQIE,,,,,answer_requested,,
5,2QV1F2GSBZ,1599151114.918,I2GX4OQIE,,,,,correct_response,,
6,2QV1F2GSBZ,1599151114.928,I2GX4OQIE,,,,,problem_finished,,
7,2QV1F2GSBZ,1599151116.212,,,,,,continue_selected,,
8,2QV1F2GSBZ,1599151116.407,HCTP9BOV,3.0,answer,1.0,1.0,problem_started,,
9,2QV1F2GSBZ,1599151121.866,HCTP9BOV,,,,,answer_requested,,


Column `assignment_log_id`: ['2QV1F2GSBZ' 'W4UD30NIA' '2DJ8MR8M7U' ... '249NA55LN0' '1E2UZDBLNE'
 '1VVEB3EAGF']
Column `timestamp`: [1.59915099e+09 1.59915099e+09 1.59915107e+09 ... 1.63491912e+09
 1.63491912e+09 1.63491912e+09]
Column `problem_id`: [nan 'I2GX4OQIE' 'HCTP9BOV' ... '1MIH1CDBYH' '2C59YVROJA' '58J703EA0']
Column `max_attempts`: [nan  3.  1.]
Column `available_core_tutoring`: [nan 'answer' 'no_tutoring' 'explanation' 'hint']
Column `score_viewable`: [nan  1.  0.]
Column `continuous_score_viewable`: [nan  1.  0.]
Column `action`: ['assignment_started' 'problem_started' 'wrong_response'
 'answer_requested' 'correct_response' 'problem_finished'
 'continue_selected' 'open_response' 'assignment_finished'
 'assignment_resumed' 'explanation_requested'
 'skill_related_video_requested' 'hint_requested' 'live_tutor_requested']
Column `hint_id`: [nan 'OEM5SD5F2' '4O45D3DCP' ... '1E23T5SSF6' 'GRAMRQ1B6' '3WLC5CBKC']
Column `explanation_id`: [nan '1RNFIMNB5O' '1L0I9NP35E' ... '20VH7GDZ

Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,unit_test_assignment_log_id,object,0,0.00%,56577
1,in_unit_assignment_log_id,object,0,0.00%,638528


  Duplicates: 3048
  Sample Data:


Unnamed: 0,unit_test_assignment_log_id,in_unit_assignment_log_id
0,7FGC8P0F1,V6YXT3UG
1,15KQFID5U5,1TFFYMT814
2,QKDRPCXSG,1N2IFGUASM
3,1JOJIQXU1B,15W4ET3W62
4,2C9YZRVZT0,1WORTY787C
5,38M6IA4SS,2DQG3SWWLS
6,15XW17EHLW,Y3G0XTLMF
7,2C5IG7FC12,1HLYER60XW
8,F9OJCBCRM,1XB8H1OIF8
9,2OJ73SYFF6,1XB8H1OIF8


Column `unit_test_assignment_log_id`: ['7FGC8P0F1' '15KQFID5U5' 'QKDRPCXSG' ... '2PNRH0FF5C' '17EUNRKIC4'
 '28TD16LQU8']
Column `in_unit_assignment_log_id`: ['V6YXT3UG' '1TFFYMT814' '1N2IFGUASM' ... '251US25W27' '1QT606YWQ4'
 '35N2V2RP7']
problem_details (shape: (132738, 10))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,problem_id,object,0,0.00%,132738
1,problem_multipart_id,object,0,0.00%,70108
2,problem_multipart_position,int64,0,0.00%,55
3,problem_type,object,0,0.00%,10
4,problem_skill_code,object,820,0.62%,541
5,problem_skill_description,object,820,0.62%,539
6,problem_contains_image,float64,5,0.00%,2
7,problem_contains_equation,float64,5,0.00%,2
8,problem_contains_video,float64,5,0.00%,2
9,problem_text_bert_pca,object,0,0.00%,85042


  Duplicates: 0
  Sample Data:


Unnamed: 0,problem_id,problem_multipart_id,problem_multipart_position,problem_type,problem_skill_code,problem_skill_description,problem_contains_image,problem_contains_equation,problem_contains_video,problem_text_bert_pca
0,10MFND3HAJ,2MHCTW1IIN,1,Multiple Choice,6.RP.A.3b,Unit Rate,0.0,0.0,1.0,"[0.53955209,-0.96322744,0.49725574,6.28795392,..."
1,IH3MOE7AF,1UEQMXOOFA,1,Multiple Choice,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-1.61147666,-1.50911536,0.52055446,6.01118343..."
2,14YC7CEE2N,1UEQMXOOFA,2,Ungraded Open Response,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-8.95361845,5.26005410,-4.41350451,-2.6751771..."
3,16L5KQWLN7,1W7DRPNEJL,1,Ungraded Open Response,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-2.89295465,1.73222701,-0.21075635,0.16314057..."
4,BU0LO0LDD,1Z6MGLD8VK,1,Ungraded Open Response,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-1.53959700,1.35386494,-1.56874727,0.89545312..."
5,W9WPQSAU5,MBYKGWG5L,1,Ungraded Open Response,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-3.20997122,0.64484637,-0.57017812,-0.4925776..."
6,2OHCH5C5BD,O0EI8SMXR,1,Number,6.RP.A.2,Expressing Unit Rate in Words,0.0,0.0,0.0,"[-1.47761510,-1.33452493,-0.32730713,0.4147120..."
7,9CB1OILA2,A1DWWVVLC,1,Ungraded Open Response,6.RP.A.3a,Making Equivalent Ratio Tables,0.0,0.0,0.0,"[-3.49009235,-4.33279096,1.77473598,-0.3124369..."
8,1JCPX2ZOXQ,K65VD17P2,1,Ungraded Open Response,6.RP.A.3a,Making Equivalent Ratio Tables,0.0,0.0,0.0,"[-1.43699869,-4.19686441,-0.77936048,4.2911283..."
9,AANYMYPL6,1K9KSMZ0FV,1,Multiple Choice,6.RP.A.2,Expressing Unit Rate in Words,0.0,0.0,0.0,"[-1.70709674,-2.89674901,-0.17309755,5.5549082..."


Column `problem_id`: ['10MFND3HAJ' 'IH3MOE7AF' '14YC7CEE2N' ... '16PQQU7TL4' 'HIGDEU75J'
 '1A23RM90NL']
Column `problem_multipart_id`: ['2MHCTW1IIN' '1UEQMXOOFA' '1W7DRPNEJL' ... '1Q6T5XK3TJ' '1ZHXWZ0J1S'
 'NQQHNDY3S']
Column `problem_multipart_position`: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 16 15 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55]
Column `problem_type`: ['Multiple Choice' 'Ungraded Open Response' 'Number'
 'Algebraic Expression' 'Numeric Expression' 'Check All That Apply'
 'Exact Match (ignore case)' 'Exact Fraction'
 'Exact Match (case sensitive)' 'Ordering']
Column `problem_skill_code`: ['6.RP.A.3b' '6.RP.A.2' '6.RP.A.3a' '6.RP.A.1' '7.RP.A.2a' '8.EE.A.1-1'
 '8.EE.A.1-2' '8.EE.A.1-3' '8.EE.A.3' '8.EE.A.4' '7.RP.A.2b' '7.RP.A.2c'
 '6.EE.C.9-2' '7.RP.A.2d' '7.RP.A.1' '6.NS.A.1' '6.NS.B.3-3' '7.RP.A.3'
 '5.NF.B.4a-2' '8.G.A.1b' '8.G.A.1a' '8.G.A.1c' '7.G.A.1' '5.G.A.2'
 '8.G.A.2' '7.NS.A.1a' '7.N

Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,assignment_log_id,object,0,0.00%,42343
1,problem_id,object,0,0.00%,1835
2,score,int64,0,0.00%,2


  Duplicates: 0
  Sample Data:


Unnamed: 0,assignment_log_id,problem_id,score
0,1CEASUAUQJ,18J6436AS5,1
1,2IMKPEIL2Q,9RMI4CZU9,0
2,2IMKPEIL2Q,8F4U5WWTV,0
3,2IMKPEIL2Q,27D3I359NE,1
4,2IMKPEIL2Q,22DY4PFVMV,1
5,2IMKPEIL2Q,ZQMHFZJ53,1
6,2IMKPEIL2Q,1II2JVYEQV,0
7,2MZN9L748R,2N2SARA9Q6,1
8,2MZN9L748R,123M9UFYL2,0
9,2MZN9L748R,1WL078QSL4,0


Column `assignment_log_id`: ['1CEASUAUQJ' '2IMKPEIL2Q' '2MZN9L748R' ... 'DC3HIJPC9' '624996B53'
 '2OMX9AY8J6']
Column `problem_id`: ['18J6436AS5' '9RMI4CZU9' '8F4U5WWTV' ... '1VUW3WTLJE' 'SAI1LT0AQ'
 '1AJJUYE7LZ']
Column `score`: [1 0]


## Apriori Association Rule Mining

### Apriori Analysis Utility Functions

In [8]:
def eda_basket_summary(
    transactions_df: pd.DataFrame,
    plot_filename: str,
    output_dir: str,
    top_n: int = 5,
):
    """
    Perform EDA on transaction baskets (list-of-items format)
    and save item frequency plot in a folder.

    Parameters
    ----------
    transactions_df : pd.DataFrame
        Output of action_problem_transactions()
    top_n : int
        Number of top items to display
    output_dir : str
        Folder where plots will be saved
    plot_filename : str
        Name of the plot file
    """

    assert 'items' in transactions_df.columns, "`items` column not found"

    # --- Ensure output directory exists ---
    os.makedirs(output_dir, exist_ok=True)
    plot_path = os.path.join(output_dir, plot_filename)

    baskets = transactions_df['items']
    n_transactions = len(baskets)

    # --- Flatten items ---
    all_items = [item for basket in baskets for item in basket]

    item_counts = Counter(all_items)
    item_support_pct = {
        item: (count / n_transactions) * 100
        for item, count in item_counts.items()
    }

    item_support_pct = dict(
        sorted(item_support_pct.items(), key=lambda x: x[1], reverse=True)
    )

    # --- Basket statistics ---
    basket_sizes = baskets.apply(len)

    avg_basket_size = basket_sizes.mean()
    pct_1_to_3 = basket_sizes.between(1, 3).mean() * 100

    # --- Print summary ---
    print("\nEDA Progress")
    print(f"- Top {top_n} items:")
    for item, pct in list(item_support_pct.items())[:top_n]:
        print(f"  • {item} ({pct:.1f}%)")

    print(f"- Average basket size: {avg_basket_size:.1f} items")
    print(f"- {pct_1_to_3:.1f}% transactions contain 1–3 items")

    # --- Plot item frequency ---
    top_items = list(item_support_pct.items())[:20]
    labels, values = zip(*top_items)

    plt.figure(figsize=(10, 5))
    plt.bar(labels, values)
    plt.xticks(rotation=45, ha='right')
    plt.ylabel("Support (%)")
    plt.title("Item Frequency Distribution")
    plt.tight_layout()
    plt.savefig(plot_path)
    plt.close()

    print(f"\nPlot saved: {plot_path}")

    return {
        "item_support_pct": item_support_pct,
        "avg_basket_size": avg_basket_size,
        "pct_1_to_3": pct_1_to_3,
        "basket_sizes": basket_sizes
    }

def run_apriori(encoded_df):
    frequent_itemsets = apriori(
        encoded_df,
        min_support=analysis_config.min_support,
        use_colnames=True
    )

    frequent_itemsets = frequent_itemsets.sort_values(
        'support',
        ascending=False
    )

    rules = association_rules(
        frequent_itemsets,
        metric='confidence',
        min_threshold=analysis_config.min_confidence
    )
    rules = rules[rules['lift'] >= analysis_config.min_lift]

    rules = rules.sort_values(
        ['lift', 'confidence'],
        ascending=False
    )

    rules['antecedents'] = rules['antecedents'].apply(lambda x: ', '.join(list(x)))
    rules['consequents'] = rules['consequents'].apply(lambda x: ', '.join(list(x)))

    return frequent_itemsets, rules

### 1. Student-problem interactions (behavior co-occurrence)

#### Data Cleaning

##### 'action_logs' dataframe

In [8]:
print("Preprocessing `action_logs`")
DROP_COLS = [
    "max_attempts",
    "score_viewable",
    "continuous_score_viewable",
    "hint_id",
    "explanation_id"
]
print(f"    Dropping columns: {DROP_COLS}")
data_frames["action_logs"] = data_frames["action_logs"].drop(columns=DROP_COLS)

KEEP_ACTIONS = [
    "problem_started",
    "wrong_response",
    "correct_response",
    "hint_requested",
    "explanation_requested",
    "answer_requested",
    "problem_finished"
]
data_frames["action_logs"] = data_frames["action_logs"][data_frames["action_logs"]["action"].isin(KEEP_ACTIONS)]
print(f"    Removed rows with `action` not in: {KEEP_ACTIONS}")

# Sort for sequence analysis
data_frames["action_logs"] = data_frames["action_logs"].sort_values(
    ["assignment_log_id", "timestamp"]
)

print(f"    Optimizing dataframe...")
data_frames['action_logs'] = optimize_dataframe(data_frames['action_logs'])

df_exploration("action_logs", data_frames["action_logs"])
print(f"    Saving dataframe...")
save_dataframe("action_logs", data_frames["action_logs"])

print(f"    Cleaning...")
del data_frames["action_logs"], KEEP_ACTIONS, DROP_COLS
gc.collect()

Preprocessing `action_logs`
    Dropping columns: ['max_attempts', 'score_viewable', 'continuous_score_viewable', 'hint_id', 'explanation_id']
    Removed rows with `action` not in: ['problem_started', 'wrong_response', 'correct_response', 'hint_requested', 'explanation_requested', 'answer_requested', 'problem_finished']
    Optimizing dataframe...
  Memory reduced: 3682.53 MB → 3620.53 MB (1.7% reduction)
action_logs (shape: (16252841, 5))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,assignment_log_id,object,0,0.00%,638201
1,timestamp,float32,0,0.00%,303758
2,problem_id,object,0,0.00%,57360
3,available_core_tutoring,object,11006981,67.72%,4
4,action,object,0,0.00%,7


  Duplicates: 552199
  Sample Data:


Unnamed: 0,assignment_log_id,timestamp,problem_id,available_core_tutoring,action
6144448,1000AQM2VK,1618344576.0,1IEH49XWH5,answer,problem_started
6144449,1000AQM2VK,1618344576.0,1IEH49XWH5,,correct_response
6144450,1000AQM2VK,1618344576.0,1IEH49XWH5,,problem_finished
6144452,1000AQM2VK,1618344576.0,27YYTVQK6K,no_tutoring,problem_started
6144454,1000AQM2VK,1618344704.0,27YYTVQK6K,,problem_finished
6144456,1000AQM2VK,1618344704.0,2K9KDM1BB5,answer,problem_started
6144457,1000AQM2VK,1618344704.0,2K9KDM1BB5,,correct_response
6144458,1000AQM2VK,1618344704.0,2K9KDM1BB5,,problem_finished
6144460,1000AQM2VK,1618344704.0,1HES7DVPEF,no_tutoring,problem_started
6144462,1000AQM2VK,1618344704.0,1HES7DVPEF,,problem_finished


    Saving dataframe...
    Cleaning...


1040

##### 'problem_details' dataframe

In [9]:
PROB_BOOL_COLS = ['problem_contains_image', 'problem_contains_equation', 'problem_contains_video']
print(f"    Filling missing boolean columns `{PROB_BOOL_COLS}` with 0.")
for col in PROB_BOOL_COLS:
    data_frames["problem_details"][col] = data_frames["problem_details"][col].fillna(0).astype(np.int8)

for col in ['problem_skill_code', 'problem_skill_description']:
    data_frames["problem_details"][col] = data_frames["problem_details"][col].fillna('Unknown')

print(f"    Dropping unused columns...")
DROP_COLS = [
    "problem_text_bert_pca",
    "problem_multipart_id",
    "problem_multipart_position",
    "problem_skill_description",
]
data_frames["problem_details"] = data_frames["problem_details"].drop(columns=DROP_COLS)

print(f"    Optimizing dataframe...")
data_frames["problem_details"] = optimize_dataframe(data_frames["problem_details"])

df_exploration('problem_details', data_frames['problem_details'])
print(f"    Saving dataframe...")
save_dataframe('problem_details',data_frames['problem_details'])

print(f"    Cleaning...")
del data_frames['problem_details'], PROB_BOOL_COLS, DROP_COLS
gc.collect()

    Filling missing boolean columns `['problem_contains_image', 'problem_contains_equation', 'problem_contains_video']` with 0.
    Dropping unused columns...
    Optimizing dataframe...
  Memory reduced: 23.16 MB → 23.16 MB (0.0% reduction)
problem_details (shape: (132738, 6))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,problem_id,object,0,0.00%,132738
1,problem_type,object,0,0.00%,10
2,problem_skill_code,object,0,0.00%,542
3,problem_contains_image,int8,0,0.00%,2
4,problem_contains_equation,int8,0,0.00%,2
5,problem_contains_video,int8,0,0.00%,2


  Duplicates: 0
  Sample Data:


Unnamed: 0,problem_id,problem_type,problem_skill_code,problem_contains_image,problem_contains_equation,problem_contains_video
0,10MFND3HAJ,Multiple Choice,6.RP.A.3b,0,0,1
1,IH3MOE7AF,Multiple Choice,6.RP.A.3b,0,0,0
2,14YC7CEE2N,Ungraded Open Response,6.RP.A.3b,0,0,0
3,16L5KQWLN7,Ungraded Open Response,6.RP.A.3b,0,0,0
4,BU0LO0LDD,Ungraded Open Response,6.RP.A.3b,0,0,0
5,W9WPQSAU5,Ungraded Open Response,6.RP.A.3b,0,0,0
6,2OHCH5C5BD,Number,6.RP.A.2,0,0,0
7,9CB1OILA2,Ungraded Open Response,6.RP.A.3a,0,0,0
8,1JCPX2ZOXQ,Ungraded Open Response,6.RP.A.3a,0,0,0
9,AANYMYPL6,Multiple Choice,6.RP.A.2,0,0,0


    Saving dataframe...
    Cleaning...


0

#### Feature Engineering

##### Aggregation Function

In [9]:
def aggregate_attempts(group, assignment_log_id, problem_id):
    if not (group['action'] == 'problem_finished').any():
        return None

    started = group.loc[group['action'] == 'problem_started', 'timestamp'].min()
    finished = group.loc[group['action'] == 'problem_finished', 'timestamp'].max()

    if pd.isna(started) or pd.isna(finished):
        return None

    time_spent = finished - started

    hint_count = (group['action'] == 'hint_requested').sum()
    wrong_count = (group['action'] == 'wrong_response').sum()
    correct_count = (group['action'] == 'correct_response').sum()
    answer_requested = (group['action'] == 'answer_requested').any()
    explanation_requested = (group['action'] == 'explanation_requested').any()

    if correct_count > 0:
        if hint_count == 0 and wrong_count == 0 and not answer_requested:
            final_outcome = 'correct_first_try'
        else:
            final_outcome = 'correct_after_help'
    else:
        final_outcome = 'gave_up'

    return {
        'assignment_log_id': assignment_log_id,
        'problem_id': problem_id,
        'hint_count': hint_count,
        'wrong_count': wrong_count,
        'answer_requested': answer_requested,
        'explanation_requested': explanation_requested,
        'time_spent': time_spent,
        'final_outcome': final_outcome
    }

##### Feature Engineering Process w/ Chunking

In [None]:
print("Loading action_log dataframe...")
action_logs = load_dataframe('action_logs')

action_logs['timestamp'] = pd.to_datetime(action_logs['timestamp'])
action_logs['action'] = action_logs['action'].astype('category')

print("Sorting action logs...")
action_logs = action_logs.sort_values(
    ['assignment_log_id', 'problem_id', 'timestamp']
).reset_index(drop=True)

chunk_size = 1_000_000
temp_dir = str(processing_config.analysis_df_dir / "temp_action_problem")
start_row, chunk_idx = get_resume_state(temp_dir, chunk_size)
buffer = pd.DataFrame()

print("Starting chunked aggregation...")
for start in range(start_row, len(action_logs), chunk_size):
    end = min(start + chunk_size, len(action_logs))
    print(f"Processing rows {start:,} → {end:,}")

    chunk = action_logs.iloc[start:end]
    chunk = pd.concat([buffer, chunk], ignore_index=True)

    grouped = chunk.groupby(
        ['assignment_log_id', 'problem_id'],
        sort=False
    )

    keys = list(grouped.groups.keys())

    if len(keys) == 1:
        buffer = grouped.get_group(keys[0]).copy()
        continue

    complete_keys = keys[:-1]
    carry_key = keys[-1]

    results = []

    for assignment_log_id, problem_id in complete_keys:
        group = grouped.get_group((assignment_log_id, problem_id))
        out = aggregate_attempts(group, assignment_log_id, problem_id)
        if out is not None:
            results.append(out)

    if results:
        out_df = pd.DataFrame(results)
        out_path = f"{temp_dir}/problem_attempts_part_{chunk_idx:05d}.parquet"
        out_df.to_parquet(out_path, index=False)
        print(f"Saved {len(out_df):,} rows → {out_path}")
        chunk_idx += 1

    buffer = grouped.get_group(carry_key).copy()

    del chunk, grouped, results
    gc.collect()

print("Finalizing remaining buffer...")
final_results = []
grouped = buffer.groupby(
    ['assignment_log_id', 'problem_id'],
    sort=False
)
for assignment_log_id, problem_id in grouped.groups:
    group = grouped.get_group((assignment_log_id, problem_id))
    out = aggregate_attempts(group, assignment_log_id, problem_id)
    if out is not None:
        final_results.append(out)
if final_results:
    final_df = pd.DataFrame(final_results)
    out_path = f"{temp_dir}/problem_attempts_part_{chunk_idx:05d}.parquet"
    final_df.to_parquet(out_path, index=False)
    print(f"Saved final {len(final_df):,} rows")

del action_logs
del buffer
gc.collect()

print("Merging chunk files...")
chunk_files = sorted(glob.glob(f"{temp_dir}/problem_attempts_part_*.parquet"))
problem_attempts = pd.concat(
    (pd.read_parquet(f) for f in chunk_files),
    ignore_index=True
)

print(f"Total problem attempts: {len(problem_attempts):,}")
print("Loading problem details...")
problem_details = load_dataframe("problem_details")

action_problem_df = problem_attempts.merge(
    problem_details,
    on='problem_id',
    how='left'
)
save_dataframe("action_problem", action_problem_df)
print("Saved final action_problem dataframe")

print("Cleaning up temporary files...")
for f in chunk_files:
    os.remove(f)
os.rmdir(temp_dir)

del problem_attempts
del problem_details
del action_problem_df
del final_results
if 'final_df' in locals():
    del final_df
gc.collect()

print("Cleanup complete. Ready for next cell.")


Loading action_log dataframe...
Sorting action logs...
Starting chunked aggregation...
Processing rows 0 → 1,000,000
Saved 316,638 rows → problem_attempt_chunks/problem_attempts_part_00000.parquet
Processing rows 1,000,000 → 2,000,000
Saved 315,762 rows → problem_attempt_chunks/problem_attempts_part_00001.parquet
Processing rows 2,000,000 → 3,000,000
Saved 316,614 rows → problem_attempt_chunks/problem_attempts_part_00002.parquet
Processing rows 3,000,000 → 4,000,000
Saved 316,497 rows → problem_attempt_chunks/problem_attempts_part_00003.parquet
Processing rows 4,000,000 → 5,000,000
Saved 315,905 rows → problem_attempt_chunks/problem_attempts_part_00004.parquet
Processing rows 5,000,000 → 6,000,000
Saved 316,039 rows → problem_attempt_chunks/problem_attempts_part_00005.parquet
Processing rows 6,000,000 → 7,000,000
Saved 316,684 rows → problem_attempt_chunks/problem_attempts_part_00006.parquet
Processing rows 7,000,000 → 8,000,000
Saved 315,990 rows → problem_attempt_chunks/problem_attem

#### Transactions Creation

In [None]:
def action_problem_transactions(transactions):
    transactions = transactions.copy()

    transactions['hints'] = pd.cut(
        transactions['hint_count'],
        bins=[-0.1, 0, 2, np.inf],
        labels=['no_hints', 'few_hints', 'many_hints']
    )

    transactions['wrongs'] = pd.cut(
        transactions['wrong_count'],
        bins=[-0.1, 0, 2, np.inf],
        labels=['no_wrongs', 'few_wrongs', 'many_wrongs']
    )

    time_seconds = transactions['time_spent'].dt.total_seconds()

    q1, q2 = time_seconds.quantile([0.33, 0.66])

    transactions['time'] = pd.cut(
        time_seconds,
        bins=[-1, q1, q2, np.inf],
        labels=['fast', 'medium', 'slow']
    )


    transactions['answer'] = np.where(
        transactions['answer_requested'] == True,
        'answer_req',
        'no_answer'
    )

    transactions['explanation'] = np.where(
        transactions['explanation_requested'] == True,
        'explanation_req',
        'no_explanation'
    )

    if 'problem_type' in transactions.columns:
        type_mapping = {
            'Multiple Choice': 'mc',
            'Number': 'numeric',
            'Algebraic Expression': 'algebra',
            'Numeric Expression': 'numeric',
            'Ungraded Open Response': 'open',
            'Check All That Apply': 'mc',
            'Exact Match (ignore case)': 'text',
            'Exact Fraction': 'fraction',
            'Exact Match (case sensitive)': 'text',
            'Ordering': 'ordering'
        }
        transactions['prob_type'] = transactions['problem_type'].map(type_mapping).fillna('other')
    else:
        transactions['prob_type'] = 'unknown'

    transactions['transaction_id'] = (
        transactions['assignment_log_id'].astype(str) + '_' +
        transactions['problem_id'].astype(str)
    )

    item_columns = [
        'hints', 'wrongs', 'time', 'answer',
        'explanation', 'final_outcome', 'prob_type'
    ]

    transactions['items'] = transactions[item_columns].apply(
        lambda row: [str(item) for item in row if pd.notna(item)],
        axis=1
    )

    return transactions[['transaction_id', 'items']]

action_problem_df = load_dataframe('action_problem')

action_problem_t = action_problem_transactions(action_problem_df)
save_dataframe("action_problem_transactions", action_problem_t)

del action_problem_t, action_problem_df
gc.collect()

In [None]:
transactions_df = load_dataframe("action_problem_transactions")
for _, row in transactions_df.head(5).iterrows():
    print(row['transaction_id'], row['items'])

eda_stats = eda_basket_summary(
    transactions_df,
    top_n=5,
    output_dir=processing_config.results_vis_dir,
    plot_filename="A1_item_frequencies.png"
)

del transactions_df
gc.collect()

1000AQM2VK_154L9TTK7O ['no_hints', 'few_wrongs', 'medium', 'answer_req', 'no_explanation', 'correct_after_help', 'numeric']
1000AQM2VK_1D21SH7B18 ['no_hints', 'no_wrongs', 'slow', 'no_answer', 'no_explanation', 'gave_up', 'open']
1000AQM2VK_1H3QM3Z2VV ['no_hints', 'no_wrongs', 'fast', 'no_answer', 'no_explanation', 'gave_up', 'open']
1000AQM2VK_1H3ZH4WT00 ['no_hints', 'few_wrongs', 'fast', 'answer_req', 'no_explanation', 'correct_after_help', 'numeric']
1000AQM2VK_1HES7DVPEF ['no_hints', 'no_wrongs', 'fast', 'no_answer', 'no_explanation', 'gave_up', 'open']

## 2. EDA Progress
**Key Findings (so far):**

- Top 5 items:
  • no_explanation (99.6%)
  • no_hints (99.0%)
  • no_answer (88.4%)
  • no_wrongs (82.8%)
  • fast (55.9%)
- Average basket size: 7.0 items
- 0.0% transactions contain 1–3 items

![Item Frequency Distribution](../results/vis/A1_item_frequencies.png)


1381

#### Transactions Encoding

In [None]:
action_problem_t = load_dataframe("action_problem_transactions")
transactions = action_problem_t['items'].tolist()

te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)

encoded_df = pd.DataFrame(
    te_array,
    columns=te.columns_
)

display(encoded_df)
save_dataframe("action_problem_encoded", encoded_df)
del action_problem_t, transactions, te, te_array, 
gc.collect()

Unnamed: 0,algebra,answer_req,correct_after_help,correct_first_try,explanation_req,fast,few_hints,few_wrongs,fraction,gave_up,many_hints,many_wrongs,mc,medium,no_answer,no_explanation,no_hints,no_wrongs,numeric,open,ordering,other,slow,text
0,False,True,True,False,False,False,False,True,False,False,False,False,False,True,False,True,True,False,True,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,True,True,True,False,True,False,False,True,False
2,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,True,True,True,False,True,False,False,False,False
3,False,True,True,False,False,True,False,True,False,False,False,False,False,False,False,True,True,False,True,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,True,True,True,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5140884,False,False,False,True,False,True,False,False,False,False,False,False,True,False,True,True,True,True,False,False,False,False,False,False
5140885,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,True,True,False,False,False,False,False,False,False
5140886,True,True,True,False,False,True,False,True,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
5140887,False,False,False,True,False,True,False,False,False,False,False,False,True,False,True,True,True,True,False,False,False,False,False,False


6248

#### Top Frequent Items and Rules

In [8]:
encoded_df = load_dataframe('action_problem_encoded')
freq_items1, rules1 = run_apriori(encoded_df)
print_top_association_results(
    freq_items1,
    rules1,
    config=analysis_config
)

del encoded_df, freq_items1, rules1
gc.collect()


TOP 10 FREQUENT ITEMSETS
no_explanation  (support=0.996)
no_hints  (support=0.990)
no_explanation, no_hints  (support=0.986)
no_answer  (support=0.884)
no_answer, no_explanation  (support=0.881)
no_answer, no_hints  (support=0.879)
no_answer, no_explanation, no_hints  (support=0.876)
no_wrongs  (support=0.828)
no_explanation, no_wrongs  (support=0.827)
no_hints, no_wrongs  (support=0.825)
From 557 to 557 items with support > 0.05

TOP 10 ASSOCIATION RULES
correct_after_help, no_answer, no_hints, no_explanation -> few_wrongs  (support=0.083, confidence=0.919, lift=6.509)
correct_after_help, no_answer, no_hints -> few_wrongs  (support=0.084, confidence=0.913, lift=6.467)
correct_after_help, no_answer, no_hints -> few_wrongs, no_explanation  (support=0.083, confidence=0.899, lift=6.442)
correct_after_help, no_answer, no_explanation -> no_hints, few_wrongs  (support=0.083, confidence=0.867, lift=6.323)
no_hints, few_wrongs -> correct_after_help, no_answer, no_explanation  (support=0.083, 

279

### 2. Student-unit aggregations (success/failure prediction)

#### Data Loading

In [10]:
unit_test_scores = data_frames["training_unit_test_scores"]
save_dataframe("unit_test_scores", unit_test_scores)
del unit_test_scores
gc.collect()

assignment_relationships = data_frames['assignment_relationships']
save_dataframe("assignment_relationships", assignment_relationships)
del assignment_relationships
gc.collect()

0

#### Feature Engineering

##### Aggregation Function

In [16]:
def aggregate_assignment_behaviors(actions):
    actions = actions.copy()

    if len(actions) == 0:
        return None
    
    hint_count = (actions['action'] == 'hint_requested').sum()
    wrong_count = (actions['action'] == 'wrong_response').sum()
    correct_count = (actions['action'] == 'correct_response').sum()
    answer_count = (actions['action'] == 'answer_requested').sum()
    explanation_count = (actions['action'] == 'explanation_requested').sum()
    problems_started = (actions['action'] == 'problem_started').sum()
    problems_finished = (actions['action'] == 'problem_finished').sum()

    total_problems = max(problems_started, 1)  # Avoid division by zero
    completion_rate = problems_finished / total_problems if total_problems > 0 else 0
    help_per_problem = (hint_count + explanation_count + answer_count) / total_problems if total_problems > 0 else 0
    wrong_per_problem = wrong_count / total_problems if total_problems > 0 else 0

    return pd.Series({
        'problems_completed': problems_finished,
        'completion_rate': completion_rate,
        'help_per_problem': help_per_problem,
        'wrong_per_problem': wrong_per_problem,
        'hint_count': hint_count,
        'answer_count': answer_count
    })

##### Feature Engineering Process w/ Chunking

In [20]:
from concurrent.futures import ThreadPoolExecutor
# ------------------------------------------------------------------
# Step 0: Build unit ↔ in-unit mapping
# ------------------------------------------------------------------
unit_tests = load_dataframe("unit_test_scores")
unit_test_scores = unit_tests.groupby(
    'assignment_log_id'
).agg({
    'score': ['mean', 'sum', 'count']
}).reset_index()

unit_test_scores.columns = [
    'assignment_log_id',
    'avg_score',
    'total_correct',
    'total_problems'
]
print(f"  Found {len(unit_test_scores):,} unit tests")

assignment_relationships = load_dataframe("assignment_relationships")

unit_with_inunit = assignment_relationships.merge(
    unit_test_scores,
    left_on='unit_test_assignment_log_id',
    right_on='assignment_log_id',
    how='inner'
)

print(f"  Found {len(unit_with_inunit):,} unit test <-> in-unit assignment pairs")

del unit_tests, assignment_relationships, unit_test_scores
gc.collect()


# ------------------------------------------------------------------
# Step 1: Prepare assignment list
# ------------------------------------------------------------------
in_unit_assignments = (
    unit_with_inunit['in_unit_assignment_log_id']
    .drop_duplicates()
    .to_numpy()
)

print(f"  Processing {len(in_unit_assignments):,} unique in-unit assignments...")

batch_size = 10_000
flush_size = 50_000
num_workers = 2

temp_dir = processing_config.analysis_df_dir / "temp_in_unit"
temp_dir.mkdir(parents=True, exist_ok=True)

existing_parts = sorted(temp_dir.glob("in_unit_behaviors_part_*.parquet"))
chunk_idx = len(existing_parts)
start_idx = chunk_idx * flush_size

print(f"Resuming from assignment index {start_idx:,}")
print(f"Existing chunks: {chunk_idx}")

# -------------------------------------------------
# Load once, group once
# -------------------------------------------------
action_logs = load_dataframe("action_logs")
grouped_logs = action_logs.groupby("assignment_log_id", sort=False)
del action_logs
gc.collect()


# -------------------------------------------------
# Worker (THREAD SAFE)
# -------------------------------------------------
def process_assignment(assignment_id):
    try:
        group = grouped_logs.get_group(assignment_id)
    except KeyError:
        return None

    behavior = aggregate_assignment_behaviors(group)
    if behavior is None:
        return None

    behavior["in_unit_assignment_log_id"] = assignment_id
    return behavior


# -------------------------------------------------
# Chunked + threaded aggregation
# -------------------------------------------------
buffer = []
print("Starting in-unit behavior aggregation...")

with ThreadPoolExecutor(max_workers=num_workers) as executor:
    for i in range(start_idx, len(in_unit_assignments), batch_size):
        batch = in_unit_assignments[i:i + batch_size]

        if i % 50_000 == 0:
            print(f"Processed {i:,} / {len(in_unit_assignments):,} assignments")

        for behavior in executor.map(process_assignment, batch):
            if behavior is not None:
                buffer.append(behavior)

        if len(buffer) >= flush_size:
            out_df = pd.DataFrame(buffer)
            out_path = temp_dir / f"in_unit_behaviors_part_{chunk_idx:05d}.parquet"
            out_df.to_parquet(out_path, index=False)

            print(f"Saved {len(out_df):,} rows → {out_path.name}")

            chunk_idx += 1
            buffer.clear()
            del out_df
            gc.collect()


# -------------------------------------------------
# Final flush
# -------------------------------------------------
if buffer:
    out_df = pd.DataFrame(buffer)
    out_path = temp_dir / f"in_unit_behaviors_part_{chunk_idx:05d}.parquet"
    out_df.to_parquet(out_path, index=False)

    print(f"Saved final {len(out_df):,} rows → {out_path.name}")

    buffer.clear()
    del out_df
    gc.collect()


# -------------------------------------------------
# Merge → final DF
# -------------------------------------------------
print("Merging in-unit behavior chunks...")

chunk_files = sorted(temp_dir.glob("in_unit_behaviors_part_*.parquet"))

in_unit_df = pd.concat(
    (pd.read_parquet(f) for f in chunk_files),
    ignore_index=True
)

print(f"Aggregated behaviors for {len(in_unit_df):,} in-unit assignments")

save_dataframe("in_unit_behaviors", in_unit_df)
print("Saved final in_unit_behaviors dataframe")


# ------------------------------------------------------------------
# Step 6: Cleanup
# ------------------------------------------------------------------
del in_unit_df
for f in chunk_files:
    f.unlink()

temp_dir.rmdir()
gc.collect()

  Found 42,343 unit tests
  Found 533,297 unit test <-> in-unit assignment pairs
  Processing 485,956 unique in-unit assignments...
Resuming from assignment index 0
Existing chunks: 0
Starting in-unit behavior aggregation...
Processed 0 / 485,956 assignments
Processed 50,000 / 485,956 assignments
Saved 59,974 rows → in_unit_behaviors_part_00000.parquet
Processed 100,000 / 485,956 assignments
Saved 59,978 rows → in_unit_behaviors_part_00001.parquet
Processed 150,000 / 485,956 assignments
Saved 59,968 rows → in_unit_behaviors_part_00002.parquet
Processed 200,000 / 485,956 assignments
Saved 59,978 rows → in_unit_behaviors_part_00003.parquet
Processed 250,000 / 485,956 assignments
Saved 59,948 rows → in_unit_behaviors_part_00004.parquet
Processed 300,000 / 485,956 assignments
Processed 350,000 / 485,956 assignments
Saved 59,974 rows → in_unit_behaviors_part_00005.parquet
Processed 400,000 / 485,956 assignments
Saved 59,975 rows → in_unit_behaviors_part_00006.parquet
Processed 450,000 / 485

0

In [22]:
in_unit_df = load_dataframe("in_unit_behaviors")
df_exploration("in_unit_behaviors", in_unit_df, 10)
del in_unit_df
gc.collect()

in_unit_behaviors (shape: (485727, 7))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,problems_completed,float64,0,0.00%,64
1,completion_rate,float64,0,0.00%,62
2,help_per_problem,float64,0,0.00%,805
3,wrong_per_problem,float64,0,0.00%,1474
4,hint_count,float64,0,0.00%,37
5,answer_count,float64,0,0.00%,45
6,in_unit_assignment_log_id,object,0,0.00%,485727


  Duplicates: 0
  Sample Data:


Unnamed: 0,problems_completed,completion_rate,help_per_problem,wrong_per_problem,hint_count,answer_count,in_unit_assignment_log_id
0,0.0,0.0,0.0,0.0,0.0,0.0,V6YXT3UG
1,0.0,0.0,0.0,0.0,0.0,0.0,1TFFYMT814
2,0.0,0.0,0.0,0.0,0.0,0.0,1N2IFGUASM
3,0.0,0.0,0.0,0.0,0.0,0.0,15W4ET3W62
4,4.0,1.0,0.0,0.25,0.0,0.0,2DQG3SWWLS
5,4.0,1.0,0.0,0.0,0.0,0.0,Y3G0XTLMF
6,4.0,1.0,0.0,0.25,0.0,0.0,1HLYER60XW
7,4.0,1.0,0.0,0.0,0.0,0.0,1XB8H1OIF8
8,4.0,1.0,0.0,0.25,0.0,0.0,RIFQE6J73
9,4.0,1.0,0.0,0.0,0.0,0.0,19QOOYY90X


0

In [None]:
in_unit_df = load_dataframe("in_unit_behaviors")

# Join behaviors with relationships
unit_with_behaviors = unit_with_inunit.merge(
    in_unit_df,
    on='in_unit_assignment_log_id',
    how='left'
)

# Aggregate per unit test
unit_test_agg = unit_with_behaviors.groupby('unit_test_assignment_log_id').agg({
    'avg_score': 'first',
    'total_correct': 'first',
    'total_problems': 'first',
    'problems_completed': 'mean',
    'completion_rate': 'mean',
    'help_per_problem': 'mean',
    'wrong_per_problem': 'mean',
    'in_unit_assignment_log_id': 'count'  # Count of in-unit assignments
}).reset_index()

unit_test_agg.columns = [
    'assignment_log_id', 'unit_test_score', 'total_correct', 'total_problems',
    'avg_problems_completed', 'avg_completion_rate', 'avg_help_seeking',
    'avg_wrong_attempts', 'num_in_unit_assignments'
]

print(f"  Aggregated {len(unit_test_agg):,} unit tests")

# Enrich with sequence metadata
print("\nStep 5: Enriching with metadata...")
if self.assignment_details is not None and self.sequence_details is not None:
    unit_test_agg = unit_test_agg.merge(
        self.assignment_details[['assignment_log_id', 'sequence_id']],
        on='assignment_log_id',
        how='left'
    )
    
    unit_test_agg = unit_test_agg.merge(
        self.sequence_details[['sequence_id', 'sequence_folder_path_level_1', 
                                'sequence_folder_path_level_2']],
        on='sequence_id',
        how='left'
    )


### 3. Student help-seeking patterns (mastery analysis)