## Imports and Initial Configurations

### Imports

In [2]:
# Make sure to install requirements.txt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules
import gc
import psutil
import os
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
import warnings
import ast
from IPython.display import display
from joblib import Parallel, delayed
from tqdm import tqdm
from collections import Counter
import pyarrow as pa
import pyarrow.parquet as pq
import uuid
import pyarrow.dataset as ds
import tempfile
import shutil
from joblib import Parallel, delayed
import json
warnings.filterwarnings('ignore')

### Configurations

In [3]:
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams.update({
    "figure.autolayout": True,
    "axes.titleweight": "bold"
})
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f'{x:.4f}')

## Data Preperation

### Process Configuration Classes

In [4]:
@dataclass
class AnalysisConfig:
    """Apriori analysis parameters"""
    min_support: float = 0.05
    min_confidence: float = 0.6
    min_lift: float = 1.0
    top_n_rules: int = 10

@dataclass
class ProcessingConfig:
    """Data processing configuration"""
    preprocessed_df_dir: Path = Path("../dataset/dataframes")
    dataset_folder: Path = Path('../dataset')
    include_files: tuple = ('action_logs.csv', 
                            'assignment_details.csv',
                            'assignment_relationships.csv',
                            'problem_details.csv',
                            'hint_details.csv',
                            'explanation_details.csv',
                            'sequence_details.csv',
                            'training_unit_test_scores.csv',
                            'sequence_relationships.csv'
                            )

# Initialize
print("Initializing process configurations...")
analysis_config = AnalysisConfig()
processing_config = ProcessingConfig()
print("Configurations initialized.")

Initializing process configurations...
Configurations initialized.


### Utility Functions

In [5]:
def get_memory_usage_mb() -> float:
    return psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024

def log_memory(context: str = "") -> float:
    mem = get_memory_usage_mb()
    print(f"Memory {context}: {mem:.2f} MB")
    return mem


def save_dataframe(name: str, dataframe: pd.DataFrame):
    os.makedirs(ProcessingConfig.preprocessed_df_dir, exist_ok=True)
    # Remove any existing extension and add .parquet
    path = ProcessingConfig.preprocessed_df_dir / f'{name}.parquet'
    dataframe.to_parquet(path, index=False, engine='pyarrow')

def load_dataframe(name: str) -> pd.DataFrame:
    # Remove any existing extension and add .parquet
    path = ProcessingConfig.preprocessed_df_dir / f'{name}.parquet'
    return pd.read_parquet(path, engine='pyarrow')

def optimize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Optimize DataFrame memory usage by downcasting numeric types"""
    initial_memory = df.memory_usage(deep=True).sum() / 1024**2
    df = df.drop_duplicates()
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            # Integers
            if str(col_type).startswith('int'):
                c_min = df[col].min()
                c_max = df[col].max()
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            
            # Floats
            elif str(col_type).startswith('float'):
                c_min = df[col].min()
                c_max = df[col].max()
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    
    final_memory = df.memory_usage(deep=True).sum() / 1024**2
    print(f"  Memory reduced: {initial_memory:.2f} MB → {final_memory:.2f} MB "
          f"({100 * (initial_memory - final_memory) / initial_memory:.1f}% reduction)")
    
    return df


### Dataset Files Selection

In [5]:
csv_files = sorted(ProcessingConfig.dataset_folder.glob('*.csv'))
print(f"\nData folder: {ProcessingConfig.dataset_folder.absolute()}")
print(f"Found {len(csv_files)} CSV files:\n")

for f in csv_files:
    size_mb = f.stat().st_size / 1024 / 1024
    included = f.name in processing_config.include_files
    status = "EXCLUDED" if not included else "INCLUDED"
    print(f"  {status} {f.name:<45} {size_mb:>10.2f} MB")
    if not included:
        csv_files.remove(f)


Data folder: /mnt/41A664F31125B500/Personal/Academics/4th_Year/1st_Sem/CSC172_Data_Mining_and_Analysis/CSC172-AssociationMining-Bautista/notebooks/../dataset
Found 10 CSV files:

  INCLUDED action_logs.csv                                  1371.44 MB
  INCLUDED assignment_details.csv                            921.42 MB
  INCLUDED assignment_relationships.csv                       14.25 MB
  EXCLUDED evaluation_unit_test_scores.csv                     3.36 MB
  INCLUDED hint_details.csv                                    3.21 MB
  INCLUDED problem_details.csv                                58.98 MB
  INCLUDED sequence_details.csv                                3.81 MB
  INCLUDED sequence_relationships.csv                          0.27 MB
  INCLUDED training_unit_test_scores.csv                      10.03 MB


### Loading and Initial Exploration of CSV files as DataFrames

#### Initialize DataFrames

In [6]:
data_frames = {f_name.stem: pd.read_csv(f_name) for f_name in csv_files}

#### Initial Exploration

In [7]:
def df_exploration(key: str, df: pd.DataFrame, head_n: int = 10):
    print("=" * 40)
    print(f"{key} (shape: {df.shape})")
    print("=" * 40)

    report = []

    n_rows = len(df)

    for col in df.columns:
        series = df[col]

        # Missing values
        missing = series.isna().sum()

        # Unique values (safe for lists)
        try:
            unique = series.nunique(dropna=True)
        except TypeError:
            unique = series.astype(str).nunique(dropna=True)

        report.append({
            "Column": col,
            "Type": str(series.dtype),
            "Missing": int(missing),
            "Missing%": f"{(missing / n_rows * 100):.2f}%" if n_rows else "0.00%",
            "Unique": int(unique),
        })

    print("Report:")
    display(pd.DataFrame(report))

    try:
        dup_count = df.duplicated().sum()
    except TypeError:
        dup_count = df.astype(str).duplicated().sum()

    print(f"  Duplicates: {dup_count}")
    print(f"  Sample Data:")
    # Head display (safe for large & complex columns)
    display(df.head(head_n))


for key, df in data_frames.items():
    df_exploration(key, df)

action_logs (shape: (23932276, 10))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,assignment_log_id,object,0,0.00%,638528
1,timestamp,float64,0,0.00%,23908418
2,problem_id,object,6136715,25.64%,57360
3,max_attempts,float64,18686416,78.08%,2
4,available_core_tutoring,object,18686416,78.08%,4
5,score_viewable,float64,18686416,78.08%,2
6,continuous_score_viewable,float64,18686416,78.08%,2
7,action,object,0,0.00%,14
8,hint_id,object,23858933,99.69%,9125
9,explanation_id,object,23911140,99.91%,4132


  Duplicates: 0
  Sample Data:


Unnamed: 0,assignment_log_id,timestamp,problem_id,max_attempts,available_core_tutoring,score_viewable,continuous_score_viewable,action,hint_id,explanation_id
0,2QV1F2GSBZ,1599150988.995,,,,,,assignment_started,,
1,2QV1F2GSBZ,1599150990.935,I2GX4OQIE,3.0,answer,1.0,1.0,problem_started,,
2,2QV1F2GSBZ,1599151065.758,I2GX4OQIE,,,,,wrong_response,,
3,2QV1F2GSBZ,1599151090.746,I2GX4OQIE,,,,,wrong_response,,
4,2QV1F2GSBZ,1599151096.323,I2GX4OQIE,,,,,answer_requested,,
5,2QV1F2GSBZ,1599151114.918,I2GX4OQIE,,,,,correct_response,,
6,2QV1F2GSBZ,1599151114.928,I2GX4OQIE,,,,,problem_finished,,
7,2QV1F2GSBZ,1599151116.212,,,,,,continue_selected,,
8,2QV1F2GSBZ,1599151116.407,HCTP9BOV,3.0,answer,1.0,1.0,problem_started,,
9,2QV1F2GSBZ,1599151121.866,HCTP9BOV,,,,,answer_requested,,


assignment_details (shape: (9319676, 9))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,assignment_log_id,object,0,0.00%,9319676
1,teacher_id,object,0,0.00%,23523
2,class_id,object,0,0.00%,47401
3,student_id,object,0,0.00%,651253
4,sequence_id,object,0,0.00%,8774
5,assignment_release_date,float64,0,0.00%,431169
6,assignment_due_date,float64,2744279,29.45%,150786
7,assignment_start_time,object,0,0.00%,9262758
8,assignment_end_time,object,1878016,20.15%,7386233


  Duplicates: 0
  Sample Data:


Unnamed: 0,assignment_log_id,teacher_id,class_id,student_id,sequence_id,assignment_release_date,assignment_due_date,assignment_start_time,assignment_end_time
0,2PLEB2KWK9,22OEQXISYV,133F5L5O95,L97DTM607,1FLYIHK4Q4,1539634782.402,1540066860.0,1539634866.476,
1,8G25XNCXN,2SKA2RTF6,2OL82EC95R,21S35PU5W2,CDLX4UJ84,1539871373.139,,1539871403.267,1539871641.345
2,266AW7UU1V,1FJ326JFAH,1WJWBO8XL4,IBO6BEHXA,2T42B3UC5,1539884662.992,,1539884690.684,
3,15SHL0U0E6,129LDU45TT,IBO6BEHXA,1CT2ERTNC7,7ZGYNOHS3,1539895814.081,1540241520.0,1539952545.055,
4,CQA32TBFI,1FJ326JFAH,1WJWBO8XL4,2JC4HHXU4M,2T42B3UC5,1539884662.992,,1539974068.802,
5,RNSUY1N30,1FJ326JFAH,1WJWBO8XL4,2JC4HHXU4M,116QWSQWM9,1541431922.967,,1541431936.621,
6,1SRBUROB4M,1A8U1KW3AV,2DFHHHY3AO,GJD8FQZUO,SZQ65NBOQ,1542217921.97,,1542218008.283,
7,1MOHSGSH4S,1A8U1KW3AV,2DFHHHY3AO,GJD8FQZUO,SZQ65NBOQ,1542217958.218,,1542218340.303,1542219741.687
8,22UBCQFR5,1A8U1KW3AV,2DFHHHY3AO,288F3DEIBP,SZQ65NBOQ,1542217958.218,,1542221221.258,1542222007.714
9,1PRB4L8LJ3,1A8U1KW3AV,2DFHHHY3AO,SAUMQVPOS,SZQ65NBOQ,1542217958.218,,1542221226.111,1542221972.925


assignment_relationships (shape: (702887, 2))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,unit_test_assignment_log_id,object,0,0.00%,56577
1,in_unit_assignment_log_id,object,0,0.00%,638528


  Duplicates: 3048
  Sample Data:


Unnamed: 0,unit_test_assignment_log_id,in_unit_assignment_log_id
0,7FGC8P0F1,V6YXT3UG
1,15KQFID5U5,1TFFYMT814
2,QKDRPCXSG,1N2IFGUASM
3,1JOJIQXU1B,15W4ET3W62
4,2C9YZRVZT0,1WORTY787C
5,38M6IA4SS,2DQG3SWWLS
6,15XW17EHLW,Y3G0XTLMF
7,2C5IG7FC12,1HLYER60XW
8,F9OJCBCRM,1XB8H1OIF8
9,2OJ73SYFF6,1XB8H1OIF8


explanation_details (shape: (4132, 6))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,explanation_id,object,0,0.00%,4132
1,explanation_creator_id,object,0,0.00%,14
2,explanation_contains_image,int64,0,0.00%,2
3,explanation_contains_equation,int64,0,0.00%,2
4,explanation_contains_video,int64,0,0.00%,2
5,explanation_text_bert_pca,object,0,0.00%,2367


  Duplicates: 0
  Sample Data:


Unnamed: 0,explanation_id,explanation_creator_id,explanation_contains_image,explanation_contains_equation,explanation_contains_video,explanation_text_bert_pca
0,F9LDM3F28,2QVSAY30Y4,0,0,1,"[-14.54676799,-0.10885940,-0.19061570,-0.07347..."
1,1LP7J12588,2QVSAY30Y4,0,0,1,"[-14.54676799,-0.10885940,-0.19061570,-0.07347..."
2,166NMZR8N1,2QVSAY30Y4,0,0,1,"[-14.54676799,-0.10885940,-0.19061570,-0.07347..."
3,BLPMAHQ1U,2QVSAY30Y4,0,0,1,"[-14.54676799,-0.10885940,-0.19061570,-0.07347..."
4,9BN3ZCYZB,2QVSAY30Y4,0,0,1,"[-14.54676799,-0.10885940,-0.19061570,-0.07347..."
5,1RZOX5Y06X,2QVSAY30Y4,0,0,1,"[-14.54676799,-0.10885940,-0.19061570,-0.07347..."
6,2OC8YHF2A2,2QVSAY30Y4,0,0,1,"[-14.54676799,-0.10885940,-0.19061570,-0.07347..."
7,13LRF9OH6O,2QVSAY30Y4,0,0,1,"[-14.54676799,-0.10885940,-0.19061570,-0.07347..."
8,CW8S6AU12,2QVSAY30Y4,0,0,1,"[-14.54676799,-0.10885940,-0.19061570,-0.07347..."
9,14QA68SKWP,2QVSAY30Y4,0,0,1,"[-14.54676799,-0.10885940,-0.19061570,-0.07347..."


hint_details (shape: (8381, 7))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,hint_id,object,0,0.00%,8381
1,hint_creator_id,object,0,0.00%,17
2,hint_position,int64,0,0.00%,4
3,hint_contains_image,int64,0,0.00%,2
4,hint_contains_equation,int64,0,0.00%,2
5,hint_contains_video,int64,0,0.00%,1
6,hint_text_bert_pca,object,0,0.00%,6405


  Duplicates: 0
  Sample Data:


Unnamed: 0,hint_id,hint_creator_id,hint_position,hint_contains_image,hint_contains_equation,hint_contains_video,hint_text_bert_pca
0,27QACMUW18,19ETZUMD36,1,0,0,0,"[1.17008950,2.15066114,-3.16681823,9.14589015,..."
1,2A6J4ALXFX,19ETZUMD36,1,0,0,0,"[-1.29619265,1.00184040,-2.53840949,10.1349365..."
2,N2CKUB1LY,19ETZUMD36,1,0,0,0,"[-1.29619265,1.00184040,-2.53840949,10.1349365..."
3,10N2WE3WMP,19ETZUMD36,1,0,0,0,"[-1.29619265,1.00184040,-2.53840949,10.1349365..."
4,UJC2I281G,19ETZUMD36,1,0,0,0,"[-1.29619265,1.00184040,-2.53840949,10.1349365..."
5,K1HC9Z0XW,19ETZUMD36,1,0,0,0,"[4.08991516,1.02389493,-0.45771406,4.43437555,..."
6,2N0KOMXKDI,19ETZUMD36,2,0,1,0,"[-0.50686679,1.28997016,-4.69543767,5.13560538..."
7,1LU9FDMWST,19ETZUMD36,1,0,0,0,"[3.07992179,-1.11033570,-3.23417373,1.20729399..."
8,1KCNFLP3NG,19ETZUMD36,2,0,0,0,"[-1.79526883,-3.39402474,0.99609407,3.10209348..."
9,1TUCU89LRV,19ETZUMD36,3,0,0,0,"[0.23790174,1.02181451,-4.05085219,0.63962018,..."


problem_details (shape: (132738, 10))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,problem_id,object,0,0.00%,132738
1,problem_multipart_id,object,0,0.00%,70108
2,problem_multipart_position,int64,0,0.00%,55
3,problem_type,object,0,0.00%,10
4,problem_skill_code,object,820,0.62%,541
5,problem_skill_description,object,820,0.62%,539
6,problem_contains_image,float64,5,0.00%,2
7,problem_contains_equation,float64,5,0.00%,2
8,problem_contains_video,float64,5,0.00%,2
9,problem_text_bert_pca,object,0,0.00%,85042


  Duplicates: 0
  Sample Data:


Unnamed: 0,problem_id,problem_multipart_id,problem_multipart_position,problem_type,problem_skill_code,problem_skill_description,problem_contains_image,problem_contains_equation,problem_contains_video,problem_text_bert_pca
0,10MFND3HAJ,2MHCTW1IIN,1,Multiple Choice,6.RP.A.3b,Unit Rate,0.0,0.0,1.0,"[0.53955209,-0.96322744,0.49725574,6.28795392,..."
1,IH3MOE7AF,1UEQMXOOFA,1,Multiple Choice,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-1.61147666,-1.50911536,0.52055446,6.01118343..."
2,14YC7CEE2N,1UEQMXOOFA,2,Ungraded Open Response,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-8.95361845,5.26005410,-4.41350451,-2.6751771..."
3,16L5KQWLN7,1W7DRPNEJL,1,Ungraded Open Response,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-2.89295465,1.73222701,-0.21075635,0.16314057..."
4,BU0LO0LDD,1Z6MGLD8VK,1,Ungraded Open Response,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-1.53959700,1.35386494,-1.56874727,0.89545312..."
5,W9WPQSAU5,MBYKGWG5L,1,Ungraded Open Response,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-3.20997122,0.64484637,-0.57017812,-0.4925776..."
6,2OHCH5C5BD,O0EI8SMXR,1,Number,6.RP.A.2,Expressing Unit Rate in Words,0.0,0.0,0.0,"[-1.47761510,-1.33452493,-0.32730713,0.4147120..."
7,9CB1OILA2,A1DWWVVLC,1,Ungraded Open Response,6.RP.A.3a,Making Equivalent Ratio Tables,0.0,0.0,0.0,"[-3.49009235,-4.33279096,1.77473598,-0.3124369..."
8,1JCPX2ZOXQ,K65VD17P2,1,Ungraded Open Response,6.RP.A.3a,Making Equivalent Ratio Tables,0.0,0.0,0.0,"[-1.43699869,-4.19686441,-0.77936048,4.2911283..."
9,AANYMYPL6,1K9KSMZ0FV,1,Multiple Choice,6.RP.A.2,Expressing Unit Rate in Words,0.0,0.0,0.0,"[-1.70709674,-2.89674901,-0.17309755,5.5549082..."


sequence_details (shape: (10774, 8))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,sequence_id,object,0,0.00%,10228
1,sequence_folder_path_level_1,object,0,0.00%,2
2,sequence_folder_path_level_2,object,0,0.00%,17
3,sequence_folder_path_level_3,object,0,0.00%,177
4,sequence_folder_path_level_4,object,96,0.89%,2406
5,sequence_folder_path_level_5,object,8790,81.59%,719
6,sequence_name,object,0,0.00%,10225
7,sequence_problem_ids,object,0,0.00%,10677


  Duplicates: 0
  Sample Data:


Unnamed: 0,sequence_id,sequence_folder_path_level_1,sequence_folder_path_level_2,sequence_folder_path_level_3,sequence_folder_path_level_4,sequence_folder_path_level_5,sequence_name,sequence_problem_ids
0,K1U9M2PVF,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Module 1---Assessments,,End-of-Module---Alg 1.1 End-of-Module Assessment,"[AQ0ZKSP6D,2KTD380L98,7CPDNFDLD,2F9VV7RVWU,255..."
1,1XEPEYCPC3,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Module 1---Assessments,,Mid-Module---Alg1.1 Mid-Module Assessment,"[WS70M9DP1,13HDHY5VMI,24WQMJBRDX,1IFT888E81,F2..."
2,20SXJMMSRG,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 1: Graphs of Piecewise Linear...,,"Problem Set---Algebra I, M1, Lesson 1 (N.Q.A.1...","[1D3AXDDMQ9,2HVIXDM2L5,1I9N9TMSO6,182WSU48H,Z6..."
3,1SMS0A4N5G,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 2: Graphs of Quadratic Functions,,"Classwork---Algebra I, M1, Lesson 2 (N.Q.A.1, ...","[1X69IIUXB1,E083MYD2P]"
4,1BROMSHRRA,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 2: Graphs of Quadratic Functions,,"Exit Ticket---Algebra 1, M1, Lesson 2 (N.Q.1, ...",[2BLJ83JUIM]
5,520QV3Q8S,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 2: Graphs of Quadratic Functions,,"Problem Set---Algebra I, M1, Lesson 2 (N.Q.A.1...","[29VD2UIJTI,1M74LB6G3J,1772ID2XLH,1J69K3FEKK,1..."
6,2FMEH9Y63M,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 3: Graphs of Exponential Func...,,"Classwork---Algebra I, M1, Lesson 3 (N.Q.1, N....","[1JHWLIKQDC,1VEVZAR2XS,OB0DO2ZS,RKO1EOZBD]"
7,1RASYU5JGC,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 3: Graphs of Exponential Func...,,"Exit Ticket---Algebra 1, M1, Lesson 3 (N.Q.1, ...","[87T4WGIEB,2D5DN5AD90]"
8,2MZEXEMAEN,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 3: Graphs of Exponential Func...,,"Problem Set---Algebra I, M1, Lesson 3 (N.Q.A.1...","[2FP1R2REA1,166VN2L2PC,2KKXLO5S5Q,1W7WB4TMPB,X..."
9,IZ1NEEPQP,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 4: Analyzing Graphs—Water Usa...,,"Classwork---Algebra I, M1, Lesson 4 (N.Q.A.1, ...","[1CNJYSTUEH,2KULLSIT0Y,1MZ1MDC6AK,H319LYQXU,2H..."


sequence_relationships (shape: (13108, 2))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,unit_test_sequence_id,object,0,0.00%,240
1,in_unit_sequence_id,object,0,0.00%,9186


  Duplicates: 544
  Sample Data:


Unnamed: 0,unit_test_sequence_id,in_unit_sequence_id
0,K1U9M2PVF,1XEPEYCPC3
1,K1U9M2PVF,20SXJMMSRG
2,K1U9M2PVF,1SMS0A4N5G
3,K1U9M2PVF,1BROMSHRRA
4,K1U9M2PVF,520QV3Q8S
5,K1U9M2PVF,2FMEH9Y63M
6,K1U9M2PVF,1RASYU5JGC
7,K1U9M2PVF,2MZEXEMAEN
8,K1U9M2PVF,IZ1NEEPQP
9,K1U9M2PVF,YG6VSZANE


training_unit_test_scores (shape: (452439, 3))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,assignment_log_id,object,0,0.00%,42343
1,problem_id,object,0,0.00%,1835
2,score,int64,0,0.00%,2


  Duplicates: 0
  Sample Data:


Unnamed: 0,assignment_log_id,problem_id,score
0,1CEASUAUQJ,18J6436AS5,1
1,2IMKPEIL2Q,9RMI4CZU9,0
2,2IMKPEIL2Q,8F4U5WWTV,0
3,2IMKPEIL2Q,27D3I359NE,1
4,2IMKPEIL2Q,22DY4PFVMV,1
5,2IMKPEIL2Q,ZQMHFZJ53,1
6,2IMKPEIL2Q,1II2JVYEQV,0
7,2MZN9L748R,2N2SARA9Q6,1
8,2MZN9L748R,123M9UFYL2,0
9,2MZN9L748R,1WL078QSL4,0


### Data Cleaning Strategies

#### 'action_logs' dataframe

In [8]:
print("Preprocessing `action_logs`")
ACTION_COLS_TO_KEEP = [
    'assignment_log_id',
    'timestamp',
    'problem_id',
    'action',
    'hint_id',
    'explanation_id',
    'max_attempts',
    'available_core_tutoring',
    'score_viewable',
    'continuous_score_viewable'
]

ACTION_COLS_DROP = [c for c in data_frames['action_logs'].columns if c not in ACTION_COLS_TO_KEEP]
data_frames["action_logs"] = data_frames["action_logs"].drop(columns=ACTION_COLS_DROP)
print(f"    Droped columns: {ACTION_COLS_DROP}")

KEEP_ACTIONS = [
    'problem_started',
    'correct_response',
    'wrong_response',
    'hint_requested',
    'explanation_requested',
    'answer_requested',
    'problem_finished'
]
data_frames["action_logs"] = data_frames["action_logs"][data_frames["action_logs"]["action"].isin(KEEP_ACTIONS)]
print(f"    Removed rows with `action` not in: {KEEP_ACTIONS}")

# Convert timestamp to datetime (seconds → datetime)
data_frames["action_logs"]["timestamp"] = pd.to_datetime(
    data_frames["action_logs"]["timestamp"],
    unit="s",
    errors="coerce"
)
print(f"    Converted `timestamp` to datetime seconds")

# Sort for sequence analysis
data_frames["action_logs"] = data_frames["action_logs"].sort_values(
    ["assignment_log_id", "timestamp"]
)

print(f"    Optimizing dataframe...")
data_frames['action_logs'] = optimize_dataframe(data_frames['action_logs'])

# Explore
df_exploration("action_logs", data_frames["action_logs"])

# Save cleaned dataframe
save_dataframe("action_logs", data_frames["action_logs"])

# Free memory
del data_frames["action_logs"], ACTION_COLS_DROP, ACTION_COLS_TO_KEEP, KEEP_ACTIONS
gc.collect()

Preprocessing `action_logs`
    Droped columns: []
    Removed rows with `action` not in: ['problem_started', 'correct_response', 'wrong_response', 'hint_requested', 'explanation_requested', 'answer_requested', 'problem_finished']
    Converted `timestamp` to datetime seconds
    Optimizing dataframe...
  Memory reduced: 5048.92 MB → 4862.92 MB (3.7% reduction)
action_logs (shape: (16252841, 10))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,assignment_log_id,object,0,0.00%,638201
1,timestamp,datetime64[ns],0,0.00%,16241543
2,problem_id,object,0,0.00%,57360
3,max_attempts,float32,11006981,67.72%,2
4,available_core_tutoring,object,11006981,67.72%,4
5,score_viewable,float32,11006981,67.72%,2
6,continuous_score_viewable,float32,11006981,67.72%,2
7,action,object,0,0.00%,7
8,hint_id,object,16179498,99.55%,9125
9,explanation_id,object,16231705,99.87%,4132


  Duplicates: 0
  Sample Data:


Unnamed: 0,assignment_log_id,timestamp,problem_id,max_attempts,available_core_tutoring,score_viewable,continuous_score_viewable,action,hint_id,explanation_id
6144448,1000AQM2VK,2021-04-13 20:09:24.598999977,1IEH49XWH5,3.0,answer,1.0,1.0,problem_started,,
6144449,1000AQM2VK,2021-04-13 20:09:48.509999989,1IEH49XWH5,,,,,correct_response,,
6144450,1000AQM2VK,2021-04-13 20:09:48.516000032,1IEH49XWH5,,,,,problem_finished,,
6144452,1000AQM2VK,2021-04-13 20:09:50.427000046,27YYTVQK6K,1.0,no_tutoring,1.0,0.0,problem_started,,
6144454,1000AQM2VK,2021-04-13 20:11:02.394999981,27YYTVQK6K,,,,,problem_finished,,
6144456,1000AQM2VK,2021-04-13 20:11:04.138000011,2K9KDM1BB5,3.0,answer,1.0,1.0,problem_started,,
6144457,1000AQM2VK,2021-04-13 20:11:22.243999958,2K9KDM1BB5,,,,,correct_response,,
6144458,1000AQM2VK,2021-04-13 20:11:22.246999979,2K9KDM1BB5,,,,,problem_finished,,
6144460,1000AQM2VK,2021-04-13 20:11:23.434000015,1HES7DVPEF,1.0,no_tutoring,1.0,0.0,problem_started,,
6144462,1000AQM2VK,2021-04-13 20:12:13.062000036,1HES7DVPEF,,,,,problem_finished,,


49

#### 'assignment_details' dataframe

In [9]:
print("Preprocessing `assignment_details`")
ASSIGNMENT_COLS_TO_KEEP = [
    'assignment_log_id',
    'student_id',
    'teacher_id',
    'class_id',
    'sequence_id',
    'assignment_start_time',
    'assignment_end_time',
]
ASSIGNMENT_COLS_DROP = [c for c in data_frames['assignment_details'].columns if c not in ASSIGNMENT_COLS_TO_KEEP]
data_frames["assignment_details"] = data_frames["assignment_details"].drop(columns=ASSIGNMENT_COLS_DROP)
print(f"    Droped columns: {ASSIGNMENT_COLS_DROP}")

# Convert time measurements
TO_CONVERT = ['assignment_start_time', 'assignment_end_time']
for col in TO_CONVERT:
    if col in data_frames["assignment_details"].columns:
        data_frames["assignment_details"][col] = pd.to_datetime(data_frames["assignment_details"][col], unit='s', errors='coerce')
print(f"    Converted `{TO_CONVERT}` to datetime seconds`")

print(f"    Optimizing dataframe...")
data_frames["assignment_details"] = optimize_dataframe(data_frames["assignment_details"])

df_exploration('assignment_details', data_frames['assignment_details'])
save_dataframe('assignment_details',data_frames['assignment_details'])
del data_frames['assignment_details'], ASSIGNMENT_COLS_TO_KEEP, ASSIGNMENT_COLS_DROP, TO_CONVERT
gc.collect()

Preprocessing `assignment_details`
    Droped columns: ['assignment_release_date', 'assignment_due_date']
    Converted `['assignment_start_time', 'assignment_end_time']` to datetime seconds`
    Optimizing dataframe...
  Memory reduced: 2747.76 MB → 2747.76 MB (0.0% reduction)
assignment_details (shape: (9319676, 7))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,assignment_log_id,object,0,0.00%,9319676
1,teacher_id,object,0,0.00%,23523
2,class_id,object,0,0.00%,47401
3,student_id,object,0,0.00%,651253
4,sequence_id,object,0,0.00%,8774
5,assignment_start_time,datetime64[ns],53615,0.58%,9262757
6,assignment_end_time,datetime64[ns],1931631,20.73%,7386232


  Duplicates: 0
  Sample Data:


Unnamed: 0,assignment_log_id,teacher_id,class_id,student_id,sequence_id,assignment_start_time,assignment_end_time
0,2PLEB2KWK9,22OEQXISYV,133F5L5O95,L97DTM607,1FLYIHK4Q4,2018-10-15 20:21:06.476000071,NaT
1,8G25XNCXN,2SKA2RTF6,2OL82EC95R,21S35PU5W2,CDLX4UJ84,2018-10-18 14:03:23.266999959,2018-10-18 14:07:21.345000029
2,266AW7UU1V,1FJ326JFAH,1WJWBO8XL4,IBO6BEHXA,2T42B3UC5,2018-10-18 17:44:50.684000015,NaT
3,15SHL0U0E6,129LDU45TT,IBO6BEHXA,1CT2ERTNC7,7ZGYNOHS3,2018-10-19 12:35:45.055000067,NaT
4,CQA32TBFI,1FJ326JFAH,1WJWBO8XL4,2JC4HHXU4M,2T42B3UC5,2018-10-19 18:34:28.802000046,NaT
5,RNSUY1N30,1FJ326JFAH,1WJWBO8XL4,2JC4HHXU4M,116QWSQWM9,2018-11-05 15:32:16.621000051,NaT
6,1SRBUROB4M,1A8U1KW3AV,2DFHHHY3AO,GJD8FQZUO,SZQ65NBOQ,2018-11-14 17:53:28.282999992,NaT
7,1MOHSGSH4S,1A8U1KW3AV,2DFHHHY3AO,GJD8FQZUO,SZQ65NBOQ,2018-11-14 17:59:00.302999973,2018-11-14 18:22:21.687000036
8,22UBCQFR5,1A8U1KW3AV,2DFHHHY3AO,288F3DEIBP,SZQ65NBOQ,2018-11-14 18:47:01.257999896,2018-11-14 19:00:07.713999987
9,1PRB4L8LJ3,1A8U1KW3AV,2DFHHHY3AO,SAUMQVPOS,SZQ65NBOQ,2018-11-14 18:47:06.111000061,2018-11-14 18:59:32.924999952


0

#### 'assignment_relationships' dataframe

In [10]:
print("Preprocessing `assignment_relationships`")
print(f"    Optimizing dataframe...")
data_frames["assignment_relationships"] = optimize_dataframe(data_frames["assignment_relationships"])

df_exploration('assignment_relationships', data_frames['assignment_relationships'])
save_dataframe('assignment_relationships',data_frames['assignment_relationships'])
del data_frames['assignment_relationships']
gc.collect()

Preprocessing `assignment_relationships`
    Optimizing dataframe...
  Memory reduced: 78.60 MB → 83.60 MB (-6.4% reduction)
assignment_relationships (shape: (699839, 2))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,unit_test_assignment_log_id,object,0,0.00%,56577
1,in_unit_assignment_log_id,object,0,0.00%,638528


  Duplicates: 0
  Sample Data:


Unnamed: 0,unit_test_assignment_log_id,in_unit_assignment_log_id
0,7FGC8P0F1,V6YXT3UG
1,15KQFID5U5,1TFFYMT814
2,QKDRPCXSG,1N2IFGUASM
3,1JOJIQXU1B,15W4ET3W62
4,2C9YZRVZT0,1WORTY787C
5,38M6IA4SS,2DQG3SWWLS
6,15XW17EHLW,Y3G0XTLMF
7,2C5IG7FC12,1HLYER60XW
8,F9OJCBCRM,1XB8H1OIF8
9,2OJ73SYFF6,1XB8H1OIF8


0

#### 'explanation_details' dataframe

In [11]:
print("Preprocessing `explanation_details`")
EXPL_COLS_TO_KEEP = [
    'explanation_id',
    'explanation_contains_image',
    'explanation_contains_equation',
    'explanation_contains_video'
]
EXPL_COLS_DROP = [c for c in data_frames['explanation_details'].columns if c not in EXPL_COLS_TO_KEEP]
data_frames["explanation_details"] = data_frames["explanation_details"].drop(columns=EXPL_COLS_DROP)
print(f"    Droped columns: {EXPL_COLS_DROP}")

print(f"    Optimizing dataframe...")
data_frames["explanation_details"] = optimize_dataframe(data_frames["explanation_details"])

df_exploration('explanation_details', data_frames['explanation_details'])
save_dataframe('explanation_details',data_frames['explanation_details'])
del data_frames['explanation_details'], EXPL_COLS_TO_KEEP, EXPL_COLS_DROP
gc.collect()

Preprocessing `explanation_details`
    Droped columns: ['explanation_creator_id', 'explanation_text_bert_pca']
    Optimizing dataframe...
  Memory reduced: 0.33 MB → 0.24 MB (25.4% reduction)
explanation_details (shape: (4132, 4))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,explanation_id,object,0,0.00%,4132
1,explanation_contains_image,int8,0,0.00%,2
2,explanation_contains_equation,int8,0,0.00%,2
3,explanation_contains_video,int8,0,0.00%,2


  Duplicates: 0
  Sample Data:


Unnamed: 0,explanation_id,explanation_contains_image,explanation_contains_equation,explanation_contains_video
0,F9LDM3F28,0,0,1
1,1LP7J12588,0,0,1
2,166NMZR8N1,0,0,1
3,BLPMAHQ1U,0,0,1
4,9BN3ZCYZB,0,0,1
5,1RZOX5Y06X,0,0,1
6,2OC8YHF2A2,0,0,1
7,13LRF9OH6O,0,0,1
8,CW8S6AU12,0,0,1
9,14QA68SKWP,0,0,1


0

#### 'hint_details' dataframe

In [12]:
print("Preprocessing `hint_details`")
HINT_COLS_TO_KEEP = [
    'hint_id',
    'hint_position',
    'hint_contains_image',
    'hint_contains_equation',
    'hint_contains_video'
]

HINT_COLS_DROP = [c for c in data_frames['hint_details'].columns if c not in HINT_COLS_TO_KEEP]
data_frames["hint_details"] = data_frames["hint_details"].drop(columns=HINT_COLS_DROP)
print(f"    Droped columns: {HINT_COLS_DROP}")

print(f"    Optimizing dataframe...")
data_frames["hint_details"] = optimize_dataframe(data_frames["hint_details"])

df_exploration('hint_details', data_frames['hint_details'])
save_dataframe('hint_details',data_frames['hint_details'])
del data_frames['hint_details'], HINT_COLS_TO_KEEP, HINT_COLS_DROP
gc.collect()

Preprocessing `hint_details`
    Droped columns: ['hint_creator_id', 'hint_text_bert_pca']
    Optimizing dataframe...
  Memory reduced: 0.72 MB → 0.50 MB (30.9% reduction)
hint_details (shape: (8381, 5))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,hint_id,object,0,0.00%,8381
1,hint_position,int8,0,0.00%,4
2,hint_contains_image,int8,0,0.00%,2
3,hint_contains_equation,int8,0,0.00%,2
4,hint_contains_video,int8,0,0.00%,1


  Duplicates: 0
  Sample Data:


Unnamed: 0,hint_id,hint_position,hint_contains_image,hint_contains_equation,hint_contains_video
0,27QACMUW18,1,0,0,0
1,2A6J4ALXFX,1,0,0,0
2,N2CKUB1LY,1,0,0,0
3,10N2WE3WMP,1,0,0,0
4,UJC2I281G,1,0,0,0
5,K1HC9Z0XW,1,0,0,0
6,2N0KOMXKDI,2,0,1,0
7,1LU9FDMWST,1,0,0,0
8,1KCNFLP3NG,2,0,0,0
9,1TUCU89LRV,3,0,0,0


0

#### 'problem_details' dataframe

In [13]:
print("Preprocessing `problem_details`")
PROBLEM_COLS_TO_KEEP = [
    'problem_id',
    'problem_type',
    'problem_skill_code',
    'problem_multipart_id',
    'problem_multipart_position',
    'problem_contains_image',
    'problem_contains_equation',
    'problem_contains_video'
]
PROBLEM_COLS_DROP = [c for c in data_frames['problem_details'].columns if c not in PROBLEM_COLS_TO_KEEP]
data_frames["problem_details"] = data_frames["problem_details"].drop(columns=PROBLEM_COLS_DROP)
print(f"    Droped columns: {PROBLEM_COLS_DROP}")

PROB_BOOL_COLS = ['problem_contains_image', 'problem_contains_equation', 'problem_contains_video']
print(f"    Filling missing boolean columns `{PROB_BOOL_COLS}` with 0.")
for col in PROB_BOOL_COLS:
    data_frames["problem_details"][col] = data_frames["problem_details"][col].fillna(0).astype(np.int8)

print(f"    Optimizing dataframe...")
data_frames["problem_details"] = optimize_dataframe(data_frames["problem_details"])

df_exploration('problem_details', data_frames['problem_details'])
save_dataframe('problem_details',data_frames['problem_details'])
del data_frames['problem_details'], PROBLEM_COLS_TO_KEEP, PROBLEM_COLS_DROP
gc.collect()

Preprocessing `problem_details`
    Droped columns: ['problem_skill_description', 'problem_text_bert_pca']
    Filling missing boolean columns `['problem_contains_image', 'problem_contains_equation', 'problem_contains_video']` with 0.
    Optimizing dataframe...
  Memory reduced: 31.58 MB → 30.69 MB (2.8% reduction)
problem_details (shape: (132738, 8))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,problem_id,object,0,0.00%,132738
1,problem_multipart_id,object,0,0.00%,70108
2,problem_multipart_position,int8,0,0.00%,55
3,problem_type,object,0,0.00%,10
4,problem_skill_code,object,820,0.62%,541
5,problem_contains_image,int8,0,0.00%,2
6,problem_contains_equation,int8,0,0.00%,2
7,problem_contains_video,int8,0,0.00%,2


  Duplicates: 0
  Sample Data:


Unnamed: 0,problem_id,problem_multipart_id,problem_multipart_position,problem_type,problem_skill_code,problem_contains_image,problem_contains_equation,problem_contains_video
0,10MFND3HAJ,2MHCTW1IIN,1,Multiple Choice,6.RP.A.3b,0,0,1
1,IH3MOE7AF,1UEQMXOOFA,1,Multiple Choice,6.RP.A.3b,0,0,0
2,14YC7CEE2N,1UEQMXOOFA,2,Ungraded Open Response,6.RP.A.3b,0,0,0
3,16L5KQWLN7,1W7DRPNEJL,1,Ungraded Open Response,6.RP.A.3b,0,0,0
4,BU0LO0LDD,1Z6MGLD8VK,1,Ungraded Open Response,6.RP.A.3b,0,0,0
5,W9WPQSAU5,MBYKGWG5L,1,Ungraded Open Response,6.RP.A.3b,0,0,0
6,2OHCH5C5BD,O0EI8SMXR,1,Number,6.RP.A.2,0,0,0
7,9CB1OILA2,A1DWWVVLC,1,Ungraded Open Response,6.RP.A.3a,0,0,0
8,1JCPX2ZOXQ,K65VD17P2,1,Ungraded Open Response,6.RP.A.3a,0,0,0
9,AANYMYPL6,1K9KSMZ0FV,1,Multiple Choice,6.RP.A.2,0,0,0


0

#### 'sequence_details' dataframe

In [14]:
print("Preprocessing `sequence_details`")

SEQUENCE_COLS_TO_KEEP = [
    'sequence_id',
    'sequence_folder_path_level_1',
    'sequence_folder_path_level_2',
    'sequence_folder_path_level_3',
    'sequence_name'
]
SEQ_COLS_DROP = [c for c in data_frames['sequence_details'].columns if c not in SEQUENCE_COLS_TO_KEEP]
data_frames["sequence_details"] = data_frames["sequence_details"].drop(columns=SEQ_COLS_DROP)
print(f"    Droped columns: {SEQ_COLS_DROP}")

print(f"    Optimizing dataframe...")
data_frames["sequence_details"] = optimize_dataframe(data_frames["sequence_details"])


df_exploration('sequence_details', data_frames['sequence_details'])
save_dataframe('sequence_details',data_frames['sequence_details'])
del data_frames['sequence_details'], SEQUENCE_COLS_TO_KEEP, SEQ_COLS_DROP
gc.collect()

Preprocessing `sequence_details`
    Droped columns: ['sequence_folder_path_level_4', 'sequence_folder_path_level_5', 'sequence_problem_ids']
    Optimizing dataframe...
  Memory reduced: 4.51 MB → 4.36 MB (3.3% reduction)
sequence_details (shape: (10229, 5))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,sequence_id,object,0,0.00%,10228
1,sequence_folder_path_level_1,object,0,0.00%,2
2,sequence_folder_path_level_2,object,0,0.00%,17
3,sequence_folder_path_level_3,object,0,0.00%,177
4,sequence_name,object,0,0.00%,10225


  Duplicates: 0
  Sample Data:


Unnamed: 0,sequence_id,sequence_folder_path_level_1,sequence_folder_path_level_2,sequence_folder_path_level_3,sequence_name
0,K1U9M2PVF,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,End-of-Module---Alg 1.1 End-of-Module Assessment
1,1XEPEYCPC3,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Mid-Module---Alg1.1 Mid-Module Assessment
2,20SXJMMSRG,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,"Problem Set---Algebra I, M1, Lesson 1 (N.Q.A.1..."
3,1SMS0A4N5G,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,"Classwork---Algebra I, M1, Lesson 2 (N.Q.A.1, ..."
4,1BROMSHRRA,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,"Exit Ticket---Algebra 1, M1, Lesson 2 (N.Q.1, ..."
5,520QV3Q8S,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,"Problem Set---Algebra I, M1, Lesson 2 (N.Q.A.1..."
6,2FMEH9Y63M,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,"Classwork---Algebra I, M1, Lesson 3 (N.Q.1, N...."
7,1RASYU5JGC,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,"Exit Ticket---Algebra 1, M1, Lesson 3 (N.Q.1, ..."
8,2MZEXEMAEN,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,"Problem Set---Algebra I, M1, Lesson 3 (N.Q.A.1..."
9,IZ1NEEPQP,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,"Classwork---Algebra I, M1, Lesson 4 (N.Q.A.1, ..."


0

#### 'sequence_relationships' dataframe

In [15]:
print("Preprocessing `sequence_relationships`")
print(f"    Optimizing dataframe...")
data_frames['sequence_relationships'] = optimize_dataframe(data_frames['sequence_relationships'])

df_exploration('sequence_relationships', data_frames['sequence_relationships'])
save_dataframe('sequence_relationships',data_frames['sequence_relationships'])
del data_frames['sequence_relationships']
gc.collect()

Preprocessing `sequence_relationships`
    Optimizing dataframe...
  Memory reduced: 1.47 MB → 1.50 MB (-2.4% reduction)
sequence_relationships (shape: (12564, 2))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,unit_test_sequence_id,object,0,0.00%,240
1,in_unit_sequence_id,object,0,0.00%,9186


  Duplicates: 0
  Sample Data:


Unnamed: 0,unit_test_sequence_id,in_unit_sequence_id
0,K1U9M2PVF,1XEPEYCPC3
1,K1U9M2PVF,20SXJMMSRG
2,K1U9M2PVF,1SMS0A4N5G
3,K1U9M2PVF,1BROMSHRRA
4,K1U9M2PVF,520QV3Q8S
5,K1U9M2PVF,2FMEH9Y63M
6,K1U9M2PVF,1RASYU5JGC
7,K1U9M2PVF,2MZEXEMAEN
8,K1U9M2PVF,IZ1NEEPQP
9,K1U9M2PVF,YG6VSZANE


0

#### 'training_unit_test_scores' dataframe

In [16]:
print("Preprocessing `training_unit_test_scores`")
print(f"    Optimizing dataframe...")
data_frames['training_unit_test_scores'] = optimize_dataframe(data_frames['training_unit_test_scores'])

df_exploration('training_unit_test_scores', data_frames['training_unit_test_scores'])
save_dataframe('training_unit_test_scores',data_frames['training_unit_test_scores'])
del data_frames['training_unit_test_scores']
gc.collect()

Preprocessing `training_unit_test_scores`
    Optimizing dataframe...
  Memory reduced: 54.04 MB → 51.02 MB (5.6% reduction)
training_unit_test_scores (shape: (452439, 3))
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,assignment_log_id,object,0,0.00%,42343
1,problem_id,object,0,0.00%,1835
2,score,int8,0,0.00%,2


  Duplicates: 0
  Sample Data:


Unnamed: 0,assignment_log_id,problem_id,score
0,1CEASUAUQJ,18J6436AS5,1
1,2IMKPEIL2Q,9RMI4CZU9,0
2,2IMKPEIL2Q,8F4U5WWTV,0
3,2IMKPEIL2Q,27D3I359NE,1
4,2IMKPEIL2Q,22DY4PFVMV,1
5,2IMKPEIL2Q,ZQMHFZJ53,1
6,2IMKPEIL2Q,1II2JVYEQV,0
7,2MZN9L748R,2N2SARA9Q6,1
8,2MZN9L748R,123M9UFYL2,0
9,2MZN9L748R,1WL078QSL4,0


0

### EDA and Visualization

#### Action Logs

## Analysis

### Analysis 1: Student-Problem Behavior Co-occurence

#### DataFrames Merging

In [6]:
action_logs = load_dataframe('action_logs')
assignment_details = load_dataframe('assignment_details')
problem_details = load_dataframe('problem_details')
hint_details = load_dataframe('hint_details')
explanation_details = load_dataframe('explanation_details')
sequence_details = load_dataframe('sequence_details')

print("\n[1/5] Merging action logs with assignment details...")

student_interactions = action_logs.merge(
    assignment_details[['assignment_log_id', 'student_id', 'teacher_id', 
                        'class_id', 'sequence_id', 'assignment_start_time']], 
    on='assignment_log_id', 
    how='left'
)

print(f"  Merged: {len(student_interactions):,} rows")
del action_logs
gc.collect()

print("[2/5] Merging with problem details (content features)...")

student_interactions = student_interactions.merge(
    problem_details[['problem_id', 'problem_type', 'problem_skill_code', 
                     'problem_multipart_position', 'problem_contains_image',
                     'problem_contains_equation', 'problem_contains_video']], 
    on='problem_id', 
    how='left'
)

print(f"  Merged: {len(student_interactions):,} rows")
del problem_details
gc.collect()

print("[3/5] Merging with hint details...")

student_interactions = student_interactions.merge(
    hint_details[['hint_id', 'hint_position', 'hint_contains_image',
                  'hint_contains_equation', 'hint_contains_video']], 
    on='hint_id', 
    how='left'
)

print(f"  Merged: {len(student_interactions):,} rows")
del hint_details
gc.collect()

print("[4/5] Merging with explanation details...")

student_interactions = student_interactions.merge(
    explanation_details[['explanation_id', 'explanation_contains_image',
                         'explanation_contains_equation', 'explanation_contains_video']], 
    on='explanation_id', 
    how='left'
)

print(f"  Merged: {len(student_interactions):,} rows")
del explanation_details
gc.collect()

print("[5/5] Merging with sequence details...")

student_interactions = student_interactions.merge(
    sequence_details[['sequence_id', 'sequence_folder_path_level_1',
                      'sequence_folder_path_level_2', 'sequence_folder_path_level_3']], 
    on='sequence_id', 
    how='left'
)

print(f"  Merged: {len(student_interactions):,} rows")
del sequence_details, assignment_details
gc.collect()

print(f"Final merged dataset: {len(student_interactions):,} records")
display(student_interactions)
save_dataframe('student_interactions', student_interactions)
del student_interactions
gc.collect()


[1/5] Merging action logs with assignment details...
  Merged: 16,252,841 rows
[2/5] Merging with problem details (content features)...
  Merged: 16,252,841 rows
[3/5] Merging with hint details...
  Merged: 16,252,841 rows
[4/5] Merging with explanation details...
  Merged: 16,252,841 rows
[5/5] Merging with sequence details...
  Merged: 16,252,841 rows
Final merged dataset: 16,252,841 records


Unnamed: 0,assignment_log_id,timestamp,problem_id,max_attempts,available_core_tutoring,score_viewable,continuous_score_viewable,action,hint_id,explanation_id,student_id,teacher_id,class_id,sequence_id,assignment_start_time,problem_type,problem_skill_code,problem_multipart_position,problem_contains_image,problem_contains_equation,problem_contains_video,hint_position,hint_contains_image,hint_contains_equation,hint_contains_video,explanation_contains_image,explanation_contains_equation,explanation_contains_video,sequence_folder_path_level_1,sequence_folder_path_level_2,sequence_folder_path_level_3
0,1000AQM2VK,2021-04-13 20:09:24.598999977,1IEH49XWH5,3.0000,answer,1.0000,1.0000,problem_started,,,1P7YNUPW8I,16HRFF60QL,3RHBOEMLK,1HGLJT7ITA,2021-04-13 20:09:24.022000074,Number,5.NF.B.4a-2,1.0000,0.0000,1.0000,0.0000,,,,,,,,EngageNY/Eureka Math (© by Great Minds®) *,Grade 5,Module 4 - Multiplying and Dividing Fractions ...
1,1000AQM2VK,2021-04-13 20:09:48.509999989,1IEH49XWH5,,,,,correct_response,,,1P7YNUPW8I,16HRFF60QL,3RHBOEMLK,1HGLJT7ITA,2021-04-13 20:09:24.022000074,Number,5.NF.B.4a-2,1.0000,0.0000,1.0000,0.0000,,,,,,,,EngageNY/Eureka Math (© by Great Minds®) *,Grade 5,Module 4 - Multiplying and Dividing Fractions ...
2,1000AQM2VK,2021-04-13 20:09:48.516000032,1IEH49XWH5,,,,,problem_finished,,,1P7YNUPW8I,16HRFF60QL,3RHBOEMLK,1HGLJT7ITA,2021-04-13 20:09:24.022000074,Number,5.NF.B.4a-2,1.0000,0.0000,1.0000,0.0000,,,,,,,,EngageNY/Eureka Math (© by Great Minds®) *,Grade 5,Module 4 - Multiplying and Dividing Fractions ...
3,1000AQM2VK,2021-04-13 20:09:50.427000046,27YYTVQK6K,1.0000,no_tutoring,1.0000,0.0000,problem_started,,,1P7YNUPW8I,16HRFF60QL,3RHBOEMLK,1HGLJT7ITA,2021-04-13 20:09:24.022000074,Ungraded Open Response,5.NF.B.4a-2,2.0000,0.0000,0.0000,0.0000,,,,,,,,EngageNY/Eureka Math (© by Great Minds®) *,Grade 5,Module 4 - Multiplying and Dividing Fractions ...
4,1000AQM2VK,2021-04-13 20:11:02.394999981,27YYTVQK6K,,,,,problem_finished,,,1P7YNUPW8I,16HRFF60QL,3RHBOEMLK,1HGLJT7ITA,2021-04-13 20:09:24.022000074,Ungraded Open Response,5.NF.B.4a-2,2.0000,0.0000,0.0000,0.0000,,,,,,,,EngageNY/Eureka Math (© by Great Minds®) *,Grade 5,Module 4 - Multiplying and Dividing Fractions ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16252836,ZZZYX8591,2021-03-08 13:37:04.628000021,2O5FUWLMRV,,,,,correct_response,,,1DSJ2JL7FM,1QN1NE65QL,1KCUESOSEH,21OPH9ZWGP,2021-03-08 13:30:22.384000063,Algebraic Expression,3.NF.A.1,6.0000,1.0000,0.0000,0.0000,,,,,,,,EngageNY/Eureka Math (© by Great Minds®) *,Grade 3,Module 5 - Fractions as Numbers on the Number ...
16252837,ZZZYX8591,2021-03-08 13:37:04.632999897,2O5FUWLMRV,,,,,problem_finished,,,1DSJ2JL7FM,1QN1NE65QL,1KCUESOSEH,21OPH9ZWGP,2021-03-08 13:30:22.384000063,Algebraic Expression,3.NF.A.1,6.0000,1.0000,0.0000,0.0000,,,,,,,,EngageNY/Eureka Math (© by Great Minds®) *,Grade 3,Module 5 - Fractions as Numbers on the Number ...
16252838,ZZZYX8591,2021-03-08 13:37:05.346999884,19IDWCF0I8,3.0000,hint,1.0000,1.0000,problem_started,,,1DSJ2JL7FM,1QN1NE65QL,1KCUESOSEH,21OPH9ZWGP,2021-03-08 13:30:22.384000063,Algebraic Expression,3.NF.A.1,1.0000,1.0000,0.0000,0.0000,,,,,,,,EngageNY/Eureka Math (© by Great Minds®) *,Grade 3,Module 5 - Fractions as Numbers on the Number ...
16252839,ZZZYX8591,2021-03-08 13:38:25.901000023,19IDWCF0I8,,,,,correct_response,,,1DSJ2JL7FM,1QN1NE65QL,1KCUESOSEH,21OPH9ZWGP,2021-03-08 13:30:22.384000063,Algebraic Expression,3.NF.A.1,1.0000,1.0000,0.0000,0.0000,,,,,,,,EngageNY/Eureka Math (© by Great Minds®) *,Grade 3,Module 5 - Fractions as Numbers on the Number ...


0

#### Feature Engineering

In [5]:
student_interactions = load_dataframe('student_interactions')

# Filter to problem-related actions only (exclude NaN problem_ids)
print("\n[1/8] Filtering to problem-related actions...")
initial_rows = len(student_interactions)
student_interactions = student_interactions[student_interactions['problem_id'].notna()]
print(f"  Filtered: {len(student_interactions):,} rows ({100 * len(student_interactions)/initial_rows:.1f}% retained)")

student_interactions = student_interactions.sort_values(
    ['assignment_log_id', 'problem_id', 'timestamp']
)

# Calculate response time
print("[2/8] Calculating response times...")
student_interactions['response_time'] = student_interactions.groupby(
    ['assignment_log_id', 'problem_id']
)['timestamp'].diff()
student_interactions['response_time'] = student_interactions['response_time'].dt.total_seconds()

# Categorize response times
bins = [-1, 10, 30, 60, 180, float("inf")]
labels = ["very_fast", "fast", "moderate", "slow", "very_slow"]
student_interactions["response_time_cat"] = pd.cut(
    student_interactions["response_time"],
    bins=bins,
    labels=labels
).astype("object").fillna("initial")

# Extract curriculum type
print("[3/8] Extracting curriculum information...")
student_interactions['curriculum'] = student_interactions['sequence_folder_path_level_1'].fillna('other')

# Extract grade level  
print("[4/8] Extracting grade level...")
student_interactions['grade_level'] = student_interactions['sequence_folder_path_level_2'].fillna('unknown')

# Extract unit information
print("[5/8] Extracting unit information...")
student_interactions['unit'] = student_interactions['sequence_folder_path_level_3'].fillna('unknown')

# Categorize skill complexity (based on skill code depth)
print("[6/8] Categorizing skill complexity...")
student_interactions['skill_depth'] = student_interactions['problem_skill_code'].apply(
    lambda x: str(x).count('.') if pd.notna(x) else 0
)
student_interactions['skill_complexity'] = pd.cut(
    student_interactions['skill_depth'],
    bins=[-1, 0, 1, 10],
    labels=['basic', 'intermediate', 'advanced']
)

# Problem content complexity
print("[7/8] Calculating problem content complexity...")
student_interactions['content_complexity'] = (
    student_interactions['problem_contains_image'].fillna(0) +
    student_interactions['problem_contains_equation'].fillna(0) +
    student_interactions['problem_contains_video'].fillna(0)
).astype(int)

# Multipart problem indicator
print("[8/8] Identifying multipart problems...")
student_interactions['is_multipart'] = student_interactions['problem_multipart_position'] > 1

print(f"\n✓ Feature engineering complete")
print(f"  Total interactions: {len(student_interactions):,}")
print(f"  Unique assignments: {student_interactions['assignment_log_id'].nunique():,}")
print(f"  Unique problems: {student_interactions['problem_id'].nunique():,}")

save_dataframe('student_interactions_fe', student_interactions)
del student_interactions
gc.collect()


[1/8] Filtering to problem-related actions...
  Filtered: 16,252,841 rows (100.0% retained)
[2/8] Calculating response times...
[3/8] Extracting curriculum information...
[4/8] Extracting grade level...
[5/8] Extracting unit information...
[6/8] Categorizing skill complexity...
[7/8] Calculating problem content complexity...
[8/8] Identifying multipart problems...

✓ Feature engineering complete
  Total interactions: 16,252,841
  Unique assignments: 638,201
  Unique problems: 57,360


76

#### Transaction Creation

In [5]:
def create_baskets(transaction):
    """
    Apply semantic basketing to reduce dimensionality while maintaining interpretability.
    Groups related items into baskets.
    """
    baskets = []
    
    # Track categories
    has_hints = False
    has_explanations = False
    has_images = False
    has_equations = False
    
    # Action aggregation
    action_types = set()
    
    for item in transaction:
        # Keep individual items but track for basket creation
        baskets.append(item)
        
        # Track hint usage
        if item.startswith('hint_'):
            has_hints = True
            if 'image' in item:
                has_images = True
            if 'equation' in item:
                has_equations = True
        
        # Track explanation usage
        elif item.startswith('expl_') or item == 'used_explanation':
            has_explanations = True
            if 'image' in item:
                has_images = True
            if 'equation' in item:
                has_equations = True
        
        # Track action types
        elif item.startswith('action_'):
            action_type = item.split('_')[1]  # Extract action type (correct, wrong, etc.)
            action_types.add(action_type)
    
    # Add basket indicators
    if has_hints:
        baskets.append('basket_used_hints')
    
    if has_explanations:
        baskets.append('basket_used_explanations')
    
    if has_hints or has_explanations:
        baskets.append('basket_used_help_resources')
    
    if has_images:
        baskets.append('basket_contains_images')
    
    if has_equations:
        baskets.append('basket_contains_equations')
    
    # Add action type baskets
    if 'correct' in action_types:
        baskets.append('basket_had_correct_response')
    
    if 'wrong' in action_types:
        baskets.append('basket_had_wrong_response')
    
    if 'hint' in action_types or 'explanation' in action_types or 'answer' in action_types:
        baskets.append('basket_requested_help')
    
    return list(set(baskets))  # Remove duplicates


def process_interaction_group(args):
    (asg_id, prob_id), group = args
    transaction = []

    actions = group['action'].value_counts().to_dict()

    # Action binning: Aggregate by frequency
    for action_type in [
        'correct_response', 'wrong_response', 'hint_requested',
        'explanation_requested', 'answer_requested'
    ]:
        count = actions.get(action_type, 0)
        if count == 1:
            transaction.append(f"action_{action_type}_once")
        elif 2 <= count <= 3:
            transaction.append(f"action_{action_type}_multiple")
        elif count > 3:
            transaction.append(f"action_{action_type}_many")

    # Response time binning (already binned in feature engineering)
    rt_cats = group['response_time_cat'].value_counts()
    if len(rt_cats):
        dominant = rt_cats.index[0]
        if dominant != 'initial':
            transaction.append(f"timing_{dominant}")

    def first_valid(col):
        vals = group[col].dropna()
        return vals.iloc[0] if len(vals) else None

    ptype = first_valid('problem_type')
    if ptype:
        transaction.append(f"probtype_{str(ptype).replace(' ', '_')[:20]}")

    # Content complexity binning
    complexity = first_valid('content_complexity')
    if complexity is not None:
        transaction.append(
            "content_simple" if complexity == 0 else
            "content_moderate" if complexity == 1 else
            "content_complex"
        )

    tutoring = first_valid('available_core_tutoring')
    if tutoring is not None:
        transaction.append(f"tutoring_{tutoring}")

    # Attempts binning
    attempts = first_valid('max_attempts')
    if attempts is not None:
        transaction.append(
            "attempts_limited" if attempts <= 2 else
            "attempts_moderate" if attempts <= 5 else
            "attempts_unlimited"
        )

    # Hint depth binning
    hint_rows = group[group['hint_id'].notna()]
    if len(hint_rows):
        max_hint = hint_rows['hint_position'].max()
        transaction.append(
            "hint_first_only" if max_hint == 1 else
            "hint_shallow" if max_hint <= 2 else
            "hint_deep"
        )
        if hint_rows['hint_contains_image'].sum() > 0:
            transaction.append("hint_has_image")
        if hint_rows['hint_contains_equation'].sum() > 0:
            transaction.append("hint_has_equation")

    if group['explanation_id'].notna().any():
        transaction.append("used_explanation")
        if group['explanation_contains_image'].sum() > 0:
            transaction.append("expl_has_image")
        if group['explanation_contains_equation'].sum() > 0:
            transaction.append("expl_has_equation")

    # Skill complexity (already binned)
    skill = first_valid('skill_complexity')
    if skill:
        transaction.append(f"skill_{skill}")

    # Curriculum - keep top level only
    curr = first_valid('curriculum')
    if curr and curr != 'other':
        transaction.append(f"curriculum_{curr.split('/')[0].replace(' ', '_')[:15]}")

    # Grade level - bin into broader categories
    grade = first_valid('grade_level')
    if grade and grade != 'unknown':
        grade_str = str(grade).lower()
        if any(x in grade_str for x in ['k', '1', '2', '3', '4']):
            transaction.append("grade_elementary")
        elif any(x in grade_str for x in ['5', '6', '7', '8']):
            transaction.append("grade_middle")
        elif any(x in grade_str for x in ['9', '10', '11', '12']):
            transaction.append("grade_high")
        else:
            transaction.append(f"grade_{grade_str.replace(' ', '_')[:15]}")

    if group['is_multipart'].any():
        transaction.append("multipart_problem")

    score = first_valid('score_viewable')
    if score is not None:
        transaction.append("score_visible" if score == 1 else "score_hidden")

    # Outcome binning
    if 'correct_response' in actions:
        transaction.append(
            "outcome_success_after_struggle"
            if 'wrong_response' in actions
            else "outcome_first_attempt_success"
        )
    elif 'wrong_response' in actions:
        transaction.append("outcome_failure")

    # Apply basketing to create higher-level groupings
    if len(transaction) >= 3:
        return create_baskets(transaction)
    else:
        return None


# ==============================================================================
# SEQUENTIAL PROCESSING WITH BINNING AND BASKETING
# ==============================================================================

A1_TRANSACTIONS_OUTPUT_DIR = "../dataset/A1_transactions_parquet"
os.makedirs(A1_TRANSACTIONS_OUTPUT_DIR, exist_ok=True)

input_path = ProcessingConfig.preprocessed_df_dir / 'student_interactions_fe.parquet'

print("="*70)
print("SEQUENTIAL TRANSACTION PROCESSING WITH BINNING & BASKETING")
print("="*70)
print(f"Reading from: {input_path}")
print(f"Writing to: {A1_TRANSACTIONS_OUTPUT_DIR}\n")

print("Binning strategies applied:")
print("  ✓ Actions: Frequency-based (once/multiple/many)")
print("  ✓ Response time: Categorical bins (fast/moderate/slow)")
print("  ✓ Content complexity: 3 bins (simple/moderate/complex)")
print("  ✓ Attempts: 3 bins (limited/moderate/unlimited)")
print("  ✓ Hint depth: 3 bins (first_only/shallow/deep)")
print("  ✓ Skill complexity: 3 bins (basic/intermediate/advanced)")
print("  ✓ Grade level: 3 bins (elementary/middle/high)")
print("\nBasket categories created:")
print("  ✓ basket_used_hints")
print("  ✓ basket_used_explanations")
print("  ✓ basket_used_help_resources")
print("  ✓ basket_contains_images")
print("  ✓ basket_contains_equations")
print("  ✓ basket_had_correct_response")
print("  ✓ basket_had_wrong_response")
print("  ✓ basket_requested_help\n")

# Open parquet file for streaming
parquet_file = pq.ParquetFile(input_path)
print(f"Total rows in input: {parquet_file.metadata.num_rows:,}\n")

# Configuration
CHUNK_SIZE = 200_000  # Rows to read at a time
OUTPUT_BUFFER_SIZE = 100_000  # Transactions to buffer before writing

records_buffer = []
file_counter = 0
total_transactions = 0
total_rows_processed = 0

# Track incomplete groups across chunks
current_group_key = None
current_group_rows = []

# Statistics tracking
basket_stats = {
    'with_baskets': 0,
    'basket_types': set()
}

print("Processing transactions sequentially...\n")

# Process parquet file in batches
for batch in tqdm(
    parquet_file.iter_batches(batch_size=CHUNK_SIZE),
    desc="Processing chunks",
    total=(parquet_file.metadata.num_rows // CHUNK_SIZE) + 1
):
    chunk_df = batch.to_pandas()
    total_rows_processed += len(chunk_df)
    
    # Sort within chunk to ensure grouping works correctly
    chunk_df = chunk_df.sort_values(
        ['assignment_log_id', 'problem_id', 'timestamp']
    ).reset_index(drop=True)
    
    # Process each row
    for idx, row in chunk_df.iterrows():
        group_key = (row['assignment_log_id'], row['problem_id'])
        
        # Check if we're continuing the same group or starting a new one
        if current_group_key == group_key:
            # Continue accumulating rows for current group
            current_group_rows.append(row)
        else:
            # Process the completed group (if any)
            if current_group_key is not None and current_group_rows:
                group_df = pd.DataFrame(current_group_rows)
                result = process_interaction_group((current_group_key, group_df))
                
                if result is not None:
                    # Track basket statistics
                    basket_items = [item for item in result if item.startswith('basket_')]
                    if basket_items:
                        basket_stats['with_baskets'] += 1
                        basket_stats['basket_types'].update(basket_items)
                    
                    records_buffer.append({
                        "assignment_log_id": current_group_key[0],
                        "problem_id": current_group_key[1],
                        "transaction": result
                    })
                    total_transactions += 1
                
                # Write buffer if it reaches the limit
                if len(records_buffer) >= OUTPUT_BUFFER_SIZE:
                    table = pa.Table.from_pylist(records_buffer)
                    pq.write_table(
                        table,
                        os.path.join(A1_TRANSACTIONS_OUTPUT_DIR, f"part-{file_counter:04d}.parquet"),
                        compression="snappy"
                    )
                    file_counter += 1
                    records_buffer = []
                    gc.collect()
            
            # Start new group
            current_group_key = group_key
            current_group_rows = [row]
    
    # Clear chunk from memory
    del chunk_df
    gc.collect()

# Process the final group
if current_group_key is not None and current_group_rows:
    group_df = pd.DataFrame(current_group_rows)
    result = process_interaction_group((current_group_key, group_df))
    
    if result is not None:
        # Track basket statistics
        basket_items = [item for item in result if item.startswith('basket_')]
        if basket_items:
            basket_stats['with_baskets'] += 1
            basket_stats['basket_types'].update(basket_items)
        
        records_buffer.append({
            "assignment_log_id": current_group_key[0],
            "problem_id": current_group_key[1],
            "transaction": result
        })
        total_transactions += 1

# Write remaining records
if records_buffer:
    table = pa.Table.from_pylist(records_buffer)
    pq.write_table(
        table,
        os.path.join(A1_TRANSACTIONS_OUTPUT_DIR, f"part-{file_counter:04d}.parquet"),
        compression="snappy"
    )
    file_counter += 1

# Final cleanup
del records_buffer, current_group_rows
gc.collect()

print(f"\n{'='*70}")
print(f"✓ PROCESSING COMPLETE!")
print(f"{'='*70}")
print(f"Total rows processed: {total_rows_processed:,}")
print(f"Total transactions created: {total_transactions:,}")
print(f"Transactions with baskets: {basket_stats['with_baskets']:,} ({100*basket_stats['with_baskets']/total_transactions:.1f}%)")
print(f"\nBasket types used:")
for basket_type in sorted(basket_stats['basket_types']):
    print(f"  • {basket_type}")
print(f"\nOutput directory: {A1_TRANSACTIONS_OUTPUT_DIR}")
print(f"Parquet files created: {file_counter}")
print(f"{'='*70}")

SEQUENTIAL TRANSACTION PROCESSING WITH BINNING & BASKETING
Reading from: ../dataset/dataframes/student_interactions_fe.parquet
Writing to: ../dataset/A1_transactions_parquet

Binning strategies applied:
  ✓ Actions: Frequency-based (once/multiple/many)
  ✓ Response time: Categorical bins (fast/moderate/slow)
  ✓ Content complexity: 3 bins (simple/moderate/complex)
  ✓ Attempts: 3 bins (limited/moderate/unlimited)
  ✓ Hint depth: 3 bins (first_only/shallow/deep)
  ✓ Skill complexity: 3 bins (basic/intermediate/advanced)
  ✓ Grade level: 3 bins (elementary/middle/high)

Basket categories created:
  ✓ basket_used_hints
  ✓ basket_used_explanations
  ✓ basket_used_help_resources
  ✓ basket_contains_images
  ✓ basket_contains_equations
  ✓ basket_had_correct_response
  ✓ basket_had_wrong_response
  ✓ basket_requested_help

Total rows in input: 16,252,841

Processing transactions sequentially...



Processing chunks:   0%|          | 0/82 [00:00<?, ?it/s]

Processing chunks: 97it [3:44:29, 138.86s/it]                          



✓ PROCESSING COMPLETE!
Total rows processed: 16,252,841
Total transactions created: 5,245,859
Transactions with baskets: 3,614,685 (68.9%)

Basket types used:
  • basket_contains_equations
  • basket_contains_images
  • basket_had_correct_response
  • basket_had_wrong_response
  • basket_requested_help
  • basket_used_explanations
  • basket_used_help_resources
  • basket_used_hints

Output directory: ../dataset/A1_transactions_parquet
Parquet files created: 53


#### Transaction Encoding

In [7]:
# Load dataset
A1_TRANSACTIONS_OUTPUT_DIR = "../dataset/A1_transactions_parquet"
dataset = ds.dataset(A1_TRANSACTIONS_OUTPUT_DIR, format="parquet")

print("="*70)
print("TRANSACTION ANALYSIS WITH BINNING & BASKETING")
print("="*70)

# Display sample transactions before processing
print("\nFetching sample transactions...")
sample_batch = next(dataset.to_batches(columns=["assignment_log_id", "problem_id", "transaction"], batch_size=10))
sample_df = sample_batch.to_pandas()

print(f"\nShowing first {len(sample_df)} transactions:\n")
for idx, row in sample_df.iterrows():
    # Separate baskets from regular items
    items = row['transaction']
    basket_items = [item for item in items if item.startswith('basket_')]
    regular_items = [item for item in items if not item.startswith('basket_')]
    
    print(f"Transaction {idx + 1}:")
    print(f"  Assignment ID: {row['assignment_log_id']}")
    print(f"  Problem ID: {row['problem_id']}")
    print(f"  Total items: {len(items)}")
    print(f"    Regular items ({len(regular_items)}): {regular_items}")
    print(f"    Basket items ({len(basket_items)}): {basket_items}")
    print()

print("="*70)
print("COUNTING ITEM FREQUENCIES")
print("="*70)

# ---- Pass 1: Count items and analyze binning/basketing ----
print("\nPass 1: Counting item frequencies...")
item_counter = Counter()
basket_counter = Counter()
regular_counter = Counter()
total_tx = 0

for batch in tqdm(dataset.to_batches(columns=["transaction"], batch_size=50_000), desc="Counting"):
    txs = batch.column("transaction").to_pylist()
    total_tx += len(txs)
    for t in txs:
        item_counter.update(t)
        # Separate baskets from regular items
        for item in t:
            if item.startswith('basket_'):
                basket_counter[item] += 1
            else:
                regular_counter[item] += 1

print(f"\n✓ Total transactions: {total_tx:,}")
print(f"✓ Unique items found: {len(item_counter)}")
print(f"  - Regular items: {len(regular_counter)}")
print(f"  - Basket items: {len(basket_counter)}")

# Analyze binning categories
print(f"\nBinning category breakdown:")
bin_categories = {}
for item in regular_counter.keys():
    prefix = item.split('_')[0] if '_' in item else 'other'
    if prefix not in bin_categories:
        bin_categories[prefix] = []
    bin_categories[prefix].append(item)

for category, items in sorted(bin_categories.items()):
    print(f"  {category}: {len(items)} items")

# Show basket statistics
print(f"\nBasket usage statistics:")
for basket, count in sorted(basket_counter.items()):
    percentage = 100 * count / total_tx
    print(f"  {basket}: {count:,} ({percentage:.1f}% of transactions)")

# Prune low-support items early
min_count = int(AnalysisConfig.min_support * total_tx)
items = [i for i, c in item_counter.items() if c >= min_count]
item_to_idx = {item: i for i, item in enumerate(items)}

# Separate filtered items
filtered_baskets = [i for i in items if i.startswith('basket_')]
filtered_regular = [i for i in items if not i.startswith('basket_')]

print(f"\n✓ Items after filtering (min_support={AnalysisConfig.min_support}):")
print(f"  Total kept: {len(items)}")
print(f"  - Regular items: {len(filtered_regular)}")
print(f"  - Basket items: {len(filtered_baskets)}")
print(f"  Removed: {len(item_counter) - len(items)} items")

# Show most common items by category
print(f"\nTop 10 most frequent regular items:")
regular_items_sorted = sorted(
    [(item, count) for item, count in regular_counter.items() if item in items],
    key=lambda x: x[1],
    reverse=True
)
for item, count in regular_items_sorted[:10]:
    support = count / total_tx
    print(f"  {item}: {count:,} ({support:.2%})")

print(f"\nAll basket items (ranked by frequency):")
basket_items_sorted = sorted(
    [(item, count) for item, count in basket_counter.items() if item in items],
    key=lambda x: x[1],
    reverse=True
)
for item, count in basket_items_sorted:
    support = count / total_tx
    print(f"  {item}: {count:,} ({support:.2%})")

# Show binning effectiveness
print(f"\nBinning effectiveness:")
for category, category_items in sorted(bin_categories.items()):
    kept = len([i for i in category_items if i in items])
    removed = len(category_items) - kept
    if len(category_items) > 0:
        print(f"  {category}: {kept} kept, {removed} removed (from {len(category_items)} total)")

print(f"\n{'='*70}")
print("ENCODING TRANSACTION MATRIX")
print(f"{'='*70}\n")

# ---- Pass 2: Encode (batch-wise, sparse-friendly) ----
print("Pass 2: Encoding transactions...")
encoded_chunks = []

for batch in tqdm(
    dataset.to_batches(columns=["transaction"], batch_size=25_000),
    desc="Encoding"
):
    txs = batch.column("transaction").to_pylist()
    arr = np.zeros((len(txs), len(items)), dtype=bool)

    for r, t in enumerate(txs):
        for item in t:
            idx = item_to_idx.get(item)
            if idx is not None:
                arr[r, idx] = True

    encoded_chunks.append(pd.DataFrame(arr, columns=items))
    del arr, txs
    gc.collect()

df_transactions = pd.concat(encoded_chunks, ignore_index=True)
del encoded_chunks
gc.collect()

print(f"\n{'='*70}")
print("ENCODING COMPLETE")
print(f"{'='*70}")
print(f"✓ Transaction matrix shape: {df_transactions.shape}")
print(f"  Rows (transactions): {df_transactions.shape[0]:,}")
print(f"  Columns (items): {df_transactions.shape[1]:,}")
print(f"    - Regular items: {len([c for c in df_transactions.columns if not c.startswith('basket_')])}")
print(f"    - Basket items: {len([c for c in df_transactions.columns if c.startswith('basket_')])}")
print(f"✓ Memory usage: {df_transactions.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Calculate sparsity
total_cells = df_transactions.shape[0] * df_transactions.shape[1]
filled_cells = df_transactions.sum().sum()
sparsity = 1 - (filled_cells / total_cells)
print(f"✓ Matrix sparsity: {sparsity:.2%}")
print(f"  Filled cells: {filled_cells:,} / {total_cells:,}")

# Analyze basket vs regular item density
basket_cols = [c for c in df_transactions.columns if c.startswith('basket_')]
regular_cols = [c for c in df_transactions.columns if not c.startswith('basket_')]

if basket_cols:
    basket_density = df_transactions[basket_cols].sum().sum() / (len(df_transactions) * len(basket_cols))
    print(f"\nBasket item density: {basket_density:.2%}")
    print(f"  (Average basket items per transaction: {basket_density * len(basket_cols):.2f})")

if regular_cols:
    regular_density = df_transactions[regular_cols].sum().sum() / (len(df_transactions) * len(regular_cols))
    print(f"Regular item density: {regular_density:.2%}")
    print(f"  (Average regular items per transaction: {regular_density * len(regular_cols):.2f})")

# Show sample of encoded matrix
print(f"\nSample encoded transactions (first 5 rows):")
print("\nBasket columns:")
if basket_cols:
    print(df_transactions[basket_cols].head())
else:
    print("  (No basket columns after filtering)")

print("\nRegular columns (first 10):")
print(df_transactions[regular_cols[:10]].head())

print(f"\n{'='*70}")
print("SAVING ENCODED DATA")
print(f"{'='*70}\n")

# Save the encoded transaction matrix
ENCODED_MATRIX_PATH = "../dataset/A1_encoded_transactions.parquet"
print(f"Saving encoded matrix to: {ENCODED_MATRIX_PATH}")

df_transactions.to_parquet(
    ENCODED_MATRIX_PATH,
    engine='pyarrow',
    compression='snappy',
    index=False
)

print(f"✓ Matrix saved ({os.path.getsize(ENCODED_MATRIX_PATH) / 1024**2:.2f} MB)")

# Save comprehensive metadata for analysis
metadata = {
    # Transaction statistics
    'total_transactions': total_tx,
    'total_unique_items_before_filter': len(item_counter),
    'total_items_after_filter': len(items),
    'regular_items': len(filtered_regular),
    'basket_items': len(filtered_baskets),
    
    # Filtering parameters
    'min_support': AnalysisConfig.min_support,
    'min_count': min_count,
    'items_removed': len(item_counter) - len(items),
    
    # Matrix properties
    'matrix_shape': list(df_transactions.shape),
    'sparsity': float(sparsity),
    'memory_usage_mb': float(df_transactions.memory_usage(deep=True).sum() / 1024**2),
    
    # Density metrics
    'basket_density': float(basket_density) if basket_cols else 0.0,
    'regular_density': float(regular_density) if regular_cols else 0.0,
    'avg_basket_items_per_tx': float(basket_density * len(basket_cols)) if basket_cols else 0.0,
    'avg_regular_items_per_tx': float(regular_density * len(regular_cols)) if regular_cols else 0.0,
    
    # Item lists
    'all_items': items,
    'basket_items_list': filtered_baskets,
    'regular_items_list': filtered_regular,
    
    # Binning categories
    'bin_categories': {cat: len(cat_items) for cat, cat_items in bin_categories.items()},
    
    # Item frequencies (for reference)
    'item_frequencies': {item: int(count) for item, count in item_counter.items() if item in items},
    'basket_frequencies': {item: int(count) for item, count in basket_counter.items() if item in items},
    'regular_frequencies': {item: int(count) for item, count in regular_counter.items() if item in items},
    
    # Top items
    'top_10_regular_items': [(item, int(count), float(count/total_tx)) 
                              for item, count in regular_items_sorted[:10]],
    'all_basket_items_ranked': [(item, int(count), float(count/total_tx)) 
                                 for item, count in basket_items_sorted]
}

METADATA_PATH = "../dataset/A1_encoded_metadata.json"
print(f"Saving metadata to: {METADATA_PATH}")

with open(METADATA_PATH, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"✓ Metadata saved")

# Save human-readable summary
SUMMARY_PATH = "../dataset/A1_encoding_summary.txt"
print(f"Saving summary to: {SUMMARY_PATH}")

with open(SUMMARY_PATH, 'w') as f:
    f.write("="*70 + "\n")
    f.write("TRANSACTION ENCODING SUMMARY\n")
    f.write("="*70 + "\n\n")
    
    f.write("TRANSACTION STATISTICS\n")
    f.write("-"*70 + "\n")
    f.write(f"Total transactions: {total_tx:,}\n")
    f.write(f"Unique items (before filtering): {len(item_counter)}\n")
    f.write(f"Items kept (after filtering): {len(items)}\n")
    f.write(f"  - Regular items: {len(filtered_regular)}\n")
    f.write(f"  - Basket items: {len(filtered_baskets)}\n")
    f.write(f"Items removed: {len(item_counter) - len(items)}\n\n")
    
    f.write("FILTERING PARAMETERS\n")
    f.write("-"*70 + "\n")
    f.write(f"Min support: {AnalysisConfig.min_support}\n")
    f.write(f"Min count: {min_count}\n\n")
    
    f.write("MATRIX PROPERTIES\n")
    f.write("-"*70 + "\n")
    f.write(f"Shape: {df_transactions.shape[0]:,} × {df_transactions.shape[1]}\n")
    f.write(f"Sparsity: {sparsity:.2%}\n")
    f.write(f"Memory usage: {df_transactions.memory_usage(deep=True).sum() / 1024**2:.2f} MB\n\n")
    
    f.write("DENSITY METRICS\n")
    f.write("-"*70 + "\n")
    if basket_cols:
        f.write(f"Basket item density: {basket_density:.2%}\n")
        f.write(f"Avg basket items per transaction: {basket_density * len(basket_cols):.2f}\n")
    if regular_cols:
        f.write(f"Regular item density: {regular_density:.2%}\n")
        f.write(f"Avg regular items per transaction: {regular_density * len(regular_cols):.2f}\n")
    f.write("\n")
    
    f.write("BINNING CATEGORIES\n")
    f.write("-"*70 + "\n")
    for category, cat_items in sorted(bin_categories.items()):
        kept = len([i for i in cat_items if i in items])
        f.write(f"{category}: {kept} items kept\n")
    f.write("\n")
    
    f.write("TOP 10 REGULAR ITEMS\n")
    f.write("-"*70 + "\n")
    for item, count in regular_items_sorted[:10]:
        support = count / total_tx
        f.write(f"{item}: {count:,} ({support:.2%})\n")
    f.write("\n")
    
    f.write("ALL BASKET ITEMS (RANKED BY FREQUENCY)\n")
    f.write("-"*70 + "\n")
    for item, count in basket_items_sorted:
        support = count / total_tx
        f.write(f"{item}: {count:,} ({support:.2%})\n")

print(f"✓ Summary saved\n")

print(f"{'='*70}")
print("FILES SAVED")
print(f"{'='*70}")
print(f"1. Encoded matrix: {ENCODED_MATRIX_PATH}")
print(f"2. Metadata (JSON): {METADATA_PATH}")
print(f"3. Summary (TXT): {SUMMARY_PATH}")
print(f"\nTo load for Apriori, use:")
print(f"  df_transactions = pd.read_parquet('{ENCODED_MATRIX_PATH}')")
print(f"  with open('{METADATA_PATH}') as f:")
print(f"      metadata = json.load(f)")
print(f"{'='*70}\n")

# Clean up counters
del basket_counter, regular_counter, regular_items_sorted, basket_items_sorted
del item_counter, item_to_idx
gc.collect()

TRANSACTION ANALYSIS WITH BINNING & BASKETING

Fetching sample transactions...

Showing first 10 transactions:

Transaction 1:
  Assignment ID: 1000AQM2VK
  Problem ID: 154L9TTK7O
  Total items: 17
    Regular items (14): ['curriculum_EngageNY', 'tutoring_answer', 'action_wrong_response_once', 'probtype_Number', 'timing_very_fast', 'action_correct_response_once', 'content_simple', 'skill_advanced', 'grade_middle', 'attempts_moderate', 'multipart_problem', 'action_answer_requested_once', 'outcome_success_after_struggle', 'score_visible']
    Basket items (3): ['basket_requested_help', 'basket_had_correct_response', 'basket_had_wrong_response']

Transaction 2:
  Assignment ID: 1000AQM2VK
  Problem ID: 1D21SH7B18
  Total items: 9
    Regular items (9): ['tutoring_no_tutoring', 'curriculum_EngageNY', 'attempts_limited', 'skill_advanced', 'content_simple', 'probtype_Ungraded_Open_Respon', 'grade_middle', 'multipart_problem', 'score_visible']
    Basket items (0): []

Transaction 3:
  Assign

Counting: 105it [00:58,  1.81it/s]



✓ Total transactions: 5,245,859
✓ Unique items found: 67
  - Regular items: 59
  - Basket items: 8

Binning category breakdown:
  action: 11 items
  attempts: 2 items
  content: 3 items
  curriculum: 2 items
  expl: 2 items
  grade: 6 items
  hint: 5 items
  multipart: 1 items
  outcome: 3 items
  probtype: 10 items
  score: 2 items
  skill: 2 items
  timing: 5 items
  tutoring: 4 items
  used: 1 items

Basket usage statistics:
  basket_contains_equations: 20,186 (0.4% of transactions)
  basket_contains_images: 7,967 (0.2% of transactions)
  basket_had_correct_response: 3,587,500 (68.4% of transactions)
  basket_had_wrong_response: 895,728 (17.1% of transactions)
  basket_requested_help: 644,526 (12.3% of transactions)
  basket_used_explanations: 21,136 (0.4% of transactions)
  basket_used_help_resources: 71,459 (1.4% of transactions)
  basket_used_hints: 50,326 (1.0% of transactions)

✓ Items after filtering (min_support=0.05):
  Total kept: 28
  - Regular items: 25
  - Basket items:

Encoding: 210it [01:16,  2.73it/s]



ENCODING COMPLETE
✓ Transaction matrix shape: (5245859, 28)
  Rows (transactions): 5,245,859
  Columns (items): 28
    - Regular items: 25
    - Basket items: 3
✓ Memory usage: 140.08 MB
✓ Matrix sparsity: 60.02%
  Filled cells: 58,728,501 / 146,884,052

Basket item density: 32.58%
  (Average basket items per transaction: 0.98)
Regular item density: 40.87%
  (Average regular items per transaction: 10.22)

Sample encoded transactions (first 5 rows):

Basket columns:
   basket_requested_help  basket_had_correct_response  \
0                   True                         True   
1                  False                        False   
2                  False                        False   
3                   True                         True   
4                  False                        False   

   basket_had_wrong_response  
0                       True  
1                      False  
2                      False  
3                       True  
4                      False  


33

#### Apriori Analysis

In [12]:
print(f"\n{'='*70}")
print("RUNNING PROGRESSIVE APRIORI WITH ADAPTIVE SUPPORT")
print(f"{'='*70}\n")

print("="*70)
print("LOADING ENCODED DATA FOR APRIORI")
print("="*70)

# Load paths
ENCODED_MATRIX_PATH = "../dataset/A1_encoded_transactions.parquet"
METADATA_PATH = "../dataset/A1_encoded_metadata.json"

# Load encoded matrix
print(f"\nLoading matrix from: {ENCODED_MATRIX_PATH}")
df_transactions = pd.read_parquet(ENCODED_MATRIX_PATH)

# Load metadata
print(f"Loading metadata from: {METADATA_PATH}")
with open(METADATA_PATH, 'r') as f:
    metadata = json.load(f)

print(f"\n✓ Data loaded successfully\n")

print(f"Matrix info:")
print(f"  Transactions: {df_transactions.shape[0]:,}")
print(f"  Items: {df_transactions.shape[1]}")
print(f"  Target min_support: {AnalysisConfig.min_support}")
print(f"\nStarting with higher support and gradually reducing if needed...\n")

# Try progressively lower support values
support_levels = [0.10, 0.08, 0.06, 0.05, 0.04, 0.03]
frequent_itemsets = None
successful_min_support = None

for min_sup in support_levels:
    # Skip if below target minimum
    if min_sup < AnalysisConfig.min_support:
        print(f"Skipping min_support={min_sup} (below target {AnalysisConfig.min_support})")
        break
        
    print(f"{'─'*70}")
    print(f"Attempting with min_support={min_sup}...")
    
    try:
        frequent_itemsets = apriori(
            df_transactions,
            min_support=min_sup,
            use_colnames=True,
            verbose=1,
            low_memory=True
        )
        
        print(f"\n✓ Success! Found {len(frequent_itemsets)} itemsets with support >= {min_sup}")
        successful_min_support = min_sup
        
        # Analyze itemsets found
        if len(frequent_itemsets) > 0:
            itemset_sizes = frequent_itemsets['itemsets'].apply(len).value_counts().sort_index()
            print(f"  Itemset size distribution:")
            for size, count in itemset_sizes.items():
                print(f"    {size}-itemsets: {count}")
        
        if len(frequent_itemsets) >= 10:  # Found enough patterns
            print(f"\n✓ Sufficient itemsets found, proceeding with rules generation...")
            break
        else:
            print(f"  ⚠ Only {len(frequent_itemsets)} itemsets found")
            if min_sup > AnalysisConfig.min_support:
                print(f"  Trying lower support...\n")
            else:
                print(f"  At target support level, proceeding anyway...\n")
                break
            
    except MemoryError as e:
        print(f"\n✗ Memory error with min_support={min_sup}")
        print(f"  Cannot allocate required memory")
        print(f"  Trying higher support (fewer itemsets)...\n")
        frequent_itemsets = None
        gc.collect()
        continue
    except Exception as e:
        print(f"\n✗ Unexpected error with min_support={min_sup}")
        print(f"  Error: {type(e).__name__}: {e}")
        print(f"  Trying higher support...\n")
        frequent_itemsets = None
        gc.collect()
        continue

print(f"\n{'='*70}")
print("APRIORI RESULTS")
print(f"{'='*70}\n")

if frequent_itemsets is None or len(frequent_itemsets) == 0:
    print("✗ FAILED: No frequent itemsets found at any support level.")
    print("\nRecommendations:")
    print("  1. Use FP-Growth instead: fpgrowth(df_transactions, min_support=0.05)")
    print("  2. Further increase min_support (e.g., 0.15 or 0.20)")
    print("  3. Reduce number of items with more aggressive filtering")
    print("  4. Remove basket items and use only regular items")
    print("  5. Sample your data (use subset of transactions)")
    rules = pd.DataFrame()
else:
    print(f"✓ Frequent itemsets found: {len(frequent_itemsets)}")
    print(f"  Using min_support: {successful_min_support}")
    
    # Analyze frequent itemsets by type
    basket_itemsets = 0
    regular_itemsets = 0
    mixed_itemsets = 0
    
    for idx, row in frequent_itemsets.iterrows():
        items = list(row['itemsets'])
        has_basket = any(item.startswith('basket_') for item in items)
        has_regular = any(not item.startswith('basket_') for item in items)
        
        if has_basket and has_regular:
            mixed_itemsets += 1
        elif has_basket:
            basket_itemsets += 1
        else:
            regular_itemsets += 1
    
    print(f"\nFrequent itemset breakdown:")
    print(f"  Regular items only: {regular_itemsets}")
    print(f"  Basket items only: {basket_itemsets}")
    print(f"  Mixed (basket + regular): {mixed_itemsets}")
    
    # Show top itemsets by support
    print(f"\nTop 10 frequent itemsets by support:")
    top_itemsets = frequent_itemsets.nlargest(10, 'support')
    for idx, row in top_itemsets.iterrows():
        items = list(row['itemsets'])
        items_str = ', '.join(sorted(items))
        print(f"  {items_str}")
        print(f"    Support: {row['support']:.4f} ({int(row['support'] * len(df_transactions)):,} transactions)")
    
    print(f"\n{'='*70}")
    print("GENERATING ASSOCIATION RULES")
    print(f"{'='*70}\n")
    
    print("Generating association rules...")
    print(f"  min_confidence: {AnalysisConfig.min_confidence}")
    print(f"  min_lift: {AnalysisConfig.min_lift}")
    
    try:
        rules = association_rules(
            frequent_itemsets,
            metric="confidence",
            min_threshold=AnalysisConfig.min_confidence
        )

        rules = rules[rules["lift"] >= AnalysisConfig.min_lift]
        rules = rules.sort_values("lift", ascending=False)

        print(f"\n✓ Association rules generated: {len(rules)}")

        if len(rules):
            print(f"  Lift range: {rules['lift'].min():.2f} – {rules['lift'].max():.2f}")
            print(f"  Confidence range: {rules['confidence'].min():.2f} – {rules['confidence'].max():.2f}")
            print(f"  Support range: {rules['support'].min():.4f} – {rules['support'].max():.4f}")
            
            # Analyze rules by basket involvement
            basket_antecedent = 0
            basket_consequent = 0
            basket_both = 0
            no_basket = 0
            
            for idx, row in rules.iterrows():
                ant = list(row['antecedents'])
                cons = list(row['consequents'])
                
                has_basket_ant = any(item.startswith('basket_') for item in ant)
                has_basket_cons = any(item.startswith('basket_') for item in cons)
                
                if has_basket_ant and has_basket_cons:
                    basket_both += 1
                elif has_basket_ant:
                    basket_antecedent += 1
                elif has_basket_cons:
                    basket_consequent += 1
                else:
                    no_basket += 1
            
            print(f"\nRule breakdown by basket involvement:")
            print(f"  No baskets: {no_basket} ({100*no_basket/len(rules):.1f}%)")
            print(f"  Basket in antecedent only: {basket_antecedent} ({100*basket_antecedent/len(rules):.1f}%)")
            print(f"  Basket in consequent only: {basket_consequent} ({100*basket_consequent/len(rules):.1f}%)")
            print(f"  Baskets in both: {basket_both} ({100*basket_both/len(rules):.1f}%)")
        else:
            print("\n⚠ No rules met the confidence and lift thresholds.")
            print(f"  Try lowering min_confidence (current: {AnalysisConfig.min_confidence})")
            print(f"  or min_lift (current: {AnalysisConfig.min_lift})")
            
    except Exception as e:
        print(f"\n✗ Error generating rules: {type(e).__name__}: {e}")
        rules = pd.DataFrame()

print(f"\n{'='*70}\n")

gc.collect()


RUNNING PROGRESSIVE APRIORI WITH ADAPTIVE SUPPORT

LOADING ENCODED DATA FOR APRIORI

Loading matrix from: ../dataset/A1_encoded_transactions.parquet
Loading metadata from: ../dataset/A1_encoded_metadata.json

✓ Data loaded successfully

Matrix info:
  Transactions: 5,245,859
  Items: 28
  Target min_support: 0.05

Starting with higher support and gradually reducing if needed...

──────────────────────────────────────────────────────────────────────
Attempting with min_support=0.1...
Processing 33 combinations | Sampling itemset size 108

✓ Success! Found 6262 itemsets with support >= 0.1
  Itemset size distribution:
    1-itemsets: 26
    2-itemsets: 183
    3-itemsets: 633
    4-itemsets: 1287
    5-itemsets: 1661
    6-itemsets: 1404
    7-itemsets: 766
    8-itemsets: 254
    9-itemsets: 45
    10-itemsets: 3

✓ Sufficient itemsets found, proceeding with rules generation...

APRIORI RESULTS

✓ Frequent itemsets found: 6262
  Using min_support: 0.1

Frequent itemset breakdown:
  Reg

43

### Results

In [13]:
import os
import json
import gc

print("\n" + "=" * 80)
print(f"TOP {AnalysisConfig.top_n_rules} ASSOCIATION RULES BY LIFT")
print("=" * 80)

if len(rules) > 0:
    # Display top rules in compact format
    for i, (idx, row) in enumerate(rules.head(AnalysisConfig.top_n_rules).iterrows(), 1):
        antecedents_list = sorted(list(row['antecedents']))
        consequents_list = sorted(list(row['consequents']))
        
        # Format as: ant1, ant2, ant3 => cons1, cons2
        ant_str = ', '.join(antecedents_list)
        cons_str = ', '.join(consequents_list)
        
        # Single line format
        print(f"{i}. {ant_str} => {cons_str}")
        print(f"   Support: {row['support']:.3f}, Confidence: {row['confidence']:.3f}, Lift: {row['lift']:.3f}")
    
    # Show breakdown by basket involvement
    print("\n" + "=" * 80)
    print("RULE BREAKDOWN BY TYPE")
    print("=" * 80)
    
    basket_in_both = sum(1 for _, row in rules.iterrows() 
                         if any(x.startswith('basket_') for x in row['antecedents']) 
                         and any(x.startswith('basket_') for x in row['consequents']))
    basket_in_ant = sum(1 for _, row in rules.iterrows() 
                        if any(x.startswith('basket_') for x in row['antecedents']) 
                        and not any(x.startswith('basket_') for x in row['consequents']))
    basket_in_cons = sum(1 for _, row in rules.iterrows() 
                         if not any(x.startswith('basket_') for x in row['antecedents']) 
                         and any(x.startswith('basket_') for x in row['consequents']))
    no_baskets = len(rules) - basket_in_both - basket_in_ant - basket_in_cons
    
    print(f"\nTotal rules: {len(rules)}")
    print(f"  Regular items only: {no_baskets} ({100*no_baskets/len(rules):.1f}%)")
    print(f"  Baskets in antecedent: {basket_in_ant} ({100*basket_in_ant/len(rules):.1f}%)")
    print(f"  Baskets in consequent: {basket_in_cons} ({100*basket_in_cons/len(rules):.1f}%)")
    print(f"  Baskets in both: {basket_in_both} ({100*basket_in_both/len(rules):.1f}%)")
    
    # Show top basket-involved rules
    if basket_in_both + basket_in_ant + basket_in_cons > 0:
        basket_rules_df = rules[
            rules['antecedents'].apply(lambda x: any(item.startswith('basket_') for item in x)) |
            rules['consequents'].apply(lambda x: any(item.startswith('basket_') for item in x))
        ]
        
        print(f"\n" + "=" * 80)
        print(f"TOP 10 RULES WITH BASKETS (HIGH-LEVEL PATTERNS)")
        print("=" * 80)
        
        for i, (idx, row) in enumerate(basket_rules_df.head(10).iterrows(), 1):
            ant = ', '.join(sorted(list(row['antecedents'])))
            cons = ', '.join(sorted(list(row['consequents'])))
            print(f"{i}. {ant} => {cons}")
            print(f"   Support: {row['support']:.3f}, Confidence: {row['confidence']:.3f}, Lift: {row['lift']:.3f}")
    
    # Show top regular rules
    regular_rules_df = rules[
        ~(rules['antecedents'].apply(lambda x: any(item.startswith('basket_') for item in x)) |
          rules['consequents'].apply(lambda x: any(item.startswith('basket_') for item in x)))
    ]
    
    if len(regular_rules_df) > 0:
        print(f"\n" + "=" * 80)
        print(f"TOP 10 RULES WITHOUT BASKETS (SPECIFIC PATTERNS)")
        print("=" * 80)
        
        for i, (idx, row) in enumerate(regular_rules_df.head(10).iterrows(), 1):
            ant = ', '.join(sorted(list(row['antecedents'])))
            cons = ', '.join(sorted(list(row['consequents'])))
            print(f"{i}. {ant} => {cons}")
            print(f"   Support: {row['support']:.3f}, Confidence: {row['confidence']:.3f}, Lift: {row['lift']:.3f}")
    
    # Compare basket vs non-basket rule quality
    if basket_in_both + basket_in_ant + basket_in_cons > 0 and no_baskets > 0:
        basket_involved = rules[
            rules['antecedents'].apply(lambda x: any(item.startswith('basket_') for item in x)) |
            rules['consequents'].apply(lambda x: any(item.startswith('basket_') for item in x))
        ]
        no_basket_rules = rules[
            ~(rules['antecedents'].apply(lambda x: any(item.startswith('basket_') for item in x)) |
              rules['consequents'].apply(lambda x: any(item.startswith('basket_') for item in x)))
        ]
        
        print(f"\n" + "=" * 80)
        print("BASKET VS REGULAR ITEM RULE COMPARISON")
        print("=" * 80)
        print(f"\nRules with baskets ({len(basket_involved)} rules):")
        print(f"  Avg lift: {basket_involved['lift'].mean():.3f}")
        print(f"  Avg confidence: {basket_involved['confidence'].mean():.3f}")
        print(f"  Avg support: {basket_involved['support'].mean():.3f}")
        print(f"\nRules without baskets ({len(no_basket_rules)} rules):")
        print(f"  Avg lift: {no_basket_rules['lift'].mean():.3f}")
        print(f"  Avg confidence: {no_basket_rules['confidence'].mean():.3f}")
        print(f"  Avg support: {no_basket_rules['support'].mean():.3f}")
    
else:
    print("\n⚠ No rules found. Try lowering min_support, min_confidence, or min_lift.")

# Save results
print("\n" + "=" * 80)
print("SAVING RESULTS")
print("=" * 80)

os.makedirs('../results', exist_ok=True)

if len(rules) > 0:
    # Convert frozensets to strings for CSV export
    rules_export = rules.copy()
    
    # Add basket indicators
    rules_export['has_basket_antecedent'] = rules_export['antecedents'].apply(
        lambda x: any(item.startswith('basket_') for item in x)
    )
    rules_export['has_basket_consequent'] = rules_export['consequents'].apply(
        lambda x: any(item.startswith('basket_') for item in x)
    )
    rules_export['num_baskets'] = rules_export['antecedents'].apply(
        lambda x: sum(1 for item in x if item.startswith('basket_'))
    ) + rules_export['consequents'].apply(
        lambda x: sum(1 for item in x if item.startswith('basket_'))
    )
    
    # Separate basket and regular items in export
    rules_export['antecedents_baskets'] = rules_export['antecedents'].apply(
        lambda x: ', '.join(sorted([item for item in x if item.startswith('basket_')]))
    )
    rules_export['antecedents_regular'] = rules_export['antecedents'].apply(
        lambda x: ', '.join(sorted([item for item in x if not item.startswith('basket_')]))
    )
    rules_export['consequents_baskets'] = rules_export['consequents'].apply(
        lambda x: ', '.join(sorted([item for item in x if item.startswith('basket_')]))
    )
    rules_export['consequents_regular'] = rules_export['consequents'].apply(
        lambda x: ', '.join(sorted([item for item in x if not item.startswith('basket_')]))
    )
    
    # Create compact rule format
    rules_export['rule'] = rules_export.apply(
        lambda row: f"{', '.join(sorted(list(row['antecedents'])))} => {', '.join(sorted(list(row['consequents'])))}",
        axis=1
    )
    
    # Original columns
    rules_export['antecedents'] = rules_export['antecedents'].apply(
        lambda x: ', '.join(sorted(list(x)))
    )
    rules_export['consequents'] = rules_export['consequents'].apply(
        lambda x: ', '.join(sorted(list(x)))
    )
    
    rules_export.to_csv('../results/analysis1_behavior_rules.csv', index=False)
    print("✓ Saved rules to: ../results/analysis1_behavior_rules.csv")
    
    # Save frequent itemsets with basket info
    frequent_itemsets_export = frequent_itemsets.copy()
    frequent_itemsets_export['has_basket'] = frequent_itemsets_export['itemsets'].apply(
        lambda x: any(item.startswith('basket_') for item in x)
    )
    frequent_itemsets_export['num_baskets'] = frequent_itemsets_export['itemsets'].apply(
        lambda x: sum(1 for item in x if item.startswith('basket_'))
    )
    frequent_itemsets_export['itemsets'] = frequent_itemsets_export['itemsets'].apply(
        lambda x: ', '.join(sorted(list(x)))
    )
    frequent_itemsets_export.to_csv('../results/analysis1_frequent_itemsets.csv', index=False)
    print("✓ Saved itemsets to: ../results/analysis1_frequent_itemsets.csv")
    
    # Save top rules in readable text format
    TOP_RULES_TXT = '../results/analysis1_top_rules.txt'
    with open(TOP_RULES_TXT, 'w') as f:
        f.write("="*80 + "\n")
        f.write(f"TOP {AnalysisConfig.top_n_rules} ASSOCIATION RULES BY LIFT\n")
        f.write("="*80 + "\n\n")
        
        for i, (idx, row) in enumerate(rules.head(AnalysisConfig.top_n_rules).iterrows(), 1):
            ant = ', '.join(sorted(list(row['antecedents'])))
            cons = ', '.join(sorted(list(row['consequents'])))
            f.write(f"{i}. {ant} => {cons}\n")
            f.write(f"   Support: {row['support']:.3f}, Confidence: {row['confidence']:.3f}, Lift: {row['lift']:.3f}\n\n")
        
        if basket_in_both + basket_in_ant + basket_in_cons > 0:
            f.write("="*80 + "\n")
            f.write("TOP 10 RULES WITH BASKETS\n")
            f.write("="*80 + "\n\n")
            
            for i, (idx, row) in enumerate(basket_rules_df.head(10).iterrows(), 1):
                ant = ', '.join(sorted(list(row['antecedents'])))
                cons = ', '.join(sorted(list(row['consequents'])))
                f.write(f"{i}. {ant} => {cons}\n")
                f.write(f"   Support: {row['support']:.3f}, Confidence: {row['confidence']:.3f}, Lift: {row['lift']:.3f}\n\n")
        
        if len(regular_rules_df) > 0:
            f.write("="*80 + "\n")
            f.write("TOP 10 RULES WITHOUT BASKETS\n")
            f.write("="*80 + "\n\n")
            
            for i, (idx, row) in enumerate(regular_rules_df.head(10).iterrows(), 1):
                ant = ', '.join(sorted(list(row['antecedents'])))
                cons = ', '.join(sorted(list(row['consequents'])))
                f.write(f"{i}. {ant} => {cons}\n")
                f.write(f"   Support: {row['support']:.3f}, Confidence: {row['confidence']:.3f}, Lift: {row['lift']:.3f}\n\n")
    
    print(f"✓ Saved top rules text: {TOP_RULES_TXT}")
    
    # Save summary statistics
    summary_stats = {
        'total_rules': len(rules),
        'total_frequent_itemsets': len(frequent_itemsets),
        'rules_with_baskets': basket_in_both + basket_in_ant + basket_in_cons,
        'rules_without_baskets': no_baskets,
        'basket_in_antecedent': basket_in_ant,
        'basket_in_consequent': basket_in_cons,
        'basket_in_both': basket_in_both,
        'avg_lift': float(rules['lift'].mean()),
        'avg_confidence': float(rules['confidence'].mean()),
        'avg_support': float(rules['support'].mean()),
        'max_lift': float(rules['lift'].max()),
        'max_confidence': float(rules['confidence'].max()),
        'max_support': float(rules['support'].max()),
        'min_support_used': AnalysisConfig.min_support,
        'min_confidence_used': AnalysisConfig.min_confidence,
        'min_lift_used': AnalysisConfig.min_lift
    }
    
    with open('../results/analysis1_summary.json', 'w') as f:
        json.dump(summary_stats, f, indent=2)
    print("✓ Saved summary to: ../results/analysis1_summary.json")
    
    print(f"\nSummary statistics:")
    print(f"  Total rules: {len(rules)}")
    print(f"  Total frequent itemsets: {len(frequent_itemsets)}")
    print(f"  Rules with baskets: {basket_in_both + basket_in_ant + basket_in_cons} ({100*(basket_in_both + basket_in_ant + basket_in_cons)/len(rules):.1f}%)")
    print(f"  Average lift: {rules['lift'].mean():.3f}")
    print(f"  Average confidence: {rules['confidence'].mean():.3f}")
    print(f"  Average support: {rules['support'].mean():.3f}")
    
    print(f"\nFiles saved:")
    print(f"  1. ../results/analysis1_behavior_rules.csv (all rules with details)")
    print(f"  2. ../results/analysis1_frequent_itemsets.csv (frequent itemsets)")
    print(f"  3. ../results/analysis1_top_rules.txt (human-readable top rules)")
    print(f"  4. ../results/analysis1_summary.json (summary statistics)")
else:
    print("⚠ No results to save. Adjust parameters and try again.")

# Cleanup
gc.collect()

print("\n✓ Analysis complete!")
print("=" * 80)


TOP 10 ASSOCIATION RULES BY LIFT
1. basket_had_correct_response, basket_requested_help => action_answer_requested_once, action_correct_response_once
   Support: 0.113, Confidence: 0.927, Lift: 8.215
2. basket_had_correct_response, basket_requested_help => action_answer_requested_once, action_correct_response_once, score_visible
   Support: 0.113, Confidence: 0.927, Lift: 8.215
3. basket_had_correct_response, basket_requested_help, score_visible => action_answer_requested_once, action_correct_response_once
   Support: 0.113, Confidence: 0.927, Lift: 8.215
4. basket_had_correct_response, basket_requested_help => action_answer_requested_once, action_correct_response_once, score_visible, skill_advanced
   Support: 0.110, Confidence: 0.906, Lift: 8.215
5. action_answer_requested_once, action_correct_response_once, skill_advanced => basket_had_correct_response, basket_requested_help, score_visible
   Support: 0.110, Confidence: 1.000, Lift: 8.215
6. action_answer_requested_once, action_corr