## Configurations and Utility Functions

### Imports
Make sure to install requirements.txt

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori, association_rules
import gc
import re
import os
from pathlib import Path
from dataclasses import dataclass
import warnings
from IPython.display import display
from mlxtend.preprocessing import TransactionEncoder
import glob
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from itertools import combinations
warnings.filterwarnings('ignore')

### Process Configuration Classes

In [2]:
@dataclass
class AnalysisConfig:
    """Apriori analysis parameters"""
    min_support: float = 0.05
    min_confidence: float = 0.6
    min_lift: float = 1.0
    top_n_rules: int = 10

@dataclass
class ProcessingConfig:
    """Data processing configuration"""
    dataset_folder: Path = Path('../dataset')
    preprocessed_df_dir: Path = Path("../dataset/dataframes")
    analysis_df_dir: Path = Path('../dataset/tempdf')
    results_raw_dir: Path = Path('../results/raw')
    results_vis_dir: Path = Path('../results/vis')
    include_files: tuple = (    # the csv files needed to create the transactions
        'action_logs.csv', 
        'problem_details.csv',
        'training_unit_test_scores.csv',
        'assignment_relationships.csv',
        'assignment_details.csv',
        'sequence_details.csv'
    )

### Utility Functions

In [3]:
def save_dataframe(name: str, dataframe: pd.DataFrame):
    os.makedirs(ProcessingConfig.preprocessed_df_dir, exist_ok=True)
    path = ProcessingConfig.preprocessed_df_dir / f'{name}.parquet'
    dataframe.to_parquet(path, index=False, engine='pyarrow')

def load_dataframe(name: str) -> pd.DataFrame:
    path = ProcessingConfig.preprocessed_df_dir / f'{name}.parquet'
    return pd.read_parquet(path, engine='pyarrow')

def df_exploration(key: str, df: pd.DataFrame, head_n: int = 10):
    print("=" * 40)
    print(f"{key} (shape: {df.shape})")
    print("=" * 40)

    print("\nUnique items per column:")
    report = []
    n_rows = len(df)
    for col in df.columns:
        series = df[col]
        missing = series.isna().sum()
        unique = series.unique()
        print(f"    `{col}`: {unique}")

        report.append({
            "Column": col,
            "Type": str(series.dtype),
            "Missing": int(missing),
            "Missing%": f"{(missing / n_rows * 100):.2f}%" if n_rows else "0.00%",
            "Unique": len(unique),
        })

    print("Report:")
    display(pd.DataFrame(report))    

    dup_count = df.duplicated().sum()
    print(f"  Duplicates: {dup_count}")
    print(f"  ({head_n}) Sample Data:")
    display(df.head(head_n))

## Exploration

### Dataset Files Selection

In [8]:
csv_files = sorted(ProcessingConfig.dataset_folder.glob('*.csv'))
print(f"\nData folder: {ProcessingConfig.dataset_folder.absolute()}")
print(f"Found {len(csv_files)} CSV files:\nOnly included {len(ProcessingConfig.include_files)} files.")

csv_files = {csv_file.stem: csv_file for csv_file in csv_files if csv_file.name in ProcessingConfig.include_files}
for name, csv in csv_files.items():
    size_mb = csv.stat().st_size / 1024 / 1024
    print(f"  {csv.name:<45} {size_mb:>10.2f} MB")


Data folder: /mnt/41A664F31125B500/Personal/Academics/4th_Year/1st_Sem/CSC172_Data_Mining_and_Analysis/CSC172-AssociationMining-Bautista/notebooks/../dataset
Found 10 CSV files:
Only included 6 files.
  action_logs.csv                                  1371.44 MB
  assignment_details.csv                            921.42 MB
  assignment_relationships.csv                       14.25 MB
  problem_details.csv                                58.98 MB
  sequence_details.csv                                3.81 MB
  training_unit_test_scores.csv                      10.03 MB


### Initial Exploration of Dataset Files

#### Initial Exploration

In [None]:
for name, csv in csv_files.items():
    df = pd.read_csv(csv)
    df_exploration(name, df)

    del df
    gc.collect()

action_logs (shape: (23932276, 10))

Unique items per column:
    `assignment_log_id`: ['2QV1F2GSBZ' 'W4UD30NIA' '2DJ8MR8M7U' ... '249NA55LN0' '1E2UZDBLNE'
 '1VVEB3EAGF']
    `timestamp`: [1.59915099e+09 1.59915099e+09 1.59915107e+09 ... 1.63491912e+09
 1.63491912e+09 1.63491912e+09]
    `problem_id`: [nan 'I2GX4OQIE' 'HCTP9BOV' ... '1MIH1CDBYH' '2C59YVROJA' '58J703EA0']
    `max_attempts`: [nan  3.  1.]
    `available_core_tutoring`: [nan 'answer' 'no_tutoring' 'explanation' 'hint']
    `score_viewable`: [nan  1.  0.]
    `continuous_score_viewable`: [nan  1.  0.]
    `action`: ['assignment_started' 'problem_started' 'wrong_response'
 'answer_requested' 'correct_response' 'problem_finished'
 'continue_selected' 'open_response' 'assignment_finished'
 'assignment_resumed' 'explanation_requested'
 'skill_related_video_requested' 'hint_requested' 'live_tutor_requested']
    `hint_id`: [nan 'OEM5SD5F2' '4O45D3DCP' ... '1E23T5SSF6' 'GRAMRQ1B6' '3WLC5CBKC']
    `explanation_id`: [nan '1RNFIM

Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,assignment_log_id,object,0,0.00%,638528
1,timestamp,float64,0,0.00%,23908418
2,problem_id,object,6136715,25.64%,57361
3,max_attempts,float64,18686416,78.08%,3
4,available_core_tutoring,object,18686416,78.08%,5
5,score_viewable,float64,18686416,78.08%,3
6,continuous_score_viewable,float64,18686416,78.08%,3
7,action,object,0,0.00%,14
8,hint_id,object,23858933,99.69%,9126
9,explanation_id,object,23911140,99.91%,4133


  Duplicates: 0
  (10) Sample Data:


Unnamed: 0,assignment_log_id,timestamp,problem_id,max_attempts,available_core_tutoring,score_viewable,continuous_score_viewable,action,hint_id,explanation_id
0,2QV1F2GSBZ,1599151000.0,,,,,,assignment_started,,
1,2QV1F2GSBZ,1599151000.0,I2GX4OQIE,3.0,answer,1.0,1.0,problem_started,,
2,2QV1F2GSBZ,1599151000.0,I2GX4OQIE,,,,,wrong_response,,
3,2QV1F2GSBZ,1599151000.0,I2GX4OQIE,,,,,wrong_response,,
4,2QV1F2GSBZ,1599151000.0,I2GX4OQIE,,,,,answer_requested,,
5,2QV1F2GSBZ,1599151000.0,I2GX4OQIE,,,,,correct_response,,
6,2QV1F2GSBZ,1599151000.0,I2GX4OQIE,,,,,problem_finished,,
7,2QV1F2GSBZ,1599151000.0,,,,,,continue_selected,,
8,2QV1F2GSBZ,1599151000.0,HCTP9BOV,3.0,answer,1.0,1.0,problem_started,,
9,2QV1F2GSBZ,1599151000.0,HCTP9BOV,,,,,answer_requested,,


assignment_details (shape: (9319676, 9))

Unique items per column:
    `assignment_log_id`: ['2PLEB2KWK9' '8G25XNCXN' '266AW7UU1V' ... '2221LVOVLV' '1JLBSN4GU8'
 '1P0ZDDHF8I']
    `teacher_id`: ['22OEQXISYV' '2SKA2RTF6' '1FJ326JFAH' ... '8YXMUUV4' '2LPTVMN0DB'
 '15GKIINPO2']
    `class_id`: ['133F5L5O95' '2OL82EC95R' '1WJWBO8XL4' ... '2O5SON14LE' '1YUZ7AVX73'
 '2O49RBRTD5']
    `student_id`: ['L97DTM607' '21S35PU5W2' 'IBO6BEHXA' ... '2OMJLIM7FZ' '1XHKWR7U53'
 '1U4TW18HRT']
    `sequence_id`: ['1FLYIHK4Q4' 'CDLX4UJ84' '2T42B3UC5' ... 'GKMMKW3HA' 'AWXR2AIP7'
 '1I4NBTPRPM']
    `assignment_release_date`: [1.53963478e+09 1.53987137e+09 1.53988466e+09 ... 1.67164100e+09
 1.67164101e+09 1.67479830e+09]
    `assignment_due_date`: [1.54006686e+09            nan 1.54024152e+09 ... 1.67286960e+09
 1.67364000e+09 1.67547150e+09]
    `assignment_start_time`: ['1539634866.476' '1539871403.267' '1539884690.684' ... '1666883797.45'
 '1670539215.69' '1671258829.227']
    `assignment_end_time`: [nan '1

Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,assignment_log_id,object,0,0.00%,9319676
1,teacher_id,object,0,0.00%,23523
2,class_id,object,0,0.00%,47401
3,student_id,object,0,0.00%,651253
4,sequence_id,object,0,0.00%,8774
5,assignment_release_date,float64,0,0.00%,431169
6,assignment_due_date,float64,2744279,29.45%,150787
7,assignment_start_time,object,0,0.00%,9262758
8,assignment_end_time,object,1878016,20.15%,7386234


  Duplicates: 0
  (10) Sample Data:


Unnamed: 0,assignment_log_id,teacher_id,class_id,student_id,sequence_id,assignment_release_date,assignment_due_date,assignment_start_time,assignment_end_time
0,2PLEB2KWK9,22OEQXISYV,133F5L5O95,L97DTM607,1FLYIHK4Q4,1539635000.0,1540067000.0,1539634866.476,
1,8G25XNCXN,2SKA2RTF6,2OL82EC95R,21S35PU5W2,CDLX4UJ84,1539871000.0,,1539871403.267,1539871641.345
2,266AW7UU1V,1FJ326JFAH,1WJWBO8XL4,IBO6BEHXA,2T42B3UC5,1539885000.0,,1539884690.684,
3,15SHL0U0E6,129LDU45TT,IBO6BEHXA,1CT2ERTNC7,7ZGYNOHS3,1539896000.0,1540242000.0,1539952545.055,
4,CQA32TBFI,1FJ326JFAH,1WJWBO8XL4,2JC4HHXU4M,2T42B3UC5,1539885000.0,,1539974068.802,
5,RNSUY1N30,1FJ326JFAH,1WJWBO8XL4,2JC4HHXU4M,116QWSQWM9,1541432000.0,,1541431936.621,
6,1SRBUROB4M,1A8U1KW3AV,2DFHHHY3AO,GJD8FQZUO,SZQ65NBOQ,1542218000.0,,1542218008.283,
7,1MOHSGSH4S,1A8U1KW3AV,2DFHHHY3AO,GJD8FQZUO,SZQ65NBOQ,1542218000.0,,1542218340.303,1542219741.687
8,22UBCQFR5,1A8U1KW3AV,2DFHHHY3AO,288F3DEIBP,SZQ65NBOQ,1542218000.0,,1542221221.258,1542222007.714
9,1PRB4L8LJ3,1A8U1KW3AV,2DFHHHY3AO,SAUMQVPOS,SZQ65NBOQ,1542218000.0,,1542221226.111,1542221972.925


assignment_relationships (shape: (702887, 2))

Unique items per column:
    `unit_test_assignment_log_id`: ['7FGC8P0F1' '15KQFID5U5' 'QKDRPCXSG' ... '2PNRH0FF5C' '17EUNRKIC4'
 '28TD16LQU8']
    `in_unit_assignment_log_id`: ['V6YXT3UG' '1TFFYMT814' '1N2IFGUASM' ... '251US25W27' '1QT606YWQ4'
 '35N2V2RP7']
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,unit_test_assignment_log_id,object,0,0.00%,56577
1,in_unit_assignment_log_id,object,0,0.00%,638528


  Duplicates: 3048
  (10) Sample Data:


Unnamed: 0,unit_test_assignment_log_id,in_unit_assignment_log_id
0,7FGC8P0F1,V6YXT3UG
1,15KQFID5U5,1TFFYMT814
2,QKDRPCXSG,1N2IFGUASM
3,1JOJIQXU1B,15W4ET3W62
4,2C9YZRVZT0,1WORTY787C
5,38M6IA4SS,2DQG3SWWLS
6,15XW17EHLW,Y3G0XTLMF
7,2C5IG7FC12,1HLYER60XW
8,F9OJCBCRM,1XB8H1OIF8
9,2OJ73SYFF6,1XB8H1OIF8


problem_details (shape: (132738, 10))

Unique items per column:
    `problem_id`: ['10MFND3HAJ' 'IH3MOE7AF' '14YC7CEE2N' ... '16PQQU7TL4' 'HIGDEU75J'
 '1A23RM90NL']
    `problem_multipart_id`: ['2MHCTW1IIN' '1UEQMXOOFA' '1W7DRPNEJL' ... '1Q6T5XK3TJ' '1ZHXWZ0J1S'
 'NQQHNDY3S']
    `problem_multipart_position`: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 16 15 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55]
    `problem_type`: ['Multiple Choice' 'Ungraded Open Response' 'Number'
 'Algebraic Expression' 'Numeric Expression' 'Check All That Apply'
 'Exact Match (ignore case)' 'Exact Fraction'
 'Exact Match (case sensitive)' 'Ordering']
    `problem_skill_code`: ['6.RP.A.3b' '6.RP.A.2' '6.RP.A.3a' '6.RP.A.1' '7.RP.A.2a' '8.EE.A.1-1'
 '8.EE.A.1-2' '8.EE.A.1-3' '8.EE.A.3' '8.EE.A.4' '7.RP.A.2b' '7.RP.A.2c'
 '6.EE.C.9-2' '7.RP.A.2d' '7.RP.A.1' '6.NS.A.1' '6.NS.B.3-3' '7.RP.A.3'
 '5.NF.B.4a-2' '8.G.A.1b' '8.G.A.1a' '8.G.A.1c

Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,problem_id,object,0,0.00%,132738
1,problem_multipart_id,object,0,0.00%,70108
2,problem_multipart_position,int64,0,0.00%,55
3,problem_type,object,0,0.00%,10
4,problem_skill_code,object,820,0.62%,542
5,problem_skill_description,object,820,0.62%,540
6,problem_contains_image,float64,5,0.00%,3
7,problem_contains_equation,float64,5,0.00%,3
8,problem_contains_video,float64,5,0.00%,3
9,problem_text_bert_pca,object,0,0.00%,85042


  Duplicates: 0
  (10) Sample Data:


Unnamed: 0,problem_id,problem_multipart_id,problem_multipart_position,problem_type,problem_skill_code,problem_skill_description,problem_contains_image,problem_contains_equation,problem_contains_video,problem_text_bert_pca
0,10MFND3HAJ,2MHCTW1IIN,1,Multiple Choice,6.RP.A.3b,Unit Rate,0.0,0.0,1.0,"[0.53955209,-0.96322744,0.49725574,6.28795392,..."
1,IH3MOE7AF,1UEQMXOOFA,1,Multiple Choice,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-1.61147666,-1.50911536,0.52055446,6.01118343..."
2,14YC7CEE2N,1UEQMXOOFA,2,Ungraded Open Response,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-8.95361845,5.26005410,-4.41350451,-2.6751771..."
3,16L5KQWLN7,1W7DRPNEJL,1,Ungraded Open Response,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-2.89295465,1.73222701,-0.21075635,0.16314057..."
4,BU0LO0LDD,1Z6MGLD8VK,1,Ungraded Open Response,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-1.53959700,1.35386494,-1.56874727,0.89545312..."
5,W9WPQSAU5,MBYKGWG5L,1,Ungraded Open Response,6.RP.A.3b,Unit Rate,0.0,0.0,0.0,"[-3.20997122,0.64484637,-0.57017812,-0.4925776..."
6,2OHCH5C5BD,O0EI8SMXR,1,Number,6.RP.A.2,Expressing Unit Rate in Words,0.0,0.0,0.0,"[-1.47761510,-1.33452493,-0.32730713,0.4147120..."
7,9CB1OILA2,A1DWWVVLC,1,Ungraded Open Response,6.RP.A.3a,Making Equivalent Ratio Tables,0.0,0.0,0.0,"[-3.49009235,-4.33279096,1.77473598,-0.3124369..."
8,1JCPX2ZOXQ,K65VD17P2,1,Ungraded Open Response,6.RP.A.3a,Making Equivalent Ratio Tables,0.0,0.0,0.0,"[-1.43699869,-4.19686441,-0.77936048,4.2911283..."
9,AANYMYPL6,1K9KSMZ0FV,1,Multiple Choice,6.RP.A.2,Expressing Unit Rate in Words,0.0,0.0,0.0,"[-1.70709674,-2.89674901,-0.17309755,5.5549082..."


sequence_details (shape: (10774, 8))

Unique items per column:
    `sequence_id`: ['K1U9M2PVF' '1XEPEYCPC3' '20SXJMMSRG' ... 'RDY7KM9L2' '2JXW7XZREV'
 '20978UCBXP']
    `sequence_folder_path_level_1`: ['EngageNY/Eureka Math (© by Great Minds®) *'
 'Kendall Hunt Illustrative Mathematics']
    `sequence_folder_path_level_2`: ['Algebra I' 'Algebra II' 'Geometry' 'Grade 1' 'Grade 2' 'Grade 3'
 'Grade 4' 'Grade 5' 'Grade 6' 'Grade 7' 'Grade 8'
 'Pre-Calculus and Advanced Topics' 'Algebra 1' 'Algebra 1 Supports'
 'Algebra 2' 'Grade 6 Accelerated' 'Grade 7 Accelerated']
    `sequence_folder_path_level_3`: ['Module 1 - Relationships Between Quantities and Reasoning with Equations and Their Graphs'
 'Module 2 - Descriptive Statisitics'
 'Module 3 - Linear and Exponential Functions'
 'Module 4 - Polynomial and Quadratic Expressions, Equations, and Functions'
 'Module 5 - A Synthesis of Modeling with Equations and Functions'
 'Module 1 - Polynomial, Rational, and Radical Relationships'
 'Module 2

Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,sequence_id,object,0,0.00%,10228
1,sequence_folder_path_level_1,object,0,0.00%,2
2,sequence_folder_path_level_2,object,0,0.00%,17
3,sequence_folder_path_level_3,object,0,0.00%,177
4,sequence_folder_path_level_4,object,96,0.89%,2407
5,sequence_folder_path_level_5,object,8790,81.59%,720
6,sequence_name,object,0,0.00%,10225
7,sequence_problem_ids,object,0,0.00%,10677


  Duplicates: 0
  (10) Sample Data:


Unnamed: 0,sequence_id,sequence_folder_path_level_1,sequence_folder_path_level_2,sequence_folder_path_level_3,sequence_folder_path_level_4,sequence_folder_path_level_5,sequence_name,sequence_problem_ids
0,K1U9M2PVF,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Module 1---Assessments,,End-of-Module---Alg 1.1 End-of-Module Assessment,"[AQ0ZKSP6D,2KTD380L98,7CPDNFDLD,2F9VV7RVWU,255..."
1,1XEPEYCPC3,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Module 1---Assessments,,Mid-Module---Alg1.1 Mid-Module Assessment,"[WS70M9DP1,13HDHY5VMI,24WQMJBRDX,1IFT888E81,F2..."
2,20SXJMMSRG,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 1: Graphs of Piecewise Linear...,,"Problem Set---Algebra I, M1, Lesson 1 (N.Q.A.1...","[1D3AXDDMQ9,2HVIXDM2L5,1I9N9TMSO6,182WSU48H,Z6..."
3,1SMS0A4N5G,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 2: Graphs of Quadratic Functions,,"Classwork---Algebra I, M1, Lesson 2 (N.Q.A.1, ...","[1X69IIUXB1,E083MYD2P]"
4,1BROMSHRRA,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 2: Graphs of Quadratic Functions,,"Exit Ticket---Algebra 1, M1, Lesson 2 (N.Q.1, ...",[2BLJ83JUIM]
5,520QV3Q8S,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 2: Graphs of Quadratic Functions,,"Problem Set---Algebra I, M1, Lesson 2 (N.Q.A.1...","[29VD2UIJTI,1M74LB6G3J,1772ID2XLH,1J69K3FEKK,1..."
6,2FMEH9Y63M,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 3: Graphs of Exponential Func...,,"Classwork---Algebra I, M1, Lesson 3 (N.Q.1, N....","[1JHWLIKQDC,1VEVZAR2XS,OB0DO2ZS,RKO1EOZBD]"
7,1RASYU5JGC,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 3: Graphs of Exponential Func...,,"Exit Ticket---Algebra 1, M1, Lesson 3 (N.Q.1, ...","[87T4WGIEB,2D5DN5AD90]"
8,2MZEXEMAEN,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 3: Graphs of Exponential Func...,,"Problem Set---Algebra I, M1, Lesson 3 (N.Q.A.1...","[2FP1R2REA1,166VN2L2PC,2KKXLO5S5Q,1W7WB4TMPB,X..."
9,IZ1NEEPQP,EngageNY/Eureka Math (© by Great Minds®) *,Algebra I,Module 1 - Relationships Between Quantities an...,Topic A---Lesson 4: Analyzing Graphs—Water Usa...,,"Classwork---Algebra I, M1, Lesson 4 (N.Q.A.1, ...","[1CNJYSTUEH,2KULLSIT0Y,1MZ1MDC6AK,H319LYQXU,2H..."


training_unit_test_scores (shape: (452439, 3))

Unique items per column:
    `assignment_log_id`: ['1CEASUAUQJ' '2IMKPEIL2Q' '2MZN9L748R' ... 'DC3HIJPC9' '624996B53'
 '2OMX9AY8J6']
    `problem_id`: ['18J6436AS5' '9RMI4CZU9' '8F4U5WWTV' ... '1VUW3WTLJE' 'SAI1LT0AQ'
 '1AJJUYE7LZ']
    `score`: [1 0]
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,assignment_log_id,object,0,0.00%,42343
1,problem_id,object,0,0.00%,1835
2,score,int64,0,0.00%,2


  Duplicates: 0
  (10) Sample Data:


Unnamed: 0,assignment_log_id,problem_id,score
0,1CEASUAUQJ,18J6436AS5,1
1,2IMKPEIL2Q,9RMI4CZU9,0
2,2IMKPEIL2Q,8F4U5WWTV,0
3,2IMKPEIL2Q,27D3I359NE,1
4,2IMKPEIL2Q,22DY4PFVMV,1
5,2IMKPEIL2Q,ZQMHFZJ53,1
6,2IMKPEIL2Q,1II2JVYEQV,0
7,2MZN9L748R,2N2SARA9Q6,1
8,2MZN9L748R,123M9UFYL2,0
9,2MZN9L748R,1WL078QSL4,0


## Apriori Association Rule Mining

### Apriori Analysis Utility Functions

In [None]:
def basket_summary( transactions_df: pd.DataFrame, output_dir: str = ProcessingConfig.results_vis_dir, plot_filename: str = "apriori_eda.png", 
                       top_n: int = 5, heatmap_top_k: int = 10, rules_df: pd.DataFrame | None = None ):
    
    print("Sample Transactions: ")
    display(transactions_df.head(top_n))
    
    assert 'items' in transactions_df.columns, "`items` column not found"
    os.makedirs(output_dir, exist_ok=True)

    baskets = transactions_df['items']
    n_transactions = len(baskets)
    all_items = [item for basket in baskets for item in basket]

    item_counts = Counter(all_items)
    item_support = {
        item: count / n_transactions
        for item, count in item_counts.items()
    }
    item_support = dict(
        sorted(item_support.items(), key=lambda x: x[1], reverse=True)
    )

    basket_sizes = baskets.apply(len)
    print(f"Average Baskets: {basket_sizes.mean()}")
    print("\nEDA:")
    print(f"- Top {top_n} items:")
    for item, s in list(item_support.items())[:top_n]:
        print(f"  • {item} ({s*100:.1f}%)")

    # Co-occurrence matrix (top-K items)
    top_items_k = [item for item, _ in list(item_support.items())[:heatmap_top_k]]
    idx = {item: i for i, item in enumerate(top_items_k)}

    co_matrix = np.zeros((heatmap_top_k, heatmap_top_k), dtype=int)

    for basket in baskets:
        present = [item for item in basket if item in idx]
        for i, j in combinations(present, 2):
            co_matrix[idx[i], idx[j]] += 1
            co_matrix[idx[j], idx[i]] += 1
        for i in present:
            co_matrix[idx[i], idx[i]] += 1

    co_matrix_pct = co_matrix / n_transactions

    fig, axes = plt.subplots(2, 2, figsize=(16, 12))

    # Item support (bar)
    top_items = list(item_support.items())[:20]
    labels, values = zip(*top_items)

    ax = axes[0, 0]
    ax.bar(labels, np.array(values) * 100)
    ax.set_title("Item Support (Top 20)")
    ax.set_ylabel("Support (%)")
    ax.tick_params(axis='x', rotation=45)

    # Co-occurrence heatmap
    ax = axes[0, 1]
    im = ax.imshow(co_matrix_pct * 100)
    ax.set_xticks(range(heatmap_top_k))
    ax.set_yticks(range(heatmap_top_k))
    ax.set_xticklabels(top_items_k, rotation=45, ha="right")
    ax.set_yticklabels(top_items_k)
    ax.set_title("Item Co-occurrence (%)")
    fig.colorbar(im, ax=ax, fraction=0.046)

    # Support vs Confidence (bubble = lift)
    if rules_df is not None:
        ax = axes[1, 0]
        ax.scatter(
            rules_df["support"],
            rules_df["confidence"],
            s=rules_df["lift"] * 25,
            alpha=0.6
        )
        ax.set_xlabel("Support")
        ax.set_ylabel("Confidence")
        ax.set_title("Rules: Support vs Confidence (size = Lift)")
        ax.grid(True, linestyle="--", alpha=0.4)
    else:
        axes[1, 0].axis("off")

    # Support vs Lift (Apriori sweet spot)
    if rules_df is not None:
        ax = axes[1, 1]
        ax.scatter(
            rules_df["support"],
            rules_df["lift"],
            alpha=0.6
        )
        ax.set_xlabel("Support")
        ax.set_ylabel("Lift")
        ax.set_title("Rules: Support vs Lift")
        ax.axhline(1.0, linestyle="--", linewidth=1)
        ax.grid(True, linestyle="--", alpha=0.4)
    else:
        axes[1, 1].axis("off")

    plt.tight_layout()
    plot_path = os.path.join(output_dir, plot_filename)
    plt.savefig(plot_path)
    plt.close()

    print(f"\nCombined EDA plot saved: {plot_path}")


def run_apriori(encoded_df):
    frequent_itemsets = apriori(
        encoded_df,
        min_support=AnalysisConfig.min_support,
        use_colnames=True
    )

    frequent_itemsets = frequent_itemsets.sort_values(
        'support',
        ascending=False
    )

    rules = association_rules(
        frequent_itemsets,
        metric='confidence',
        min_threshold=AnalysisConfig.min_confidence
    )
    rules = rules[rules['lift'] >= AnalysisConfig.min_lift]

    rules = rules.sort_values(
        ['lift', 'confidence'],
        ascending=False
    )

    rules['antecedents'] = rules['antecedents'].apply(lambda x: ', '.join(list(x)))
    rules['consequents'] = rules['consequents'].apply(lambda x: ', '.join(list(x)))

    return frequent_itemsets, rules

def print_top_association_results(
    frequent_itemsets: pd.DataFrame,
    rules: pd.DataFrame,
    config,
    sort_itemsets_by: str = "support",
    sort_rules_by: tuple = ("lift", "confidence"),
    output_dir: str = ProcessingConfig.results_raw_dir,
    base_filename: str = "apriori",
):
    os.makedirs(output_dir, exist_ok=True)

    print("\n" + "=" * 60)
    print(f"TOP {config.top_n_rules} FREQUENT ITEMSETS")
    print("=" * 60)

    itemsets_df = frequent_itemsets.copy()
    itemsets_df["itemset"] = itemsets_df["itemsets"].apply(
        lambda x: " | ".join(sorted(x))
    )
    itemsets_df["itemset_size"] = itemsets_df["itemsets"].apply(len)

    top_itemsets = (
        itemsets_df
        .sort_values(sort_itemsets_by, ascending=False)
        .head(config.top_n_rules)
    )

    for _, row in top_itemsets.iterrows():
        print(f"{row['itemset']}  (support={row['support']:.3f})")

    print(
        f"From {len(itemsets_df)} itemsets "
        f"with support ≥ {config.min_support}"
    )

    print("\n" + "=" * 60)
    print(f"TOP {config.top_n_rules} ASSOCIATION RULES")
    print("=" * 60)
    
    rules_df = rules.copy()
    rules_df["antecedent_size"] = rules_df["antecedents"].apply(len)
    rules_df["consequent_size"] = rules_df["consequents"].apply(len)
    rules_df["rule_size"] = (
        rules_df["antecedent_size"] + rules_df["consequent_size"]
    )

    top_rules = (
        rules_df
        .sort_values(list(sort_rules_by), ascending=False)
        .head(config.top_n_rules)
    )

    for _, row in top_rules.iterrows():
        print(
            f"{row['antecedents']} -> {row['consequents']}  "
            f"(support={row['support']:.3f}, "
            f"confidence={row['confidence']:.3f}, "
            f"lift={row['lift']:.3f})"
        )

    itemsets_path = os.path.join(output_dir, f"{base_filename}_itemsets.csv")
    itemsets_df.sort_values(sort_itemsets_by, ascending=False).drop(columns=["itemsets"]).to_csv(itemsets_path, index=False)

    rules_path = os.path.join(output_dir, f"{base_filename}_rules.csv")
    rules_df.sort_values(list(sort_rules_by), ascending=False).to_csv(rules_path, index=False)

    print("\nSaved results:")
    print(f"  - Itemsets: {itemsets_path}")
    print(f"  - Rules: {rules_path}")

### 1. Student-problem interactions (behavior co-occurrence)

#### Data Cleaning

##### 'action_logs' dataframe

In [10]:
print("Preprocessing `action_logs`")
action_logs: pd.DataFrame = pd.read_csv(csv_files['action_logs'])
DROP_COLS = [
    "max_attempts",
    "score_viewable",
    "continuous_score_viewable",
    "hint_id",
    "explanation_id"
]
print(f"    Dropping columns: {DROP_COLS}")
action_logs = action_logs.drop(columns=DROP_COLS)

KEEP_ACTIONS = [
    "problem_started",
    "wrong_response",
    "correct_response",
    "hint_requested",
    "explanation_requested",
    "answer_requested",
    "problem_finished"
]
action_logs = action_logs[action_logs["action"].isin(KEEP_ACTIONS)]
print(f"    Removed rows with `action` not in: {KEEP_ACTIONS}")

# Sort for sequence analysis
action_logs = action_logs.sort_values(
    ["assignment_log_id", "timestamp"]
)

df_exploration("action_logs", action_logs)
print(f"    Saving dataframe...")
save_dataframe("action_logs", action_logs)

print(f"    Cleaning...")
del action_logs, KEEP_ACTIONS, DROP_COLS
gc.collect()

Preprocessing `action_logs`
    Dropping columns: ['max_attempts', 'score_viewable', 'continuous_score_viewable', 'hint_id', 'explanation_id']
    Removed rows with `action` not in: ['problem_started', 'wrong_response', 'correct_response', 'hint_requested', 'explanation_requested', 'answer_requested', 'problem_finished']
action_logs (shape: (16252841, 5))

Unique items per column:
    `assignment_log_id`: ['1000AQM2VK' '1000PGZ66S' '1000REVJ68' ... 'ZZZBBDDJD' 'ZZZCJIY46'
 'ZZZYX8591']
    `timestamp`: [1.61834456e+09 1.61834459e+09 1.61834459e+09 ... 1.61521063e+09
 1.61521071e+09 1.61521071e+09]
    `problem_id`: ['1IEH49XWH5' '27YYTVQK6K' '2K9KDM1BB5' ... '1VCS0UUSIR' '2L4QWWQZN8'
 '27VCBJOL0B']
    `available_core_tutoring`: ['answer' nan 'no_tutoring' 'hint' 'explanation']
    `action`: ['problem_started' 'correct_response' 'problem_finished' 'wrong_response'
 'answer_requested' 'explanation_requested' 'hint_requested']
Report:


Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,assignment_log_id,object,0,0.00%,638201
1,timestamp,float64,0,0.00%,16241543
2,problem_id,object,0,0.00%,57360
3,available_core_tutoring,object,11006981,67.72%,5
4,action,object,0,0.00%,7


  Duplicates: 0
  (10) Sample Data:


Unnamed: 0,assignment_log_id,timestamp,problem_id,available_core_tutoring,action
6144448,1000AQM2VK,1618345000.0,1IEH49XWH5,answer,problem_started
6144449,1000AQM2VK,1618345000.0,1IEH49XWH5,,correct_response
6144450,1000AQM2VK,1618345000.0,1IEH49XWH5,,problem_finished
6144452,1000AQM2VK,1618345000.0,27YYTVQK6K,no_tutoring,problem_started
6144454,1000AQM2VK,1618345000.0,27YYTVQK6K,,problem_finished
6144456,1000AQM2VK,1618345000.0,2K9KDM1BB5,answer,problem_started
6144457,1000AQM2VK,1618345000.0,2K9KDM1BB5,,correct_response
6144458,1000AQM2VK,1618345000.0,2K9KDM1BB5,,problem_finished
6144460,1000AQM2VK,1618345000.0,1HES7DVPEF,no_tutoring,problem_started
6144462,1000AQM2VK,1618345000.0,1HES7DVPEF,,problem_finished


    Saving dataframe...
    Cleaning...


896

##### 'problem_details' dataframe

In [11]:
problem_details: pd.DataFrame = pd.read_csv(csv_files['problem_details'])
PROB_BOOL_COLS = ['problem_contains_image', 'problem_contains_equation', 'problem_contains_video']
print(f"    Filling missing boolean columns `{PROB_BOOL_COLS}` with 0.")
for col in PROB_BOOL_COLS:
    problem_details[col] = problem_details[col].fillna(0).astype(np.int8)

for col in ['problem_skill_code', 'problem_skill_description']:
    problem_details[col] = problem_details[col].fillna('Unknown')

print(f"    Dropping unused columns...")
DROP_COLS = [
    "problem_text_bert_pca",
    "problem_multipart_id",
    "problem_multipart_position",
    "problem_skill_description",
]
problem_details = problem_details.drop(columns=DROP_COLS)

df_exploration('problem_details', problem_details)
print(f"    Saving dataframe...")
save_dataframe('problem_details',problem_details)

print(f"    Cleaning...")
del problem_details, PROB_BOOL_COLS, DROP_COLS
gc.collect()

    Filling missing boolean columns `['problem_contains_image', 'problem_contains_equation', 'problem_contains_video']` with 0.
    Dropping unused columns...
problem_details (shape: (132738, 6))

Unique items per column:
    `problem_id`: ['10MFND3HAJ' 'IH3MOE7AF' '14YC7CEE2N' ... '16PQQU7TL4' 'HIGDEU75J'
 '1A23RM90NL']
    `problem_type`: ['Multiple Choice' 'Ungraded Open Response' 'Number'
 'Algebraic Expression' 'Numeric Expression' 'Check All That Apply'
 'Exact Match (ignore case)' 'Exact Fraction'
 'Exact Match (case sensitive)' 'Ordering']
    `problem_skill_code`: ['6.RP.A.3b' '6.RP.A.2' '6.RP.A.3a' '6.RP.A.1' '7.RP.A.2a' '8.EE.A.1-1'
 '8.EE.A.1-2' '8.EE.A.1-3' '8.EE.A.3' '8.EE.A.4' '7.RP.A.2b' '7.RP.A.2c'
 '6.EE.C.9-2' '7.RP.A.2d' '7.RP.A.1' '6.NS.A.1' '6.NS.B.3-3' '7.RP.A.3'
 '5.NF.B.4a-2' '8.G.A.1b' '8.G.A.1a' '8.G.A.1c' '7.G.A.1' '5.G.A.2'
 '8.G.A.2' '7.NS.A.1a' '7.NS.A.1b' '7.NS.A.3-1' '7.NS.A.1d-1' '8.G.A.5-1'
 '8.G.A.5-2' '8.G.B.7' '7.EE.B.4a-1' '7.NS.A.1c' '7.NS.A.1d-3

Unnamed: 0,Column,Type,Missing,Missing%,Unique
0,problem_id,object,0,0.00%,132738
1,problem_type,object,0,0.00%,10
2,problem_skill_code,object,0,0.00%,542
3,problem_contains_image,int8,0,0.00%,2
4,problem_contains_equation,int8,0,0.00%,2
5,problem_contains_video,int8,0,0.00%,2


  Duplicates: 0
  (10) Sample Data:


Unnamed: 0,problem_id,problem_type,problem_skill_code,problem_contains_image,problem_contains_equation,problem_contains_video
0,10MFND3HAJ,Multiple Choice,6.RP.A.3b,0,0,1
1,IH3MOE7AF,Multiple Choice,6.RP.A.3b,0,0,0
2,14YC7CEE2N,Ungraded Open Response,6.RP.A.3b,0,0,0
3,16L5KQWLN7,Ungraded Open Response,6.RP.A.3b,0,0,0
4,BU0LO0LDD,Ungraded Open Response,6.RP.A.3b,0,0,0
5,W9WPQSAU5,Ungraded Open Response,6.RP.A.3b,0,0,0
6,2OHCH5C5BD,Number,6.RP.A.2,0,0,0
7,9CB1OILA2,Ungraded Open Response,6.RP.A.3a,0,0,0
8,1JCPX2ZOXQ,Ungraded Open Response,6.RP.A.3a,0,0,0
9,AANYMYPL6,Multiple Choice,6.RP.A.2,0,0,0


    Saving dataframe...
    Cleaning...


0

#### Feature Engineering

##### Aggregation Function

In [12]:
def aggregate_attempts(group, assignment_log_id, problem_id):
    if not (group['action'] == 'problem_finished').any():
        return None

    started = group.loc[group['action'] == 'problem_started', 'timestamp'].min()
    finished = group.loc[group['action'] == 'problem_finished', 'timestamp'].max()

    if pd.isna(started) or pd.isna(finished):
        return None

    time_spent = finished - started

    hint_count = (group['action'] == 'hint_requested').sum()
    wrong_count = (group['action'] == 'wrong_response').sum()
    correct_count = (group['action'] == 'correct_response').sum()
    answer_requested = (group['action'] == 'answer_requested').any()
    explanation_requested = (group['action'] == 'explanation_requested').any()

    if correct_count > 0:
        if hint_count == 0 and wrong_count == 0 and not answer_requested:
            final_outcome = 'correct_first_try'
        else:
            final_outcome = 'correct_after_help'
    else:
        final_outcome = 'gave_up'

    return {
        'assignment_log_id': assignment_log_id,
        'problem_id': problem_id,
        'hint_count': hint_count,
        'wrong_count': wrong_count,
        'answer_requested': answer_requested,
        'explanation_requested': explanation_requested,
        'time_spent': time_spent,
        'final_outcome': final_outcome
    }

def get_resume_state(temp_dir, chunk_size):
    files = glob.glob(f"{temp_dir}/problem_attempts_part_*.parquet")
    if not files:
        return 0, 0

    indices = [
        int(re.search(r"part_(\d+)\.parquet", f).group(1))
        for f in files
    ]

    last_chunk_idx = max(indices)
    resume_row = (last_chunk_idx + 1) * chunk_size

    print(f"Resuming from chunk {last_chunk_idx + 1}, row {resume_row:,}")
    return resume_row, last_chunk_idx + 1

##### Feature Engineering Process w/ Chunking

In [None]:
print("Loading action_log dataframe...")
action_logs = load_dataframe('action_logs')

action_logs['timestamp'] = pd.to_datetime(action_logs['timestamp'])
action_logs['action'] = action_logs['action'].astype('category')

print("Sorting action logs...")
action_logs = action_logs.sort_values(
    ['assignment_log_id', 'problem_id', 'timestamp']
).reset_index(drop=True)

chunk_size = 1_000_000
temp_dir = str(ProcessingConfig.analysis_df_dir / "temp_action_problem")
start_row, chunk_idx = get_resume_state(temp_dir, chunk_size)
buffer = pd.DataFrame()

print("Starting chunked aggregation...")
for start in range(start_row, len(action_logs), chunk_size):
    end = min(start + chunk_size, len(action_logs))
    print(f"Processing rows {start:,} → {end:,}")

    chunk = action_logs.iloc[start:end]
    chunk = pd.concat([buffer, chunk], ignore_index=True)

    grouped = chunk.groupby(
        ['assignment_log_id', 'problem_id'],
        sort=False
    )

    keys = list(grouped.groups.keys())

    if len(keys) == 1:
        buffer = grouped.get_group(keys[0]).copy()
        continue

    complete_keys = keys[:-1]
    carry_key = keys[-1]

    results = []

    for assignment_log_id, problem_id in complete_keys:
        group = grouped.get_group((assignment_log_id, problem_id))
        out = aggregate_attempts(group, assignment_log_id, problem_id)
        if out is not None:
            results.append(out)

    if results:
        out_df = pd.DataFrame(results)
        out_path = f"{temp_dir}/problem_attempts_part_{chunk_idx:05d}.parquet"
        out_df.to_parquet(out_path, index=False)
        print(f"Saved {len(out_df):,} rows → {out_path}")
        chunk_idx += 1

    buffer = grouped.get_group(carry_key).copy()

    del chunk, grouped, results
    gc.collect()

print("Finalizing remaining buffer...")
final_results = []
grouped = buffer.groupby(
    ['assignment_log_id', 'problem_id'],
    sort=False
)
for assignment_log_id, problem_id in grouped.groups:
    group = grouped.get_group((assignment_log_id, problem_id))
    out = aggregate_attempts(group, assignment_log_id, problem_id)
    if out is not None:
        final_results.append(out)
if final_results:
    final_df = pd.DataFrame(final_results)
    out_path = f"{temp_dir}/problem_attempts_part_{chunk_idx:05d}.parquet"
    final_df.to_parquet(out_path, index=False)
    print(f"Saved final {len(final_df):,} rows")

del action_logs
del buffer
gc.collect()

print("Merging chunk files...")
chunk_files = sorted(glob.glob(f"{temp_dir}/problem_attempts_part_*.parquet"))
problem_attempts = pd.concat(
    (pd.read_parquet(f) for f in chunk_files),
    ignore_index=True
)

print(f"Total problem attempts: {len(problem_attempts):,}")
print("Loading problem details...")
problem_details = load_dataframe("problem_details")

action_problem_df = problem_attempts.merge(
    problem_details,
    on='problem_id',
    how='left'
)
save_dataframe("action_problem", action_problem_df)
print("Saved final action_problem dataframe")

print("Cleaning up temporary files...")
for f in chunk_files:
    os.remove(f)
os.rmdir(temp_dir)

del problem_attempts
del problem_details
del action_problem_df
del final_results
if 'final_df' in locals():
    del final_df
gc.collect()

print("Cleanup complete. Ready for next cell.")


Loading action_log dataframe...
Sorting action logs...
Starting chunked aggregation...
Processing rows 0 → 1,000,000
Saved 316,638 rows → problem_attempt_chunks/problem_attempts_part_00000.parquet
Processing rows 1,000,000 → 2,000,000
Saved 315,762 rows → problem_attempt_chunks/problem_attempts_part_00001.parquet
Processing rows 2,000,000 → 3,000,000
Saved 316,614 rows → problem_attempt_chunks/problem_attempts_part_00002.parquet
Processing rows 3,000,000 → 4,000,000
Saved 316,497 rows → problem_attempt_chunks/problem_attempts_part_00003.parquet
Processing rows 4,000,000 → 5,000,000
Saved 315,905 rows → problem_attempt_chunks/problem_attempts_part_00004.parquet
Processing rows 5,000,000 → 6,000,000
Saved 316,039 rows → problem_attempt_chunks/problem_attempts_part_00005.parquet
Processing rows 6,000,000 → 7,000,000
Saved 316,684 rows → problem_attempt_chunks/problem_attempts_part_00006.parquet
Processing rows 7,000,000 → 8,000,000
Saved 315,990 rows → problem_attempt_chunks/problem_attem

#### Transactions Creation

In [10]:
def action_problem_transactions(transactions):
    transactions = transactions.copy()

    transactions['hints'] = pd.cut(
        transactions['hint_count'],
        bins=[-0.1, 0, 2, np.inf],
        labels=['no_hints', 'few_hints', 'many_hints']
    )

    transactions['wrongs'] = pd.cut(
        transactions['wrong_count'],
        bins=[-0.1, 0, 2, np.inf],
        labels=['no_wrongs', 'few_wrongs', 'many_wrongs']
    )

    time_seconds = transactions['time_spent'].dt.total_seconds()

    q1, q2 = time_seconds.quantile([0.33, 0.66])

    transactions['time'] = pd.cut(
        time_seconds,
        bins=[-1, q1, q2, np.inf],
        labels=['fast', 'medium', 'slow']
    )


    transactions['answer'] = np.where(
        transactions['answer_requested'] == True,
        'answer_req',
        'no_answer'
    )

    transactions['explanation'] = np.where(
        transactions['explanation_requested'] == True,
        'explanation_req',
        'no_explanation'
    )

    if 'problem_type' in transactions.columns:
        type_mapping = {
            'Multiple Choice': 'mc',
            'Number': 'numeric',
            'Algebraic Expression': 'algebra',
            'Numeric Expression': 'numeric',
            'Ungraded Open Response': 'open',
            'Check All That Apply': 'mc',
            'Exact Match (ignore case)': 'text',
            'Exact Fraction': 'fraction',
            'Exact Match (case sensitive)': 'text',
            'Ordering': 'ordering'
        }
        transactions['prob_type'] = transactions['problem_type'].map(type_mapping).fillna('other')
    else:
        transactions['prob_type'] = 'unknown'

    transactions['transaction_id'] = (
        transactions['assignment_log_id'].astype(str) + '_' +
        transactions['problem_id'].astype(str)
    )

    item_columns = [
        'hints', 'wrongs', 'time', 'answer',
        'explanation', 'final_outcome', 'prob_type'
    ]

    transactions['items'] = transactions[item_columns].apply(
        lambda row: [str(item) for item in row if pd.notna(item)],
        axis=1
    )

    return transactions[['transaction_id', 'items']]

action_problem_df = load_dataframe('action_problem')
action_problem_t = action_problem_transactions(action_problem_df)
save_dataframe("action_problem_transactions", action_problem_t)
del action_problem_t, action_problem_df
gc.collect()

24

#### Transactions Encoding

In [None]:
action_problem_t = load_dataframe("action_problem_transactions")
transactions = action_problem_t['items'].tolist()

te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)

encoded_df = pd.DataFrame(
    te_array,
    columns=te.columns_
)

display(encoded_df)
save_dataframe("action_problem_encoded", encoded_df)
del action_problem_t, transactions, te, te_array, 
gc.collect()

Unnamed: 0,algebra,answer_req,correct_after_help,correct_first_try,explanation_req,fast,few_hints,few_wrongs,fraction,gave_up,many_hints,many_wrongs,mc,medium,no_answer,no_explanation,no_hints,no_wrongs,numeric,open,ordering,other,slow,text
0,False,True,True,False,False,False,False,True,False,False,False,False,False,True,False,True,True,False,True,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,True,True,True,False,True,False,False,True,False
2,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,True,True,True,False,True,False,False,False,False
3,False,True,True,False,False,True,False,True,False,False,False,False,False,False,False,True,True,False,True,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,True,True,True,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5140884,False,False,False,True,False,True,False,False,False,False,False,False,True,False,True,True,True,True,False,False,False,False,False,False
5140885,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,True,True,False,False,False,False,False,False,False
5140886,True,True,True,False,False,True,False,True,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
5140887,False,False,False,True,False,True,False,False,False,False,False,False,True,False,True,True,True,True,False,False,False,False,False,False


6248

#### Top Frequent Items and Rules

In [5]:
print("Loading encoddings...")
encoded_df = load_dataframe('action_problem_encoded')
print("Running Apriori Algorithm...")
freq_items1, rules1 = run_apriori(encoded_df)
del encoded_df
gc.collect()

print_top_association_results(
    frequent_itemsets=freq_items1,
    rules = rules1,
    config=AnalysisConfig,
    base_filename="A1_analysis"
)

Loading encoddings...
Running Apriori Algorithm...

TOP 10 FREQUENT ITEMSETS
no_explanation  (support=0.996)
no_hints  (support=0.990)
no_explanation | no_hints  (support=0.986)
no_answer  (support=0.884)
no_answer | no_explanation  (support=0.881)
no_answer | no_hints  (support=0.879)
no_answer | no_explanation | no_hints  (support=0.876)
no_wrongs  (support=0.828)
no_explanation | no_wrongs  (support=0.827)
no_hints | no_wrongs  (support=0.825)
From 557 itemsets with support ≥ 0.05

TOP 10 ASSOCIATION RULES
correct_after_help, no_hints, no_explanation, no_answer -> few_wrongs  (support=0.083, confidence=0.919, lift=6.509)
correct_after_help, no_hints, no_answer -> few_wrongs  (support=0.084, confidence=0.913, lift=6.467)
correct_after_help, no_hints, no_answer -> few_wrongs, no_explanation  (support=0.083, confidence=0.899, lift=6.442)
correct_after_help, no_explanation, no_answer -> no_hints, few_wrongs  (support=0.083, confidence=0.867, lift=6.323)
no_hints, few_wrongs -> correct_a

#### EDA and Visualizations

In [6]:
transactions_df = load_dataframe("action_problem_transactions")
basket_summary(
    transactions_df,
    rules_df=rules1,
    top_n=5,
    output_dir=ProcessingConfig.results_vis_dir,
    plot_filename="A1_results_analysis.png"
)
del transactions_df, freq_items1, rules1
gc.collect()


Average Baskets: 7.0

EDA:
- Top 5 items:
  • no_explanation (99.6%)
  • no_hints (99.0%)
  • no_answer (88.4%)
  • no_wrongs (82.8%)
  • fast (55.9%)

Combined EDA plot saved: ../results/vis/A1_results_analysis.png


1676

### 2. Student-unit aggregations (success/failure prediction)

#### Data Loading

In [None]:
unit_test_scores: pd.DataFrame = pd.read_csv(csv_files['training_unit_test_scores'])
save_dataframe("unit_test_scores", unit_test_scores)
del unit_test_scores
gc.collect()

assignment_relationships: pd.DataFrame = pd.read_csv(csv_files['assignment_relationships'])
save_dataframe("assignment_relationships", assignment_relationships)
del assignment_relationships
gc.collect()

0

#### Feature Engineering

##### Aggregation Function

In [50]:
def aggregate_assignment_behaviors(actions):
    actions = actions.copy()

    if len(actions) == 0:
        return None
    
    hint_count = (actions['action'] == 'hint_requested').sum()
    wrong_count = (actions['action'] == 'wrong_response').sum()
    correct_count = (actions['action'] == 'correct_response').sum()
    answer_count = (actions['action'] == 'answer_requested').sum()
    explanation_count = (actions['action'] == 'explanation_requested').sum()
    problems_started = (actions['action'] == 'problem_started').sum()
    problems_finished = (actions['action'] == 'problem_finished').sum()

    total_problems = max(problems_started, 1)

    completion_rate = problems_finished / total_problems
    help_total = hint_count + explanation_count + answer_count
    help_per_problem = help_total / total_problems
    wrong_per_problem = wrong_count / total_problems


    struggle_ratio = (
        wrong_count / (correct_count + 1)
        if correct_count >= 0 else 0
    )

    help_to_wrong_ratio = (
        help_total / (wrong_count + 1)
        if wrong_count >= 0 else 0
    )

    effort_intensity = (
        (help_total + wrong_count + correct_count) / total_problems
    )

    return pd.Series({
        'problems_completed': problems_finished,
        'completion_rate': completion_rate,
        'help_per_problem': help_per_problem,
        'wrong_per_problem': wrong_per_problem,
        'hint_count': hint_count,
        'answer_count': answer_count,
        'struggle_ratio': struggle_ratio,
        'help_to_wrong_ratio': help_to_wrong_ratio,
        'effort_intensity': effort_intensity
    })


##### Feature Engineering Process w/ Chunking

In [None]:
# Unit -> in-unit mapping
unit_tests = load_dataframe("unit_test_scores")

# Balance the correct and wrong in the dataset
score_counts = unit_tests['score'].value_counts()
min_count = score_counts.min()
balanced_unit_tests = (
    unit_tests
    .groupby('score', group_keys=False)
    .sample(n=min_count, random_state=42)
)
unit_tests = balanced_unit_tests.reset_index(drop=True)
del balanced_unit_tests
gc.collect()

unit_test_scores = unit_tests.groupby(
    'assignment_log_id'
).agg({
    'score': ['mean', 'sum', 'count']
}).reset_index()
unit_test_scores.columns = [
    'assignment_log_id',
    'avg_score',
    'total_correct',
    'total_problems'
]
print(f"  Found {len(unit_test_scores):,} unit tests")

assignment_relationships = load_dataframe("assignment_relationships")
unit_with_inunit = assignment_relationships.merge(
    unit_test_scores,
    left_on='unit_test_assignment_log_id',
    right_on='assignment_log_id',
    how='inner'
)
print(f"  Found {len(unit_with_inunit):,} unit test <-> in-unit assignment pairs")
del unit_tests, assignment_relationships, unit_test_scores
gc.collect()

in_unit_assignments = (
    unit_with_inunit['in_unit_assignment_log_id']
    .drop_duplicates()
    .to_numpy()
)

print(f"  Processing {len(in_unit_assignments):,} unique in-unit assignments...")
batch_size = 10_000
flush_size = 50_000
num_workers = 5
temp_dir = ProcessingConfig.analysis_df_dir / "temp_in_unit"
temp_dir.mkdir(parents=True, exist_ok=True)
existing_parts = sorted(temp_dir.glob("in_unit_behaviors_part_*.parquet"))
chunk_idx = len(existing_parts)
start_idx = chunk_idx * flush_size

print(f"Resuming from assignment index {start_idx:,}")
print(f"Existing chunks: {chunk_idx}")
action_logs = load_dataframe("action_logs")
grouped_logs = action_logs.groupby("assignment_log_id", sort=False)
del action_logs
gc.collect()

def process_assignment(assignment_id):
    try:
        group = grouped_logs.get_group(assignment_id)
    except KeyError:
        return None
    behavior = aggregate_assignment_behaviors(group)
    if behavior is None:
        return None
    behavior["in_unit_assignment_log_id"] = assignment_id
    return behavior

buffer = []
print("Starting in-unit behavior aggregation...")
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    for i in range(start_idx, len(in_unit_assignments), batch_size):
        batch = in_unit_assignments[i:i + batch_size]
        if i % 50_000 == 0:
            print(f"Processed {i:,} / {len(in_unit_assignments):,} assignments")

        for behavior in executor.map(process_assignment, batch):
            if behavior is not None:
                buffer.append(behavior)

        if len(buffer) >= flush_size:
            out_df = pd.DataFrame(buffer)
            out_path = temp_dir / f"in_unit_behaviors_part_{chunk_idx:05d}.parquet"
            out_df.to_parquet(out_path, index=False)
            print(f"Saved {len(out_df):,} rows → {out_path.name}")
            chunk_idx += 1
            buffer.clear()
            del out_df
            gc.collect()

# Check if there are items left in the buffer
if buffer:
    out_df = pd.DataFrame(buffer)
    out_path = temp_dir / f"in_unit_behaviors_part_{chunk_idx:05d}.parquet"
    out_df.to_parquet(out_path, index=False)
    print(f"Saved final {len(out_df):,} rows → {out_path.name}")
    buffer.clear()
    del out_df
    gc.collect()

print("Merging in-unit behavior chunks...")
chunk_files = sorted(temp_dir.glob("in_unit_behaviors_part_*.parquet"))
in_unit_df = pd.concat(
    (pd.read_parquet(f) for f in chunk_files),
    ignore_index=True
)
print(f"Aggregated behaviors for {len(in_unit_df):,} in-unit assignments")
save_dataframe("in_unit_behaviors", in_unit_df)
del grouped_logs
print("Saved final in_unit_behaviors dataframe")

# Join behaviors with relationships
unit_with_behaviors = unit_with_inunit.merge(
    in_unit_df,
    on='in_unit_assignment_log_id',
    how='left'
)
print("Cleaning temp files...")
del in_unit_df
gc.collect()
for f in chunk_files:
    f.unlink()
temp_dir.rmdir()
gc.collect()

print("Starting unit test aggregation...")
unit_test_agg = unit_with_behaviors.groupby('unit_test_assignment_log_id').agg({
    'avg_score': 'first',
    'total_correct': 'first',
    'total_problems': 'first',
    'problems_completed': 'mean',
    'completion_rate': 'mean',
    'help_per_problem': 'mean',
    'wrong_per_problem': 'mean',
    'struggle_ratio': 'mean',
    'help_to_wrong_ratio': 'mean',
    'effort_intensity': 'mean',
    'in_unit_assignment_log_id': 'count'
}).reset_index()
del unit_with_behaviors
gc.collect()

unit_test_agg.columns = [
    'assignment_log_id', 'unit_test_score', 'total_correct', 'total_problems',
    'avg_problems_completed', 'avg_completion_rate', 'avg_help_seeking', 'avg_wrong_attempts',
    'avg_struggle_ratio', 'avg_help_to_wrong_ratio', 'avg_effort_intensity', 'num_in_unit_assignments'
]
print(f"  Aggregated {len(unit_test_agg):,} unit tests")
save_dataframe('unit_test_df', unit_test_agg)
del unit_test_agg
gc.collect()

Original score distribution:
score
1    264919
0    187520
Name: count, dtype: int64
Balanced score distribution:
score
0    187520
1    187520
Name: count, dtype: int64
  Found 42,244 unit tests
  Found 532,013 unit test <-> in-unit assignment pairs
  Processing 484,881 unique in-unit assignments...
Resuming from assignment index 0
Existing chunks: 0
Starting in-unit behavior aggregation...
Processed 0 / 484,881 assignments
Processed 50,000 / 484,881 assignments
Saved 59,974 rows → in_unit_behaviors_part_00000.parquet
Processed 100,000 / 484,881 assignments
Saved 59,978 rows → in_unit_behaviors_part_00001.parquet
Processed 150,000 / 484,881 assignments
Saved 59,968 rows → in_unit_behaviors_part_00002.parquet
Processed 200,000 / 484,881 assignments
Saved 59,978 rows → in_unit_behaviors_part_00003.parquet
Processed 250,000 / 484,881 assignments
Saved 59,948 rows → in_unit_behaviors_part_00004.parquet
Processed 300,000 / 484,881 assignments
Processed 350,000 / 484,881 assignments
Saved 5

0

##### Transaction Creation

In [None]:
def student_unit_transactions(transactions):
    transactions = transactions.copy()

    transactions['score_cat'] = pd.cut(
        transactions['unit_test_score'],
        bins=[-0.1, 0.5, 1.01],
        labels=['low_score', 'high_score']
    )

    transactions['help_cat'] = pd.qcut(
        transactions['avg_help_seeking'],
        q=[0, .7, .9, 1],
        labels=['typical_help', 'high_help', 'very_high_help']
    )

    transactions['wrong_cat'] = pd.cut(
        transactions['avg_wrong_attempts'],
        bins=[-0.1, 1, 2, np.inf],
        labels=['low_wrongs', 'med_wrongs', 'high_wrongs']
    )
    
    transactions['completion_cat'] = pd.cut(
        transactions['avg_completion_rate'],
        bins=[-0.1, 0.7, 1.01],
        labels=['low_completion', 'high_completion']
    )
    
    transactions['problems_cat'] = pd.cut(
        transactions['avg_problems_completed'],
        bins=[-0.1, 5, 10, np.inf],
        labels=['few_problems', 'med_problems', 'many_problems']
    )
    
    transactions['struggle_cat'] = pd.qcut(
        transactions['avg_struggle_ratio'],
        q=[0, .7, .9, 1],
        labels=['low_struggle', 'med_struggle', 'high_struggle']
    )

    transactions['effort_cat'] = pd.qcut(
        transactions['avg_effort_intensity'],
        q=[0, .7, .9, 1],
        labels=['low_effort', 'med_effort', 'high_effort']
    )
    
    transactions['transaction_id'] = transactions['assignment_log_id']
    
    item_columns = [
        'score_cat',
        'help_cat',
        'wrong_cat',
        'completion_cat',
        'problems_cat',
        'struggle_cat',
        'effort_cat'
    ]

    transactions['items'] = transactions[item_columns].apply(
        lambda row: [str(item) for item in row if pd.notna(item) and str(item) != 'nan'],
        axis=1
    )
    
    result = transactions[['transaction_id', 'items']].copy()

    return result

unit_test_df = load_dataframe('unit_test_df')
unit_test_t = student_unit_transactions(unit_test_df)
save_dataframe('unit_test_transactions', unit_test_t)
del unit_test_df, unit_test_t
gc.collect()

count   42244.0000
mean        0.4535
std         0.3184
min         0.0000
25%         0.1818
50%         0.4545
75%         0.7143
max         1.0000
Name: unit_test_score, dtype: float64
unit_test_score
False    21644
True     20600
Name: count, dtype: int64


0

#### Transactions Encoding

In [60]:
unit_test_t = load_dataframe("unit_test_transactions")
transactions = unit_test_t['items'].tolist()

te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)

encoded_df = pd.DataFrame(
    te_array,
    columns=te.columns_
)

display(encoded_df)
save_dataframe("unit_test_encoded", encoded_df)
del unit_test_t, transactions, te, te_array, 
gc.collect()

Unnamed: 0,few_problems,high_completion,high_effort,high_help,high_score,high_struggle,high_wrongs,low_completion,low_effort,low_score,low_struggle,low_wrongs,many_problems,med_effort,med_problems,med_struggle,med_wrongs,typical_help,very_high_help
0,True,True,False,False,True,False,False,False,True,False,True,True,False,False,False,False,False,True,False
1,False,True,False,False,False,False,False,False,False,True,True,True,False,True,True,False,False,False,True
2,True,True,False,False,False,False,False,False,True,True,True,True,False,False,False,False,False,True,False
3,False,True,False,True,False,False,False,False,True,True,True,True,True,False,False,False,False,False,False
4,False,True,False,True,False,False,False,False,False,True,False,True,True,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42239,True,True,False,False,False,False,False,False,True,True,False,True,False,False,False,True,False,True,False
42240,False,True,False,False,False,False,False,False,True,True,True,True,False,False,True,False,False,True,False
42241,False,True,False,False,False,False,False,False,True,True,False,True,False,False,True,True,False,True,False
42242,False,False,False,False,True,False,False,True,True,False,True,True,True,False,False,False,False,True,False


5518

#### Top Frequent Items and Rules

In [22]:
print("Loading encoddings...")
encoded_df = load_dataframe('unit_test_encoded')
print("Running Apriori Algorithm...")
freq_items2, rules2 = run_apriori(encoded_df)
del encoded_df
gc.collect()

print_top_association_results(
    frequent_itemsets=freq_items2,
    rules = rules2,
    config=AnalysisConfig,
    base_filename="A2_analysis"
)

Loading encoddings...
Running Apriori Algorithm...

TOP 10 FREQUENT ITEMSETS
low_wrongs  (support=0.959)
high_completion  (support=0.869)
high_completion | low_wrongs  (support=0.836)
low_effort | low_wrongs  (support=0.700)
low_effort  (support=0.700)
low_struggle  (support=0.700)
low_struggle | low_wrongs  (support=0.700)
typical_help  (support=0.700)
low_wrongs | typical_help  (support=0.689)
low_effort | typical_help  (support=0.614)
From 608 itemsets with support ≥ 0.05

TOP 10 ASSOCIATION RULES
low_wrongs, low_struggle, low_completion, typical_help -> low_effort, few_problems  (support=0.062, confidence=0.803, lift=3.258)
low_struggle, low_completion, typical_help -> low_wrongs, low_effort, few_problems  (support=0.062, confidence=0.803, lift=3.258)
low_struggle, low_completion, typical_help -> low_effort, few_problems  (support=0.062, confidence=0.803, lift=3.258)
low_struggle, low_completion, low_score -> low_effort, few_problems  (support=0.056, confidence=0.794, lift=3.221)
l

#### EDA and Visualizations

In [14]:
transactions_df = load_dataframe("unit_test_transactions")

basket_summary(
    transactions_df,
    rules_df=rules2,
    top_n=5,
    output_dir=ProcessingConfig.results_vis_dir,
    plot_filename="A2_results_analysis.png"
)

del transactions_df, freq_items2, rules2
gc.collect()

Average Baskets: 6.999502887984092

EDA:
- Top 5 items:
  • low_wrongs (95.9%)
  • high_completion (86.9%)
  • low_struggle (70.0%)
  • low_effort (70.0%)
  • typical_help (70.0%)

Combined EDA plot saved: ../results/vis/A2_results_analysis.png


1792

### 3. Student help-seeking patterns (mastery analysis)

In [63]:
sequence_details = pd.read_csv(ProcessingConfig.dataset_folder / 'sequence_details.csv') # data_frames['sequence_details'] if data_frames and 'sequence_details' in data_frames.columns else
sequence_details = sequence_details[['sequence_id', 'sequence_folder_path_level_2', 'sequence_folder_path_level_3']]
save_dataframe('sequence_details', sequence_details)
del sequence_details
gc.collect()

232351

#### Feature Engineering

##### Aggregation Function

In [15]:
HELP_ACTIONS = ['hint_requested', 'explanation_requested', 'answer_requested']
def aggregate_help_seeking(group):
    group = group.sort_values('timestamp')
    
    hint_count = (group['action'] == 'hint_requested').sum()
    explanation_count = (group['action'] == 'explanation_requested').sum()
    answer_count = (group['action'] == 'answer_requested').sum()
    
    problems_started = (group['action'] == 'problem_started').sum()
    problems_finished = (group['action'] == 'problem_finished').sum()
    correct_count = (group['action'] == 'correct_response').sum()
    wrong_count = (group['action'] == 'wrong_response').sum()
    
    if problems_started == 0:
        return None
    
    hints_per_problem = hint_count / problems_started
    help_total = hint_count + explanation_count + answer_count
    help_per_problem = help_total / problems_started
    
    first_wrong_idx = group[group['action'] == 'wrong_response'].index.min() if wrong_count > 0 else np.inf
    first_help_idx = group[group['action'].isin(HELP_ACTIONS)].index.min()
    wrong_before_help = first_wrong_idx < first_help_idx if not pd.isna(first_help_idx) else False
    
    success_rate = correct_count / problems_started if problems_started > 0 else 0
    
    self_correction = False
    prev_action = None
    for action in group['action'].values:
        if prev_action == 'wrong_response' and action == 'correct_response':
            self_correction = True
            break
        prev_action = action
    
    return pd.Series({
        'hint_count': hint_count,
        'explanation_count': explanation_count,
        'answer_count': answer_count,
        'problems_started': problems_started,
        'problems_finished': problems_finished,
        'hints_per_problem': hints_per_problem,
        'help_per_problem': help_per_problem,
        'wrong_before_help': wrong_before_help,
        'success_rate': success_rate,
        'self_correction': self_correction
    })

##### Aggregation Process

In [None]:
action_logs = load_dataframe("action_logs")

print("Filtering to assignments with help-seeking...")
help_seeking_assignments = action_logs[action_logs['action'].isin(HELP_ACTIONS)]['assignment_log_id'].unique()
print(f"  Found {len(help_seeking_assignments):,} assignments with help-seeking")

action_logs = action_logs[action_logs['assignment_log_id'].isin(help_seeking_assignments)]

print("\nAggregating help-seeking patterns per assignment...")
help_patterns = action_logs.groupby('assignment_log_id').apply(
    aggregate_help_seeking
).reset_index()
del action_logs
gc.collect()


help_patterns = help_patterns.dropna(subset=['hint_count'])
help_patterns = help_patterns[help_patterns['problems_started'] >= 3]

print(f"Aggregated {len(help_patterns):,} assignments with help-seeking")

assignment_details = load_dataframe('assignment_details')
help_patterns = help_patterns.merge(
    assignment_details,
    on='assignment_log_id',
    how='left'
)
del assignment_details
gc.collect()

sequence_details = load_dataframe('sequence_details')
help_patterns = help_patterns.merge(
    sequence_details,
    on='sequence_id',
    how='left'
)
del sequence_details
gc.collect()

save_dataframe('help_patterns', help_patterns)
del help_patterns
gc.collect()

Filtering to assignments with help-seeking...
  Found 231,148 assignments with help-seeking

Aggregating help-seeking patterns per assignment...
Aggregated 204,330 assignments with help-seeking


0

##### Transaction Creation

In [19]:
def help_patterns_transactions(transactions):
    transactions = help_patterns.copy()
        
    # Hint usage pattern
    transactions['hint_pattern'] = pd.cut(
        transactions['hints_per_problem'],
        bins=[-0.1, 0.5, 2, np.inf],
        labels=['rare_hints', 'moderate_hints', 'frequent_hints']
    )
    
    # Overall help seeking
    transactions['help_level'] = pd.cut(
        transactions['help_per_problem'],
        bins=[-0.1, 1, 3, np.inf],
        labels=['low_help', 'med_help', 'high_help']
    )
    
    # Explanation usage
    transactions['explanation_usage'] = pd.cut(
        transactions['explanation_count'],
        bins=[-0.1, 0, 2, np.inf],
        labels=['no_explanation', 'some_explanation', 'frequent_explanation']
    )
    
    # Answer requesting
    transactions['answer_pattern'] = pd.cut(
        transactions['answer_count'],
        bins=[-0.1, 0, 2, np.inf],
        labels=['no_answer_req', 'some_answer_req', 'frequent_answer_req']
    )
    
    # Help effectiveness
    transactions['help_effectiveness'] = pd.cut(
        transactions['success_rate'],
        bins=[-0.1, 0.5, 0.8, 1.01],
        labels=['low_success', 'med_success', 'high_success']
    )
    
    # Binary features
    transactions['tries_first'] = transactions['wrong_before_help'].map(
        {True: 'tries_before_help', False: 'help_first'}
    )
    
    transactions['self_corrects'] = transactions['self_correction'].map(
        {True: 'self_corrects', False: 'no_self_correction'}
    )
    
    # Skill type (simplified)
    if 'sequence_folder_path_level_2' in transactions.columns:
        transactions['grade_level'] = transactions['sequence_folder_path_level_2'].fillna('unknown')
        # Categorize into elementary, middle, high
        transactions.loc[
            transactions['grade_level'].str.contains('Grade [1-5]', case=False, na=False),
            'grade_cat'
        ] = 'elementary'
        transactions.loc[
            transactions['grade_level'].str.contains('Grade [6-8]', case=False, na=False),
            'grade_cat'
        ] = 'middle'
        transactions.loc[
            transactions['grade_level'].str.contains('Algebra|Geometry|Calculus', case=False, na=False),
            'grade_cat'
        ] = 'high_school'
        transactions['grade_cat'] = transactions['grade_cat'].fillna('unknown')
    else:
        transactions['grade_cat'] = 'unknown'
    
    # Subject from folder path level 3
    if 'sequence_folder_path_level_3' in transactions.columns:
        transactions['subject'] = 'unknown'
        transactions.loc[
            transactions['sequence_folder_path_level_3'].str.contains(
                'Ratio|Proportion', case=False, na=False
            ), 'subject'
        ] = 'ratios'
        transactions.loc[
            transactions['sequence_folder_path_level_3'].str.contains(
                'Equation|Expression|Algebra', case=False, na=False
            ), 'subject'
        ] = 'algebra'
        transactions.loc[
            transactions['sequence_folder_path_level_3'].str.contains(
                'Geometry|Shape|Angle', case=False, na=False
            ), 'subject'
        ] = 'geometry'
        transactions.loc[
            transactions['sequence_folder_path_level_3'].str.contains(
                'Statistics|Probability|Data', case=False, na=False
            ), 'subject'
        ] = 'statistics'
    else:
        transactions['subject'] = 'unknown'
    
    transactions['transaction_id'] = transactions['assignment_log_id']
    
    item_columns = ['hint_pattern', 'help_level', 'explanation_usage', 
                    'answer_pattern', 'help_effectiveness', 'tries_first',
                    'self_corrects', 'grade_cat', 'subject']
    
    transactions['items'] = transactions[item_columns].apply(
        lambda row: [str(item) for item in row if pd.notna(item) and str(item) != 'nan'],
        axis=1
    )
    
    result = transactions[['transaction_id', 'items']].copy()

    return result

help_patterns = load_dataframe('help_patterns')
help_patterns_t = help_patterns_transactions(help_patterns)
save_dataframe('help_patterns_transactions', help_patterns_t)

del help_patterns, help_patterns_t
gc.collect()

0

#### Transactions Encoding

In [68]:
help_patterns_t = load_dataframe("help_patterns_transactions")
transactions = help_patterns_t['items'].tolist()

te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)

encoded_df = pd.DataFrame(
    te_array,
    columns=te.columns_
)

display(encoded_df)
save_dataframe("help_patterns_encoded", encoded_df)
del help_patterns_t, transactions, te, te_array, 
gc.collect()

Unnamed: 0,algebra,elementary,frequent_answer_req,frequent_explanation,frequent_hints,geometry,help_first,high_help,high_school,high_success,low_help,low_success,med_help,med_success,middle,moderate_hints,no_answer_req,no_explanation,no_self_correction,rare_hints,ratios,self_corrects,some_answer_req,some_explanation,statistics,tries_before_help,unknown
0,False,True,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,True,False,True,False,True,False,False,False,True,True
1,True,False,True,False,False,False,False,False,False,True,True,False,False,False,True,False,False,True,True,True,False,False,False,False,False,True,False
2,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,True,True,True,False,False,True,False,False,True,True
3,False,True,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,True,True,True,False,False,False,False,False,True,True
4,False,True,True,False,False,False,True,False,False,True,True,False,False,False,False,False,False,True,True,True,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205688,False,True,True,False,False,False,True,False,False,False,True,False,False,True,False,False,False,True,True,True,False,False,False,False,False,False,True
205689,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,True,True,True,False,False,False,False,False,True,True
205690,False,True,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,True,False,True,False,True,True,False,False,True,True
205691,False,True,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,True,False,True,False,True,True,False,False,True,True


5691

#### Top Frequent Items and Rules

In [28]:
print("Loading encoddings...")
encoded_df = load_dataframe('help_patterns_encoded')
print("Running Apriori Algorithm...")
freq_items3, rules3 = run_apriori(encoded_df)
del encoded_df
gc.collect()

print_top_association_results(
    frequent_itemsets=freq_items3,
    rules = rules3,
    config=AnalysisConfig,
    base_filename="A3_analysis"
)

Loading encoddings...
Running Apriori Algorithm...

TOP 10 FREQUENT ITEMSETS
low_help  (support=0.986)
rare_hints  (support=0.982)
low_help | rare_hints  (support=0.978)
no_explanation  (support=0.951)
low_help | no_explanation  (support=0.945)
no_explanation | rare_hints  (support=0.939)
low_help | no_explanation | rare_hints  (support=0.937)
tries_before_help  (support=0.771)
low_help | tries_before_help  (support=0.763)
rare_hints | tries_before_help  (support=0.759)
From 2399 itemsets with support ≥ 0.05

TOP 10 ASSOCIATION RULES
low_help, algebra -> rare_hints, middle  (support=0.050, confidence=0.764, lift=3.082)
rare_hints, algebra -> low_help, middle  (support=0.050, confidence=0.767, lift=3.050)
algebra -> low_help, rare_hints, middle  (support=0.050, confidence=0.748, lift=3.049)
algebra -> rare_hints, middle  (support=0.051, confidence=0.755, lift=3.045)
algebra -> low_help, middle  (support=0.051, confidence=0.758, lift=3.016)
algebra -> middle  (support=0.052, confidence=0

#### EDA and Visualizations

In [29]:
transactions_df = load_dataframe("help_patterns_transactions")

basket_summary(
    transactions_df,
    rules_df=rules3,
    top_n=5,
    output_dir=ProcessingConfig.results_vis_dir,
    plot_filename="A3_results_analysis.png"
)

del transactions_df, freq_items3, rules3
gc.collect()

Sample Transactions: 


Unnamed: 0,transaction_id,items
0,1000AQM2VK,"[rare_hints, low_help, no_explanation, frequen..."
1,10013WDY9H,"[rare_hints, low_help, no_explanation, frequen..."
2,1002VQV352,"[rare_hints, low_help, no_explanation, some_an..."
3,1003OIOXGA,"[rare_hints, low_help, no_explanation, frequen..."
4,10049KJE1W,"[rare_hints, low_help, no_explanation, frequen..."


Average Baskets: 9.0

EDA:
- Top 5 items:
  • low_help (98.6%)
  • rare_hints (98.2%)
  • no_explanation (95.1%)
  • tries_before_help (77.1%)
  • unknown (65.2%)

Combined EDA plot saved: ../results/vis/A3_results_analysis.png


1692