# Pre-requisites

In [1]:
pip install pybdm

Collecting pybdm
  Downloading pybdm-0.1.0-py2.py3-none-any.whl.metadata (8.2 kB)
Downloading pybdm-0.1.0-py2.py3-none-any.whl (39.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pybdm
Successfully installed pybdm-0.1.0


In [2]:
pip install -U transformers bitsandbytes accelerate

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd /content/drive/MyDrive/Colab Notebooks/Legal

/content/drive/MyDrive/Colab Notebooks/Legal


# 3rd June

In [None]:
# @title Cell 1: Configuration

import os
from typing import List, Dict, Set
from collections import Counter

# --- Project Paths ---
BASE_PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Legal/'
PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json')

P3_QIDS_TO_PROCESS_THEMATICALLY = ["Q4"]

# --- BDM Configuration ---
MATRIX_SIZE_GLOBAL = (8, 8)
# MAX_TEXT_FOR_BDM_HASH = 2000 # This will now be controlled by BDM_SEGMENT_LENGTH
BDM_SEGMENT_LENGTH = 2000 # NEW: Length of segments for calculating full corpus BDM

# --- LLM Configuration ---
LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'
USE_QUANTIZATION_FOR_LOCAL_LLM = True
LLM_BATCH_SIZE_RESPONSES = 5
LLM_RETRY_ATTEMPTS = 2
MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK = 7000
LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION = 1000
MAX_MOTIFS_PER_CHUNK = 3

# --- Token-Based L(H) Configuration (SFs cost zeroed for current experiment) ---
MOTIF_SYMBOLIC_LABEL_COST = 0.5
MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
MOTIF_DESCRIPTION_TOKEN_COST = 0.1
MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.0
MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.0

# --- Surface Form Filtering Configuration ---
MIN_SF_FREQUENCY_IN_FULL_CORPUS = 2
MAX_SF_TOKEN_LENGTH_FOR_FINAL_MOTIF = 6

# --- Logging File ---
LLM_DEBUG_LOG_FILE = os.path.join(BASE_PROJECT_DIR, "llm_motif_debug_log_refactored_v10_segmented_bdm.txt")

print(f"Cell 1: Configuration loaded. LOCAL_LLM_MODEL_ID set to '{LOCAL_LLM_MODEL_ID}'.")
print(f"BDM Strategy: Segmented BDM with segment length = {BDM_SEGMENT_LENGTH}")
# ... (other existing print statements for config)

In [None]:
# @title Cell 4: Motif Processing and Validation

import json
import re
import time
from typing import List, Dict
# from collections import Counter # Defined in Cell 1, used by extract_actual_phrases (Cell 2)

# Assume constants from Cell 1 are in global scope
# Assume text_utils functions (preprocess_corpus_for_motif_extraction, count_sf_occurrences, tokenize_phrase) from Cell 2 are in scope
# Assume LLM interaction functions (create_enhanced_motif_prompt, call_local_llm_for_raw_response) from Cell 3 are in scope

try:
    LLM_DEBUG_LOG_FILE; LLM_RETRY_ATTEMPTS; MIN_SF_FREQUENCY_IN_FULL_CORPUS; MAX_SF_TOKEN_LENGTH_FOR_FINAL_MOTIF
except NameError:
    print("WARN (Cell 4): Key config constants not found from Cell 1. Using fallbacks.")
    LLM_DEBUG_LOG_FILE = "temp_llm_debug_log_cell4.txt"; LLM_RETRY_ATTEMPTS = 2
    MIN_SF_FREQUENCY_IN_FULL_CORPUS = 2; MAX_SF_TOKEN_LENGTH_FOR_FINAL_MOTIF = 6


def parse_and_validate_llm_json_response(
    llm_raw_response_text: str,
    qid_for_log:str,
    chunk_idx_for_log:int,
    prompt_sent_to_llm:str
    ) -> List[Dict]:
    """Parses LLM JSON, attempts label fixing, validates schema."""
    json_str_candidate = llm_raw_response_text.strip()

    # Attempt to remove markdown fences if LLM adds them
    if json_str_candidate.startswith("```json"):
        json_str_candidate = json_str_candidate[len("```json"):].strip()
    if json_str_candidate.startswith("```"):
        json_str_candidate = json_str_candidate[len("```"):].strip()
    # VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
    # CORRECTED TYPO HERE
    if json_str_candidate.endswith("```"):
        json_str_candidate = json_str_candidate[:-len("```")].strip() # Was json_str, now json_str_candidate
    # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

    # print(f"    DEBUG (parse_validate): QID {qid_for_log}, Chunk {chunk_idx_for_log}, JSON candidate for parsing:\n{json_str_candidate[:500]}...")

    if not json_str_candidate or json_str_candidate.lower() == "[]" or \
       "no_themes_found" in json_str_candidate.lower() or \
       "no clear motifs" in json_str_candidate.lower():
        return []

    # Attempt to fix common JSON error: trailing commas before closing brackets/braces
    json_str_candidate_fixed = re.sub(r',\s*([\}\]])', r'\1', json_str_candidate)
    if json_str_candidate_fixed != json_str_candidate:
        # print(f"    DEBUG (parse_validate): Applied trailing comma fix for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        json_str_candidate = json_str_candidate_fixed

    try:
        parsed_data = json.loads(json_str_candidate)
        if isinstance(parsed_data, dict): parsed_data = [parsed_data]
        if not isinstance(parsed_data, list):
            raise ValueError("Parsed JSON from LLM is not a list or a single object.")

        valid_motifs_from_json = []
        for item_idx, item in enumerate(parsed_data):
            if not isinstance(item, dict) or not item: continue

            label_str_original = item.get('label', "")
            label_str_processed = ""
            if isinstance(label_str_original, str) and label_str_original.strip():
                temp_label_stripped = label_str_original.strip()
                match_strict_bracketed = re.fullmatch(r"\[([A-Z0-9_]+)\]", temp_label_stripped)
                if match_strict_bracketed:
                    label_str_processed = temp_label_stripped
                else:
                    match_bracketed_part = re.search(r"(\[[A-Z0-9_]+(?:_[A-Z0-9_]+)*\])", temp_label_stripped)
                    if match_bracketed_part:
                        label_str_processed = match_bracketed_part.group(1)
                    else:
                        sanitized_content = re.sub(r'\s+|-', '_', temp_label_stripped)
                        sanitized_content = re.sub(r'[^a-zA-Z0-9_]', '', sanitized_content).upper()
                        sanitized_content = "_".join(sanitized_content.split('_')[:4])
                        if sanitized_content:
                            label_str_processed = f"[{sanitized_content}]"

            item['label'] = label_str_processed

            label_to_validate = item.get('label',"")
            desc_str = item.get('description',"")
            sf_list = item.get('surface_forms', [])

            has_all_keys = all(k in item for k in ["label", "description", "surface_forms"])
            label_is_valid_format = isinstance(label_to_validate, str) and \
                                    bool(label_to_validate) and \
                                    label_to_validate.startswith('[') and \
                                    label_to_validate.endswith(']') and \
                                    re.fullmatch(r"\[[A-Z0-9_]+\]", label_to_validate)
            desc_is_valid = isinstance(item.get('description'), str)
            sfs_list_is_valid = isinstance(item.get('surface_forms'), list) and \
                                 all(isinstance(sf_item, str) for sf_item in item.get('surface_forms',[]))

            if has_all_keys and label_is_valid_format and desc_is_valid and sfs_list_is_valid:
                valid_motifs_from_json.append({
                    "label": label_to_validate,
                    "description": item['description'].strip(),
                    "surface_forms": [s.strip() for s in item['surface_forms'] if isinstance(s, str) and s.strip()]
                })
            else: # Item failed detailed schema validation
                print(f"    [WARN] Invalid motif object schema for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}. Skipping.")
                with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
                    f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} --- ITEM_SCHEMA_FAILURE (Item {item_idx+1}) ---\n")
                    f.write(f"Original Label: '{label_str_original}', Processed Label for Validation: '{label_to_validate}'\n")
                    f.write(f"Item Content: {json.dumps(item, indent=2)}\n")
                    f.write(f"Validation Checks: has_keys={has_all_keys}, label_valid_fmt={label_is_valid_format}, desc_valid={desc_is_valid}, sfs_list_valid={sfs_list_is_valid}\n")
        return valid_motifs_from_json
    except (json.JSONDecodeError, ValueError) as e:
        print(f"    [WARN] Motif JSON parsing or core structure issue for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} --- JSON_PARSE_VALUE_ERROR ---\n")
            prompt_parts = prompt_sent_to_llm.split('Set of comments to analyze:')
            user_content_for_log = prompt_parts[1][:500] if len(prompt_parts) > 1 else (prompt_sent_to_llm[:500] if prompt_sent_to_llm else "PROMPT_EMPTY")
            f.write(f"PROMPT USER CONTENT (approx first 500 chars):\n{user_content_for_log}...\n")
            f.write(f"RAW LLM RESPONSE (Error: {type(e).__name__} - {str(e)}):\n{llm_raw_response_text}\n")
            f.write(f"EXTRACTED JSON CANDIDATE (Error: {type(e).__name__}):\n{json_str_candidate}\n")
        return []

def get_motifs_for_qid_batched(
    list_of_individual_response_texts: List[str],
    responses_per_batch: int,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str
    ) -> List[Dict]:
    all_raw_motifs_from_chunks = []
    batched_text_chunks_for_llm = []
    for i in range(0, len(list_of_individual_response_texts), responses_per_batch):
        batch_responses = list_of_individual_response_texts[i:i + responses_per_batch]
        chunk_text_for_llm = preprocess_corpus_for_motif_extraction("\n\n<RSP_SEP>\n\n".join(batch_responses)) # from Cell 2
        batched_text_chunks_for_llm.append(chunk_text_for_llm)
    print(f"  QID {qid_for_log}: Processing {len(list_of_individual_response_texts)} responses in {len(batched_text_chunks_for_llm)} preprocessed chunks.")
    for chunk_idx, text_chunk_to_analyze_processed in enumerate(batched_text_chunks_for_llm):
        print(f"    Analyzing chunk {chunk_idx + 1}/{len(batched_text_chunks_for_llm)} for QID {qid_for_log} (len: {len(text_chunk_to_analyze_processed)} chars)...")
        if len(text_chunk_to_analyze_processed.strip()) < 50: print(f"      Chunk {chunk_idx+1} too short, skipping."); continue

        # create_enhanced_motif_prompt is from Cell 3
        prompt_for_llm = create_enhanced_motif_prompt(text_chunk_to_analyze_processed)

        motifs_from_this_chunk = []
        for attempt in range(LLM_RETRY_ATTEMPTS): # from config
            # call_local_llm_for_raw_response is from Cell 3
            raw_llm_response = call_local_llm_for_raw_response(
                prompt_for_llm, hf_pipeline_instance, hf_tokenizer_instance, qid_for_log, chunk_idx + 1
            )
            if not raw_llm_response:
                print(f"      LLM call attempt {attempt + 1} for chunk {chunk_idx+1} returned empty. Retrying...");
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1); continue

            # parse_and_validate_llm_json_response is defined in this cell
            parsed_motifs_from_this_attempt = parse_and_validate_llm_json_response(
                raw_llm_response, qid_for_log, chunk_idx+1, prompt_for_llm
            )
            if parsed_motifs_from_this_attempt:
                motifs_from_this_chunk = parsed_motifs_from_this_attempt; break
            else:
                print(f"      Motif parsing/validation attempt {attempt + 1} yielded no motifs for chunk {chunk_idx+1}. Retrying...");
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)

        if motifs_from_this_chunk:
            print(f"      Extracted {len(motifs_from_this_chunk)} motifs from chunk {chunk_idx+1} (QID {qid_for_log}).")
            all_raw_motifs_from_chunks.extend(motifs_from_this_chunk)
        else:
            print(f"      No valid motifs from chunk {chunk_idx+1} (QID {qid_for_log}) after {LLM_RETRY_ATTEMPTS} attempts.")
    return all_raw_motifs_from_chunks

def consolidate_raw_motifs(list_of_all_raw_motifs: List[Dict]) -> List[Dict]:
    # (Same as last version)
    if not list_of_all_raw_motifs: return []
    consolidated_motifs_map = {}
    for motif_obj in list_of_all_raw_motifs:
        label = motif_obj.get("label","").strip(); description = motif_obj.get("description","").strip()
        surface_forms = motif_obj.get("surface_forms", [])
        if not (label and isinstance(surface_forms, list)): continue
        current_sfs_set = set(sf.lower().strip() for sf in surface_forms if isinstance(sf, str) and sf.strip())
        if label not in consolidated_motifs_map:
            consolidated_motifs_map[label] = {"label": label, "description": description, "surface_forms": sorted(list(current_sfs_set))}
        else:
            existing_sfs_set = set(consolidated_motifs_map[label].get("surface_forms", []))
            consolidated_motifs_map[label]["surface_forms"] = sorted(list(existing_sfs_set.union(current_sfs_set)))
    return list(consolidated_motifs_map.values())

def filter_surface_forms_by_global_frequency(
    consolidated_motifs_list: List[Dict],
    full_qid_corpus_text: str,
    min_global_freq: int = MIN_SF_FREQUENCY_IN_FULL_CORPUS,
    max_sf_len_tokens: int = MAX_SF_TOKEN_LENGTH_FOR_FINAL_MOTIF # Uses constant from Cell 1
    ) -> List[Dict]:
    """Filters SFs by global frequency AND token length."""
    if not consolidated_motifs_list: return []
    final_globally_filtered_motifs = []
    for motif_obj in consolidated_motifs_list:
        globally_frequent_and_short_sfs = []
        original_sfs_for_this_motif = motif_obj.get("surface_forms", [])
        for sf_str_lower in original_sfs_for_this_motif:
            count = count_sf_occurrences(full_qid_corpus_text, sf_str_lower) # from Cell 2
            # tokenize_phrase from Cell 2
            if count >= min_global_freq and len(tokenize_phrase(sf_str_lower)) <= max_sf_len_tokens:
                globally_frequent_and_short_sfs.append(sf_str_lower)
        if globally_frequent_and_short_sfs:
            filtered_motif_entry = motif_obj.copy()
            filtered_motif_entry["surface_forms"] = sorted(list(set(globally_frequent_and_short_sfs)))
            final_globally_filtered_motifs.append(filtered_motif_entry)
    return final_globally_filtered_motifs

print("Cell 4: Motif Processing and Validation Utilities loaded (with NameError fix in parse_and_validate).")

In [None]:
# @title Cell 5: MDL Calculations

import hashlib
import numpy as np
from pybdm import BDM
import re
from typing import List, Dict

# Assume constants from Cell 1 are in global scope
# Assume tokenize_phrase from Cell 2 is in global scope
try:
    MATRIX_SIZE_GLOBAL; MAX_TEXT_FOR_BDM_HASH; MOTIF_SYMBOLIC_LABEL_COST # Check a few
    MOTIF_DESCRIPTION_TEXT_BASE_COST; MOTIF_DESCRIPTION_TOKEN_COST
    MOTIF_SURFACE_FORMS_LIST_BASE_COST; MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH
    BDM_SEGMENT_LENGTH # From last config update
except NameError:
    print("WARN (Cell 5): Key config constants not found from Cell 1. Using fallbacks or expecting errors.")
    MATRIX_SIZE_GLOBAL = (8, 8); MAX_TEXT_FOR_BDM_HASH = 2000; BDM_SEGMENT_LENGTH = 2000
    MOTIF_SYMBOLIC_LABEL_COST = 0.5; MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
    MOTIF_DESCRIPTION_TOKEN_COST = 0.1; MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.0
    MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.0


def initialize_bdm_instance():
    """Initializes and returns a BDM instance."""
    print("Initializing BDM instance...")
    try:
        bdm_instance = BDM(ndim=2)
        print("BDM instance initialized successfully (ndim=2, default CTM-based).")
        return bdm_instance
    except Exception as e_bdm_init:
        print(f"CRITICAL: Failed to initialize BDM instance: {e_bdm_init}")
        if "CTM data files" in str(e_bdm_init).lower() or "dataset" in str(e_bdm_init).lower():
            print("  BDM Error Hint: This might be related to missing/corrupted CTM data files for PyBDM.")
            print("  Ensure PyBDM is installed correctly and can access/download its data.")
            print("  You might need to run once: from pybdm import get_ctm_dataset; get_ctm_dataset()")
        return None

def text_to_binary_matrix(text_input_segment: str, size: tuple = MATRIX_SIZE_GLOBAL) -> np.ndarray:
    """Converts a text segment to a binary matrix using its SHA256 hash."""
    if not isinstance(text_input_segment, str) or not text_input_segment.strip():
        return np.zeros(size, dtype=int)
    hash_obj = hashlib.sha256(text_input_segment.encode('utf-8', 'ignore'))
    hash_digest = hash_obj.hexdigest()
    required_bits = size[0] * size[1]
    binary_string_from_hash = bin(int(hash_digest, 16))[2:].zfill(256)
    binary_string_for_matrix = binary_string_from_hash[:required_bits] if required_bits <= 256 else binary_string_from_hash.ljust(required_bits, '0')
    bits_for_matrix = [int(b) for b in binary_string_for_matrix]
    return np.array(bits_for_matrix).reshape(size)

def compute_bdm_for_text_segment(
    text_segment: str,
    bdm_instance: BDM,
    matrix_s: tuple = MATRIX_SIZE_GLOBAL
    ) -> float:
    """Computes BDM for a given text segment."""
    if not isinstance(text_segment, str) or not text_segment.strip() :
        return 0.0
    binary_matrix = text_to_binary_matrix(text_segment, size=matrix_s) # text_segment is passed directly
    try:
        bdm_value = bdm_instance.bdm(binary_matrix)
        return bdm_value
    except Exception as e_bdm:
        print(f"      Error during BDM segment calculation for text (segment len {len(text_segment)}): {e_bdm}")
        return -1.0

def calculate_full_corpus_bdm_segmented(
    corpus_str: str,
    bdm_instance: BDM,
    segment_len: int = BDM_SEGMENT_LENGTH,
    matrix_s: tuple = MATRIX_SIZE_GLOBAL
    ) -> float:
    """Calculates total BDM for a corpus by summing BDM of its segments."""
    if not isinstance(corpus_str, str) or not corpus_str.strip():
        return 0.0
    total_bdm = 0.0
    num_segments_processed = 0
    for i in range(0, len(corpus_str), segment_len):
        segment = corpus_str[i:i+segment_len]
        if segment.strip():
            bdm_val_segment = compute_bdm_for_text_segment(segment, bdm_instance, matrix_s)
            if bdm_val_segment < 0:
                print(f"    ERROR (calculate_full_corpus_bdm_segmented): BDM error in segment {num_segments_processed + 1}. Aborting QID.")
                return -1.0
            total_bdm += bdm_val_segment
            num_segments_processed += 1
    if num_segments_processed == 0 and len(corpus_str) > 0 :
        return 0.0
    return total_bdm

def calculate_L_H_token_based_structured(structured_motifs_list: List[Dict]) -> float:
    # (This function is the corrected one from the previous UnboundLocalError fix for 'current_motif_lh')
    if not structured_motifs_list: return 0.0
    total_lh_cost = 0.0
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue
        current_motif_lh = 0.0
        label_str = motif_obj.get('label', "")
        if isinstance(label_str, str) and label_str.strip(): current_motif_lh += MOTIF_SYMBOLIC_LABEL_COST
        description_str = motif_obj.get('description', "")
        if isinstance(description_str, str) and description_str.strip():
            current_motif_lh += MOTIF_DESCRIPTION_TEXT_BASE_COST
            current_motif_lh += len(tokenize_phrase(description_str)) * MOTIF_DESCRIPTION_TOKEN_COST
        surface_forms_list = motif_obj.get('surface_forms', [])
        if isinstance(surface_forms_list, list) and surface_forms_list:
            valid_sfs_for_lh = [sf for sf in surface_forms_list if isinstance(sf, str) and sf.strip()]
            if valid_sfs_for_lh:
                current_motif_lh += MOTIF_SURFACE_FORMS_LIST_BASE_COST
                for sf_str in valid_sfs_for_lh:
                    current_motif_lh += len(tokenize_phrase(sf_str)) * MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH
        total_lh_cost += current_motif_lh
    return total_lh_cost

# VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
# CORRECTED VERSION OF THIS FUNCTION
# VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
def llm_compress_text_structured(text_to_compress: str, structured_motifs_list: List[Dict]) -> str:
    """Compresses text by replacing occurrences of motif surface forms with their symbolic labels."""
    if not isinstance(text_to_compress, str):
        return ""

    # Initialize compressed_text immediately after confirming text_to_compress is a string
    # and before checking if structured_motifs_list is empty.
    compressed_text = text_to_compress.lower()

    if not structured_motifs_list:
        return compressed_text # Return the lowercased original if no motifs

    # Now proceed with replacements
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict):
            continue

        label = motif_obj.get('label', None)
        surface_forms = motif_obj.get('surface_forms', [])

        if not (isinstance(label, str) and label.strip()) or \
           not (isinstance(surface_forms, list) and surface_forms):
            continue

        placeholder = label

        sorted_sfs_for_this_motif = sorted(
            [sf for sf in surface_forms if isinstance(sf, str) and sf.strip()],
            key=len,
            reverse=True
        )

        for sf_str_lower in sorted_sfs_for_this_motif: # Assumes sf_str is already lowercased
            try:
                # compressed_text is used on the right here, and assigned to itself
                compressed_text = re.sub(r'\b' + re.escape(sf_str_lower) + r'\b', placeholder, compressed_text)
            except re.error as re_e:
                print(f"    Regex error during compression for SF '{sf_str_lower}' of motif '{label}': {re_e}. Skipping.")
                continue
    return compressed_text
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# END OF CORRECTION
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

def compute_mdl_cost_for_text_block(
    full_qid_corpus_str: str,
    final_motifs_to_evaluate: List[Dict],
    bdm_instance: BDM,
    matrix_s: tuple = MATRIX_SIZE_GLOBAL,
    segment_len_for_bdm: int = BDM_SEGMENT_LENGTH # Use from config
    ) -> tuple[float, float, float]:
    """Computes L(H), L(D|H) (using segmented BDM), and Total MDL."""
    if not isinstance(full_qid_corpus_str, str): full_qid_corpus_str = ""
    l_h = calculate_L_H_token_based_structured(final_motifs_to_evaluate)
    compressed_text_block = llm_compress_text_structured(full_qid_corpus_str, final_motifs_to_evaluate)

    l_d_h = calculate_full_corpus_bdm_segmented( # Using segmented BDM
        compressed_text_block,
        bdm_instance,
        segment_len=segment_len_for_bdm,
        matrix_s=matrix_s
    )

    if l_d_h < 0: return l_h, -1.0, -1.0
    total_mdl_cost = l_h + l_d_h
    return l_h, l_d_h, total_mdl_cost

print("Cell 5: MDL Calculation Utilities loaded (with UnboundLocalError fix in llm_compress_text_structured and segmented BDM).")

In [None]:
# @title Cell 6: Main Pipeline Orchestration

# Assume functions from previous cells are defined
script_version_name_for_run_message = "Refactored MWP v10 (Segmented BDM)"

def main():
    script_version_name = "Refactored MWP v10 (Segmented BDM, Simpler Prompt, Label Fix, L(H) SF Cost Zero, SF Len Filter)"
    print(f"--- {script_version_name} ---")
    # ... (Print config params including BDM_SEGMENT_LENGTH - same as before) ...
    print(f"Timestamp: {time.asctime()}")
    print("\n--- Configuration Summary ---"); print(f"LLM Model: {LOCAL_LLM_MODEL_ID}, Quantization: {USE_QUANTIZATION_FOR_LOCAL_LLM}")
    print(f"LLM Batch Size: {LLM_BATCH_SIZE_RESPONSES}, Retries: {LLM_RETRY_ATTEMPTS}, Max Chars/Chunk: {MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK}, Max New Tokens: {LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION}, Max Motifs/Chunk: {MAX_MOTIFS_PER_CHUNK}")
    print(f"L(H) Costs: Label={MOTIF_SYMBOLIC_LABEL_COST}, DescBase={MOTIF_DESCRIPTION_TEXT_BASE_COST}, DescToken={MOTIF_DESCRIPTION_TOKEN_COST}, SFListBase={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, SFTokenInLH={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}")
    print(f"Global SF Filtering: MinFreq={MIN_SF_FREQUENCY_IN_FULL_CORPUS}, MaxSFLen={MAX_SF_TOKEN_LENGTH_FOR_FINAL_MOTIF}")
    print(f"BDM Segment Length: {BDM_SEGMENT_LENGTH}, BDM Matrix: {MATRIX_SIZE_GLOBAL}"); print(f"Debug Log File: {LLM_DEBUG_LOG_FILE}"); print("--- End Config ---\n")

    # ... (Initialize debug log file - same as before) ...
    # ... (Initialize LLM Pipeline - same as before) ...
    # ... (Initialize BDM - same as before) ...
    # ... (Load Phase 2 Data - same as before) ...
    # ... (QID selection logic - same as before) ...
    try:
        with open(LLM_DEBUG_LOG_FILE, "w", encoding="utf-8") as f: f.write(f"LLM Log - {time.asctime()}\nVersion: {script_version_name}\nModel: {LOCAL_LLM_MODEL_ID}\n---\n")
    except Exception as e_log: print(f"WARN: Log init failed: {e_log}")
    hf_pipeline_instance, hf_tokenizer_instance = initialize_llm_pipeline()
    if not hf_pipeline_instance: print("CRITICAL: LLM init failed."); return
    bdm_instance_main = initialize_bdm_instance()
    if not bdm_instance_main: print("CRITICAL: BDM init failed."); return
    if not os.path.exists(P2_COLLATED_FILE): print(f"ERROR: File {P2_COLLATED_FILE} not found."); return
    print(f"Loading data from: {P2_COLLATED_FILE}..."); phase2_data_content = None
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f: phase2_data_content = json.load(f)
    except Exception as e_load: print(f"Error loading {P2_COLLATED_FILE}: {e_load}"); return
    all_qid_mdl_results_list = []
    aggregated_content_by_qid_from_file = phase2_data_content.get("aggregated_pdf_content_by_qid", {})
    if not aggregated_content_by_qid_from_file: print(f"No 'aggregated_pdf_content_by_qid' in {P2_COLLATED_FILE}."); return
    qids_to_process_this_run = [qid for qid in P3_QIDS_TO_PROCESS_THEMATICALLY if qid in aggregated_content_by_qid_from_file] if (P3_QIDS_TO_PROCESS_THEMATICALLY and isinstance(P3_QIDS_TO_PROCESS_THEMATICALLY, list) and P3_QIDS_TO_PROCESS_THEMATICALLY) else list(aggregated_content_by_qid_from_file.keys())[:1]
    if not qids_to_process_this_run: print(f"No QIDs to process. Exiting."); return
    print(f"\nMDL analysis for QIDs: {qids_to_process_this_run}\n")


    for qid_identifier_str in qids_to_process_this_run:
        print(f"--- Analyzing Data for QID: {qid_identifier_str} ---")
        # ... (Get actual_response_texts_for_qid and full_corpus_text_for_qid) ...
        list_of_individual_response_structs = aggregated_content_by_qid_from_file.get(qid_identifier_str, [])
        actual_response_texts_for_qid = [item.get("text", "") for item in list_of_individual_response_structs if isinstance(item, dict) and isinstance(item.get("text"), str) and item.get("text","").strip()]
        if not actual_response_texts_for_qid: print(f"  No valid text for QID {qid_identifier_str}. Skipping."); print("-" * 50); continue
        full_corpus_text_for_qid = "\n\n<RSP_SEP>\n\n".join(actual_response_texts_for_qid)
        if len(full_corpus_text_for_qid.strip()) < 100: print(f"  Skipping QID {qid_identifier_str}: text too short."); print("-" * 50); continue
        num_total_responses_for_qid = len(actual_response_texts_for_qid)
        print(f"  Corpus for QID {qid_identifier_str}: {len(full_corpus_text_for_qid)} chars, {num_total_responses_for_qid} responses.")

        # MODIFIED: Calculate baseline L(D) using segmented BDM
        baseline_bdm_original_corpus = calculate_full_corpus_bdm_segmented(
            full_corpus_text_for_qid,
            bdm_instance_main,
            segment_len=BDM_SEGMENT_LENGTH, # Use constant from Cell 1
            matrix_s=MATRIX_SIZE_GLOBAL
        )
        if baseline_bdm_original_corpus < 0:
            print(f"  Error computing segmented baseline BDM for QID {qid_identifier_str}. Skipping this QID.")
            # Log error if needed
            continue
        current_qid_baseline_mdl_cost = baseline_bdm_original_corpus
        print(f"  Baseline MDL for QID {qid_identifier_str} (L(D_orig) - Segmented BDM): {current_qid_baseline_mdl_cost:.4f}")

        # ... (The rest of the loop: get_motifs_for_qid_batched, consolidation,
        #      global SF filtering with length, final MDL calc, logging results -
        #      this part remains the same as your last working main() function,
        #      but compute_mdl_cost_for_text_block will now internally use segmented BDM
        #      for L(D|H) because it calls calculate_full_corpus_bdm_segmented)
        raw_motifs_from_chunks = get_motifs_for_qid_batched(actual_response_texts_for_qid, LLM_BATCH_SIZE_RESPONSES, hf_pipeline_instance, hf_tokenizer_instance, qid_identifier_str)
        current_qid_result_entry = {
            "qid": qid_identifier_str, "corpus_len_chars": len(full_corpus_text_for_qid), "num_responses": num_total_responses_for_qid,
            "baseline_mdl": current_qid_baseline_mdl_cost, "final_refined_motifs": [], "l_h_final_motifs": 0.0,
            "l_d_h_final_motifs": current_qid_baseline_mdl_cost, "total_mdl_with_final_motifs": current_qid_baseline_mdl_cost,
            "compression_achieved": 0.0, "num_raw_motifs_extracted": len(raw_motifs_from_chunks),
            "num_consolidated_motifs": 0, "num_globally_refined_motifs": 0
        }
        if not raw_motifs_from_chunks: print(f"  No raw motifs by LLM for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue
        print(f"  Extracted {len(raw_motifs_from_chunks)} raw motif objects for QID {qid_identifier_str}.")
        consolidated_motifs_list = consolidate_raw_motifs(raw_motifs_from_chunks)
        current_qid_result_entry["num_consolidated_motifs"] = len(consolidated_motifs_list)
        print(f"  Consolidated into {len(consolidated_motifs_list)} unique motifs for QID {qid_identifier_str}.")
        if not consolidated_motifs_list: print(f"  No unique motifs after consolidation for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        globally_refined_motifs = filter_surface_forms_by_global_frequency(
            consolidated_motifs_list, full_corpus_text_for_qid,
            min_global_freq=MIN_SF_FREQUENCY_IN_FULL_CORPUS,
            max_sf_len_tokens=MAX_SF_TOKEN_LENGTH_FOR_FINAL_MOTIF # Pass this new arg
        )
        current_qid_result_entry["num_globally_refined_motifs"] = len(globally_refined_motifs)
        print(f"  Globally refined (freq & len) into {len(globally_refined_motifs)} motifs for QID {qid_identifier_str}.")
        if not globally_refined_motifs: print(f"  No motifs left after GLOBAL SF refinement for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        print(f"  Final Globally Refined Motifs for QID {qid_identifier_str}:")
        for idx, mo_final in enumerate(globally_refined_motifs): print(f"    Refined Motif {idx+1}: L='{mo_final.get('label')}', D='{mo_final.get('description','N/A')[:60]}...', SFs({len(mo_final.get('surface_forms',[]))})='{mo_final.get('surface_forms',[])}'")

        l_h_final, l_d_h_final, total_mdl_final = compute_mdl_cost_for_text_block( # This now uses segmented BDM for L(D|H)
            full_corpus_text_for_qid, globally_refined_motifs, bdm_instance_main
        )
        current_qid_result_entry.update({
            "final_refined_motifs": globally_refined_motifs, "l_h_final_motifs": l_h_final,
            "l_d_h_final_motifs": l_d_h_final if l_d_h_final >=0 else "BDM_ERROR",
            "total_mdl_with_final_motifs": total_mdl_final if l_d_h_final >=0 else "BDM_ERROR",
            "compression_achieved": "BDM_ERROR" if l_d_h_final < 0 else (current_qid_baseline_mdl_cost - total_mdl_final)
        })
        if l_d_h_final < 0: print(f"  Error computing MDL cost (BDM error) for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue
        print(f"  L(H) final motifs: {l_h_final:.4f} (SFs definition cost in L(H) is ZEROED)")
        print(f"  L(D|H) (Segmented BDM) compressed full corpus: {l_d_h_final:.4f}") # Clarify it's segmented
        print(f"  Total MDL cost with final motifs: {total_mdl_final:.4f}")
        compression_val = current_qid_result_entry["compression_achieved"]
        result_status_str = f"SUCCESS: Comp: {compression_val:.4f}" if isinstance(compression_val, float) and compression_val > 0.0001 else f"NOTE: No sig. comp. Diff: {compression_val if isinstance(compression_val, str) else compression_val:.4f}"
        print(f"  {result_status_str}"); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50)

    # --- Summary Printing and Saving Results ---
    # ... (Same summary logic as before, just ensure filename is updated) ...
    print(f"\n--- Overall QID-based MDL Analysis Summary ({script_version_name}) ---")
    if not all_qid_mdl_results_list: print("No QIDs processed.")
    else:
        valid_results = [r for r in all_qid_mdl_results_list if isinstance(r.get('compression_achieved'), float) and r.get('l_h_final_motifs', -1.0) >= 0]
        num_qids_ok = len(valid_results); num_comp = sum(1 for r in valid_results if r['compression_achieved'] > 0.0001)
        print(f"Targeted QIDs: {len(qids_to_process_this_run)}, Results logged: {len(all_qid_mdl_results_list)}, Valid MDL: {num_qids_ok}, QIDs compressed: {num_comp}")
        if num_comp > 0:
            comp_vals = [r['compression_achieved'] for r in valid_results if r['compression_achieved'] > 0.0001]
            print(f"  Avg compression: {np.mean(comp_vals):.4f}, Max compression: {np.max(comp_vals):.4f}")
        else: print("  No compression achieved.")
        output_filename = os.path.join(BASE_PROJECT_DIR, "mdl_analysis_refactored_v10_segmentedBDM.json")
        try:
            with open(output_filename, "w", encoding="utf-8") as f_out: json.dump(all_qid_mdl_results_list, f_out, indent=2, ensure_ascii=False)
            print(f"Detailed results saved to {output_filename}")
        except Exception as e_s: print(f"Error saving results: {e_s}")


if __name__ == "__main__":
    print(f"Executing main MDL pipeline ({script_version_name_for_run_message}) at {time.asctime()}...") # Define script_version_name_for_run_message or use fixed string
    main()
    print(f"Main MDL pipeline execution finished at {time.asctime()}.")

# Define this at the top of Cell 6 or globally if Cell 6 is the only one with if __name__ == "__main__":
script_version_name_for_run_message = "Refactored MWP v10 (Segmented BDM)"

# 3rd June
Thematic phrase extractor completed !

In [None]:
# @title Cell 1: Configuration

import os
from typing import List, Dict, Set
from collections import Counter

# --- Project Paths ---
BASE_PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Legal/'
PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json')

P3_QIDS_TO_PROCESS_THEMATICALLY = ["Q4"]

# --- BDM Configuration ---
MATRIX_SIZE_GLOBAL = (8, 8)
MAX_TEXT_FOR_BDM_HASH = 2000

# --- LLM Configuration ---
LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'
USE_QUANTIZATION_FOR_LOCAL_LLM = True
LLM_BATCH_SIZE_RESPONSES = 5
LLM_RETRY_ATTEMPTS = 2
MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK = 7000
LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION = 1000 # Kept increased for JSON completeness
MAX_MOTIFS_PER_CHUNK = 3 # REDUCED: Ask LLM for fewer motifs per chunk

# --- Token-Based L(H) Configuration (SFs cost zeroed for current experiment) ---
MOTIF_SYMBOLIC_LABEL_COST = 0.5
MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
MOTIF_DESCRIPTION_TOKEN_COST = 0.1
MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.0
MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.0

# --- Surface Form Filtering Configuration ---
MIN_SF_FREQUENCY_IN_FULL_CORPUS = 2

# --- Logging File ---
LLM_DEBUG_LOG_FILE = os.path.join(BASE_PROJECT_DIR, "llm_motif_debug_log_refactored_v8_strict_prompt.txt") # New log filename

print(f"Cell 1: Configuration loaded. LOCAL_LLM_MODEL_ID set to '{LOCAL_LLM_MODEL_ID}'.")
print(f"LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION: {LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION}, MAX_MOTIFS_PER_CHUNK: {MAX_MOTIFS_PER_CHUNK}")
print(f"L(H) SF Costs: Base={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, Token={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}")
print(f"Debug log will be: {LLM_DEBUG_LOG_FILE}")
# ... (Path existence checks - same as before)

In [None]:
# @title Cell 2: Text Utilities

import re
from typing import List, Dict
from collections import Counter

try:
    MIN_SF_FREQ_IN_CHUNK_VALIDATION
except NameError:
    MIN_SF_FREQ_IN_CHUNK_VALIDATION = 2


def tokenize_phrase(phrase_text: str) -> List[str]:
    if not isinstance(phrase_text, str) or not phrase_text.strip():
        return []
    return phrase_text.lower().split()

def preprocess_corpus_for_motif_extraction(text_corpus: str) -> str:
    if not isinstance(text_corpus, str):
        return ""
    text = re.sub(r'\n{3,}', '\n\n', text_corpus)
    text = re.sub(r' {2,}', ' ', text)
    lines = text.split('\n')
    filtered_lines = [line.strip() for line in lines if len(line.strip()) > 10 or not line.strip()]
    return '\n'.join(filtered_lines)

def count_sf_occurrences(corpus_text: str, surface_form: str) -> int:
    if not corpus_text or not surface_form or \
       not isinstance(corpus_text, str) or not isinstance(surface_form, str) or \
       not surface_form.strip():
        return 0
    try:
        return len(re.findall(re.escape(surface_form.lower()), corpus_text.lower(), flags=re.IGNORECASE))
    except re.error as e:
        print(f"    [WARN] Regex error in count_sf_occurrences for SF '{surface_form}': {e}")
        return 0

def extract_actual_phrases_from_text(
    text: str,
    min_phrase_len: int = 2,
    max_phrase_len: int = 6,
    min_freq: int = MIN_SF_FREQ_IN_CHUNK_VALIDATION
    ) -> Dict[str, int]:
    if not isinstance(text, str) or not text.strip(): return {}
    text_cleaned = text.lower()
    text_cleaned = re.sub(r'[^\w\s\']', ' ', text_cleaned)
    text_cleaned = re.sub(r'\s+', ' ', text_cleaned).strip()
    words = text_cleaned.split()
    if not words or len(words) < min_phrase_len: return {}
    phrase_counts = Counter()
    for n in range(min_phrase_len, max_phrase_len + 1):
        if n > len(words): continue
        for i in range(len(words) - n + 1):
            phrase = ' '.join(words[i:i+n])
            if phrase: phrase_counts[phrase] += 1
    return {phrase: count for phrase, count in phrase_counts.items() if count >= min_freq}

print("Cell 2: Text Utilities loaded.")

In [None]:
# @title Cell 3: LLM Interaction

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import time
from typing import List, Dict
import os
import re

try:
    LOCAL_LLM_MODEL_ID; USE_QUANTIZATION_FOR_LOCAL_LLM; MAX_MOTIFS_PER_CHUNK
    MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK; LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION; LLM_DEBUG_LOG_FILE
except NameError:
    print("WARN (Cell 3): Key config constants not found from Cell 1. Using fallbacks.")
    LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'; USE_QUANTIZATION_FOR_LOCAL_LLM = True
    MAX_MOTIFS_PER_CHUNK = 3; MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK = 7000
    LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION = 1000; LLM_DEBUG_LOG_FILE = "temp_llm_debug_log_cell3.txt"


def initialize_llm_pipeline( # Same as last version
    model_id: str = LOCAL_LLM_MODEL_ID,
    use_quantization: bool = USE_QUANTIZATION_FOR_LOCAL_LLM,
    pipeline_return_full_text: bool = False
    ):
    print(f"--- Initializing LLM Pipeline (model: {model_id}, quantization: {use_quantization}, return_full_text: {pipeline_return_full_text}) ---")
    hf_pipeline_instance = None; hf_tokenizer_instance = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu"); print(f"Using device: {device}")
    try:
        print(f"Loading tokenizer for {model_id}..."); hf_tokenizer_instance = AutoTokenizer.from_pretrained(model_id)
        if hf_tokenizer_instance.pad_token is None:
            if hf_tokenizer_instance.eos_token is not None: print("Tokenizer setting pad_token = eos_token."); hf_tokenizer_instance.pad_token = hf_tokenizer_instance.eos_token
            else: print("WARN (initialize_llm): Tokenizer has no pad_token and no eos_token.")
        bnb_config = None; quant_active = False
        if use_quantization and torch.cuda.is_available():
            try:
                compute_dtype = torch.bfloat16 if (device.type == 'cuda' and torch.cuda.is_bf16_supported()) else torch.float16
                bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=True)
                quant_active = True; print(f"BNB config created, compute_dtype: {compute_dtype}.")
            except Exception as e_bnb: print(f"WARN: Failed BitsAndBytesConfig: {e_bnb}. Quantization may be disabled."); quant_active = False
        print(f"Loading model {model_id} (Quantization: {quant_active})..."); model_kwargs = {"device_map": "auto", "trust_remote_code": True}
        if quant_active and bnb_config: model_kwargs["quantization_config"] = bnb_config
        elif device.type == 'cuda': model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        hf_model_instance = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
        if hf_tokenizer_instance.pad_token_id is not None:
            if hf_model_instance.config.pad_token_id is None or hf_model_instance.config.pad_token_id != hf_tokenizer_instance.pad_token_id:
                hf_model_instance.config.pad_token_id = hf_tokenizer_instance.pad_token_id
        hf_pipeline_instance = pipeline("text-generation", model=hf_model_instance, tokenizer=hf_tokenizer_instance, return_full_text=pipeline_return_full_text)
        print(f"LLM pipeline initialized for {model_id}."); return hf_pipeline_instance, hf_tokenizer_instance
    except Exception as e: print(f"CRITICAL: LLM pipeline init failed: {e}"); import traceback; traceback.print_exc(); return None, None


def create_enhanced_motif_prompt(text_corpus_chunk: str, max_motifs_to_extract: int = MAX_MOTIFS_PER_CHUNK) -> str:
    """Revised prompt with stronger emphasis on JSON syntax and label format."""
    if len(text_corpus_chunk) > MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK:
        text_corpus_chunk = text_corpus_chunk[:MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK]

    prompt = f"""You are a highly precise assistant for thematic analysis. Your task is to extract key recurring themes from the provided text.

STRICT OUTPUT REQUIREMENTS:
1.  Your entire response MUST be a single, valid JSON list.
2.  Each element in the list MUST be a JSON object.
3.  Each JSON object MUST contain exactly three keys: "label", "description", and "surface_forms".
4.  The value for "label" MUST be a string, IN ALL_CAPITAL_SNAKE_CASE, AND enclosed in square brackets. Example of a correct label: "[DATA_SECURITY_POLICY]". Example of an incorrect label: "Data Security Policy".
5.  The value for "description" MUST be a single, concise sentence (string).
6.  The value for "surface_forms" MUST be a JSON list of 2 to 3 short (2-6 words) VERBATIM phrases extracted DIRECTLY from the 'Text to analyze'. These phrases must be strong examples of the theme. If no suitable verbatim phrases are found, provide an empty list `[]`. The "surface_forms" key MUST always be present.
7.  Identify up to {max_motifs_to_extract} themes. If fewer are clear, provide fewer objects. Do NOT include empty JSON objects `{{}}` in the list.
8.  Do NOT include any text, explanations, apologies, or markdown (like ```json) before or after the main JSON list.
9.  CRITICAL JSON SYNTAX: Ensure all strings are double-quoted. All lists (like "surface_forms") MUST start with '[' and end with ']', with elements comma-separated. Do NOT use trailing commas before a closing ']' or '}}'. For example, `["item1", "item2",]` is WRONG; it should be `["item1", "item2"]`.

INSTRUCTIONS FOR THEME IDENTIFICATION:
- Focus on meaningful recurring concepts directly stated or strongly implied in the 'Text to analyze'.
- For 'surface_forms', prioritize phrases that appear to be REPEATED or are highly characteristic of the theme within THIS text.
- Avoid generic labels like [EXAMPLE_THEME] or [GENERAL_TOPIC]. Make labels specific.

Text to analyze:
\"\"\"
{text_corpus_chunk}
\"\"\"

Your valid JSON response (ONLY the JSON list):
"""
    return prompt.strip()

def call_local_llm_for_raw_response( # Using this name to match Cell 4 calls
    prompt_content_for_user_turn: str,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str,
    chunk_idx_for_log: int
    ) -> str:
    # (Same as your last working version - uses do_sample=False)
    if not hf_pipeline_instance or not hf_tokenizer_instance:
        print(f"    ERROR (call_local_llm): LLM pipeline/tokenizer not initialized for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        return ""
    messages_for_template = [{"role": "user", "content": prompt_content_for_user_turn}]
    try:
        prompt_formatted_for_llm = hf_tokenizer_instance.apply_chat_template(
            messages_for_template, tokenize=False, add_generation_prompt=True
        )
    except Exception as e_template:
        print(f"    ERROR (call_local_llm): Applying chat template for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_template}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (call_local_llm) ---\nERROR APPLYING CHAT TEMPLATE: {e_template}\nPrompt content (first 300): {prompt_content_for_user_turn[:300]}...\n")
        return ""
    generation_args = {
        "max_new_tokens": LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION,
        "do_sample": False,
        "pad_token_id": hf_tokenizer_instance.pad_token_id
    }
    try:
        outputs = hf_pipeline_instance(prompt_formatted_for_llm, **generation_args)
        if outputs and isinstance(outputs, list) and len(outputs) > 0 and \
           outputs[0] and isinstance(outputs[0], dict) and 'generated_text' in outputs[0]:
            return outputs[0]['generated_text'].strip()
        else:
            print(f"    WARN (call_local_llm): LLM pipeline unexpected structure for QID {qid_for_log}, Chunk {chunk_idx_for_log}. Output: {outputs}")
            return ""
    except Exception as e_pipeline:
        print(f"    ERROR (call_local_llm): Exception during hf_pipeline call for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_pipeline}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (call_local_llm) ---\nERROR PIPELINE CALL: {e_pipeline}\nFormatted prompt (first 300): {prompt_formatted_for_llm[:300]}...\n")
        return ""

print("Cell 3: LLM Interaction Utilities loaded (with stricter JSON prompt).")

In [None]:
# @title Cell 4: Motif Processing and Validation

import json
import re
import time
from typing import List, Dict
# from collections import Counter # Defined in Cell 1, used by extract_actual_phrases (Cell 2)

# Assume constants from Cell 1
# Assume text_utils functions from Cell 2
# Assume LLM interaction functions from Cell 3

try:
    LLM_DEBUG_LOG_FILE; LLM_RETRY_ATTEMPTS; MIN_SF_FREQUENCY_IN_FULL_CORPUS
except NameError:
    LLM_DEBUG_LOG_FILE = "temp_llm_debug_log_cell4.txt"; LLM_RETRY_ATTEMPTS = 2; MIN_SF_FREQUENCY_IN_FULL_CORPUS = 2

def parse_and_validate_llm_json_response(
    llm_raw_response_text: str,
    qid_for_log:str,
    chunk_idx_for_log:int,
    prompt_sent_to_llm:str
    ) -> List[Dict]:
    """Parses LLM JSON, attempts label fixing, validates schema."""
    json_str_candidate = llm_raw_response_text.strip()
    if json_str_candidate.startswith("```json"): json_str_candidate = json_str_candidate[len("```json"):].strip()
    if json_str_candidate.startswith("```"): json_str_candidate = json_str_candidate[len("```"):].strip()
    if json_str_candidate.endswith("```"): json_str_candidate = json_str_candidate[:-len("```")].strip()

    if not json_str_candidate or json_str_candidate.lower() == "[]" or \
       "no_themes_found" in json_str_candidate.lower() or \
       "no clear motifs" in json_str_candidate.lower():
        return []

    # Attempt to fix common JSON error: trailing commas before closing brackets/braces
    json_str_candidate_fixed = re.sub(r',\s*([\}\]])', r'\1', json_str_candidate)
    if json_str_candidate_fixed != json_str_candidate:
        # print(f"    DEBUG (parse_validate): Applied trailing comma fix for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        json_str_candidate = json_str_candidate_fixed

    try:
        parsed_data = json.loads(json_str_candidate)
        if isinstance(parsed_data, dict): parsed_data = [parsed_data]
        if not isinstance(parsed_data, list):
            raise ValueError("Parsed JSON from LLM is not a list or a single object.")

        valid_motifs_from_json = []
        for item_idx, item in enumerate(parsed_data):
            if not isinstance(item, dict) or not item: continue

            label_str_original = item.get('label', "")
            label_str_processed = ""
            if isinstance(label_str_original, str) and label_str_original.strip():
                temp_label_stripped = label_str_original.strip()
                match_strict_bracketed = re.fullmatch(r"\[([A-Z0-9_]+)\]", temp_label_stripped)
                if match_strict_bracketed:
                    label_str_processed = temp_label_stripped
                else:
                    match_bracketed_part = re.search(r"(\[[A-Z0-9_]+(?:_[A-Z0-9_]+)*\])", temp_label_stripped) # Allow multiple words in snake case
                    if match_bracketed_part:
                        label_str_processed = match_bracketed_part.group(1)
                    else:
                        sanitized_content = re.sub(r'\s+|-', '_', temp_label_stripped)
                        sanitized_content = re.sub(r'[^a-zA-Z0-9_]', '', sanitized_content).upper()
                        sanitized_content = "_".join(sanitized_content.split('_')[:4])
                        if sanitized_content:
                            label_str_processed = f"[{sanitized_content}]"

            # Check if all essential keys are present before trying to access them
            if not all(k in item for k in ["label", "description", "surface_forms"]):
                print(f"    [WARN] Item missing required keys for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}. Item: {str(item)[:200]}...")
                # ... (log to file) ...
                continue # Skip this malformed item

            # Validate using the processed label
            label_to_validate = label_str_processed # Use the processed one
            desc_str = item.get('description',"")
            sf_list = item.get('surface_forms', [])

            label_is_valid_format = isinstance(label_to_validate, str) and \
                                    bool(label_to_validate) and \
                                    label_to_validate.startswith('[') and \
                                    label_to_validate.endswith(']') and \
                                    re.fullmatch(r"\[[A-Z0-9_]+\]", label_to_validate)
            desc_is_valid = isinstance(desc_str, str)
            sfs_list_is_valid = isinstance(sf_list, list) and \
                                 all(isinstance(sf_item, str) for sf_item in sf_list)

            if label_is_valid_format and desc_is_valid and sfs_list_is_valid:
                valid_motifs_from_json.append({
                    "label": label_to_validate,
                    "description": desc_str.strip(),
                    "surface_forms": [s.strip() for s in sf_list if isinstance(s, str) and s.strip()]
                })
            else:
                print(f"    [WARN] Invalid motif object schema after label processing for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}. Skipping.")
                # ... (log to file with details of which validation failed) ...
        return valid_motifs_from_json
    except (json.JSONDecodeError, ValueError) as e:
        print(f"    [WARN] Motif JSON parsing or core structure issue for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e}")
        # ... (log to file, include json_str_candidate) ...
        return []

def get_motifs_for_qid_batched(
    list_of_individual_response_texts: List[str],
    responses_per_batch: int,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str
    ) -> List[Dict]:
    # (This function remains structurally the same, calling the updated helpers)
    all_raw_motifs_from_chunks = []
    batched_text_chunks_for_llm = []
    for i in range(0, len(list_of_individual_response_texts), responses_per_batch):
        batch_responses = list_of_individual_response_texts[i:i + responses_per_batch]
        chunk_text_for_llm = preprocess_corpus_for_motif_extraction("\n\n<RSP_SEP>\n\n".join(batch_responses))
        batched_text_chunks_for_llm.append(chunk_text_for_llm)
    print(f"  QID {qid_for_log}: Processing {len(list_of_individual_response_texts)} responses in {len(batched_text_chunks_for_llm)} preprocessed chunks.")
    for chunk_idx, text_chunk_to_analyze_processed in enumerate(batched_text_chunks_for_llm):
        print(f"    Analyzing chunk {chunk_idx + 1}/{len(batched_text_chunks_for_llm)} for QID {qid_for_log} (len: {len(text_chunk_to_analyze_processed)} chars)...")
        if len(text_chunk_to_analyze_processed.strip()) < 50: print(f"      Chunk {chunk_idx+1} too short, skipping."); continue
        prompt_for_llm = create_enhanced_motif_prompt(text_chunk_to_analyze_processed)
        motifs_from_this_chunk = []
        for attempt in range(LLM_RETRY_ATTEMPTS):
            raw_llm_response = call_local_llm_for_raw_response(
                prompt_for_llm, hf_pipeline_instance, hf_tokenizer_instance, qid_for_log, chunk_idx + 1
            )
            if not raw_llm_response:
                print(f"      LLM call attempt {attempt + 1} for chunk {chunk_idx+1} returned empty. Retrying...");
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1); continue
            parsed_motifs_from_this_attempt = parse_and_validate_llm_json_response(
                raw_llm_response, qid_for_log, chunk_idx+1, prompt_for_llm
            )
            if parsed_motifs_from_this_attempt: motifs_from_this_chunk = parsed_motifs_from_this_attempt; break
            else:
                print(f"      Motif parsing/validation attempt {attempt + 1} yielded no motifs for chunk {chunk_idx+1}. Retrying...");
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)
        if motifs_from_this_chunk:
            print(f"      Extracted {len(motifs_from_this_chunk)} motifs from chunk {chunk_idx+1} (QID {qid_for_log}).")
            all_raw_motifs_from_chunks.extend(motifs_from_this_chunk)
        else:
            print(f"      No valid motifs from chunk {chunk_idx+1} (QID {qid_for_log}) after {LLM_RETRY_ATTEMPTS} attempts.")
    return all_raw_motifs_from_chunks

def consolidate_raw_motifs(list_of_all_raw_motifs: List[Dict]) -> List[Dict]:
    # (Same as last version)
    if not list_of_all_raw_motifs: return []
    consolidated_motifs_map = {}
    for motif_obj in list_of_all_raw_motifs:
        label = motif_obj.get("label","").strip(); description = motif_obj.get("description","").strip()
        surface_forms = motif_obj.get("surface_forms", [])
        if not (label and isinstance(surface_forms, list)): continue
        current_sfs_set = set(sf.lower().strip() for sf in surface_forms if isinstance(sf, str) and sf.strip())
        if label not in consolidated_motifs_map:
            consolidated_motifs_map[label] = {"label": label, "description": description, "surface_forms": sorted(list(current_sfs_set))}
        else:
            existing_sfs_set = set(consolidated_motifs_map[label].get("surface_forms", []))
            consolidated_motifs_map[label]["surface_forms"] = sorted(list(existing_sfs_set.union(current_sfs_set)))
    return list(consolidated_motifs_map.values())

def filter_surface_forms_by_global_frequency(
    consolidated_motifs_list: List[Dict],
    full_qid_corpus_text: str,
    min_global_freq: int = MIN_SF_FREQUENCY_IN_FULL_CORPUS
    ) -> List[Dict]:
    # (Same as last version)
    if not consolidated_motifs_list: return []
    final_globally_filtered_motifs = []
    for motif_obj in consolidated_motifs_list:
        globally_frequent_sfs_for_this_motif = []
        original_sfs_for_this_motif = motif_obj.get("surface_forms", [])
        for sf_str_lower in original_sfs_for_this_motif:
            count = count_sf_occurrences(full_qid_corpus_text, sf_str_lower)
            if count >= min_global_freq:
                globally_frequent_sfs_for_this_motif.append(sf_str_lower)
        if globally_frequent_sfs_for_this_motif:
            filtered_motif_entry = motif_obj.copy()
            filtered_motif_entry["surface_forms"] = sorted(list(set(globally_frequent_sfs_for_this_motif)))
            final_globally_filtered_motifs.append(filtered_motif_entry)
    return final_globally_filtered_motifs

print("Cell 4: Motif Processing and Validation Utilities loaded (with enhanced prompt strategy and robust parsing).")

In [None]:
# @title Cell 5: MDL Calculations

import hashlib
import numpy as np
from pybdm import BDM
import re
from typing import List, Dict

# Assume constants from Cell 1 are in global scope
# Assume tokenize_phrase from Cell 2 is in global scope
try:
    MATRIX_SIZE_GLOBAL; MAX_TEXT_FOR_BDM_HASH; MOTIF_SYMBOLIC_LABEL_COST # Check a few
    MOTIF_DESCRIPTION_TEXT_BASE_COST; MOTIF_DESCRIPTION_TOKEN_COST
    MOTIF_SURFACE_FORMS_LIST_BASE_COST; MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH
except NameError:
    print("WARN (Cell 5): Key config constants not found from Cell 1. Using fallbacks or expecting errors.")
    MATRIX_SIZE_GLOBAL = (8, 8); MAX_TEXT_FOR_BDM_HASH = 2000
    MOTIF_SYMBOLIC_LABEL_COST = 0.5; MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
    MOTIF_DESCRIPTION_TOKEN_COST = 0.1; MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.0
    MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.0


def initialize_bdm_instance():
    """Initializes and returns a BDM instance."""
    print("Initializing BDM instance...")
    try:
        bdm_instance = BDM(ndim=2)
        print("BDM instance initialized successfully (ndim=2, default CTM-based).")
        return bdm_instance
    except Exception as e_bdm_init:
        print(f"CRITICAL: Failed to initialize BDM instance: {e_bdm_init}")
        if "CTM data files" in str(e_bdm_init).lower() or "dataset" in str(e_bdm_init).lower():
            print("  BDM Error Hint: This might be related to missing/corrupted CTM data files for PyBDM.")
            print("  Ensure PyBDM is installed correctly and can access/download its data.")
            print("  You might need to run once: from pybdm import get_ctm_dataset; get_ctm_dataset()")
        return None

def text_to_binary_matrix(text_input: str, size: tuple = MATRIX_SIZE_GLOBAL) -> np.ndarray:
    """Converts a text string to a binary matrix using its SHA256 hash."""
    if not isinstance(text_input, str) or not text_input.strip():
        return np.zeros(size, dtype=int)
    hash_obj = hashlib.sha256(text_input.encode('utf-8', 'ignore'))
    hash_digest = hash_obj.hexdigest()
    required_bits = size[0] * size[1]
    binary_string_from_hash = bin(int(hash_digest, 16))[2:].zfill(256)
    binary_string_for_matrix = binary_string_from_hash[:required_bits] if required_bits <= 256 else binary_string_from_hash.ljust(required_bits, '0')
    bits_for_matrix = [int(b) for b in binary_string_for_matrix]
    return np.array(bits_for_matrix).reshape(size)

def compute_bdm_for_text(text_input: str, bdm_instance: BDM, matrix_s: tuple = MATRIX_SIZE_GLOBAL) -> float:
    """Computes BDM for a given text string using a prefix for hashing."""
    if not isinstance(text_input, str) or not text_input.strip() :
        return 0.0
    text_for_hash = text_input[:MAX_TEXT_FOR_BDM_HASH] if len(text_input) > MAX_TEXT_FOR_BDM_HASH else text_input
    if not text_for_hash.strip():
        return 0.0
    binary_matrix = text_to_binary_matrix(text_for_hash, size=matrix_s)
    try:
        bdm_value = bdm_instance.bdm(binary_matrix)
        return bdm_value
    except Exception as e_bdm:
        print(f"      Error during BDM calculation for text (full len {len(text_input)}, hashed part len {len(text_for_hash)}): {e_bdm}")
        return -1.0 # Indicate error

# VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
# CORRECTED VERSION OF THIS FUNCTION
# VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
def calculate_L_H_token_based_structured(structured_motifs_list: List[Dict]) -> float:
    """Calculates L(H) - the cost of defining the list of structured motifs."""
    if not structured_motifs_list:
        return 0.0

    total_lh_cost = 0.0
    for motif_obj in structured_motifs_list: # For each motif
        if not isinstance(motif_obj, dict):
            continue

        current_motif_lh = 0.0 # INITIALIZE PER MOTIF

        label_str = motif_obj.get('label', "")
        if isinstance(label_str, str) and label_str.strip():
            current_motif_lh += MOTIF_SYMBOLIC_LABEL_COST

        description_str = motif_obj.get('description', "")
        if isinstance(description_str, str) and description_str.strip():
            current_motif_lh += MOTIF_DESCRIPTION_TEXT_BASE_COST
            current_motif_lh += len(tokenize_phrase(description_str)) * MOTIF_DESCRIPTION_TOKEN_COST # tokenize_phrase from Cell 2

        surface_forms_list = motif_obj.get('surface_forms', [])
        if isinstance(surface_forms_list, list) and surface_forms_list:
            valid_sfs_for_lh = [sf for sf in surface_forms_list if isinstance(sf, str) and sf.strip()]
            if valid_sfs_for_lh: # Only add base cost if there are actual SFs
                current_motif_lh += MOTIF_SURFACE_FORMS_LIST_BASE_COST # From Cell 1 config
                for sf_str in valid_sfs_for_lh:
                    current_motif_lh += len(tokenize_phrase(sf_str)) * MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH # From Cell 1 config

        total_lh_cost += current_motif_lh
    return total_lh_cost
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# END OF CORRECTION
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


def llm_compress_text_structured(text_to_compress: str, structured_motifs_list: List[Dict]) -> str:
    """Compresses text by replacing occurrences of motif surface forms with their symbolic labels."""
    if not isinstance(text_to_compress, str): return ""
    if not structured_motifs_list: return text_to_compress.lower()
    compressed_text = text_to_compress.lower()
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue
        label = motif_obj.get('label', None)
        surface_forms = motif_obj.get('surface_forms', [])
        if not (isinstance(label, str) and label.strip()) or \
           not (isinstance(surface_forms, list) and surface_forms):
            continue
        placeholder = label
        sorted_sfs_for_this_motif = sorted(
            [sf for sf in surface_forms if isinstance(sf, str) and sf.strip()], key=len, reverse=True
        )
        for sf_str_lower in sorted_sfs_for_this_motif: # Assumes sf_str is already lowercased from consolidation/filtering
            try:
                # Using word boundaries for more precise replacement
                compressed_text = re.sub(r'\b' + re.escape(sf_str_lower) + r'\b', placeholder, compressed_text)
            except re.error as re_e:
                print(f"    Regex error during compression for SF '{sf_str_lower}' of motif '{label}': {re_e}. Skipping.")
                continue
    return compressed_text

def compute_mdl_cost_for_text_block(
    full_qid_corpus_str: str,
    final_motifs_to_evaluate: List[Dict],
    bdm_instance: BDM,
    matrix_s: tuple = MATRIX_SIZE_GLOBAL
    ) -> tuple[float, float, float]:
    """Computes L(H), L(D|H), and Total MDL for a text block given a final set of motifs."""
    if not isinstance(full_qid_corpus_str, str): full_qid_corpus_str = ""
    l_h = calculate_L_H_token_based_structured(final_motifs_to_evaluate) # Uses the corrected version
    compressed_text_block = llm_compress_text_structured(full_qid_corpus_str, final_motifs_to_evaluate)
    l_d_h = compute_bdm_for_text(compressed_text_block, bdm_instance, matrix_s)
    if l_d_h < 0: return l_h, -1.0, -1.0
    total_mdl_cost = l_h + l_d_h
    return l_h, l_d_h, total_mdl_cost

print("Cell 5: MDL Calculation Utilities loaded (with L(H) fix).")

In [None]:
# @title Cell 6: Main Pipeline Orchestration

# Assume functions from previous cells are defined

def main():
    script_version_name = "Refactored MWP v8 (Stricter Prompt, Robust Label Fix, L(H) SF Cost Zero)"
    print(f"--- {script_version_name} ---")
    print(f"Timestamp: {time.asctime()}")
    print("\n--- Configuration Summary ---") # ... (print all config constants from Cell 1)
    print(f"LLM Model: {LOCAL_LLM_MODEL_ID}, Quantization: {USE_QUANTIZATION_FOR_LOCAL_LLM}")
    print(f"LLM Batch Size (Responses): {LLM_BATCH_SIZE_RESPONSES}, Retries: {LLM_RETRY_ATTEMPTS}")
    print(f"Max Text Chars per LLM Prompt Chunk: {MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK}")
    print(f"Max New Tokens for LLM Motif Extraction: {LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION}")
    print(f"Max Motifs to Request per Chunk: {MAX_MOTIFS_PER_CHUNK}")
    print(f"L(H) Costs: Label={MOTIF_SYMBOLIC_LABEL_COST}, DescBase={MOTIF_DESCRIPTION_TEXT_BASE_COST}, DescToken={MOTIF_DESCRIPTION_TOKEN_COST}, SFListBase={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, SFTokenInLH={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}")
    print(f"Global SF Filtering Min Freq: {MIN_SF_FREQUENCY_IN_FULL_CORPUS}")
    print(f"BDM Hash Prefix Length: {MAX_TEXT_FOR_BDM_HASH}, BDM Matrix: {MATRIX_SIZE_GLOBAL}")
    print(f"Debug Log File: {LLM_DEBUG_LOG_FILE}")
    print("--- End Configuration Summary ---\n")

    try: # Initialize debug log
        with open(LLM_DEBUG_LOG_FILE, "w", encoding="utf-8") as f:
            f.write(f"LLM Motif Debug Log - Run Started: {time.asctime()}\n")
            f.write(f"Script Version: {script_version_name}\n")
            f.write(f"Model ID: {LOCAL_LLM_MODEL_ID}\n")
            f.write(f"Pipeline Config: return_full_text=False\n")
            f.write(f"L(H) SF Costs: Base={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, Token={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}\n---\n")
    except Exception as e_log: print(f"WARN: Could not initialize debug log file {LLM_DEBUG_LOG_FILE}: {e_log}")

    hf_pipeline_instance, hf_tokenizer_instance = initialize_llm_pipeline() # From Cell 3
    if not hf_pipeline_instance: print("CRITICAL: Exiting: LLM init failure."); return
    bdm_instance_main = initialize_bdm_instance() # From Cell 5
    if not bdm_instance_main: print("CRITICAL: Exiting: BDM init failure."); return

    if not os.path.exists(P2_COLLATED_FILE): print(f"ERROR: File {P2_COLLATED_FILE} not found."); return
    print(f"Loading data from: {P2_COLLATED_FILE}..."); phase2_data_content = None
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f: phase2_data_content = json.load(f)
    except Exception as e_load: print(f"Error loading {P2_COLLATED_FILE}: {e_load}"); return

    all_qid_mdl_results_list = []
    aggregated_content_by_qid_from_file = phase2_data_content.get("aggregated_pdf_content_by_qid", {})
    if not aggregated_content_by_qid_from_file: print(f"No 'aggregated_pdf_content_by_qid' in {P2_COLLATED_FILE}."); return

    qids_to_process_this_run = [] # From Cell 1 (config)
    if P3_QIDS_TO_PROCESS_THEMATICALLY and isinstance(P3_QIDS_TO_PROCESS_THEMATICALLY, list) and P3_QIDS_TO_PROCESS_THEMATICALLY:
        qids_to_process_this_run = [qid for qid in P3_QIDS_TO_PROCESS_THEMATICALLY if qid in aggregated_content_by_qid_from_file]
        if not qids_to_process_this_run: print(f"Warning: QIDs {P3_QIDS_TO_PROCESS_THEMATICALLY} not found. Exiting."); return
    else:
        qids_to_process_limit_fallback = 1
        print(f"P3_QIDS_TO_PROCESS_THEMATICALLY not set/empty. Processing up to {qids_to_process_limit_fallback} QID(s) as fallback.")
        qids_to_process_this_run = list(aggregated_content_by_qid_from_file.keys())[:qids_to_process_limit_fallback]
        if not qids_to_process_this_run: print("No QIDs in data for fallback. Exiting."); return
    if not qids_to_process_this_run: print("No QIDs selected. Exiting."); return
    print(f"\nMDL analysis will run for QIDs: {qids_to_process_this_run}\n")

    for qid_identifier_str in qids_to_process_this_run:
        print(f"--- Analyzing Data for QID: {qid_identifier_str} ---")
        list_of_individual_response_structs = aggregated_content_by_qid_from_file.get(qid_identifier_str, [])
        actual_response_texts_for_qid = [item.get("text", "") for item in list_of_individual_response_structs if isinstance(item, dict) and isinstance(item.get("text"), str) and item.get("text","").strip()]
        if not actual_response_texts_for_qid: print(f"  No valid text for QID {qid_identifier_str}. Skipping."); print("-" * 50); continue
        full_corpus_text_for_qid = "\n\n<RSP_SEP>\n\n".join(actual_response_texts_for_qid)
        if len(full_corpus_text_for_qid.strip()) < 100: print(f"  Skipping QID {qid_identifier_str}: text too short."); print("-" * 50); continue
        num_total_responses_for_qid = len(actual_response_texts_for_qid)
        print(f"  Corpus for QID {qid_identifier_str}: {len(full_corpus_text_for_qid)} chars, {num_total_responses_for_qid} responses.")

        baseline_bdm_original_corpus = compute_bdm_for_text(full_corpus_text_for_qid, bdm_instance_main)
        if baseline_bdm_original_corpus < 0: print(f"  Error computing baseline BDM for QID {qid_identifier_str}. Skipping."); print("-" * 50); continue
        current_qid_baseline_mdl_cost = baseline_bdm_original_corpus
        print(f"  Baseline MDL for QID {qid_identifier_str} (L(D_orig)): {current_qid_baseline_mdl_cost:.4f}")

        raw_motifs_from_chunks = get_motifs_for_qid_batched( # From Cell 4
            actual_response_texts_for_qid, LLM_BATCH_SIZE_RESPONSES,
            hf_pipeline_instance, hf_tokenizer_instance, qid_identifier_str
        )
        current_qid_result_entry = { # Init result entry
            "qid": qid_identifier_str, "corpus_len_chars": len(full_corpus_text_for_qid), "num_responses": num_total_responses_for_qid,
            "baseline_mdl": current_qid_baseline_mdl_cost, "final_refined_motifs": [], "l_h_final_motifs": 0.0,
            "l_d_h_final_motifs": current_qid_baseline_mdl_cost, "total_mdl_with_final_motifs": current_qid_baseline_mdl_cost,
            "compression_achieved": 0.0, "num_raw_motifs_extracted": len(raw_motifs_from_chunks),
            "num_consolidated_motifs": 0, "num_globally_refined_motifs": 0
        }
        if not raw_motifs_from_chunks: print(f"  No raw motifs by LLM for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue
        print(f"  Extracted {len(raw_motifs_from_chunks)} raw motif objects for QID {qid_identifier_str}.")

        consolidated_motifs_list = consolidate_raw_motifs(raw_motifs_from_chunks) # From Cell 4
        current_qid_result_entry["num_consolidated_motifs"] = len(consolidated_motifs_list)
        print(f"  Consolidated into {len(consolidated_motifs_list)} unique motifs for QID {qid_identifier_str}.")
        if not consolidated_motifs_list: print(f"  No unique motifs after consolidation for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        # print(f"  Consolidated Motifs (BEFORE Global SF refinement):")
        # for idx, mo_con in enumerate(consolidated_motifs_list): print(f"    Cons. Motif {idx+1}: L='{mo_con.get('label')}', D='{mo_con.get('description','N/A')[:30]}...', SFs({len(mo_con.get('surface_forms',[]))})='{mo_con.get('surface_forms',[])[:2]}...'")

        globally_refined_motifs = filter_surface_forms_by_global_frequency( # From Cell 4
            consolidated_motifs_list, full_corpus_text_for_qid, MIN_SF_FREQUENCY_IN_FULL_CORPUS
        )
        current_qid_result_entry["num_globally_refined_motifs"] = len(globally_refined_motifs)
        print(f"  Globally refined into {len(globally_refined_motifs)} motifs for QID {qid_identifier_str}.")
        if not globally_refined_motifs: print(f"  No motifs left after GLOBAL SF refinement for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        print(f"  Final Globally Refined Motifs for QID {qid_identifier_str}:")
        for idx, mo_final in enumerate(globally_refined_motifs): print(f"    Refined Motif {idx+1}: L='{mo_final.get('label')}', D='{mo_final.get('description','N/A')[:60]}...', SFs({len(mo_final.get('surface_forms',[]))})='{mo_final.get('surface_forms',[])}'")

        l_h_final, l_d_h_final, total_mdl_final = compute_mdl_cost_for_text_block( # From Cell 5
            full_corpus_text_for_qid, globally_refined_motifs, bdm_instance_main
        )
        current_qid_result_entry.update({
            "final_refined_motifs": globally_refined_motifs, "l_h_final_motifs": l_h_final,
            "l_d_h_final_motifs": l_d_h_final if l_d_h_final >=0 else "BDM_ERROR",
            "total_mdl_with_final_motifs": total_mdl_final if l_d_h_final >=0 else "BDM_ERROR",
            "compression_achieved": "BDM_ERROR" if l_d_h_final < 0 else (current_qid_baseline_mdl_cost - total_mdl_final)
        })
        if l_d_h_final < 0: print(f"  Error computing MDL cost (BDM error) for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        print(f"  L(H) final motifs: {l_h_final:.4f} (SFs definition cost in L(H) is ZEROED for this run)")
        print(f"  L(D|H) compressed full corpus: {l_d_h_final:.4f}")
        print(f"  Total MDL cost with final motifs: {total_mdl_final:.4f}")
        compression_val = current_qid_result_entry["compression_achieved"]
        result_status_str = f"SUCCESS: Comp: {compression_val:.4f}" if isinstance(compression_val, float) and compression_val > 0.0001 else f"NOTE: No sig. comp. Diff: {compression_val if isinstance(compression_val, str) else compression_val:.4f}"
        print(f"  {result_status_str}"); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50)

    print("\n--- Overall QID-based MDL Analysis Summary ---")
    if not all_qid_mdl_results_list: print("No QIDs processed.")
    else:
        valid_results = [r for r in all_qid_mdl_results_list if isinstance(r.get('compression_achieved'), float) and r.get('l_h_final_motifs', -1.0) >= 0]
        num_qids_ok = len(valid_results); num_comp = sum(1 for r in valid_results if r['compression_achieved'] > 0.0001)
        print(f"Targeted QIDs: {len(qids_to_process_this_run)}, Results logged: {len(all_qid_mdl_results_list)}, Valid MDL: {num_qids_ok}, QIDs compressed: {num_comp}")
        if num_comp > 0:
            comp_vals = [r['compression_achieved'] for r in valid_results if r['compression_achieved'] > 0.0001]
            print(f"  Avg compression: {np.mean(comp_vals):.4f}, Max compression: {np.max(comp_vals):.4f}")
        else: print("  No compression achieved.")

        output_filename = os.path.join(BASE_PROJECT_DIR, "mdl_analysis_refactored_v8_strictprompt_LHSFzero.json")
        try:
            with open(output_filename, "w", encoding="utf-8") as f_out: json.dump(all_qid_mdl_results_list, f_out, indent=2, ensure_ascii=False)
            print(f"Detailed results saved to {output_filename}")
        except Exception as e_s: print(f"Error saving results: {e_s}")

if __name__ == "__main__":
    print(f"Executing main MDL pipeline (Refactored MWP v8 - Stricter Prompt, Label Fix, L(H) SF Cost Zero) at {time.asctime()}...")
    main()
    print(f"Main MDL pipeline execution finished at {time.asctime()}.")

# 3rd June

In [None]:
# @title Cell 1: Configuration

import os
from typing import List, Dict, Set
from collections import Counter

# --- Project Paths ---
BASE_PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Legal/'
PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json')

P3_QIDS_TO_PROCESS_THEMATICALLY = ["Q4"]

# --- BDM Configuration ---
MATRIX_SIZE_GLOBAL = (8, 8)
MAX_TEXT_FOR_BDM_HASH = 2000

# --- LLM Configuration ---
LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'
USE_QUANTIZATION_FOR_LOCAL_LLM = True
LLM_BATCH_SIZE_RESPONSES = 5
LLM_RETRY_ATTEMPTS = 2
MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK = 7000
LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION = 1000 # INCREASED FOR TESTING JSON COMPLETENESS
MAX_MOTIFS_PER_CHUNK = 5

# --- Token-Based L(H) Configuration (SFs cost zeroed for current experiment) ---
MOTIF_SYMBOLIC_LABEL_COST = 0.5
MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
MOTIF_DESCRIPTION_TOKEN_COST = 0.1
MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.0
MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.0

# --- Surface Form Filtering Configuration ---
MIN_SF_FREQUENCY_IN_FULL_CORPUS = 2

# --- Logging File ---
LLM_DEBUG_LOG_FILE = os.path.join(BASE_PROJECT_DIR, "llm_motif_debug_log_refactored_v7_prompt_jsonfix.txt")

print(f"Cell 1: Configuration loaded. LOCAL_LLM_MODEL_ID set to '{LOCAL_LLM_MODEL_ID}'.")
print(f"LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION set to: {LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION}")
print(f"L(H) SF Costs: Base={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, Token={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}")
print(f"Debug log will be: {LLM_DEBUG_LOG_FILE}")
if not os.path.exists(BASE_PROJECT_DIR):
    print(f"WARNING: BASE_PROJECT_DIR '{BASE_PROJECT_DIR}' does not exist.")
if P2_COLLATED_FILE and not os.path.exists(P2_COLLATED_FILE):
     print(f"WARNING: P2_COLLATED_FILE '{P2_COLLATED_FILE}' does not exist. Data loading may fail.")

In [None]:
# @title Cell 2: Text Utilities

import re
from typing import List, Dict
from collections import Counter

# Constants this cell might use if run independently
try:
    MIN_SF_FREQ_IN_CHUNK_VALIDATION # Check if defined from config
except NameError:
    MIN_SF_FREQ_IN_CHUNK_VALIDATION = 2


def tokenize_phrase(phrase_text: str) -> List[str]:
    if not isinstance(phrase_text, str) or not phrase_text.strip():
        return []
    return phrase_text.lower().split()

def preprocess_corpus_for_motif_extraction(text_corpus: str) -> str:
    if not isinstance(text_corpus, str):
        return ""
    text = re.sub(r'\n{3,}', '\n\n', text_corpus)
    text = re.sub(r' {2,}', ' ', text)
    lines = text.split('\n')
    filtered_lines = [line.strip() for line in lines if len(line.strip()) > 10 or not line.strip()]
    return '\n'.join(filtered_lines)

def count_sf_occurrences(corpus_text: str, surface_form: str) -> int:
    if not corpus_text or not surface_form or \
       not isinstance(corpus_text, str) or not isinstance(surface_form, str) or \
       not surface_form.strip():
        return 0
    try:
        return len(re.findall(re.escape(surface_form.lower()), corpus_text.lower(), flags=re.IGNORECASE))
    except re.error as e:
        print(f"    [WARN] Regex error in count_sf_occurrences for SF '{surface_form}': {e}")
        return 0

def extract_actual_phrases_from_text( # Not actively used in current main flow
    text: str,
    min_phrase_len: int = 2,
    max_phrase_len: int = 6,
    min_freq: int = MIN_SF_FREQ_IN_CHUNK_VALIDATION
    ) -> Dict[str, int]:
    if not isinstance(text, str) or not text.strip(): return {}
    text_cleaned = text.lower()
    text_cleaned = re.sub(r'[^\w\s\']', ' ', text_cleaned)
    text_cleaned = re.sub(r'\s+', ' ', text_cleaned).strip()
    words = text_cleaned.split()
    if not words or len(words) < min_phrase_len: return {}
    phrase_counts = Counter()
    for n in range(min_phrase_len, max_phrase_len + 1):
        if n > len(words): continue
        for i in range(len(words) - n + 1):
            phrase = ' '.join(words[i:i+n])
            if phrase: phrase_counts[phrase] += 1
    return {phrase: count for phrase, count in phrase_counts.items() if count >= min_freq}

print("Cell 2: Text Utilities loaded.")

In [None]:
# @title Cell 3: LLM Interaction

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import time
from typing import List, Dict
import os
import re

try:
    LOCAL_LLM_MODEL_ID; USE_QUANTIZATION_FOR_LOCAL_LLM; MAX_MOTIFS_PER_CHUNK
    MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK; LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION; LLM_DEBUG_LOG_FILE
except NameError:
    print("WARN (Cell 3): Key config constants not found from Cell 1. Using fallbacks.")
    LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'; USE_QUANTIZATION_FOR_LOCAL_LLM = True
    MAX_MOTIFS_PER_CHUNK = 5; MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK = 7000
    LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION = 1000 # Matching Cell 1
    LLM_DEBUG_LOG_FILE = "temp_llm_debug_log_cell3.txt"


def initialize_llm_pipeline(
    model_id: str = LOCAL_LLM_MODEL_ID,
    use_quantization: bool = USE_QUANTIZATION_FOR_LOCAL_LLM,
    pipeline_return_full_text: bool = False
    ):
    # (Same as your last working version of this function)
    print(f"--- Initializing LLM Pipeline (model: {model_id}, quantization: {use_quantization}, return_full_text: {pipeline_return_full_text}) ---")
    hf_pipeline_instance = None; hf_tokenizer_instance = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu"); print(f"Using device: {device}")
    try:
        print(f"Loading tokenizer for {model_id}..."); hf_tokenizer_instance = AutoTokenizer.from_pretrained(model_id)
        if hf_tokenizer_instance.pad_token is None:
            if hf_tokenizer_instance.eos_token is not None: print("Tokenizer setting pad_token = eos_token."); hf_tokenizer_instance.pad_token = hf_tokenizer_instance.eos_token
            else: print("WARN (initialize_llm): Tokenizer has no pad_token and no eos_token.")
        bnb_config = None; quant_active = False
        if use_quantization and torch.cuda.is_available():
            try:
                compute_dtype = torch.bfloat16 if (device.type == 'cuda' and torch.cuda.is_bf16_supported()) else torch.float16
                bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=True)
                quant_active = True; print(f"BNB config created, compute_dtype: {compute_dtype}.")
            except Exception as e_bnb: print(f"WARN: Failed BitsAndBytesConfig: {e_bnb}. Quantization may be disabled."); quant_active = False
        print(f"Loading model {model_id} (Quantization: {quant_active})..."); model_kwargs = {"device_map": "auto", "trust_remote_code": True}
        if quant_active and bnb_config: model_kwargs["quantization_config"] = bnb_config
        elif device.type == 'cuda': model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        hf_model_instance = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
        if hf_tokenizer_instance.pad_token_id is not None:
            if hf_model_instance.config.pad_token_id is None or hf_model_instance.config.pad_token_id != hf_tokenizer_instance.pad_token_id:
                hf_model_instance.config.pad_token_id = hf_tokenizer_instance.pad_token_id
        hf_pipeline_instance = pipeline("text-generation", model=hf_model_instance, tokenizer=hf_tokenizer_instance, return_full_text=pipeline_return_full_text)
        print(f"LLM pipeline initialized for {model_id}."); return hf_pipeline_instance, hf_tokenizer_instance
    except Exception as e: print(f"CRITICAL: LLM pipeline init failed: {e}"); import traceback; traceback.print_exc(); return None, None


def create_enhanced_motif_prompt(text_block_for_prompt: str, max_motifs_to_extract: int = MAX_MOTIFS_PER_CHUNK) -> str:
    """Revised prompt with stronger emphasis on JSON syntax and label format."""
    if len(text_block_for_prompt) > MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK:
        text_block_for_prompt = text_block_for_prompt[:MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK]

    prompt = f"""You are a precise assistant for thematic analysis. Your task is to extract key recurring themes from the provided text.

STRICT OUTPUT REQUIREMENTS:
1.  Your entire response MUST be a single, valid JSON list.
2.  Each element in the list MUST be a JSON object.
3.  Each JSON object MUST contain exactly three keys: "label", "description", and "surface_forms".
4.  The value for "label" MUST be a string, IN ALL_CAPITAL_SNAKE_CASE, AND enclosed in square brackets. Example of a correct label: "[DATA_SECURITY_POLICY]". Example of an incorrect label: "Data Security Policy".
5.  The value for "description" MUST be a single, concise sentence (string).
6.  The value for "surface_forms" MUST be a JSON list of 2 to 3 short (2-6 words) VERBATIM phrases extracted DIRECTLY from the 'Text to analyze'. These phrases must be strong examples of the theme. If no suitable verbatim phrases are found, provide an empty list `[]`.
7.  Identify up to {max_motifs_to_extract} themes. If fewer are clear, provide fewer objects. If no themes are clear, output an empty JSON list: `[]`.
8.  Do NOT include any text, explanations, apologies, or markdown (like ```json) before or after the main JSON list. Ensure all strings in the JSON are properly double-quoted and terminated, and all lists/objects are correctly bracketed with elements comma-separated. Avoid trailing commas before a closing ']' or '}}'.

INSTRUCTIONS FOR THEME IDENTIFICATION:
- Focus on meaningful recurring concepts directly stated or strongly implied in the 'Text to analyze'.
- For 'surface_forms', prioritize phrases that appear to be REPEATED or are highly characteristic of the theme within THIS text.
- Avoid generic labels like [EXAMPLE_THEME] or [GENERAL_TOPIC]. Make labels specific.

Text to analyze:
\"\"\"
{text_block_for_prompt}
\"\"\"

Your valid JSON response (ONLY the JSON list):
"""
    return prompt.strip()

def call_local_llm_for_raw_response(
    prompt_content_for_user_turn: str,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str,
    chunk_idx_for_log: int
    ) -> str:
    # (Same as your last working version - uses do_sample=False)
    if not hf_pipeline_instance or not hf_tokenizer_instance:
        print(f"    ERROR (call_local_llm): LLM pipeline/tokenizer not initialized for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        return ""
    messages_for_template = [{"role": "user", "content": prompt_content_for_user_turn}]
    try:
        prompt_formatted_for_llm = hf_tokenizer_instance.apply_chat_template(
            messages_for_template, tokenize=False, add_generation_prompt=True
        )
    except Exception as e_template:
        print(f"    ERROR (call_local_llm): Applying chat template for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_template}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (call_local_llm) ---\nERROR APPLYING CHAT TEMPLATE: {e_template}\nPrompt content (first 300): {prompt_content_for_user_turn[:300]}...\n")
        return ""
    generation_args = {
        "max_new_tokens": LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION,
        "do_sample": False,
        "pad_token_id": hf_tokenizer_instance.pad_token_id
    }
    try:
        outputs = hf_pipeline_instance(prompt_formatted_for_llm, **generation_args)
        if outputs and isinstance(outputs, list) and len(outputs) > 0 and \
           outputs[0] and isinstance(outputs[0], dict) and 'generated_text' in outputs[0]:
            return outputs[0]['generated_text'].strip()
        else:
            print(f"    WARN (call_local_llm): LLM pipeline unexpected structure for QID {qid_for_log}, Chunk {chunk_idx_for_log}. Output: {outputs}")
            return ""
    except Exception as e_pipeline:
        print(f"    ERROR (call_local_llm): Exception during hf_pipeline call for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_pipeline}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (call_local_llm) ---\nERROR PIPELINE CALL: {e_pipeline}\nFormatted prompt (first 300): {prompt_formatted_for_llm[:300]}...\n")
        return ""

print("Cell 3: LLM Interaction Utilities loaded (with more explicit JSON prompt).")

In [None]:
# @title Cell 4: Motif Processing and Validation

import json
import re
import time
from typing import List, Dict

# Assume constants (LLM_DEBUG_LOG_FILE, etc.) from Cell 1 are in global scope
# Assume text_utils (preprocess_corpus_for_motif_extraction, count_sf_occurrences) from Cell 2
# Assume LLM interaction (build_llm_prompt_for_motifs, call_local_llm_for_raw_response) from Cell 3

try:
    LLM_DEBUG_LOG_FILE; LLM_RETRY_ATTEMPTS; MIN_SF_FREQUENCY_IN_FULL_CORPUS
except NameError:
    LLM_DEBUG_LOG_FILE = "temp_llm_debug_log_cell4.txt"; LLM_RETRY_ATTEMPTS = 2; MIN_SF_FREQUENCY_IN_FULL_CORPUS = 2


def parse_and_validate_llm_json_response(
    llm_raw_response_text: str,
    qid_for_log:str,
    chunk_idx_for_log:int,
    prompt_sent_to_llm:str # For logging context
    ) -> List[Dict]:
    """Parses LLM JSON, attempts label fixing, validates schema."""
    json_str_candidate = llm_raw_response_text.strip()
    if json_str_candidate.startswith("```json"): json_str_candidate = json_str_candidate[len("```json"):].strip()
    if json_str_candidate.startswith("```"): json_str_candidate = json_str_candidate[len("```"):].strip()
    if json_str_candidate.endswith("```"): json_str_candidate = json_str_candidate[:-len("```")].strip()

    if not json_str_candidate or json_str_candidate.lower() == "[]" or \
       "no_themes_found" in json_str_candidate.lower() or \
       "no clear motifs" in json_str_candidate.lower():
        return []

    try:
        parsed_data = json.loads(json_str_candidate)
        if isinstance(parsed_data, dict): parsed_data = [parsed_data]
        if not isinstance(parsed_data, list):
            raise ValueError("Parsed JSON is not a list or single object.")

        valid_motifs_from_json = []
        for item_idx, item in enumerate(parsed_data):
            if not isinstance(item, dict) or not item: continue

            # --- Robust Label Processing ---
            label_str_original = item.get('label', "")
            label_str_processed = ""
            if isinstance(label_str_original, str) and label_str_original.strip():
                temp_label_stripped = label_str_original.strip()
                # Priority 1: Extract existing well-formed [UPPER_SNAKE_CASE]
                match_strict_bracketed = re.fullmatch(r"\[([A-Z0-9_]+)\]", temp_label_stripped)
                if match_strict_bracketed:
                    label_str_processed = temp_label_stripped
                else:
                    # Priority 2: Extract bracketed part even if extra text (e.g., "[LABEL] explanation")
                    match_bracketed_part = re.search(r"(\[[A-Z0-9_]+\])", temp_label_stripped)
                    if match_bracketed_part:
                        label_str_processed = match_bracketed_part.group(1)
                    else: # Priority 3: No brackets found, try to create one
                        sanitized_content = re.sub(r'\s+|-', '_', temp_label_stripped) # Replace space or hyphen
                        sanitized_content = re.sub(r'[^a-zA-Z0-9_]', '', sanitized_content).upper()
                        sanitized_content = "_".join(sanitized_content.split('_')[:4]) # Limit length
                        if sanitized_content:
                            label_str_processed = f"[{sanitized_content}]"
            item['label'] = label_str_processed # Use the processed label for the item
            # --- End Label Processing ---

            label_to_validate = item.get('label',"")
            desc_str = item.get('description',"")
            sf_list = item.get('surface_forms', [])

            has_all_keys = all(k in item for k in ["label", "description", "surface_forms"])
            label_is_valid_format = isinstance(label_to_validate, str) and \
                                    bool(label_to_validate) and \
                                    label_to_validate.startswith('[') and \
                                    label_to_validate.endswith(']') and \
                                    re.fullmatch(r"\[[A-Z0-9_]+\]", label_to_validate) # Final check on processed label
            desc_is_valid = isinstance(desc_str, str)
            sfs_list_is_valid = isinstance(sf_list, list) and \
                                 all(isinstance(sf_item, str) for sf_item in sf_list)

            if has_all_keys and label_is_valid_format and desc_is_valid and sfs_list_is_valid:
                valid_motifs_from_json.append({
                    "label": label_to_validate,
                    "description": desc_str.strip(),
                    "surface_forms": [s.strip() for s in sf_list if isinstance(s, str) and s.strip()]
                })
            else:
                print(f"    [WARN] Invalid motif object schema for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}. Skipping.")
                with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
                    f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} --- ITEM_SCHEMA_FAILURE (Item {item_idx+1}) ---\n")
                    f.write(f"Original Label: '{label_str_original}', Processed Label for Validation: '{label_to_validate}'\n")
                    f.write(f"Item Content After Label Proc: {json.dumps(item, indent=2)}\n")
                    f.write(f"Validation Checks: has_keys={has_all_keys}, label_valid_fmt={label_is_valid_format}, desc_valid={desc_is_valid}, sfs_list_valid={sfs_list_is_valid}\n")
        return valid_motifs_from_json
    except (json.JSONDecodeError, ValueError) as e:
        print(f"    [WARN] Motif JSON parsing/core structure issue for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} --- JSON_PARSE_VALUE_ERROR ---\n")
            prompt_parts = prompt_sent_to_llm.split('Set of comments to analyze:')
            user_content_for_log = prompt_parts[1][:500] if len(prompt_parts) > 1 else (prompt_sent_to_llm[:500] if prompt_sent_to_llm else "PROMPT_EMPTY")
            f.write(f"PROMPT USER CONTENT (approx first 500 chars):\n{user_content_for_log}...\n")
            f.write(f"RAW LLM RESPONSE (Error: {type(e).__name__} - {str(e)}):\n{llm_raw_response_text}\n")
            f.write(f"EXTRACTED JSON CANDIDATE (Error: {type(e).__name__}):\n{json_str_candidate}\n")
        return []

def get_motifs_for_qid_batched(
    list_of_individual_response_texts: List[str],
    responses_per_batch: int,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str
    ) -> List[Dict]:
    # (This function remains the same as your last correct version:
    #  It batches text, calls create_enhanced_motif_prompt, call_local_llm_for_raw_response,
    #  and parse_and_validate_llm_json_response with retries)
    all_raw_motifs_from_chunks = []
    batched_text_chunks_for_llm = []
    for i in range(0, len(list_of_individual_response_texts), responses_per_batch):
        batch_responses = list_of_individual_response_texts[i:i + responses_per_batch]
        chunk_text_for_llm = preprocess_corpus_for_motif_extraction("\n\n<RSP_SEP>\n\n".join(batch_responses)) # from Cell 2
        batched_text_chunks_for_llm.append(chunk_text_for_llm)
    print(f"  QID {qid_for_log}: Processing {len(list_of_individual_response_texts)} responses in {len(batched_text_chunks_for_llm)} preprocessed chunks (batch size: {responses_per_batch} responses).")
    for chunk_idx, text_chunk_to_analyze_processed in enumerate(batched_text_chunks_for_llm):
        print(f"    Analyzing chunk {chunk_idx + 1}/{len(batched_text_chunks_for_llm)} for QID {qid_for_log} (processed chunk len: {len(text_chunk_to_analyze_processed)} chars)...")
        if len(text_chunk_to_analyze_processed.strip()) < 50:
            print(f"      Chunk {chunk_idx+1} (QID {qid_for_log}) too short, skipping."); continue
        prompt_for_llm = create_enhanced_motif_prompt(text_chunk_to_analyze_processed) # from Cell 3
        motifs_from_this_chunk = []
        for attempt in range(LLM_RETRY_ATTEMPTS): # from config
            raw_llm_response = call_local_llm_for_raw_response( # from Cell 3
                prompt_for_llm, hf_pipeline_instance, hf_tokenizer_instance, qid_for_log, chunk_idx + 1
            )
            if not raw_llm_response:
                print(f"      LLM call attempt {attempt + 1} for chunk {chunk_idx+1} (QID {qid_for_log}) returned empty. Retrying if possible...");
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1); continue
            parsed_motifs_from_this_attempt = parse_and_validate_llm_json_response( # from this cell
                raw_llm_response, qid_for_log, chunk_idx+1, prompt_for_llm
            )
            if parsed_motifs_from_this_attempt:
                motifs_from_this_chunk = parsed_motifs_from_this_attempt; break
            else:
                print(f"      Motif parsing/validation attempt {attempt + 1} yielded no structured motifs for chunk {chunk_idx+1}. Retrying if possible...");
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)
        if motifs_from_this_chunk:
            print(f"      Extracted {len(motifs_from_this_chunk)} structured motif objects from chunk {chunk_idx+1} (QID {qid_for_log}).")
            all_raw_motifs_from_chunks.extend(motifs_from_this_chunk)
        else:
            print(f"      No valid structured motifs from chunk {chunk_idx+1} (QID {qid_for_log}) after {LLM_RETRY_ATTEMPTS} attempts.")
    return all_raw_motifs_from_chunks


def consolidate_raw_motifs(list_of_all_raw_motifs: List[Dict]) -> List[Dict]:
    # (Same as your last correct version)
    if not list_of_all_raw_motifs: return []
    consolidated_motifs_map = {}
    for motif_obj in list_of_all_raw_motifs:
        label = motif_obj.get("label","").strip()
        description = motif_obj.get("description","").strip()
        surface_forms = motif_obj.get("surface_forms", [])
        if not (label and isinstance(surface_forms, list)): continue
        current_sfs_set = set(sf.lower().strip() for sf in surface_forms if isinstance(sf, str) and sf.strip())
        if label not in consolidated_motifs_map:
            consolidated_motifs_map[label] = {"label": label, "description": description, "surface_forms": sorted(list(current_sfs_set))}
        else:
            existing_sfs_set = set(consolidated_motifs_map[label].get("surface_forms", []))
            consolidated_motifs_map[label]["surface_forms"] = sorted(list(existing_sfs_set.union(current_sfs_set)))
    return list(consolidated_motifs_map.values())

def filter_surface_forms_by_global_frequency(
    consolidated_motifs_list: List[Dict],
    full_qid_corpus_text: str,
    min_global_freq: int = MIN_SF_FREQUENCY_IN_FULL_CORPUS # From config
    ) -> List[Dict]:
    # (Same as your last correct version - uses count_sf_occurrences from Cell 2)
    if not consolidated_motifs_list: return []
    final_globally_filtered_motifs = []
    for motif_obj in consolidated_motifs_list:
        globally_frequent_sfs_for_this_motif = []
        original_sfs_for_this_motif = motif_obj.get("surface_forms", [])
        for sf_str_lower in original_sfs_for_this_motif:
            count = count_sf_occurrences(full_qid_corpus_text, sf_str_lower) # from Cell 2
            if count >= min_global_freq:
                globally_frequent_sfs_for_this_motif.append(sf_str_lower)
        if globally_frequent_sfs_for_this_motif:
            filtered_motif_entry = motif_obj.copy()
            filtered_motif_entry["surface_forms"] = sorted(list(set(globally_frequent_sfs_for_this_motif)))
            final_globally_filtered_motifs.append(filtered_motif_entry)
    return final_globally_filtered_motifs

print("Cell 4: Motif Processing and Validation Utilities loaded (with enhanced label processing).")

In [None]:
# @title Cell 5: MDL Calculations

import hashlib
import numpy as np
from pybdm import BDM
import re
from typing import List, Dict

# Assume constants from Cell 1 are in global scope
# Assume tokenize_phrase from Cell 2 is in global scope
try:
    MATRIX_SIZE_GLOBAL; MAX_TEXT_FOR_BDM_HASH; MOTIF_SYMBOLIC_LABEL_COST
except NameError:
    print("WARN (Cell 5): Key config constants not found. Using fallbacks or expecting errors.")
    MATRIX_SIZE_GLOBAL = (8, 8); MAX_TEXT_FOR_BDM_HASH = 2000
    MOTIF_SYMBOLIC_LABEL_COST = 0.5; MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
    MOTIF_DESCRIPTION_TOKEN_COST = 0.1; MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.0
    MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.0


def initialize_bdm_instance():
    print("Initializing BDM instance...")
    try:
        bdm_instance = BDM(ndim=2)
        print("BDM instance initialized successfully (ndim=2, default CTM-based).")
        return bdm_instance
    except Exception as e_bdm_init:
        print(f"CRITICAL: Failed to initialize BDM instance: {e_bdm_init}")
        if "CTM data files" in str(e_bdm_init).lower() or "dataset" in str(e_bdm_init).lower():
            print("  BDM Error Hint: This might be related to missing/corrupted CTM data files for PyBDM.")
        return None

def text_to_binary_matrix(text_input: str, size: tuple = MATRIX_SIZE_GLOBAL) -> np.ndarray:
    if not isinstance(text_input, str) or not text_input.strip():
        return np.zeros(size, dtype=int)
    hash_obj = hashlib.sha256(text_input.encode('utf-8', 'ignore'))
    hash_digest = hash_obj.hexdigest()
    required_bits = size[0] * size[1]
    binary_string_from_hash = bin(int(hash_digest, 16))[2:].zfill(256)
    binary_string_for_matrix = binary_string_from_hash[:required_bits] if required_bits <= 256 else binary_string_from_hash.ljust(required_bits, '0')
    bits_for_matrix = [int(b) for b in binary_string_for_matrix]
    return np.array(bits_for_matrix).reshape(size)

def compute_bdm_for_text(text_input: str, bdm_instance: BDM, matrix_s: tuple = MATRIX_SIZE_GLOBAL) -> float:
    if not isinstance(text_input, str) or not text_input.strip() : return 0.0
    text_for_hash = text_input[:MAX_TEXT_FOR_BDM_HASH] if len(text_input) > MAX_TEXT_FOR_BDM_HASH else text_input
    if not text_for_hash.strip(): return 0.0
    binary_matrix = text_to_binary_matrix(text_for_hash, size=matrix_s)
    try:
        return bdm_instance.bdm(binary_matrix)
    except Exception as e_bdm:
        print(f"      Error during BDM calculation for text (len {len(text_input)}, hashed part len {len(text_for_hash)}): {e_bdm}")
        return -1.0

def calculate_L_H_token_based_structured(structured_motifs_list: List[Dict]) -> float:
    # This function uses MOTIF_* constants from Cell 1.
    # The change to make SFs "free" in L(H) is done by setting those constants to 0 in Cell 1.
    if not structured_motifs_list: return 0.0
    total_lh_cost = 0.0
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue
        current_motif_lh = 0.0
        label_str = motif_obj.get('label', "")
        if isinstance(label_str, str) and label_str.strip():
            current_motif_lh += MOTIF_SYMBOLIC_LABEL_COST
        description_str = motif_obj.get('description', "")
        if isinstance(description_str, str) and description_str.strip():
            current_motif_lh += MOTIF_DESCRIPTION_TEXT_BASE_COST
            current_motif_lh += len(tokenize_phrase(description_str)) * MOTIF_DESCRIPTION_TOKEN_COST # tokenize_phrase from Cell 2
        surface_forms_list = motif_obj.get('surface_forms', [])
        if isinstance(surface_forms_list, list) and surface_forms_list:
            valid_sfs_for_lh = [sf for sf in surface_forms_list if isinstance(sf, str) and sf.strip()]
            if valid_sfs_for_lh:
                current_motif_lh += MOTIF_SURFACE_FORMS_LIST_BASE_COST
                for sf_str in valid_sfs_for_lh:
                    current_motif_lh += len(tokenize_phrase(sf_str)) * MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH
        total_lh_cost += current_motif_lh
    return total_lh_cost

def llm_compress_text_structured(text_to_compress: str, structured_motifs_list: List[Dict]) -> str:
    if not isinstance(text_to_compress, str): return ""
    if not structured_motifs_list: return text_to_compress.lower()
    compressed_text = text_to_compress.lower()
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue
        label = motif_obj.get('label', None)
        surface_forms = motif_obj.get('surface_forms', [])
        if not (isinstance(label, str) and label.strip()) or \
           not (isinstance(surface_forms, list) and surface_forms):
            continue
        placeholder = label
        sorted_sfs_for_this_motif = sorted(
            [sf for sf in surface_forms if isinstance(sf, str) and sf.strip()], key=len, reverse=True
        )
        for sf_str_lower in sorted_sfs_for_this_motif:
            try:
                compressed_text = re.sub(r'\b' + re.escape(sf_str_lower) + r'\b', placeholder, compressed_text)
            except re.error as re_e:
                print(f"    Regex error during compression for SF '{sf_str_lower}' of motif '{label}': {re_e}. Skipping.")
                continue
    return compressed_text

def compute_mdl_cost_for_text_block(
    full_qid_corpus_str: str,
    final_motifs_to_evaluate: List[Dict],
    bdm_instance: BDM,
    matrix_s: tuple = MATRIX_SIZE_GLOBAL
    ) -> tuple[float, float, float]:
    if not isinstance(full_qid_corpus_str, str): full_qid_corpus_str = ""
    l_h = calculate_L_H_token_based_structured(final_motifs_to_evaluate)
    compressed_text_block = llm_compress_text_structured(full_qid_corpus_str, final_motifs_to_evaluate)
    l_d_h = compute_bdm_for_text(compressed_text_block, bdm_instance, matrix_s)
    if l_d_h < 0: return l_h, -1.0, -1.0
    total_mdl_cost = l_h + l_d_h
    return l_h, l_d_h, total_mdl_cost

print("Cell 5: MDL Calculation Utilities loaded.")

In [None]:
# @title Cell 6: Main Pipeline Orchestration

# Assume functions from previous cells are defined

def main():
    script_version_name = "Refactored MWP v7 (Simpler Prompt, Enhanced Label Fix, L(H) SF Cost Zero)"
    print(f"--- {script_version_name} ---")
    print(f"Timestamp: {time.asctime()}")
    print("\n--- Configuration Summary ---")
    print(f"LLM Model: {LOCAL_LLM_MODEL_ID}, Quantization: {USE_QUANTIZATION_FOR_LOCAL_LLM}")
    print(f"LLM Batch Size (Responses): {LLM_BATCH_SIZE_RESPONSES}, Retries: {LLM_RETRY_ATTEMPTS}")
    print(f"Max Text Chars per LLM Prompt Chunk: {MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK}")
    print(f"Max New Tokens for LLM Motif Extraction: {LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION}")
    print(f"Max Motifs to Request per Chunk: {MAX_MOTIFS_PER_CHUNK}")
    print(f"L(H) Costs: Label={MOTIF_SYMBOLIC_LABEL_COST}, DescBase={MOTIF_DESCRIPTION_TEXT_BASE_COST}, DescToken={MOTIF_DESCRIPTION_TOKEN_COST}, SFListBase={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, SFTokenInLH={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}")
    print(f"Global SF Filtering Min Freq: {MIN_SF_FREQUENCY_IN_FULL_CORPUS}")
    print(f"BDM Hash Prefix Length: {MAX_TEXT_FOR_BDM_HASH}, BDM Matrix: {MATRIX_SIZE_GLOBAL}")
    print(f"Debug Log File: {LLM_DEBUG_LOG_FILE}")
    print("--- End Configuration Summary ---\n")

    try:
        with open(LLM_DEBUG_LOG_FILE, "w", encoding="utf-8") as f:
            f.write(f"LLM Motif Debug Log - Run Started: {time.asctime()}\n")
            f.write(f"Script Version: {script_version_name}\n")
            f.write(f"Model ID: {LOCAL_LLM_MODEL_ID}\n")
            f.write(f"Pipeline Config: return_full_text=False\n")
            f.write(f"L(H) SF Costs: Base={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, Token={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}\n---\n")
    except Exception as e_log: print(f"WARN: Could not initialize debug log file {LLM_DEBUG_LOG_FILE}: {e_log}")

    hf_pipeline_instance, hf_tokenizer_instance = initialize_llm_pipeline()
    if not hf_pipeline_instance or not hf_tokenizer_instance: print("CRITICAL: Exiting: LLM init failure."); return

    bdm_instance_main = initialize_bdm_instance()
    if not bdm_instance_main: print("CRITICAL: Exiting: BDM init failure."); return

    if not os.path.exists(P2_COLLATED_FILE): print(f"ERROR: File {P2_COLLATED_FILE} not found."); return
    print(f"Loading data from: {P2_COLLATED_FILE}...")
    phase2_data_content = None
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f: phase2_data_content = json.load(f)
    except Exception as e_load: print(f"Error loading {P2_COLLATED_FILE}: {e_load}"); return

    all_qid_mdl_results_list = []
    aggregated_content_by_qid_from_file = phase2_data_content.get("aggregated_pdf_content_by_qid", {})
    if not aggregated_content_by_qid_from_file: print(f"No 'aggregated_pdf_content_by_qid' in {P2_COLLATED_FILE}."); return

    qids_to_process_this_run = []
    if P3_QIDS_TO_PROCESS_THEMATICALLY and isinstance(P3_QIDS_TO_PROCESS_THEMATICALLY, list) and P3_QIDS_TO_PROCESS_THEMATICALLY:
        qids_to_process_this_run = [qid for qid in P3_QIDS_TO_PROCESS_THEMATICALLY if qid in aggregated_content_by_qid_from_file]
        if not qids_to_process_this_run: print(f"Warning: QIDs {P3_QIDS_TO_PROCESS_THEMATICALLY} not found. Exiting."); return
    else:
        qids_to_process_limit_fallback = 1
        print(f"P3_QIDS_TO_PROCESS_THEMATICALLY not set/empty. Processing up to {qids_to_process_limit_fallback} QID(s) as fallback.")
        qids_to_process_this_run = list(aggregated_content_by_qid_from_file.keys())[:qids_to_process_limit_fallback]
        if not qids_to_process_this_run: print("No QIDs in data for fallback. Exiting."); return
    if not qids_to_process_this_run: print("No QIDs selected. Exiting."); return
    print(f"\nMDL analysis will run for QIDs: {qids_to_process_this_run}\n")

    for qid_identifier_str in qids_to_process_this_run:
        print(f"--- Analyzing Data for QID: {qid_identifier_str} ---")
        list_of_individual_response_structs = aggregated_content_by_qid_from_file.get(qid_identifier_str, [])
        actual_response_texts_for_qid = [item.get("text", "") for item in list_of_individual_response_structs if isinstance(item, dict) and isinstance(item.get("text"), str) and item.get("text","").strip()]
        if not actual_response_texts_for_qid: print(f"  No valid text for QID {qid_identifier_str}. Skipping."); print("-" * 50); continue
        full_corpus_text_for_qid = "\n\n<RSP_SEP>\n\n".join(actual_response_texts_for_qid)
        if len(full_corpus_text_for_qid.strip()) < 100: print(f"  Skipping QID {qid_identifier_str}: text too short."); print("-" * 50); continue
        num_total_responses_for_qid = len(actual_response_texts_for_qid)
        print(f"  Corpus for QID {qid_identifier_str}: {len(full_corpus_text_for_qid)} chars, {num_total_responses_for_qid} responses.")

        baseline_bdm_original_corpus = compute_bdm_for_text(full_corpus_text_for_qid, bdm_instance_main)
        if baseline_bdm_original_corpus < 0: print(f"  Error computing baseline BDM for QID {qid_identifier_str}. Skipping."); print("-" * 50); continue
        current_qid_baseline_mdl_cost = baseline_bdm_original_corpus
        print(f"  Baseline MDL for QID {qid_identifier_str} (L(D_orig)): {current_qid_baseline_mdl_cost:.4f}")

        raw_motifs_from_chunks = get_motifs_for_qid_batched(
            actual_response_texts_for_qid, LLM_BATCH_SIZE_RESPONSES,
            hf_pipeline_instance, hf_tokenizer_instance, qid_identifier_str
        )
        current_qid_result_entry = {
            "qid": qid_identifier_str, "corpus_len_chars": len(full_corpus_text_for_qid), "num_responses": num_total_responses_for_qid,
            "baseline_mdl": current_qid_baseline_mdl_cost, "final_refined_motifs": [], "l_h_final_motifs": 0.0,
            "l_d_h_final_motifs": current_qid_baseline_mdl_cost, "total_mdl_with_final_motifs": current_qid_baseline_mdl_cost,
            "compression_achieved": 0.0, "num_raw_motifs_extracted": len(raw_motifs_from_chunks),
            "num_consolidated_motifs": 0, "num_globally_refined_motifs": 0
        }
        if not raw_motifs_from_chunks: print(f"  No raw motifs by LLM for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue
        print(f"  Extracted {len(raw_motifs_from_chunks)} raw motif objects for QID {qid_identifier_str}.")

        consolidated_motifs_list = consolidate_raw_motifs(raw_motifs_from_chunks)
        current_qid_result_entry["num_consolidated_motifs"] = len(consolidated_motifs_list)
        print(f"  Consolidated into {len(consolidated_motifs_list)} unique motifs for QID {qid_identifier_str}.")
        if not consolidated_motifs_list: print(f"  No unique motifs after consolidation for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        # print(f"  Consolidated Motifs (BEFORE Global SF refinement):")
        # for idx, mo_con in enumerate(consolidated_motifs_list): print(f"    Cons. Motif {idx+1}: L='{mo_con.get('label')}', D='{mo_con.get('description','N/A')[:30]}...', SFs({len(mo_con.get('surface_forms',[]))})='{mo_con.get('surface_forms',[])[:2]}...'")

        globally_refined_motifs = filter_surface_forms_by_global_frequency(
            consolidated_motifs_list, full_corpus_text_for_qid, MIN_SF_FREQUENCY_IN_FULL_CORPUS
        )
        current_qid_result_entry["num_globally_refined_motifs"] = len(globally_refined_motifs)
        print(f"  Globally refined into {len(globally_refined_motifs)} motifs for QID {qid_identifier_str}.")
        if not globally_refined_motifs: print(f"  No motifs left after GLOBAL SF refinement for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        print(f"  Final Globally Refined Motifs for QID {qid_identifier_str}:")
        for idx, mo_final in enumerate(globally_refined_motifs): print(f"    Refined Motif {idx+1}: L='{mo_final.get('label')}', D='{mo_final.get('description','N/A')[:60]}...', SFs({len(mo_final.get('surface_forms',[]))})='{mo_final.get('surface_forms',[])}'")

        l_h_final, l_d_h_final, total_mdl_final = compute_mdl_cost_for_text_block(
            full_corpus_text_for_qid, globally_refined_motifs, bdm_instance_main
        )
        current_qid_result_entry.update({
            "final_refined_motifs": globally_refined_motifs, "l_h_final_motifs": l_h_final,
            "l_d_h_final_motifs": l_d_h_final if l_d_h_final >=0 else "BDM_ERROR",
            "total_mdl_with_final_motifs": total_mdl_final if l_d_h_final >=0 else "BDM_ERROR",
            "compression_achieved": "BDM_ERROR" if l_d_h_final < 0 else (current_qid_baseline_mdl_cost - total_mdl_final)
        })
        if l_d_h_final < 0: print(f"  Error computing MDL cost (BDM error) for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        print(f"  L(H) final motifs: {l_h_final:.4f} (SFs definition cost in L(H) is ZEROED for this run)")
        print(f"  L(D|H) compressed full corpus: {l_d_h_final:.4f}")
        print(f"  Total MDL cost with final motifs: {total_mdl_final:.4f}")
        compression_val = current_qid_result_entry["compression_achieved"]
        result_status_str = f"SUCCESS: Comp: {compression_val:.4f}" if isinstance(compression_val, float) and compression_val > 0.0001 else f"NOTE: No sig. comp. Diff: {compression_val if isinstance(compression_val, str) else compression_val:.4f}"
        print(f"  {result_status_str}"); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50)

    print("\n--- Overall QID-based MDL Analysis Summary ---")
    if not all_qid_mdl_results_list: print("No QIDs processed.")
    else:
        valid_results = [r for r in all_qid_mdl_results_list if isinstance(r.get('compression_achieved'), float) and r.get('l_h_final_motifs', -1.0) >= 0]
        num_qids_ok = len(valid_results); num_comp = sum(1 for r in valid_results if r['compression_achieved'] > 0.0001)
        print(f"Targeted QIDs: {len(qids_to_process_this_run)}, Results logged: {len(all_qid_mdl_results_list)}, Valid MDL: {num_qids_ok}, QIDs compressed: {num_comp}")
        if num_comp > 0:
            comp_vals = [r['compression_achieved'] for r in valid_results if r['compression_achieved'] > 0.0001]
            print(f"  Avg compression: {np.mean(comp_vals):.4f}, Max compression: {np.max(comp_vals):.4f}")
        else: print("  No compression achieved.")

        output_filename = os.path.join(BASE_PROJECT_DIR, "mdl_analysis_refactored_v7_LHSFzero_labelfix.json")
        try:
            with open(output_filename, "w", encoding="utf-8") as f_out: json.dump(all_qid_mdl_results_list, f_out, indent=2, ensure_ascii=False)
            print(f"Detailed results saved to {output_filename}")
        except Exception as e_s: print(f"Error saving results: {e_s}")

if __name__ == "__main__":
    print(f"Executing main MDL pipeline (Refactored MWP v7 - L(H) SF Cost Zero, Label Fix) at {time.asctime()}...")
    main()
    print(f"Main MDL pipeline execution finished at {time.asctime()}.")

# 2nd June

In [None]:
# @title Cell 1: Configuration

import os
from typing import List, Dict, Set # Moved typing here for early availability
from collections import Counter # Moved Counter here

# --- Project Paths ---
# !!! IMPORTANT: UPDATE BASE_PROJECT_DIR TO YOUR ACTUAL PATH !!!
BASE_PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Legal/'
# BASE_PROJECT_DIR = './' # For local testing if files are relative

PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json')

P3_QIDS_TO_PROCESS_THEMATICALLY = ["Q4"]

# --- BDM Configuration ---
MATRIX_SIZE_GLOBAL = (8, 8)
MAX_TEXT_FOR_BDM_HASH = 2000

# --- LLM Configuration ---
LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'
USE_QUANTIZATION_FOR_LOCAL_LLM = True
LLM_BATCH_SIZE_RESPONSES = 5
LLM_RETRY_ATTEMPTS = 2
MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK = 7000
LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION = 700 # For simpler prompt
MAX_MOTIFS_PER_CHUNK = 5 # For simpler prompt

# --- Token-Based L(H) Configuration ---
MOTIF_SYMBOLIC_LABEL_COST = 0.5
MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
MOTIF_DESCRIPTION_TOKEN_COST = 0.1
MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.0 # EXPERIMENT: Zero cost for SF list itself
MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.0 # EXPERIMENT: Zero cost for SF tokens in L(H)

# --- Surface Form Filtering Configuration ---
MIN_SF_FREQUENCY_IN_FULL_CORPUS = 2

# --- Logging File ---
LLM_DEBUG_LOG_FILE = os.path.join(BASE_PROJECT_DIR, "llm_motif_debug_log_refactored_v6_reset.txt")

print(f"Cell 1: Configuration loaded. LOCAL_LLM_MODEL_ID set to '{LOCAL_LLM_MODEL_ID}'.")
print(f"L(H) SF Costs: Base={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, Token={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}")
print(f"Debug log will be: {LLM_DEBUG_LOG_FILE}")
if not os.path.exists(BASE_PROJECT_DIR):
    print(f"WARNING: BASE_PROJECT_DIR '{BASE_PROJECT_DIR}' does not exist.")
if P2_COLLATED_FILE and not os.path.exists(P2_COLLATED_FILE):
     print(f"WARNING: P2_COLLATED_FILE '{P2_COLLATED_FILE}' does not exist. Data loading may fail.")

In [None]:
# @title Cell 2: Text Utilities

import re
from typing import List, Dict # Redundant if Cell 1 has it, but good for cell independence
from collections import Counter # Redundant if Cell 1 has it

# Constants this cell might use if run independently (though ideally from Cell 1)
try:
    MIN_SF_FREQ_IN_CHUNK_VALIDATION # Check if defined from config
except NameError:
    MIN_SF_FREQ_IN_CHUNK_VALIDATION = 2 # Default if not from config (not used in current main flow)


def tokenize_phrase(phrase_text: str) -> List[str]:
    """Simple tokenizer for phrases, definitions, or surface forms."""
    if not isinstance(phrase_text, str) or not phrase_text.strip():
        return []
    return phrase_text.lower().split()

def preprocess_corpus_for_motif_extraction(text_corpus: str) -> str:
    """Preprocesses a text corpus before sending to LLM or for n-gram extraction."""
    if not isinstance(text_corpus, str):
        return ""
    text = re.sub(r'\n{3,}', '\n\n', text_corpus)
    text = re.sub(r' {2,}', ' ', text)
    lines = text.split('\n')
    filtered_lines = [line.strip() for line in lines if len(line.strip()) > 10 or not line.strip()]
    return '\n'.join(filtered_lines)

def count_sf_occurrences(corpus_text: str, surface_form: str) -> int:
    """Counts case-insensitive occurrences of a surface_form within the corpus_text."""
    if not corpus_text or not surface_form or \
       not isinstance(corpus_text, str) or not isinstance(surface_form, str) or \
       not surface_form.strip():
        return 0
    try:
        return len(re.findall(re.escape(surface_form.lower()), corpus_text.lower(), flags=re.IGNORECASE))
    except re.error as e:
        print(f"    [WARN] Regex error in count_sf_occurrences for SF '{surface_form}': {e}")
        return 0

# This function is defined here as a utility, but not actively called in the primary
# motif extraction flow of this "reverted simpler prompt" version.
def extract_actual_phrases_from_text(
    text: str,
    min_phrase_len: int = 2,
    max_phrase_len: int = 6,
    min_freq: int = MIN_SF_FREQ_IN_CHUNK_VALIDATION
    ) -> Dict[str, int]:
    """Extracts n-gram phrases and their frequencies from text."""
    if not isinstance(text, str) or not text.strip(): return {}
    text_cleaned = text.lower()
    text_cleaned = re.sub(r'[^\w\s\']', ' ', text_cleaned)
    text_cleaned = re.sub(r'\s+', ' ', text_cleaned).strip()
    words = text_cleaned.split()
    if not words or len(words) < min_phrase_len: return {}
    phrase_counts = Counter()
    for n in range(min_phrase_len, max_phrase_len + 1):
        if n > len(words): continue
        for i in range(len(words) - n + 1):
            phrase = ' '.join(words[i:i+n])
            if phrase: phrase_counts[phrase] += 1
    return {phrase: count for phrase, count in phrase_counts.items() if count >= min_freq}

print("Cell 2: Text Utilities loaded.")

In [None]:
# @title Cell 3: LLM Interaction

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import time
from typing import List, Dict
import os
import re # Needed for label sanitization if moved here, but parsing is in Cell 4

# Assume constants from Cell 1 are in global scope
try:
    LOCAL_LLM_MODEL_ID; USE_QUANTIZATION_FOR_LOCAL_LLM; MAX_MOTIFS_PER_CHUNK
    MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK; LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION; LLM_DEBUG_LOG_FILE
except NameError:
    print("WARN (Cell 3): Key config constants not found from Cell 1. Using fallbacks.")
    LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'; USE_QUANTIZATION_FOR_LOCAL_LLM = True
    MAX_MOTIFS_PER_CHUNK = 5; MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK = 7000
    LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION = 700; LLM_DEBUG_LOG_FILE = "temp_llm_debug_log_cell3.txt"


def initialize_llm_pipeline(
    model_id: str = LOCAL_LLM_MODEL_ID,
    use_quantization: bool = USE_QUANTIZATION_FOR_LOCAL_LLM,
    pipeline_return_full_text: bool = False
    ):
    """Initializes and returns the Hugging Face pipeline and tokenizer."""
    print(f"--- Initializing LLM Pipeline (model: {model_id}, quantization: {use_quantization}, return_full_text: {pipeline_return_full_text}) ---")
    hf_pipeline_instance = None; hf_tokenizer_instance = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu"); print(f"Using device: {device}")
    try:
        print(f"Loading tokenizer for {model_id}..."); hf_tokenizer_instance = AutoTokenizer.from_pretrained(model_id)
        if hf_tokenizer_instance.pad_token is None:
            if hf_tokenizer_instance.eos_token is not None: print("Tokenizer setting pad_token = eos_token."); hf_tokenizer_instance.pad_token = hf_tokenizer_instance.eos_token
            else: print("WARN (initialize_llm): Tokenizer has no pad_token and no eos_token.")
        bnb_config = None; quant_active = False
        if use_quantization and torch.cuda.is_available():
            try:
                compute_dtype = torch.bfloat16 if (device.type == 'cuda' and torch.cuda.is_bf16_supported()) else torch.float16
                bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=True)
                quant_active = True; print(f"BNB config created, compute_dtype: {compute_dtype}.")
            except Exception as e_bnb: print(f"WARN: Failed BitsAndBytesConfig: {e_bnb}. Quantization may be disabled."); quant_active = False
        print(f"Loading model {model_id} (Quantization: {quant_active})..."); model_kwargs = {"device_map": "auto", "trust_remote_code": True}
        if quant_active and bnb_config: model_kwargs["quantization_config"] = bnb_config
        elif device.type == 'cuda': model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        hf_model_instance = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
        if hf_tokenizer_instance.pad_token_id is not None:
            if hf_model_instance.config.pad_token_id is None or hf_model_instance.config.pad_token_id != hf_tokenizer_instance.pad_token_id:
                hf_model_instance.config.pad_token_id = hf_tokenizer_instance.pad_token_id
        hf_pipeline_instance = pipeline("text-generation", model=hf_model_instance, tokenizer=hf_tokenizer_instance, return_full_text=pipeline_return_full_text)
        print(f"LLM pipeline initialized for {model_id}."); return hf_pipeline_instance, hf_tokenizer_instance
    except Exception as e: print(f"CRITICAL: LLM pipeline init failed: {e}"); import traceback; traceback.print_exc(); return None, None

def build_llm_prompt_for_motifs(text_block_for_prompt: str, max_motifs_to_extract: int = MAX_MOTIFS_PER_CHUNK) -> str:
    """Uses the simpler prompt structure for better label format adherence."""
    if len(text_block_for_prompt) > MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK:
        text_block_for_prompt = text_block_for_prompt[:MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK]

    prompt = f"""You will receive a set of comments from different people answering the same question.

Your task is to identify up to {max_motifs_to_extract} key recurring themes.

For each theme, provide:
- A short label like [DATA_PRIVACY]
- A 1-sentence description of the theme
- 2–3 short phrases that often appear in the text (surface forms)

Output MUST be a valid JSON list of objects, where each object has "label", "description", and "surface_forms" keys.
Example of one object in the list:
{{
  "label": "[EXAMPLE_LABEL]",
  "description": "A concise description of the example theme.",
  "surface_forms": ["short repeated phrase 1", "another short repeated phrase"]
}}
If no clear motifs are found, output an empty JSON list: `[]`.
Do not include any other text, explanations, or markdown code fences around the JSON.

Set of comments to analyze:
\"\"\"
{text_block_for_prompt}
\"\"\"

Valid JSON Output (ensure it's a list of objects, or an empty list [] if no themes):
"""
    return prompt.strip()

def call_local_llm_for_raw_response(
    prompt_content_for_user_turn: str,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str,
    chunk_idx_for_log: int
    ) -> str:
    """Makes the LLM call and returns raw text string. Assumes pipeline return_full_text=False."""
    if not hf_pipeline_instance or not hf_tokenizer_instance:
        print(f"    ERROR (call_local_llm): LLM pipeline/tokenizer not initialized for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        return ""
    messages_for_template = [{"role": "user", "content": prompt_content_for_user_turn}]
    try:
        prompt_formatted_for_llm = hf_tokenizer_instance.apply_chat_template(
            messages_for_template, tokenize=False, add_generation_prompt=True
        )
    except Exception as e_template:
        print(f"    ERROR (call_local_llm): Applying chat template for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_template}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (call_local_llm) ---\nERROR APPLYING CHAT TEMPLATE: {e_template}\nPrompt content (first 300): {prompt_content_for_user_turn[:300]}...\n")
        return ""

    generation_args = {
        "max_new_tokens": LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION,
        "do_sample": False,
        "pad_token_id": hf_tokenizer_instance.pad_token_id
    }
    try:
        outputs = hf_pipeline_instance(prompt_formatted_for_llm, **generation_args)
        if outputs and isinstance(outputs, list) and len(outputs) > 0 and \
           outputs[0] and isinstance(outputs[0], dict) and 'generated_text' in outputs[0]:
            return outputs[0]['generated_text'].strip()
        else:
            print(f"    WARN (call_local_llm): LLM pipeline unexpected structure for QID {qid_for_log}, Chunk {chunk_idx_for_log}. Output: {outputs}")
            return ""
    except Exception as e_pipeline:
        print(f"    ERROR (call_local_llm): Exception during hf_pipeline call for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_pipeline}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (call_local_llm) ---\nERROR PIPELINE CALL: {e_pipeline}\nFormatted prompt (first 300): {prompt_formatted_for_llm[:300]}...\n")
        return ""

print("Cell 3: LLM Interaction Utilities loaded (using simpler prompt).")

In [None]:
# @title Cell 4: Motif Processing and Validation

import json
import re
import time
from typing import List, Dict
# from collections import Counter # Not directly used here, but in Cell 2

# Assume constants from Cell 1 are in global scope
# Assume text_utils functions (preprocess_corpus_for_motif_extraction, count_sf_occurrences) from Cell 2 are in scope
# Assume LLM interaction functions (build_llm_prompt_for_motifs, call_local_llm_for_raw_response) from Cell 3 are in scope

try:
    LLM_DEBUG_LOG_FILE; LLM_RETRY_ATTEMPTS; MIN_SF_FREQUENCY_IN_FULL_CORPUS # Check some constants
except NameError:
    print("WARN (Cell 4): Key config constants not found. Using fallbacks.")
    LLM_DEBUG_LOG_FILE = "temp_llm_debug_log_cell4.txt"; LLM_RETRY_ATTEMPTS = 2; MIN_SF_FREQUENCY_IN_FULL_CORPUS = 2

def parse_and_validate_llm_json_response(
    llm_raw_response_text: str,
    qid_for_log:str,
    chunk_idx_for_log:int,
    prompt_sent_to_llm:str
    ) -> List[Dict]:
    """Parses LLM JSON, attempts label fixing, validates schema."""
    json_str_candidate = llm_raw_response_text.strip()
    if json_str_candidate.startswith("```json"): json_str_candidate = json_str_candidate[len("```json"):].strip()
    if json_str_candidate.startswith("```"): json_str_candidate = json_str_candidate[len("```"):].strip()
    if json_str_candidate.endswith("```"): json_str_candidate = json_str_candidate[:-len("```")].strip()

    if not json_str_candidate or json_str_candidate.lower() == "[]" or \
       "no_themes_found" in json_str_candidate.lower() or \
       "no clear motifs" in json_str_candidate.lower():
        return []

    try:
        parsed_data = json.loads(json_str_candidate)
        if isinstance(parsed_data, dict): parsed_data = [parsed_data]
        if not isinstance(parsed_data, list):
            raise ValueError("Parsed JSON is not a list or single object.")

        valid_motifs_from_json = []
        for item_idx, item in enumerate(parsed_data):
            if not isinstance(item, dict) or not item: continue

            label_str_original = item.get('label', "")
            label_str_processed = ""
            if isinstance(label_str_original, str) and label_str_original.strip():
                temp_label = label_str_original.strip()
                match = re.search(r"(\[[A-Z0-9_]+\])", temp_label)
                if match and match.group(1) == temp_label: label_str_processed = temp_label
                elif match: label_str_processed = match.group(1)
                elif not (temp_label.startswith('[') and temp_label.endswith(']')):
                    sanitized_content = re.sub(r'\s+', '_', temp_label)
                    sanitized_content = re.sub(r'[^a-zA-Z0-9_]', '', sanitized_content).upper()
                    sanitized_content = "_".join(sanitized_content.split('_')[:3])
                    if sanitized_content: label_str_processed = f"[{sanitized_content}]"

            # Update item with potentially fixed label for consistent validation and use
            # If label_str_processed is empty (e.g. original was empty or sanitization failed),
            # it will fail the bool(label_str_for_validation) check later.
            item['label'] = label_str_processed

            label_to_validate = item.get('label',"") # Already processed
            desc_str = item.get('description',"")
            sf_list = item.get('surface_forms', [])

            has_all_keys = all(k in item for k in ["label", "description", "surface_forms"])
            label_is_valid = isinstance(label_to_validate, str) and bool(label_to_validate) and \
                             label_to_validate.startswith('[') and label_to_validate.endswith(']') and \
                             re.fullmatch(r"\[[A-Z0-9_]+\]", label_to_validate)
            desc_is_valid = isinstance(desc_str, str)
            sfs_list_is_valid = isinstance(sf_list, list) and \
                                all(isinstance(sf_item, str) for sf_item in sf_list)

            if has_all_keys and label_is_valid and desc_is_valid and sfs_list_is_valid:
                valid_motifs_from_json.append({
                    "label": label_to_validate,
                    "description": desc_str.strip(),
                    "surface_forms": [s.strip() for s in sf_list if isinstance(s, str) and s.strip()]
                })
            else:
                print(f"    [WARN] Invalid motif object schema for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}. Skipping.")
                with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
                    f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} --- ITEM_SCHEMA_FAILURE (Item {item_idx+1}) ---\n")
                    f.write(f"Original Label: '{label_str_original}', Processed Label for Validation: '{label_to_validate}'\n")
                    f.write(f"Item Content: {json.dumps(item, indent=2)}\n")
                    f.write(f"Validation: keys={has_all_keys}, label={label_is_valid}, desc={desc_is_valid}, sfs_list={sfs_list_is_valid}\n")
        return valid_motifs_from_json
    except (json.JSONDecodeError, ValueError) as e:
        print(f"    [WARN] Motif JSON parsing/core structure issue for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} --- JSON_PARSE_VALUE_ERROR ---\n")
            prompt_parts = prompt_sent_to_llm.split('Set of comments to analyze:')
            user_content_for_log = prompt_parts[1][:500] if len(prompt_parts) > 1 else (prompt_sent_to_llm[:500] if prompt_sent_to_llm else "PROMPT_EMPTY")
            f.write(f"PROMPT USER CONTENT (approx first 500 chars):\n{user_content_for_log}...\n")
            f.write(f"RAW LLM RESPONSE (Error: {type(e).__name__}):\n{llm_raw_response_text}\n")
            f.write(f"EXTRACTED JSON CANDIDATE (Error: {type(e).__name__}):\n{json_str_candidate}\n")
        return []

def get_motifs_for_qid_batched(
    list_of_individual_response_texts: List[str],
    responses_per_batch: int,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str
    ) -> List[Dict]:
    all_raw_motifs_from_chunks = []
    batched_text_chunks_for_llm = []
    for i in range(0, len(list_of_individual_response_texts), responses_per_batch):
        batch_responses = list_of_individual_response_texts[i:i + responses_per_batch]
        chunk_text_for_llm = preprocess_corpus_for_motif_extraction("\n\n<RSP_SEP>\n\n".join(batch_responses))
        batched_text_chunks_for_llm.append(chunk_text_for_llm)

    print(f"  QID {qid_for_log}: Processing {len(list_of_individual_response_texts)} responses in {len(batched_text_chunks_for_llm)} preprocessed chunks (batch size: {responses_per_batch} responses).")

    for chunk_idx, text_chunk_to_analyze_processed in enumerate(batched_text_chunks_for_llm):
        print(f"    Analyzing chunk {chunk_idx + 1}/{len(batched_text_chunks_for_llm)} for QID {qid_for_log} (processed chunk len: {len(text_chunk_to_analyze_processed)} chars)...")
        if len(text_chunk_to_analyze_processed.strip()) < 50:
            print(f"      Chunk {chunk_idx+1} (QID {qid_for_log}) too short after preprocessing, skipping.")
            continue

        # build_llm_prompt_for_motifs is from Cell 3
        prompt_for_llm = build_llm_prompt_for_motifs(text_chunk_to_analyze_processed)

        motifs_from_this_chunk = []
        for attempt in range(LLM_RETRY_ATTEMPTS):
            # call_local_llm_for_raw_response is from Cell 3
            raw_llm_response = call_local_llm_for_raw_response(
                prompt_for_llm, hf_pipeline_instance, hf_tokenizer_instance, qid_for_log, chunk_idx + 1
            )
            if not raw_llm_response:
                print(f"      LLM call attempt {attempt + 1} for chunk {chunk_idx+1} (QID {qid_for_log}) returned empty. Retrying if possible...")
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)
                continue

            # parse_and_validate_llm_json_response is defined in this cell
            parsed_and_validated_motifs = parse_and_validate_llm_json_response(
                raw_llm_response, qid_for_log, chunk_idx+1, prompt_for_llm
            )
            if parsed_and_validated_motifs:
                motifs_from_this_chunk = parsed_and_validated_motifs
                break
            else:
                print(f"      Motif parsing/validation attempt {attempt + 1} yielded no structured motifs for chunk {chunk_idx+1}. Retrying if possible...")
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)

        if motifs_from_this_chunk:
            print(f"      Extracted {len(motifs_from_this_chunk)} structured motif objects from chunk {chunk_idx+1} (QID {qid_for_log}).")
            all_raw_motifs_from_chunks.extend(motifs_from_this_chunk)
        else:
            print(f"      No valid structured motifs extracted from chunk {chunk_idx+1} (QID {qid_for_log}) after {LLM_RETRY_ATTEMPTS} attempts.")

    return all_raw_motifs_from_chunks

def consolidate_raw_motifs(list_of_all_raw_motifs: List[Dict]) -> List[Dict]:
    """Consolidates motifs by label, merging surface forms (lowercased, unique, sorted)."""
    if not list_of_all_raw_motifs: return []
    consolidated_motifs_map = {}
    for motif_obj in list_of_all_raw_motifs:
        label = motif_obj.get("label","").strip() # Assumes label is already correctly formatted
        description = motif_obj.get("description","").strip()
        surface_forms = motif_obj.get("surface_forms", [])
        if not (label and isinstance(surface_forms, list)): continue

        current_sfs_set = set(sf.lower().strip() for sf in surface_forms if isinstance(sf, str) and sf.strip())
        if label not in consolidated_motifs_map:
            consolidated_motifs_map[label] = {
                "label": label,
                "description": description,
                "surface_forms": sorted(list(current_sfs_set))
            }
        else:
            existing_sfs_set = set(consolidated_motifs_map[label].get("surface_forms", []))
            consolidated_motifs_map[label]["surface_forms"] = sorted(list(existing_sfs_set.union(current_sfs_set)))
    return list(consolidated_motifs_map.values())

def filter_surface_forms_by_global_frequency(
    consolidated_motifs_list: List[Dict],
    full_qid_corpus_text: str,
    min_global_freq: int = MIN_SF_FREQUENCY_IN_FULL_CORPUS
    ) -> List[Dict]:
    """Filters SFs in consolidated motifs based on frequency in the full QID corpus."""
    if not consolidated_motifs_list: return []
    final_globally_filtered_motifs = []
    for motif_obj in consolidated_motifs_list:
        globally_frequent_sfs_for_this_motif = []
        # Surface forms from consolidation are already lowercased
        original_sfs_for_this_motif = motif_obj.get("surface_forms", [])
        for sf_str_lower in original_sfs_for_this_motif:
            # count_sf_occurrences (Cell 2) also lowercases corpus and sf for matching
            count = count_sf_occurrences(full_qid_corpus_text, sf_str_lower)
            if count >= min_global_freq:
                globally_frequent_sfs_for_this_motif.append(sf_str_lower)
        if globally_frequent_sfs_for_this_motif:
            filtered_motif_entry = motif_obj.copy()
            # SFs are already unique and lowercased, just sort
            filtered_motif_entry["surface_forms"] = sorted(globally_frequent_sfs_for_this_motif)
            final_globally_filtered_motifs.append(filtered_motif_entry)
    return final_globally_filtered_motifs

print("Cell 4: Motif Processing and Validation Utilities loaded (with label fixing).")

In [None]:
# @title Cell 5: MDL Calculations

import hashlib
import numpy as np
from pybdm import BDM
import re
from typing import List, Dict

# Assume constants from Cell 1 are in global scope
# Assume tokenize_phrase from Cell 2 is in global scope
try:
    MATRIX_SIZE_GLOBAL; MAX_TEXT_FOR_BDM_HASH; MOTIF_SYMBOLIC_LABEL_COST
except NameError:
    print("WARN (Cell 5): Key config constants not found. Using fallbacks or expecting errors.")
    MATRIX_SIZE_GLOBAL = (8, 8); MAX_TEXT_FOR_BDM_HASH = 2000
    MOTIF_SYMBOLIC_LABEL_COST = 0.5; MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
    MOTIF_DESCRIPTION_TOKEN_COST = 0.1; MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.0 # Matching Cell 1 for this experiment
    MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.0 # Matching Cell 1 for this experiment


def initialize_bdm_instance():
    """Initializes and returns a BDM instance."""
    print("Initializing BDM instance...")
    try:
        bdm_instance = BDM(ndim=2)
        print("BDM instance initialized successfully (ndim=2, default CTM-based).")
        return bdm_instance
    except Exception as e_bdm_init:
        print(f"CRITICAL: Failed to initialize BDM instance: {e_bdm_init}")
        if "CTM data files" in str(e_bdm_init).lower() or "dataset" in str(e_bdm_init).lower():
            print("  BDM Error Hint: This might be related to missing/corrupted CTM data files for PyBDM.")
            print("  Ensure PyBDM is installed correctly and can access/download its data.")
            print("  You might need to run once: from pybdm import get_ctm_dataset; get_ctm_dataset()")
        return None

def text_to_binary_matrix(text_input: str, size: tuple = MATRIX_SIZE_GLOBAL) -> np.ndarray:
    """Converts a text string to a binary matrix using its SHA256 hash."""
    if not isinstance(text_input, str) or not text_input.strip():
        return np.zeros(size, dtype=int)
    hash_obj = hashlib.sha256(text_input.encode('utf-8', 'ignore'))
    hash_digest = hash_obj.hexdigest()
    required_bits = size[0] * size[1]
    binary_string_from_hash = bin(int(hash_digest, 16))[2:].zfill(256)
    binary_string_for_matrix = binary_string_from_hash[:required_bits] if required_bits <= 256 else binary_string_from_hash.ljust(required_bits, '0')
    bits_for_matrix = [int(b) for b in binary_string_for_matrix]
    return np.array(bits_for_matrix).reshape(size)

def compute_bdm_for_text(text_input: str, bdm_instance: BDM, matrix_s: tuple = MATRIX_SIZE_GLOBAL) -> float:
    """Computes BDM for a given text string using a prefix for hashing."""
    if not isinstance(text_input, str) or not text_input.strip() :
        return 0.0
    text_for_hash = text_input[:MAX_TEXT_FOR_BDM_HASH] if len(text_input) > MAX_TEXT_FOR_BDM_HASH else text_input
    if not text_for_hash.strip():
        return 0.0
    binary_matrix = text_to_binary_matrix(text_for_hash, size=matrix_s)
    try:
        bdm_value = bdm_instance.bdm(binary_matrix)
        return bdm_value
    except Exception as e_bdm:
        print(f"      Error during BDM calculation for text (len {len(text_input)}, hashed part len {len(text_for_hash)}): {e_bdm}")
        return -1.0

def calculate_L_H_token_based_structured(structured_motifs_list: List[Dict]) -> float:
    """Calculates L(H) - the cost of defining the list of structured motifs."""
    # This function uses MOTIF_* constants from Cell 1.
    # The change to make SFs "free" in L(H) is done by setting those constants to 0 in Cell 1.
    if not structured_motifs_list: return 0.0
    total_lh_cost = 0.0
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue
        current_motif_lh = 0.0
        label_str = motif_obj.get('label', "")
        if isinstance(label_str, str) and label_str.strip():
            current_motif_lh += MOTIF_SYMBOLIC_LABEL_COST
        description_str = motif_obj.get('description', "")
        if isinstance(description_str, str) and description_str.strip():
            current_motif_lh += MOTIF_DESCRIPTION_TEXT_BASE_COST
            current_motif_lh += len(tokenize_phrase(description_str)) * MOTIF_DESCRIPTION_TOKEN_COST # tokenize_phrase from Cell 2
        surface_forms_list = motif_obj.get('surface_forms', [])
        if isinstance(surface_forms_list, list) and surface_forms_list:
            valid_sfs_for_lh = [sf for sf in surface_forms_list if isinstance(sf, str) and sf.strip()]
            if valid_sfs_for_lh:
                current_motif_lh += MOTIF_SURFACE_FORMS_LIST_BASE_COST # Will be 0 if constant is 0
                for sf_str in valid_sfs_for_lh:
                    current_motif_lh += len(tokenize_phrase(sf_str)) * MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH # Will be 0 if constant is 0
        total_lh_cost += current_motif_lh
    return total_lh_cost

def llm_compress_text_structured(text_to_compress: str, structured_motifs_list: List[Dict]) -> str:
    """Compresses text by replacing occurrences of motif surface forms with their symbolic labels."""
    if not isinstance(text_to_compress, str): return ""
    if not structured_motifs_list: return text_to_compress.lower()
    compressed_text = text_to_compress.lower()
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue
        label = motif_obj.get('label', None)
        surface_forms = motif_obj.get('surface_forms', [])
        if not (isinstance(label, str) and label.strip()) or \
           not (isinstance(surface_forms, list) and surface_forms):
            continue
        placeholder = label
        sorted_sfs_for_this_motif = sorted(
            [sf for sf in surface_forms if isinstance(sf, str) and sf.strip()], key=len, reverse=True
        )
        for sf_str_lower in sorted_sfs_for_this_motif: # Assumes SFs are already lowercased from consolidation/filtering
            try:
                compressed_text = re.sub(r'\b' + re.escape(sf_str_lower) + r'\b', placeholder, compressed_text)
            except re.error as re_e:
                print(f"    Regex error during compression for SF '{sf_str_lower}' of motif '{label}': {re_e}. Skipping.")
                continue
    return compressed_text

def compute_mdl_cost_for_text_block(
    full_qid_corpus_str: str,
    final_motifs_to_evaluate: List[Dict],
    bdm_instance: BDM,
    matrix_s: tuple = MATRIX_SIZE_GLOBAL
    ) -> tuple[float, float, float]:
    """Computes L(H), L(D|H), and Total MDL for a text block given a final set of motifs."""
    if not isinstance(full_qid_corpus_str, str): full_qid_corpus_str = ""
    l_h = calculate_L_H_token_based_structured(final_motifs_to_evaluate)
    compressed_text_block = llm_compress_text_structured(full_qid_corpus_str, final_motifs_to_evaluate)
    l_d_h = compute_bdm_for_text(compressed_text_block, bdm_instance, matrix_s)
    if l_d_h < 0: return l_h, -1.0, -1.0
    total_mdl_cost = l_h + l_d_h
    return l_h, l_d_h, total_mdl_cost

print("Cell 5: MDL Calculation Utilities loaded.")

In [None]:
# @title Cell 6: Main Pipeline Orchestration

# Assume functions from previous cells are defined

def main():
    script_version_name = "Refactored MWP v6 (Simpler Prompt, Label Fix, L(H) SF Cost Zero)"
    print(f"--- {script_version_name} ---")
    print(f"Timestamp: {time.asctime()}")
    print("\n--- Configuration Summary ---")
    print(f"LLM Model: {LOCAL_LLM_MODEL_ID}, Quantization: {USE_QUANTIZATION_FOR_LOCAL_LLM}")
    print(f"LLM Batch Size (Responses): {LLM_BATCH_SIZE_RESPONSES}, Retries: {LLM_RETRY_ATTEMPTS}")
    print(f"Max Text Chars per LLM Prompt Chunk: {MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK}")
    print(f"Max New Tokens for LLM Motif Extraction: {LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION}")
    print(f"Max Motifs to Request per Chunk: {MAX_MOTIFS_PER_CHUNK}")
    print(f"L(H) Costs: Label={MOTIF_SYMBOLIC_LABEL_COST}, DescBase={MOTIF_DESCRIPTION_TEXT_BASE_COST}, DescToken={MOTIF_DESCRIPTION_TOKEN_COST}, SFListBase={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, SFTokenInLH={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}")
    print(f"Global SF Filtering Min Freq: {MIN_SF_FREQUENCY_IN_FULL_CORPUS}")
    print(f"BDM Hash Prefix Length: {MAX_TEXT_FOR_BDM_HASH}, BDM Matrix: {MATRIX_SIZE_GLOBAL}")
    print(f"Debug Log File: {LLM_DEBUG_LOG_FILE}")
    print("--- End Configuration Summary ---\n")

    try:
        with open(LLM_DEBUG_LOG_FILE, "w", encoding="utf-8") as f:
            f.write(f"LLM Motif Debug Log - Run Started: {time.asctime()}\n")
            f.write(f"Script Version: {script_version_name}\n")
            f.write(f"Model ID: {LOCAL_LLM_MODEL_ID}\n")
            f.write(f"Pipeline Config: return_full_text=False\n") # From initialize_llm_pipeline default
            f.write(f"L(H) SF Costs: Base={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, Token={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}\n---\n")
    except Exception as e_log: print(f"WARN: Could not initialize debug log file {LLM_DEBUG_LOG_FILE}: {e_log}")

    hf_pipeline_instance, hf_tokenizer_instance = initialize_llm_pipeline() # From Cell 3
    if not hf_pipeline_instance or not hf_tokenizer_instance: print("CRITICAL: Exiting due to LLM pipeline init failure."); return

    bdm_instance_main = initialize_bdm_instance() # From Cell 5
    if not bdm_instance_main: print("CRITICAL: Exiting due to BDM init failure."); return

    if not os.path.exists(P2_COLLATED_FILE): print(f"ERROR: Phase 2 file {P2_COLLATED_FILE} not found."); return
    print(f"Loading Phase 2 data from: {P2_COLLATED_FILE}...")
    phase2_data_content = None
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f: phase2_data_content = json.load(f)
    except Exception as e_load: print(f"Error loading {P2_COLLATED_FILE}: {e_load}"); return

    all_qid_mdl_results_list = []
    aggregated_content_by_qid_from_file = phase2_data_content.get("aggregated_pdf_content_by_qid", {})
    if not aggregated_content_by_qid_from_file: print(f"No 'aggregated_pdf_content_by_qid' in {P2_COLLATED_FILE}."); return

    qids_to_process_this_run = [] # From Cell 1
    if P3_QIDS_TO_PROCESS_THEMATICALLY and isinstance(P3_QIDS_TO_PROCESS_THEMATICALLY, list) and P3_QIDS_TO_PROCESS_THEMATICALLY:
        qids_to_process_this_run = [qid for qid in P3_QIDS_TO_PROCESS_THEMATICALLY if qid in aggregated_content_by_qid_from_file]
        if not qids_to_process_this_run: print(f"Warning: QIDs {P3_QIDS_TO_PROCESS_THEMATICALLY} not found. Exiting."); return
    else:
        qids_to_process_limit_fallback = 1
        print(f"P3_QIDS_TO_PROCESS_THEMATICALLY not set/empty. Processing up to {qids_to_process_limit_fallback} QID(s) as fallback.")
        qids_to_process_this_run = list(aggregated_content_by_qid_from_file.keys())[:qids_to_process_limit_fallback]
        if not qids_to_process_this_run: print("No QIDs in data for fallback. Exiting."); return
    if not qids_to_process_this_run: print("No QIDs selected. Exiting."); return
    print(f"\nMDL analysis will run for QIDs: {qids_to_process_this_run}\n")

    for qid_identifier_str in qids_to_process_this_run:
        print(f"--- Analyzing Data for QID: {qid_identifier_str} ---")
        list_of_individual_response_structs = aggregated_content_by_qid_from_file.get(qid_identifier_str, [])
        actual_response_texts_for_qid = [item.get("text", "") for item in list_of_individual_response_structs if isinstance(item, dict) and isinstance(item.get("text"), str) and item.get("text","").strip()]
        if not actual_response_texts_for_qid: print(f"  No valid text for QID {qid_identifier_str}. Skipping."); print("-" * 50); continue
        full_corpus_text_for_qid = "\n\n<RSP_SEP>\n\n".join(actual_response_texts_for_qid)
        if len(full_corpus_text_for_qid.strip()) < 100: print(f"  Skipping QID {qid_identifier_str}: text too short."); print("-" * 50); continue
        num_total_responses_for_qid = len(actual_response_texts_for_qid)
        print(f"  Corpus for QID {qid_identifier_str}: {len(full_corpus_text_for_qid)} chars, {num_total_responses_for_qid} responses.")

        baseline_bdm_original_corpus = compute_bdm_for_text(full_corpus_text_for_qid, bdm_instance_main) # Removed matrix_s, uses default
        if baseline_bdm_original_corpus < 0: print(f"  Error computing baseline BDM for QID {qid_identifier_str}. Skipping."); print("-" * 50); continue
        current_qid_baseline_mdl_cost = baseline_bdm_original_corpus
        print(f"  Baseline MDL for QID {qid_identifier_str} (L(D_orig)): {current_qid_baseline_mdl_cost:.4f}")

        # get_motifs_for_qid_batched is from Cell 4
        raw_motifs_from_chunks = get_motifs_for_qid_batched(
            actual_response_texts_for_qid, LLM_BATCH_SIZE_RESPONSES,
            hf_pipeline_instance, hf_tokenizer_instance, qid_identifier_str
        )
        current_qid_result_entry = {
            "qid": qid_identifier_str, "corpus_len_chars": len(full_corpus_text_for_qid), "num_responses": num_total_responses_for_qid,
            "baseline_mdl": current_qid_baseline_mdl_cost, "final_refined_motifs": [], "l_h_final_motifs": 0.0,
            "l_d_h_final_motifs": current_qid_baseline_mdl_cost, "total_mdl_with_final_motifs": current_qid_baseline_mdl_cost,
            "compression_achieved": 0.0, "num_raw_motifs_extracted": len(raw_motifs_from_chunks),
            "num_consolidated_motifs": 0, "num_globally_refined_motifs": 0
        }
        if not raw_motifs_from_chunks: print(f"  No raw motifs by LLM for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue
        print(f"  Extracted {len(raw_motifs_from_chunks)} raw motif objects for QID {qid_identifier_str}.")

        consolidated_motifs_list = consolidate_raw_motifs(raw_motifs_from_chunks) # From Cell 4
        current_qid_result_entry["num_consolidated_motifs"] = len(consolidated_motifs_list)
        print(f"  Consolidated into {len(consolidated_motifs_list)} unique motifs for QID {qid_identifier_str}.")
        if not consolidated_motifs_list: print(f"  No unique motifs after consolidation for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        # print(f"  Consolidated Motifs (BEFORE Global SF refinement):")
        # for idx, mo_con in enumerate(consolidated_motifs_list): print(f"    Cons. Motif {idx+1}: L='{mo_con.get('label')}', D='{mo_con.get('description','N/A')[:30]}...', SFs({len(mo_con.get('surface_forms',[]))})='{mo_con.get('surface_forms',[])[:2]}...'")

        # filter_surface_forms_by_global_frequency from Cell 4
        globally_refined_motifs = filter_surface_forms_by_global_frequency(
            consolidated_motifs_list, full_corpus_text_for_qid, MIN_SF_FREQUENCY_IN_FULL_CORPUS
        )
        current_qid_result_entry["num_globally_refined_motifs"] = len(globally_refined_motifs)
        print(f"  Globally refined into {len(globally_refined_motifs)} motifs for QID {qid_identifier_str}.")
        if not globally_refined_motifs: print(f"  No motifs left after GLOBAL SF refinement for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        print(f"  Final Globally Refined Motifs for QID {qid_identifier_str}:")
        for idx, mo_final in enumerate(globally_refined_motifs): print(f"    Refined Motif {idx+1}: L='{mo_final.get('label')}', D='{mo_final.get('description','N/A')[:60]}...', SFs({len(mo_final.get('surface_forms',[]))})='{mo_final.get('surface_forms',[])}'")

        # compute_mdl_cost_for_text_block from Cell 5
        l_h_final, l_d_h_final, total_mdl_final = compute_mdl_cost_for_text_block(
            full_corpus_text_for_qid, globally_refined_motifs, bdm_instance_main # Removed matrix_s, uses default
        )
        current_qid_result_entry.update({
            "final_refined_motifs": globally_refined_motifs, "l_h_final_motifs": l_h_final,
            "l_d_h_final_motifs": l_d_h_final if l_d_h_final >=0 else "BDM_ERROR",
            "total_mdl_with_final_motifs": total_mdl_final if l_d_h_final >=0 else "BDM_ERROR",
            "compression_achieved": "BDM_ERROR" if l_d_h_final < 0 else (current_qid_baseline_mdl_cost - total_mdl_final)
        })
        if l_d_h_final < 0: print(f"  Error computing MDL cost (BDM error) for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        print(f"  L(H) final motifs: {l_h_final:.4f} (SFs definition cost in L(H) is ZEROED for this run)")
        print(f"  L(D|H) compressed full corpus: {l_d_h_final:.4f}")
        print(f"  Total MDL cost with final motifs: {total_mdl_final:.4f}")
        compression_val = current_qid_result_entry["compression_achieved"]
        result_status_str = f"SUCCESS: Comp: {compression_val:.4f}" if isinstance(compression_val, float) and compression_val > 0.0001 else f"NOTE: No sig. comp. Diff: {compression_val if isinstance(compression_val, str) else compression_val:.4f}"
        print(f"  {result_status_str}"); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50)

    print("\n--- Overall QID-based MDL Analysis Summary ---")
    if not all_qid_mdl_results_list: print("No QIDs processed.")
    else:
        valid_results = [r for r in all_qid_mdl_results_list if isinstance(r.get('compression_achieved'), float) and r.get('l_h_final_motifs', -1.0) >= 0]
        num_qids_ok = len(valid_results); num_comp = sum(1 for r in valid_results if r['compression_achieved'] > 0.0001)
        print(f"Targeted QIDs: {len(qids_to_process_this_run)}, Results logged: {len(all_qid_mdl_results_list)}, Valid MDL: {num_qids_ok}, QIDs compressed: {num_comp}")
        if num_comp > 0:
            comp_vals = [r['compression_achieved'] for r in valid_results if r['compression_achieved'] > 0.0001]
            print(f"  Avg compression: {np.mean(comp_vals):.4f}, Max compression: {np.max(comp_vals):.4f}")
        else: print("  No compression achieved.")

        output_filename = os.path.join(BASE_PROJECT_DIR, "mdl_analysis_refactored_v6_LHSFzero.json") # MODIFIED FILENAME
        try:
            with open(output_filename, "w", encoding="utf-8") as f_out: json.dump(all_qid_mdl_results_list, f_out, indent=2, ensure_ascii=False)
            print(f"Detailed results saved to {output_filename}")
        except Exception as e_s: print(f"Error saving results: {e_s}")

if __name__ == "__main__":
    print(f"Executing main MDL pipeline (Refactored v6 - L(H) SF Cost Zero) at {time.asctime()}...")
    main()
    print(f"Main MDL pipeline execution finished at {time.asctime()}.")

# 2nd June

## Continue from yesterday's refactored solution

In [None]:
# @title Cell 3: LLM Interaction (MODIFIED create_enhanced_motif_prompt)

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import time
from typing import List, Dict
import os # Added for LLM_DEBUG_LOG_FILE

# Assume constants like LOCAL_LLM_MODEL_ID, MAX_MOTIFS_PER_CHUNK, etc., are loaded from Cell 1
# For standalone testing of this cell, you might need to define them here or use try-except.
try:
    LOCAL_LLM_MODEL_ID
except NameError:
    print("WARN (Cell 3): Config constants not found from Cell 1. Using fallbacks.")
    LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'; USE_QUANTIZATION_FOR_LOCAL_LLM = True
    MAX_MOTIFS_PER_CHUNK = 5; MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK = 7000
    LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION = 700; LLM_DEBUG_LOG_FILE = "temp_llm_debug_log_cell3.txt"


def initialize_llm_pipeline(
    model_id: str = LOCAL_LLM_MODEL_ID,
    use_quantization: bool = USE_QUANTIZATION_FOR_LOCAL_LLM,
    pipeline_return_full_text: bool = False
    ):
    # ... (keep this function exactly as in your last working version) ...
    # (It correctly sets pad_token, quantization, etc.)
    print(f"--- Initializing LLM Pipeline (model: {model_id}, quantization: {use_quantization}, return_full_text: {pipeline_return_full_text}) ---")
    hf_pipeline_instance = None; hf_tokenizer_instance = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu"); print(f"Using device: {device}")
    try:
        print(f"Loading tokenizer for {model_id}..."); hf_tokenizer_instance = AutoTokenizer.from_pretrained(model_id)
        if hf_tokenizer_instance.pad_token is None:
            if hf_tokenizer_instance.eos_token is not None: print("Tokenizer setting pad_token = eos_token."); hf_tokenizer_instance.pad_token = hf_tokenizer_instance.eos_token
            else: print("WARN (initialize_llm): Tokenizer has no pad_token and no eos_token.")
        bnb_config = None; quant_active = False
        if use_quantization and torch.cuda.is_available():
            try:
                compute_dtype = torch.bfloat16 if (device.type == 'cuda' and torch.cuda.is_bf16_supported()) else torch.float16
                bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=True)
                quant_active = True; print(f"BNB config created, compute_dtype: {compute_dtype}.")
            except Exception as e_bnb: print(f"WARN: Failed BitsAndBytesConfig: {e_bnb}. Quantization may be disabled."); quant_active = False
        print(f"Loading model {model_id} (Quantization: {quant_active})..."); model_kwargs = {"device_map": "auto", "trust_remote_code": True}
        if quant_active and bnb_config: model_kwargs["quantization_config"] = bnb_config
        elif device.type == 'cuda': model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        hf_model_instance = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
        if hf_tokenizer_instance.pad_token_id is not None:
            if hf_model_instance.config.pad_token_id is None or hf_model_instance.config.pad_token_id != hf_tokenizer_instance.pad_token_id:
                hf_model_instance.config.pad_token_id = hf_tokenizer_instance.pad_token_id
                print(f"Model config pad_token_id set to: {hf_model_instance.config.pad_token_id}")
        hf_pipeline_instance = pipeline("text-generation", model=hf_model_instance, tokenizer=hf_tokenizer_instance, return_full_text=pipeline_return_full_text)
        print(f"LLM pipeline initialized for {model_id}."); return hf_pipeline_instance, hf_tokenizer_instance
    except Exception as e: print(f"CRITICAL: LLM pipeline init failed: {e}"); import traceback; traceback.print_exc(); return None, None


def create_enhanced_motif_prompt(text_corpus_chunk: str, max_motifs_to_extract: int = MAX_MOTIFS_PER_CHUNK) -> str: # Renamed for clarity
    """
    Revised prompt for extracting structured motifs, with very strong emphasis on JSON and label format.
    """
    if len(text_corpus_chunk) > MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK:
        text_corpus_chunk = text_corpus_chunk[:MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK]

    prompt = f"""You are a highly precise assistant for thematic analysis. Your task is to extract key recurring themes from the provided text.

STRICT OUTPUT REQUIREMENTS:
1.  Your entire response MUST be a single, valid JSON list.
2.  Each element in the list MUST be a JSON object.
3.  Each JSON object MUST contain exactly three keys: "label", "description", and "surface_forms".
4.  The value for "label" MUST be a string, IN ALL_CAPITAL_SNAKE_CASE, AND enclosed in square brackets. Example: "[DATA_SECURITY_POLICY]".
5.  The value for "description" MUST be a single, concise sentence (string).
6.  The value for "surface_forms" MUST be a JSON list of 2 to 3 short (2-6 words) VERBATIM phrases extracted DIRECTLY from the 'Text to analyze'. These phrases must be strong examples of the theme. If no suitable verbatim phrases are found, provide an empty list `[]`.
7.  Identify up to {max_motifs_to_extract} themes. If fewer than {max_motifs_to_extract} themes are clear, provide fewer objects. If no themes are clear, output an empty JSON list: `[]`.
8.  Do NOT include any text, explanations, apologies, or markdown (like ```json) before or after the main JSON list.

INSTRUCTIONS FOR THEME IDENTIFICATION:
- Focus on meaningful recurring concepts directly stated or strongly implied in the 'Text to analyze'.
- For 'surface_forms', prioritize phrases that appear to be REPEATED or are highly characteristic of the theme.
- Avoid generic labels like [EXAMPLE_THEME] or [GENERAL_TOPIC]. Make labels specific.

Text to analyze:
\"\"\"
{text_corpus_chunk}
\"\"\"

Your valid JSON response:
"""
    return prompt.strip()

def call_local_llm_for_raw_response(
    prompt_content_for_user_turn: str,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str,
    chunk_idx_for_log: int
    ) -> str:
    # ... (keep this function exactly as in your last working version) ...
    # (It uses do_sample=False and should work with the above prompt)
    if not hf_pipeline_instance or not hf_tokenizer_instance:
        print(f"    ERROR (call_local_llm): LLM pipeline/tokenizer not initialized for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        return ""
    messages_for_template = [{"role": "user", "content": prompt_content_for_user_turn}]
    try:
        prompt_formatted_for_llm = hf_tokenizer_instance.apply_chat_template(
            messages_for_template, tokenize=False, add_generation_prompt=True
        )
    except Exception as e_template:
        print(f"    ERROR (call_local_llm): Applying chat template failed for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_template}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (call_local_llm_for_raw_response) ---\nERROR APPLYING CHAT TEMPLATE: {e_template}\nUser prompt content (first 300 chars): {prompt_content_for_user_turn[:300]}...\n")
        return ""

    generation_args = {
        "max_new_tokens": LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION,
        "do_sample": False,
        "pad_token_id": hf_tokenizer_instance.pad_token_id
    }
    try:
        outputs = hf_pipeline_instance(prompt_formatted_for_llm, **generation_args)
        if outputs and isinstance(outputs, list) and len(outputs) > 0 and \
           outputs[0] and isinstance(outputs[0], dict) and 'generated_text' in outputs[0]:
            assistant_response_text = outputs[0]['generated_text'].strip()
            return assistant_response_text
        else:
            print(f"    WARN (call_local_llm): LLM pipeline returned unexpected structure for QID {qid_for_log}, Chunk {chunk_idx_for_log}. Output: {outputs}")
            return ""
    except Exception as e_pipeline:
        print(f"    ERROR (call_local_llm): Exception during hf_pipeline call for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_pipeline}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (call_local_llm_for_raw_response) ---\nERROR DURING PIPELINE CALL: {e_pipeline}\nFormatted prompt (first 300 chars): {prompt_formatted_for_llm[:300]}...\n")
        return ""

print("Cell 3: LLM Interaction Utilities loaded (with enhanced prompt).")

In [None]:
# @title Cell 4: Motif Processing and Validation (MODIFIED parse_and_validate_llm_json_response)

import json
import re
import time
from typing import List, Dict
from collections import Counter

# Assume constants like LLM_DEBUG_LOG_FILE, MIN_SF_FREQUENCY_IN_FULL_CORPUS etc. are loaded from Cell 1
# Assume text_utils functions like preprocess_corpus_for_motif_extraction, count_sf_occurrences are loaded from Cell 2
# Assume LLM interaction functions like create_enhanced_motif_prompt, call_local_llm_for_raw_response are loaded from Cell 3

# For standalone testing of this cell:
try:
    LLM_DEBUG_LOG_FILE
except NameError: LLM_DEBUG_LOG_FILE = "temp_llm_debug_log_cell4.txt"
try:
    MIN_SF_FREQUENCY_IN_FULL_CORPUS
except NameError: MIN_SF_FREQUENCY_IN_FULL_CORPUS = 2
try:
    LLM_RETRY_ATTEMPTS
except NameError: LLM_RETRY_ATTEMPTS = 2


def parse_and_validate_llm_json_response(
    llm_raw_response_text: str,
    qid_for_log:str,
    chunk_idx_for_log:int,
    prompt_sent_to_llm:str
    ) -> List[Dict]:
    """
    Parses the LLM's raw text response, attempts to fix labels if non-compliant,
    and validates schema. Returns a list of valid motif dictionaries.
    """
    json_str_candidate = llm_raw_response_text.strip()
    if json_str_candidate.startswith("```json"): json_str_candidate = json_str_candidate[len("```json"):].strip()
    if json_str_candidate.startswith("```"): json_str_candidate = json_str_candidate[len("```"):].strip()
    if json_str_candidate.endswith("```"): json_str_candidate = json_str_candidate[:-len("```")].strip()

    if not json_str_candidate or json_str_candidate.lower() == "[]" or \
       "no_themes_found" in json_str_candidate.lower() or \
       "no clear motifs" in json_str_candidate.lower():
        return []

    try:
        parsed_data = json.loads(json_str_candidate)
        if isinstance(parsed_data, dict): parsed_data = [parsed_data]
        if not isinstance(parsed_data, list):
            raise ValueError("Parsed JSON from LLM is not a list or a single object.")

        valid_motifs_from_json = []
        for item_idx, item in enumerate(parsed_data):
            if not isinstance(item, dict) or not item: # Skip non-dicts or empty dicts {}
                # print(f"    DEBUG (parse_validate): Skipping non-dict or empty item for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}: {item}")
                continue

            # --- Label Processing and Validation ---
            label_str_original = item.get('label', "")
            label_str_for_validation = "" # This will hold the label used for validation

            if isinstance(label_str_original, str) and label_str_original.strip():
                temp_label_stripped = label_str_original.strip()
                # Check if already correctly formatted
                if temp_label_stripped.startswith('[') and temp_label_stripped.endswith(']') and \
                   re.fullmatch(r"\[[A-Z0-9_]+\]", temp_label_stripped): # Strict check for [UPPER_SNAKE_CASE]
                    label_str_for_validation = temp_label_stripped
                else:
                    # Attempt to sanitize/fix if not perfectly formatted
                    # Extract potential bracketed part first
                    match_bracketed = re.search(r"(\[[A-Z0-9_]+\])", temp_label_stripped)
                    if match_bracketed:
                        label_str_for_validation = match_bracketed.group(1)
                        # print(f"    DEBUG (parse_validate): Extracted existing bracketed label '{label_str_for_validation}' from '{label_str_original}'")
                    else: # No bracketed part found, try to create one
                        sanitized_content = re.sub(r'\s+', '_', temp_label_stripped)
                        sanitized_content = re.sub(r'[^a-zA-Z0-9_]', '', sanitized_content).upper() # Allow a-z for initial capture then uppercase
                        sanitized_content = "_".join(sanitized_content.split('_')[:4]) # Limit length
                        if sanitized_content:
                            label_str_for_validation = f"[{sanitized_content}]"
                            # print(f"    DEBUG (parse_validate): Auto-formatted label from '{label_str_original}' to '{label_str_for_validation}'")
                        # If sanitization results in empty, label_str_for_validation remains ""

            # item['label'] = label_str_for_validation # Update the item's label with the processed one
                                                     # This is important if you reuse 'item' later

            # --- Schema Validation using processed label ---
            desc_str = item.get('description',"")
            sf_list = item.get('surface_forms', [])

            has_all_keys = all(k in item for k in ["label", "description", "surface_forms"])
            label_is_valid_format = isinstance(label_str_for_validation, str) and \
                                    bool(label_str_for_validation) and \
                                    label_str_for_validation.startswith('[') and \
                                    label_str_for_validation.endswith(']') and \
                                    re.fullmatch(r"\[[A-Z0-9_]+\]", label_str_for_validation)

            desc_is_valid = isinstance(desc_str, str) # Allow empty description string
            sfs_are_valid_list = isinstance(sf_list, list) and \
                                 all(isinstance(sf_item, str) for sf_item in sf_list)

            if has_all_keys and label_is_valid_format and desc_is_valid and sfs_are_valid_list:
                valid_motifs_from_json.append({
                    "label": label_str_for_validation,
                    "description": desc_str.strip(),
                    "surface_forms": [s.strip() for s in sf_list if isinstance(s, str) and s.strip()]
                })
            else: # Item failed detailed schema validation
                print(f"    [WARN] Invalid motif object schema after label processing for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}. Skipping.")
                # Log details of failure
                with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
                    f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} --- ITEM_SCHEMA_FAILURE (Item {item_idx+1}) ---\n")
                    f.write(f"Original Label: '{label_str_original}', Processed Label for Validation: '{label_str_for_validation}'\n")
                    f.write(f"Item Content: {json.dumps(item, indent=2)}\n") # Log the item state
                    f.write(f"Validation Details: has_keys={has_all_keys}, label_ok={label_is_valid_format}, desc_ok={desc_is_valid}, sf_list_ok={sfs_are_valid_list}\n")
        return valid_motifs_from_json

    except (json.JSONDecodeError, ValueError) as e:
        print(f"    [WARN] Motif JSON parsing or core structure issue for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e}")
        # ... (existing logging to file for JSONDecodeError) ...
        return []

def get_motifs_for_qid_batched( # This function remains structurally similar
    list_of_individual_response_texts: List[str],
    responses_per_batch: int,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str
    ) -> List[Dict]:
    # ... (Same as your last correct version, calling create_enhanced_motif_prompt,
    #      call_local_llm_for_raw_response, and parse_and_validate_llm_json_response) ...
    all_raw_motifs_from_chunks = []
    batched_text_chunks_for_llm = []
    for i in range(0, len(list_of_individual_response_texts), responses_per_batch):
        batch_responses = list_of_individual_response_texts[i:i + responses_per_batch]
        chunk_text_for_llm = preprocess_corpus_for_motif_extraction("\n\n<RSP_SEP>\n\n".join(batch_responses)) # preprocess_corpus from Cell 2
        batched_text_chunks_for_llm.append(chunk_text_for_llm)
    print(f"  QID {qid_for_log}: Processing {len(list_of_individual_response_texts)} responses in {len(batched_text_chunks_for_llm)} preprocessed chunks (batch size: {responses_per_batch} responses).")
    for chunk_idx, text_chunk_to_analyze_processed in enumerate(batched_text_chunks_for_llm):
        print(f"    Analyzing chunk {chunk_idx + 1}/{len(batched_text_chunks_for_llm)} for QID {qid_for_log} (processed chunk len: {len(text_chunk_to_analyze_processed)} chars)...")
        if len(text_chunk_to_analyze_processed.strip()) < 50:
            print(f"      Chunk {chunk_idx+1} (QID {qid_for_log}) too short, skipping."); continue
        prompt_for_llm = create_enhanced_motif_prompt(text_chunk_to_analyze_processed) # create_enhanced_motif_prompt from Cell 3
        motifs_from_this_chunk = []
        for attempt in range(LLM_RETRY_ATTEMPTS): # LLM_RETRY_ATTEMPTS from config
            raw_llm_response = call_local_llm_for_raw_response( # call_local_llm_for_raw_response from Cell 3
                prompt_for_llm, hf_pipeline_instance, hf_tokenizer_instance, qid_for_log, chunk_idx + 1
            )
            if not raw_llm_response:
                print(f"      LLM call attempt {attempt + 1} for chunk {chunk_idx+1} (QID {qid_for_log}) returned empty. Retrying if possible...");
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1); continue
            parsed_motifs_from_this_attempt = parse_and_validate_llm_json_response( # Defined in this cell
                raw_llm_response, qid_for_log, chunk_idx+1, prompt_for_llm
            )
            if parsed_motifs_from_this_attempt:
                motifs_from_this_chunk = parsed_motifs_from_this_attempt; break
            else:
                print(f"      Motif parsing/validation attempt {attempt + 1} yielded no structured motifs for chunk {chunk_idx+1}. Retrying if possible...");
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)
        if motifs_from_this_chunk:
            print(f"      Extracted {len(motifs_from_this_chunk)} structured motif objects from chunk {chunk_idx+1} (QID {qid_for_log}).")
            all_raw_motifs_from_chunks.extend(motifs_from_this_chunk)
        else:
            print(f"      No valid structured motifs from chunk {chunk_idx+1} (QID {qid_for_log}) after {LLM_RETRY_ATTEMPTS} attempts.")
    return all_raw_motifs_from_chunks

def consolidate_raw_motifs(list_of_all_raw_motifs: List[Dict]) -> List[Dict]:
    # ... (Keep this function exactly as in your last working version) ...
    # (It merges by label and combines/deduplicates surface_forms)
    if not list_of_all_raw_motifs: return []
    consolidated_motifs_map = {}
    for motif_obj in list_of_all_raw_motifs:
        label = motif_obj.get("label","").strip()
        description = motif_obj.get("description","").strip()
        surface_forms = motif_obj.get("surface_forms", [])
        if not (label and isinstance(surface_forms, list)): continue
        current_sfs_set = set(sf.lower().strip() for sf in surface_forms if isinstance(sf, str) and sf.strip())
        if label not in consolidated_motifs_map:
            consolidated_motifs_map[label] = {"label": label, "description": description, "surface_forms": sorted(list(current_sfs_set))}
        else:
            existing_sfs_set = set(consolidated_motifs_map[label].get("surface_forms", []))
            consolidated_motifs_map[label]["surface_forms"] = sorted(list(existing_sfs_set.union(current_sfs_set)))
    return list(consolidated_motifs_map.values())


def filter_surface_forms_by_global_frequency(
    consolidated_motifs_list: List[Dict],
    full_qid_corpus_text: str,
    min_global_freq: int = MIN_SF_FREQUENCY_IN_FULL_CORPUS # From config
    ) -> List[Dict]:
    # ... (Keep this function exactly as in your last working version) ...
    # (It uses count_sf_occurrences from Cell 2)
    if not consolidated_motifs_list: return []
    final_globally_filtered_motifs = []
    for motif_obj in consolidated_motifs_list:
        globally_frequent_sfs_for_this_motif = []
        original_sfs_for_this_motif = motif_obj.get("surface_forms", [])
        for sf_str in original_sfs_for_this_motif:
            count = count_sf_occurrences(full_qid_corpus_text, sf_str) # count_sf_occurrences from Cell 2
            if count >= min_global_freq:
                globally_frequent_sfs_for_this_motif.append(sf_str)
        if globally_frequent_sfs_for_this_motif:
            filtered_motif_entry = motif_obj.copy()
            filtered_motif_entry["surface_forms"] = sorted(list(set(globally_frequent_sfs_for_this_motif)))
            final_globally_filtered_motifs.append(filtered_motif_entry)
    return final_globally_filtered_motifs

print("Cell 4: Motif Processing and Validation Utilities loaded (with enhanced prompt and label fixing).")

# 1st June

## Refactored from New MWP below

In [5]:
# @title Cell 1: Configuration

import os

# --- Project Paths ---
# !!! IMPORTANT: UPDATE BASE_PROJECT_DIR TO YOUR ACTUAL PATH !!!
# Example for Google Colab:
# from google.colab import drive
# drive.mount('/content/drive')
# BASE_PROJECT_DIR = '/content/drive/MyDrive/YourFolder/LegalAnalysis/'
# Example for local:
BASE_PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Legal/' # Matching your last successful run
# Ensure BASE_PROJECT_DIR ends with a slash if it's a directory path

PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json')

P3_QIDS_TO_PROCESS_THEMATICALLY = ["Q4"]

# --- BDM Configuration ---
MATRIX_SIZE_GLOBAL = (8, 8)
MAX_TEXT_FOR_BDM_HASH = 2000

# --- LLM Configuration ---
LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'
USE_QUANTIZATION_FOR_LOCAL_LLM = True
LLM_BATCH_SIZE_RESPONSES = 5
LLM_RETRY_ATTEMPTS = 2
MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK = 7000
LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION = 700
MAX_MOTIFS_PER_CHUNK = 5

# --- Token-Based L(H) Configuration ---
MOTIF_SYMBOLIC_LABEL_COST = 0.5
MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
MOTIF_DESCRIPTION_TOKEN_COST = 0.1
MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.25
MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.1

# --- Surface Form Filtering Configuration ---
MIN_SF_FREQUENCY_IN_FULL_CORPUS = 2

# --- Logging File ---
LLM_DEBUG_LOG_FILE = os.path.join(BASE_PROJECT_DIR, "llm_motif_debug_log_refactored_v1.txt") # New log file for refactored version

print(f"Cell 1: Configuration loaded. LOCAL_LLM_MODEL_ID set to '{LOCAL_LLM_MODEL_ID}'.")
print(f"Debug log will be: {LLM_DEBUG_LOG_FILE}")
if not os.path.exists(BASE_PROJECT_DIR):
    print(f"WARNING: BASE_PROJECT_DIR '{BASE_PROJECT_DIR}' does not exist.")
if P2_COLLATED_FILE and not os.path.exists(P2_COLLATED_FILE):
     print(f"WARNING: P2_COLLATED_FILE '{P2_COLLATED_FILE}' does not exist. Data loading may fail.")

Cell 1: Configuration loaded. LOCAL_LLM_MODEL_ID set to 'google/gemma-2b-it'.
Debug log will be: /content/drive/MyDrive/Colab Notebooks/Legal/llm_motif_debug_log_refactored_v1.txt


In [6]:
# @title Cell 2: Text Utilities

import re
from typing import List, Dict # Added Dict for extract_actual_phrases
from collections import Counter # For extract_actual_phrases_from_text

# Import constants from Cell 1 (if running as separate cells in a notebook)
# If this were a .py file, it would be: from config import MIN_SF_FREQ_IN_CHUNK_VALIDATION
# For a notebook, we assume Cell 1's constants are in the global namespace after running it.
# However, to make cells more independent if run out of order or for clarity:
try:
    MIN_SF_FREQ_IN_CHUNK_VALIDATION # Check if defined
except NameError:
    print("WARN (Cell 2): MIN_SF_FREQ_IN_CHUNK_VALIDATION not found from config, using default 2.")
    MIN_SF_FREQ_IN_CHUNK_VALIDATION = 2


def tokenize_phrase(phrase_text: str) -> List[str]:
    """
    Simple tokenizer for phrases, definitions, or surface forms.
    Lowercases and splits by space.
    """
    if not isinstance(phrase_text, str) or not phrase_text.strip():
        return []
    return phrase_text.lower().split()

def preprocess_corpus_for_motif_extraction(text_corpus: str) -> str:
    """
    Preprocesses a text corpus (typically a chunk of joined responses)
    before sending to LLM or for n-gram extraction.
    - Consolidates excessive newlines and spaces.
    - Filters out very short lines (potential noise).
    """
    if not isinstance(text_corpus, str):
        return ""

    text = re.sub(r'\n{3,}', '\n\n', text_corpus)
    text = re.sub(r' {2,}', ' ', text)

    lines = text.split('\n')
    filtered_lines = [line.strip() for line in lines if len(line.strip()) > 10 or not line.strip()]

    return '\n'.join(filtered_lines)

def count_sf_occurrences(corpus_text: str, surface_form: str) -> int:
    """
    Counts case-insensitive occurrences of a surface_form within the corpus_text.
    """
    if not corpus_text or not surface_form or \
       not isinstance(corpus_text, str) or not isinstance(surface_form, str) or \
       not surface_form.strip(): # Don't count empty surface forms
        return 0
    try:
        # Using word boundaries \b to ensure we match whole phrases/words
        # This might be too restrictive if SFs are substrings, but generally safer
        # For simple phrase counting, \b might not be needed if sf is multi-word
        # Let's remove \b for now to match previous behavior of simple substring count
        return len(re.findall(re.escape(surface_form.lower()), corpus_text.lower(), flags=re.IGNORECASE))
    except re.error as e:
        print(f"    [WARN] Regex error in count_sf_occurrences for SF '{surface_form}': {e}")
        return 0

def extract_actual_phrases_from_text(
    text: str,
    min_phrase_len: int = 2,
    max_phrase_len: int = 6,
    min_freq: int = MIN_SF_FREQ_IN_CHUNK_VALIDATION
    ) -> Dict[str, int]:
    """
    Extracts n-gram phrases (2 to 6 words by default) and their frequencies from text.
    Only returns phrases meeting min_freq.
    NOTE: This function is NOT currently used in the main "reverted prompt" pipeline flow,
          but is kept as a utility for potential future chunk-level SF validation.
    """
    if not isinstance(text, str) or not text.strip():
        return {}

    text_cleaned = text.lower()
    text_cleaned = re.sub(r'[^\w\s\']', ' ', text_cleaned)
    text_cleaned = re.sub(r'\s+', ' ', text_cleaned).strip()

    words = text_cleaned.split()
    if not words or len(words) < min_phrase_len:
        return {}

    phrase_counts = Counter()
    for n in range(min_phrase_len, max_phrase_len + 1):
        if n > len(words): continue
        for i in range(len(words) - n + 1):
            phrase_tokens = words[i:i+n]
            phrase = ' '.join(phrase_tokens)
            if phrase:
                phrase_counts[phrase] += 1

    recurring_phrases = {phrase: count for phrase, count in phrase_counts.items() if count >= min_freq}
    return recurring_phrases

print("Cell 2: Text Utilities loaded.")

WARN (Cell 2): MIN_SF_FREQ_IN_CHUNK_VALIDATION not found from config, using default 2.
Cell 2: Text Utilities loaded.


In [7]:
# @title Cell 3: LLM Interaction

import torch # Should be imported before transformers in some environments
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import time # For potential sleeps if needed, and debug log timestamping
from typing import List # Already imported in Cell 1, but good practice for module independence

# Import constants from Cell 1 (if running as separate cells in a notebook)
# For a .py file, use: from config import LOCAL_LLM_MODEL_ID, ...
# For notebooks, assume Cell 1's constants are in global scope.
# Add checks or default values if a constant might be missing.
try:
    LOCAL_LLM_MODEL_ID
except NameError:
    print("WARN (Cell 3): Config constants like LOCAL_LLM_MODEL_ID not found. Using fallback or expecting errors.")
    LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it' # Fallback
    USE_QUANTIZATION_FOR_LOCAL_LLM = True
    MAX_MOTIFS_PER_CHUNK = 5
    MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK = 7000
    LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION = 700
    LLM_DEBUG_LOG_FILE = "llm_interaction_debug_temp.txt"


def initialize_llm_pipeline(
    model_id: str = LOCAL_LLM_MODEL_ID,
    use_quantization: bool = USE_QUANTIZATION_FOR_LOCAL_LLM,
    pipeline_return_full_text: bool = False # Defaulting to False as per our findings
    ):
    """Initializes and returns the Hugging Face pipeline and tokenizer."""
    print(f"--- Initializing LLM Pipeline (model: {model_id}, quantization: {use_quantization}, return_full_text: {pipeline_return_full_text}) ---")

    hf_pipeline_instance = None
    hf_tokenizer_instance = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    try:
        print(f"Loading tokenizer for {model_id}...")
        hf_tokenizer_instance = AutoTokenizer.from_pretrained(model_id)

        if hf_tokenizer_instance.pad_token is None:
            if hf_tokenizer_instance.eos_token is not None:
                print("Tokenizer does not have a pad_token; setting pad_token = eos_token.")
                hf_tokenizer_instance.pad_token = hf_tokenizer_instance.eos_token
            else:
                print("WARN (initialize_llm): Tokenizer has no pad_token and no eos_token. This might cause issues.")
                # Add a generic pad token if really needed, but usually models handle this with eos.
                # hf_tokenizer_instance.add_special_tokens({'pad_token': '[PAD]'})

        bnb_config = None
        quant_active = False
        if use_quantization and torch.cuda.is_available():
            try:
                compute_dtype = torch.bfloat16 if (device.type == 'cuda' and torch.cuda.is_bf16_supported()) else torch.float16
                bnb_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=compute_dtype,
                    bnb_4bit_use_double_quant=True
                )
                quant_active = True
                print(f"BitsAndBytesConfig created for {model_id}, compute_dtype: {compute_dtype}.")
            except Exception as e_bnb:
                print(f"WARN: Failed to create BitsAndBytesConfig: {e_bnb}. Quantization may be disabled or fall back.")
                quant_active = False

        print(f"Loading local model {model_id} (Quantization active: {quant_active})...")
        model_kwargs = {"device_map": "auto", "trust_remote_code": True}

        if quant_active and bnb_config:
            model_kwargs["quantization_config"] = bnb_config
        elif device.type == 'cuda':
             model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

        hf_model_instance = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)

        # Set model's pad_token_id if tokenizer's was set to eos_token_id (or another valid ID)
        if hf_tokenizer_instance.pad_token_id is not None:
            if hf_model_instance.config.pad_token_id is None or \
               hf_model_instance.config.pad_token_id != hf_tokenizer_instance.pad_token_id:
                hf_model_instance.config.pad_token_id = hf_tokenizer_instance.pad_token_id
                print(f"Model config pad_token_id set to: {hf_model_instance.config.pad_token_id}")
        else: # Should not happen if we set tokenizer.pad_token = tokenizer.eos_token and eos_token exists
            print("WARN (initialize_llm): Tokenizer pad_token_id is None after attempting to set. Pipeline might use default.")


        hf_pipeline_instance = pipeline(
            "text-generation",
            model=hf_model_instance,
            tokenizer=hf_tokenizer_instance,
            return_full_text=pipeline_return_full_text
        )
        print(f"Local LLM pipeline for {model_id} initialized successfully.")
        return hf_pipeline_instance, hf_tokenizer_instance
    except Exception as e:
        print(f"CRITICAL: Failed to initialize local LLM pipeline: {e}")
        import traceback
        traceback.print_exc()
        return None, None

def build_llm_prompt_for_motifs(text_block_for_prompt: str, max_motifs_to_extract: int = MAX_MOTIFS_PER_CHUNK) -> str:
    """Uses the simpler prompt structure that previously yielded bracketed labels."""
    if len(text_block_for_prompt) > MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK:
        # print(f"    Note (build_prompt): Text block for LLM prompt analysis truncated from {len(text_block_for_prompt)} to {MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK} chars.")
        text_block_for_prompt = text_block_for_prompt[:MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK]

    # This is the prompt from the "Old Successful Code Cell"
    prompt = f"""You will receive a set of comments from different people answering the same question.

Your task is to identify up to {max_motifs_to_extract} key recurring themes.

For each theme, provide:
- A short label like [DATA_PRIVACY]
- A 1-sentence description of the theme
- 2–3 short phrases that often appear in the text (surface forms)

Output MUST be a valid JSON list of objects, where each object has "label", "description", and "surface_forms" keys.
Example of one object in the list:
{{
  "label": "[EXAMPLE_LABEL]",
  "description": "A concise description of the example theme.",
  "surface_forms": ["short repeated phrase 1", "another short repeated phrase"]
}}
If no clear motifs are found, output an empty JSON list: `[]`.
Do not include any other text, explanations, or markdown code fences around the JSON.

Set of comments to analyze:
\"\"\"
{text_block_for_prompt}
\"\"\"

Valid JSON Output (ensure it's a list of objects, or an empty list [] if no themes):
"""
    return prompt.strip()

def call_local_llm_for_raw_response(
    prompt_content_for_user_turn: str,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str,
    chunk_idx_for_log: int
    ) -> str:
    """
    Makes the actual call to the local LLM pipeline using a pre-formatted user prompt string.
    Returns the raw text string generated by the LLM.
    Assumes pipeline is initialized with return_full_text=False.
    """
    if not hf_pipeline_instance or not hf_tokenizer_instance:
        print(f"    ERROR (call_local_llm): LLM pipeline/tokenizer not initialized for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        return ""

    messages_for_template = [{"role": "user", "content": prompt_content_for_user_turn}]

    try:
        prompt_formatted_for_llm = hf_tokenizer_instance.apply_chat_template(
            messages_for_template, tokenize=False, add_generation_prompt=True
        )
    except Exception as e_template:
        print(f"    ERROR (call_local_llm): Applying chat template failed for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_template}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f: # LLM_DEBUG_LOG_FILE from config
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (call_local_llm_for_raw_response) ---\n")
            f.write(f"ERROR APPLYING CHAT TEMPLATE: {e_template}\n")
            f.write(f"User prompt content (first 300 chars): {prompt_content_for_user_turn[:300]}...\n")
        return ""

    generation_args = {
        "max_new_tokens": LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION, # From config
        "do_sample": False, # Key for consistency
        "pad_token_id": hf_tokenizer_instance.pad_token_id # Ensure this is properly set
    }
    # print(f"    DEBUG (call_local_llm): QID {qid_for_log}, Chunk {chunk_idx_for_log}, Prompt len: {len(prompt_formatted_for_llm)}, GenArgs: {generation_args}")

    try:
        outputs = hf_pipeline_instance(prompt_formatted_for_llm, **generation_args)

        # print(f"    DEBUG (call_local_llm): Raw 'outputs' from hf_pipeline for QID {qid_for_log}, Chunk {chunk_idx_for_log}:\n<<<<<\n{outputs}\n>>>>>")

        if outputs and isinstance(outputs, list) and len(outputs) > 0 and \
           outputs[0] and isinstance(outputs[0], dict) and 'generated_text' in outputs[0]:
            # Since pipeline is expected to be initialized with return_full_text=False,
            # outputs[0]['generated_text'] should only be the new tokens.
            assistant_response_text = outputs[0]['generated_text'].strip()
            # print(f"    DEBUG (call_local_llm): 'assistant_response_text' for QID {qid_for_log}, Chunk {chunk_idx_for_log} (len {len(assistant_response_text)}):\n<<<<<\n{assistant_response_text[:1000]}...\n>>>>>")
            return assistant_response_text
        else:
            print(f"    WARN (call_local_llm): LLM pipeline returned unexpected or empty structure for QID {qid_for_log}, Chunk {chunk_idx_for_log}. Output: {outputs}")
            return ""
    except Exception as e_pipeline:
        print(f"    ERROR (call_local_llm): Exception during hf_pipeline call for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_pipeline}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f: # LLM_DEBUG_LOG_FILE from config
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (call_local_llm_for_raw_response) ---\n")
            f.write(f"ERROR DURING PIPELINE CALL: {e_pipeline}\n")
            f.write(f"Formatted prompt (first 300 chars): {prompt_formatted_for_llm[:300]}...\n")
        return ""

print("Cell 3: LLM Interaction Utilities loaded.")

Cell 3: LLM Interaction Utilities loaded.


In [8]:
# @title Cell 4: Motif Processing and Validation

import json # Already imported in Cell 1, but good for module clarity
import re   # Already imported in Cell 1
import time # Already imported in Cell 1
from typing import List, Dict # Already imported in Cell 1
# from collections import Counter # Not directly used here, but extract_actual_phrases was in Cell 2

# Import functions/constants from previous cells if needed for standalone cell execution (for .py files)
# For notebooks, assume previous cells have run and defined them in global scope.
# from text_utils import preprocess_corpus_for_motif_extraction, count_sf_occurrences # If these were in a separate file
# from llm_interaction import build_llm_prompt_for_motifs, call_local_llm_for_raw_response # If these were in a separate file
# from config import LLM_RETRY_ATTEMPTS, MIN_SF_FREQUENCY_IN_FULL_CORPUS, LLM_DEBUG_LOG_FILE # etc.

# Add checks or default values for constants if a cell might be run independently
try:
    LLM_RETRY_ATTEMPTS
except NameError:
    print("WARN (Cell 4): LLM_RETRY_ATTEMPTS not found from config, using default 2.")
    LLM_RETRY_ATTEMPTS = 2
try:
    MIN_SF_FREQUENCY_IN_FULL_CORPUS
except NameError:
    print("WARN (Cell 4): MIN_SF_FREQUENCY_IN_FULL_CORPUS not found from config, using default 2.")
    MIN_SF_FREQUENCY_IN_FULL_CORPUS = 2
try:
    LLM_DEBUG_LOG_FILE
except NameError:
    LLM_DEBUG_LOG_FILE = "temp_llm_debug_log_cell4.txt"


def parse_and_validate_llm_json_response(
    llm_raw_response_text: str,
    qid_for_log:str,
    chunk_idx_for_log:int,
    prompt_sent_to_llm:str # For logging context if prompt caused error
    ) -> List[Dict]:
    """
    Parses the LLM's raw text response, attempts to fix labels, and validates schema.
    Returns a list of valid motif dictionaries, or an empty list on failure.
    """
    json_str_candidate = llm_raw_response_text.strip()

    # Attempt to remove markdown fences if LLM adds them
    if json_str_candidate.startswith("```json"):
        json_str_candidate = json_str_candidate[len("```json"):].strip()
    if json_str_candidate.startswith("```"): # More generic ```
        json_str_candidate = json_str_candidate[len("```"):].strip()
    if json_str_candidate.endswith("```"):
        json_str_candidate = json_str_candidate[:-len("```")].strip()

    # print(f"    DEBUG (parse_validate): QID {qid_for_log}, Chunk {chunk_idx_for_log}, JSON candidate for parsing:\n{json_str_candidate[:500]}...")

    # Handle cases where LLM explicitly says no themes or returns empty list string
    if not json_str_candidate or json_str_candidate.lower() == "[]" or \
       "no_themes_found" in json_str_candidate.lower() or \
       "no clear motifs" in json_str_candidate.lower():
        # print(f"    DEBUG (parse_validate): LLM indicated no themes or JSON was effectively empty for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        return []

    try:
        parsed_data = json.loads(json_str_candidate)

        # Handle if LLM returns a single JSON object instead of a list
        if isinstance(parsed_data, dict):
            # print(f"    DEBUG (parse_validate): LLM returned a single JSON object, wrapping in list for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
            parsed_data = [parsed_data]

        if not isinstance(parsed_data, list):
            # This error will be caught by the outer except block
            raise ValueError("Parsed JSON from LLM is not a list (nor a single object that could be wrapped).")

        valid_motifs_from_json = []
        for item_idx, item in enumerate(parsed_data):
            if not item: # Skip empty dictionary {} items
                # print(f"    DEBUG (parse_validate): Skipping empty item object {{}} for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}")
                continue

            # --- Attempt to Fix/Sanitize Label ---
            label_str_original = item.get('label', "")
            label_str_processed = ""

            if isinstance(label_str_original, str) and label_str_original.strip():
                temp_label = label_str_original.strip()
                match = re.search(r"(\[[A-Z0-9_]+\])", temp_label) # Look for [UPPER_SNAKE_CASE]
                if match and match.group(1) == temp_label: # Whole string is correctly bracketed
                    label_str_processed = temp_label
                elif match: # Found a bracketed part within a longer string (e.g., "[LABEL] some text")
                    label_str_processed = match.group(1)
                    # print(f"    DEBUG (parse_validate): Extracted bracketed label '{label_str_processed}' from '{label_str_original}' for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}")
                elif not (temp_label.startswith('[') and temp_label.endswith(']')):
                    # If no brackets, or not solely bracketed, attempt to sanitize and add them
                    sanitized_content = re.sub(r'\s+', '_', temp_label) # Replace spaces with underscores
                    sanitized_content = re.sub(r'[^\w_]', '', sanitized_content).upper() # Keep only alphanum & underscore, then uppercase
                    sanitized_content = "_".join(sanitized_content.split('_')[:3]) # Limit to first 3 "words"
                    if sanitized_content: # Ensure not empty after sanitization
                        label_str_processed = f"[{sanitized_content}]"
                        # print(f"    DEBUG (parse_validate): Auto-formatted label from '{label_str_original}' to '{label_str_processed}' for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}")
            item['label'] = label_str_processed # Update item with processed label for validation
            # --- End Label Fix/Sanitize ---

            # --- Schema Validation ---
            current_item_label_for_validation = item.get('label',"") # Already processed and stripped if it was string
            desc_str = item.get('description',"")
            sf_list = item.get('surface_forms', [])

            is_dict_val = isinstance(item, dict)
            has_all_keys_val = all(k in item for k in ["label", "description", "surface_forms"])
            is_label_str_val = isinstance(current_item_label_for_validation, str) and bool(current_item_label_for_validation)
            label_starts_bracket_val = current_item_label_for_validation.startswith('[') if is_label_str_val else False
            label_ends_bracket_val = current_item_label_for_validation.endswith(']') if is_label_str_val else False
            is_desc_str_val = isinstance(desc_str, str)
            is_sf_list_val = isinstance(sf_list, list)
            sfs_are_strings_val = all(isinstance(sf_item, str) for sf_item in sf_list) if is_sf_list_val else False

            if is_dict_val and has_all_keys_val and \
               is_label_str_val and label_starts_bracket_val and label_ends_bracket_val and \
               is_desc_str_val and is_sf_list_val and sfs_are_strings_val:
                valid_motifs_from_json.append({
                    "label": current_item_label_for_validation,
                    "description": desc_str.strip(),
                    "surface_forms": [s.strip() for s in sf_list if isinstance(s, str) and s.strip()]
                })
            else: # Item failed schema validation
                print(f"    [WARN] Invalid motif object structure after label processing for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}. Skipping item.")
                with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
                    f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} --- ITEM_VALIDATION_FAILURE (Item {item_idx+1}) ---\n")
                    prompt_parts = prompt_sent_to_llm.split('Set of comments to analyze:')
                    user_content_for_log = "USER_CONTENT_SPLIT_FAILED_OR_PROMPT_EMPTY"
                    if len(prompt_parts) > 1: user_content_for_log = prompt_parts[1][:500]
                    elif prompt_sent_to_llm: user_content_for_log = prompt_sent_to_llm[:500]
                    f.write(f"PROMPT USER CONTENT (approx first 500 chars):\n{user_content_for_log}...\n")
                    f.write(f"RAW LLM RESPONSE (where validation failed for item):\n{llm_raw_response_text}\n")
                    f.write(f"PARSED ITEM THAT FAILED SCHEMA (after label processing attempt):\n{json.dumps(item, indent=2)}\n") # Log the item that failed
        return valid_motifs_from_json

    except (json.JSONDecodeError, ValueError) as e:
        print(f"    [WARN] Motif JSON parsing or core structure validation failed for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} --- JSON_PARSE_ERROR_OR_VALUE_ERROR ---\n")
            prompt_parts = prompt_sent_to_llm.split('Set of comments to analyze:')
            user_content_for_log = "USER_CONTENT_SPLIT_FAILED_OR_PROMPT_EMPTY"
            if len(prompt_parts) > 1: user_content_for_log = prompt_parts[1][:500]
            elif prompt_sent_to_llm: user_content_for_log = prompt_sent_to_llm[:500]
            f.write(f"PROMPT USER CONTENT (approx first 500 chars):\n{user_content_for_log}...\n")
            f.write(f"RAW LLM RESPONSE (Error: {type(e).__name__}):\n{llm_raw_response_text}\n")
            f.write(f"EXTRACTED JSON STRING CANDIDATE (Error: {type(e).__name__}):\n{json_str_candidate}\n")
        return []

def get_motifs_for_qid_batched(
    list_of_individual_response_texts: List[str],
    responses_per_batch: int,
    hf_pipeline_instance,  # From Cell 3
    hf_tokenizer_instance, # From Cell 3
    qid_for_log: str
    ) -> List[Dict]:
    """
    Orchestrates batched LLM calls for a QID to get raw, structurally validated motif candidates.
    Does NOT perform chunk-level SF n-gram validation in this version.
    """
    all_raw_motifs_from_chunks = []

    # Create text chunks for LLM processing
    batched_text_chunks_for_llm = []
    for i in range(0, len(list_of_individual_response_texts), responses_per_batch):
        batch_responses = list_of_individual_response_texts[i:i + responses_per_batch]
        # Preprocess each chunk before building the prompt
        chunk_text_for_llm = preprocess_corpus_for_motif_extraction("\n\n<RSP_SEP>\n\n".join(batch_responses)) # preprocess_corpus_... from Cell 2
        batched_text_chunks_for_llm.append(chunk_text_for_llm)

    print(f"  QID {qid_for_log}: Processing {len(list_of_individual_response_texts)} responses in {len(batched_text_chunks_for_llm)} preprocessed chunks (batch size: {responses_per_batch} responses).")

    for chunk_idx, text_chunk_to_analyze_processed in enumerate(batched_text_chunks_for_llm):
        print(f"    Analyzing chunk {chunk_idx + 1}/{len(batched_text_chunks_for_llm)} for QID {qid_for_log} (processed chunk len: {len(text_chunk_to_analyze_processed)} chars)...")
        if len(text_chunk_to_analyze_processed.strip()) < 50: # Arbitrary threshold for very short/empty chunks
            print(f"      Chunk {chunk_idx+1} (QID {qid_for_log}) too short after preprocessing, skipping.")
            continue

        # build_llm_prompt_for_motifs is from Cell 3
        prompt_for_llm = build_llm_prompt_for_motifs(text_chunk_to_analyze_processed)

        motifs_from_this_chunk = [] # Store successfully parsed motifs for this chunk
        for attempt in range(LLM_RETRY_ATTEMPTS): # LLM_RETRY_ATTEMPTS from config
            # call_local_llm_for_raw_response is from Cell 3
            raw_llm_response = call_local_llm_for_raw_response(
                prompt_for_llm,
                hf_pipeline_instance,
                hf_tokenizer_instance,
                qid_for_log,
                chunk_idx + 1
            )

            if not raw_llm_response: # LLM call failed or returned empty
                print(f"      LLM call attempt {attempt + 1} for chunk {chunk_idx+1} (QID {qid_for_log}) returned empty string. Retrying if possible...")
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1) # Small delay before retry
                continue # Go to next attempt

            # Parse and validate the structure of the JSON response from LLM
            parsed_and_validated_motifs = parse_and_validate_llm_json_response(
                raw_llm_response,
                qid_for_log,
                chunk_idx+1,
                prompt_for_llm # Pass the prompt for logging context on error
            )

            if parsed_and_validated_motifs: # If parsing and basic validation were successful
                motifs_from_this_chunk = parsed_and_validated_motifs
                break # Success for this chunk, exit retry loop
            else:
                # This means JSON parsing failed, or it parsed to an empty list, or all items failed schema validation
                print(f"      Motif parsing/validation attempt {attempt + 1} yielded no structured motifs for chunk {chunk_idx+1} (QID {qid_for_log}). Retrying if possible...")
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)

        if motifs_from_this_chunk:
            print(f"      Extracted {len(motifs_from_this_chunk)} structured motif objects from chunk {chunk_idx+1} (QID {qid_for_log}).")
            all_raw_motifs_from_chunks.extend(motifs_from_this_chunk)
        else:
            print(f"      No valid structured motifs extracted from chunk {chunk_idx+1} (QID {qid_for_log}) after {LLM_RETRY_ATTEMPTS} attempts.")

    return all_raw_motifs_from_chunks

def consolidate_raw_motifs(list_of_all_raw_motifs: List[Dict]) -> List[Dict]:
    """Consolidates motifs extracted from all chunks, primarily by label, merging surface forms."""
    if not list_of_all_raw_motifs:
        return []

    consolidated_motifs_map = {}
    for motif_obj in list_of_all_raw_motifs:
        # Ensure motif_obj is a dict and has a label (already validated by parse_and_validate)
        if not isinstance(motif_obj, dict) or not motif_obj.get("label"):
            continue

        label = motif_obj["label"] # Known to exist and be stripped from parse_and_validate
        description = motif_obj.get("description","").strip() # Also stripped
        surface_forms = motif_obj.get("surface_forms", []) # Already list of stripped strings

        # Normalize SFs for merging (lowercase, strip, unique)
        current_sfs_set = set(sf.lower() for sf in surface_forms if sf) # Assumes SFs are already stripped

        if label not in consolidated_motifs_map:
            consolidated_motifs_map[label] = {
                "label": label,
                "description": description, # Takes description from first encounter of this label
                "surface_forms": sorted(list(current_sfs_set))
            }
        else: # Label exists, merge surface forms
            existing_sfs_set = set(consolidated_motifs_map[label].get("surface_forms", [])) # These are already lowercased
            consolidated_motifs_map[label]["surface_forms"] = sorted(list(existing_sfs_set.union(current_sfs_set)))
            # Optionally, could append descriptions or choose the longest, etc. For now, keeps first.

    return list(consolidated_motifs_map.values())

def filter_surface_forms_by_global_frequency(
    consolidated_motifs_list: List[Dict],
    full_qid_corpus_text: str, # The entire original text for the QID
    min_global_freq: int = MIN_SF_FREQUENCY_IN_FULL_CORPUS # Uses config constant
    ) -> List[Dict]:
    """
    Filters surface forms in consolidated motifs based on their frequency
    in the full QID corpus text.
    Motifs are discarded if they have no SFs left after filtering.
    """
    if not consolidated_motifs_list:
        return []

    final_globally_filtered_motifs = []
    # print(f"  Filtering SFs from {len(consolidated_motifs_list)} consolidated motifs (min global freq: {min_global_freq})...")

    for motif_obj in consolidated_motifs_list:
        globally_frequent_sfs_for_this_motif = []
        # Surface forms in consolidated_motifs_list are already lowercased and unique
        original_sfs_for_this_motif = motif_obj.get("surface_forms", [])

        for sf_str_lower in original_sfs_for_this_motif:
            # count_sf_occurrences performs its own lowercasing of corpus and sf
            count = count_sf_occurrences(full_qid_corpus_text, sf_str_lower)

            if count >= min_global_freq:
                globally_frequent_sfs_for_this_motif.append(sf_str_lower) # Keep the lowercased SF
                # print(f"    SF '{sf_str_lower}' (label '{motif_obj.get('label')}') kept, global freq: {count}")
            # else:
                # print(f"    SF '{sf_str_lower}' (label '{motif_obj.get('label')}') filtered out, global freq: {count} (min_req: {min_global_freq})")

        if globally_frequent_sfs_for_this_motif: # Only keep motif if it has at least one globally frequent SF
            filtered_motif_entry = motif_obj.copy() # Make a copy to modify
            # SFs are already lowercased and unique from consolidation, just sort them
            filtered_motif_entry["surface_forms"] = sorted(list(set(globally_frequent_sfs_for_this_motif)))
            final_globally_filtered_motifs.append(filtered_motif_entry)
        # else:
            # print(f"    Motif '{motif_obj.get('label')}' discarded (no globally frequent SFs after filtering).")

    return final_globally_filtered_motifs

print("Cell 4: Motif Processing and Validation Utilities loaded.")

Cell 4: Motif Processing and Validation Utilities loaded.


In [9]:
# @title Cell 5: MDL Calculations

import hashlib # Already imported in Cell 1
import numpy as np # Already imported in Cell 1
from pybdm import BDM # Already imported in Cell 1
import re # Already imported in Cell 1
from typing import List, Dict # Already imported in Cell 1

# Import functions/constants from previous cells if needed (for .py files)
# For notebooks, assume previous cells have run and defined them in global scope.
# from config import MATRIX_SIZE_GLOBAL, MAX_TEXT_FOR_BDM_HASH, MOTIF_SYMBOLIC_LABEL_COST, ...
# from text_utils import tokenize_phrase

# Add checks or default values for constants if a cell might be run independently
try:
    MATRIX_SIZE_GLOBAL; MAX_TEXT_FOR_BDM_HASH; MOTIF_SYMBOLIC_LABEL_COST # Check a few
except NameError:
    print("WARN (Cell 5): Key config constants not found. Using fallback or expecting errors.")
    MATRIX_SIZE_GLOBAL = (8, 8); MAX_TEXT_FOR_BDM_HASH = 2000
    MOTIF_SYMBOLIC_LABEL_COST = 0.5; MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
    MOTIF_DESCRIPTION_TOKEN_COST = 0.1; MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.25
    MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.1


def initialize_bdm_instance():
    """Initializes and returns a BDM instance."""
    print("Initializing BDM instance...")
    try:
        bdm_instance = BDM(ndim=2) # Corrected: Use default CTM-based NKS
        print("BDM instance initialized successfully (ndim=2, default CTM-based).")
        return bdm_instance
    except Exception as e_bdm_init:
        print(f"CRITICAL: Failed to initialize BDM instance: {e_bdm_init}")
        if "CTM data files" in str(e_bdm_init).lower() or "dataset" in str(e_bdm_init).lower():
            print("  BDM Error Hint: This might be related to missing/corrupted CTM data files for PyBDM.")
            print("  Ensure PyBDM is installed correctly and can access/download its data.")
            print("  You might need to run once: from pybdm import get_ctm_dataset; get_ctm_dataset()")
        return None

def text_to_binary_matrix(text_input: str, size: tuple = MATRIX_SIZE_GLOBAL) -> np.ndarray:
    """Converts a text string to a binary matrix using its SHA256 hash."""
    if not isinstance(text_input, str) or not text_input.strip():
        return np.zeros(size, dtype=int)

    hash_obj = hashlib.sha256(text_input.encode('utf-8', 'ignore'))
    hash_digest = hash_obj.hexdigest()
    required_bits = size[0] * size[1]
    binary_string_from_hash = bin(int(hash_digest, 16))[2:].zfill(256)

    binary_string_for_matrix = binary_string_from_hash[:required_bits] if required_bits <= 256 else binary_string_from_hash.ljust(required_bits, '0')

    bits_for_matrix = [int(b) for b in binary_string_for_matrix]
    return np.array(bits_for_matrix).reshape(size)

def compute_bdm_for_text(text_input: str, bdm_instance: BDM, matrix_s: tuple = MATRIX_SIZE_GLOBAL) -> float:
    """Computes BDM for a given text string using a prefix for hashing."""
    if not isinstance(text_input, str) or not text_input.strip() :
        return 0.0

    text_for_hash = text_input[:MAX_TEXT_FOR_BDM_HASH] if len(text_input) > MAX_TEXT_FOR_BDM_HASH else text_input
    if not text_for_hash.strip():
        return 0.0

    binary_matrix = text_to_binary_matrix(text_for_hash, size=matrix_s)
    try:
        bdm_value = bdm_instance.bdm(binary_matrix)
        return bdm_value
    except Exception as e_bdm:
        print(f"      Error during BDM calculation for text (full len {len(text_input)}, hashed part len {len(text_for_hash)}): {e_bdm}")
        return -1.0 # Indicate error

def calculate_L_H_token_based_structured(structured_motifs_list: List[Dict]) -> float:
    """Calculates L(H) - the cost of defining the list of structured motifs."""
    if not structured_motifs_list:
        return 0.0

    total_lh_cost = 0.0
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict):
            continue

        current_motif_lh = 0.0

        label_str = motif_obj.get('label', "")
        if isinstance(label_str, str) and label_str.strip(): # Label must exist
            current_motif_lh += MOTIF_SYMBOLIC_LABEL_COST

        description_str = motif_obj.get('description', "")
        if isinstance(description_str, str) and description_str.strip(): # Description can be empty but contributes if present
            current_motif_lh += MOTIF_DESCRIPTION_TEXT_BASE_COST
            current_motif_lh += len(tokenize_phrase(description_str)) * MOTIF_DESCRIPTION_TOKEN_COST # tokenize_phrase from Cell 2

        surface_forms_list = motif_obj.get('surface_forms', [])
        if isinstance(surface_forms_list, list) and surface_forms_list:
            valid_sfs_for_lh = [sf for sf in surface_forms_list if isinstance(sf, str) and sf.strip()]
            if valid_sfs_for_lh: # Only add base cost if there are actual SFs
                current_motif_lh += MOTIF_SURFACE_FORMS_LIST_BASE_COST
                for sf_str in valid_sfs_for_lh:
                    current_motif_lh += len(tokenize_phrase(sf_str)) * MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH

        total_lh_cost += current_motif_lh
    return total_lh_cost

def llm_compress_text_structured(text_to_compress: str, structured_motifs_list: List[Dict]) -> str:
    """Compresses text by replacing occurrences of motif surface forms with their symbolic labels."""
    if not isinstance(text_to_compress, str):
        return "" # Return empty if input is not string
    if not structured_motifs_list:
        return text_to_compress.lower()

    compressed_text = text_to_compress.lower()

    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict):
            continue

        label = motif_obj.get('label', None)
        surface_forms = motif_obj.get('surface_forms', []) # SFs should be already lowercased by consolidation/filtering

        if not (isinstance(label, str) and label.strip()) or \
           not (isinstance(surface_forms, list) and surface_forms):
            continue # Skip motif if label is bad or no surface forms

        placeholder = label

        # Surface forms should be already lowercased and stripped from previous processing steps
        # Sort this motif's own surface forms by length (descending) for greedy matching
        sorted_sfs_for_this_motif = sorted(
            [sf for sf in surface_forms if isinstance(sf, str) and sf.strip()], # Ensure SFs are strings
            key=len,
            reverse=True
        )

        for sf_str_lower in sorted_sfs_for_this_motif: # Assumes sf_str is already lowercase
            try:
                # Use word boundaries for more precise replacement
                compressed_text = re.sub(r'\b' + re.escape(sf_str_lower) + r'\b', placeholder, compressed_text)
            except re.error as re_e:
                print(f"    Regex error during compression for SF '{sf_str_lower}' of motif '{label}': {re_e}. Skipping this SF.")
                continue
    return compressed_text

def compute_mdl_cost_for_text_block(
    full_qid_corpus_str: str,
    final_motifs_to_evaluate: List[Dict],
    bdm_instance: BDM,
    matrix_s: tuple = MATRIX_SIZE_GLOBAL
    ) -> tuple[float, float, float]:
    """Computes L(H), L(D|H), and Total MDL for a text block given a final set of motifs."""
    if not isinstance(full_qid_corpus_str, str):
        full_qid_corpus_str = ""

    l_h = calculate_L_H_token_based_structured(final_motifs_to_evaluate)

    compressed_text_block = llm_compress_text_structured(full_qid_corpus_str, final_motifs_to_evaluate)
    l_d_h = compute_bdm_for_text(compressed_text_block, bdm_instance, matrix_s)

    if l_d_h < 0: # Indicates a BDM computation error
        return l_h, -1.0, -1.0

    total_mdl_cost = l_h + l_d_h
    return l_h, l_d_h, total_mdl_cost

print("Cell 5: MDL Calculation Utilities loaded.")

Cell 5: MDL Calculation Utilities loaded.


In [10]:
# @title Cell 6: Main Pipeline Orchestration

# Assumes functions from previous cells (Cells 1-5 content) are defined or imported:
# From Cell 1: All configuration constants (BASE_PROJECT_DIR, LOCAL_LLM_MODEL_ID, etc.)
# From Cell 2: preprocess_corpus_for_motif_extraction, count_sf_occurrences
# From Cell 3: initialize_llm_pipeline (this should be defined here if not imported)
# From Cell 4: get_motifs_for_qid_batched, consolidate_raw_motifs, filter_surface_forms_by_global_frequency
# From Cell 5: initialize_bdm_instance, compute_bdm_for_text, compute_mdl_cost_for_text_block

# For clarity in a notebook, if you are truly running these as separate cells,
# you might add specific imports at the top of this cell from the "modules" like:
# from config import *
# from text_utils import preprocess_corpus_for_motif_extraction, count_sf_occurrences
# from llm_interaction import initialize_llm_pipeline # (and other llm functions if get_motifs_for_qid_batched was also there)
# from motif_processing import get_motifs_for_qid_batched, consolidate_raw_motifs, filter_surface_forms_by_global_frequency
# from mdl_calculations import initialize_bdm_instance, compute_bdm_for_text, compute_mdl_cost_for_text_block

# However, if all previous cells were run in order, their functions/constants are in scope.

def main():
    # --- Initial Setup and Welcome Message ---
    script_version_name = "Single Cell MWP (Reverted Simpler Prompt v5, Label Fix, Global SF Filter)"
    print(f"--- {script_version_name} ---")
    print(f"Timestamp: {time.asctime()}")
    print("\n--- Configuration Summary ---")
    print(f"LLM Model: {LOCAL_LLM_MODEL_ID}, Quantization: {USE_QUANTIZATION_FOR_LOCAL_LLM}")
    print(f"LLM Batch Size (Responses): {LLM_BATCH_SIZE_RESPONSES}, Retries: {LLM_RETRY_ATTEMPTS}")
    print(f"Max Text Chars per LLM Prompt Chunk: {MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK}")
    print(f"Max New Tokens for LLM Motif Extraction: {LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION}")
    print(f"Max Motifs to Request per Chunk: {MAX_MOTIFS_PER_CHUNK}")
    print(f"L(H) Costs: Label={MOTIF_SYMBOLIC_LABEL_COST}, DescBase={MOTIF_DESCRIPTION_TEXT_BASE_COST}, DescToken={MOTIF_DESCRIPTION_TOKEN_COST}, SFListBase={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, SFTokenInLH={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}")
    print(f"Global SF Filtering Min Freq: {MIN_SF_FREQUENCY_IN_FULL_CORPUS}")
    print(f"BDM Hash Prefix Length: {MAX_TEXT_FOR_BDM_HASH}, BDM Matrix: {MATRIX_SIZE_GLOBAL}")
    print(f"Debug Log File: {LLM_DEBUG_LOG_FILE}")
    print("--- End Configuration Summary ---\n")

    # --- Initialize Debug Log File ---
    try:
        with open(LLM_DEBUG_LOG_FILE, "w", encoding="utf-8") as f: # Overwrite for new run
            f.write(f"LLM Motif Debug Log - Run Started: {time.asctime()}\n")
            f.write(f"Script Version: {script_version_name}\n")
            f.write(f"Model ID: {LOCAL_LLM_MODEL_ID}\n")
            f.write(f"Pipeline Config: return_full_text=False (Implicit in current setup)\n")
            f.write(f"Prompt Strategy: Reverted Simpler Prompt with Label Fix Attempt Active\n---\n")
    except Exception as e_log:
        print(f"WARN: Could not initialize debug log file {LLM_DEBUG_LOG_FILE}: {e_log}")

    # --- Initialize LLM and BDM ---
    # These functions are expected to be defined (e.g., from Cell 3 and Cell 5 content)
    hf_pipeline_instance, hf_tokenizer_instance = initialize_llm_pipeline()
    if not hf_pipeline_instance or not hf_tokenizer_instance:
        print("CRITICAL: Exiting due to LLM pipeline initialization failure.")
        return

    bdm_instance_main = initialize_bdm_instance()
    if not bdm_instance_main:
        print("CRITICAL: Exiting due to BDM initialization failure.")
        return

    # --- Load Phase 2 Collated Data ---
    if not os.path.exists(P2_COLLATED_FILE):
        print(f"ERROR: Phase 2 output file not found: {P2_COLLATED_FILE}")
        return
    print(f"Loading Phase 2 output from: {P2_COLLATED_FILE}...")
    phase2_data_content = None
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f:
            phase2_data_content = json.load(f)
    except Exception as e_load:
        print(f"Error loading or parsing {P2_COLLATED_FILE}: {e_load}")
        return

    all_qid_mdl_results_list = []

    # --- Determine QIDs to Process ---
    aggregated_content_by_qid_from_file = phase2_data_content.get("aggregated_pdf_content_by_qid", {})
    if not aggregated_content_by_qid_from_file:
        print(f"No 'aggregated_pdf_content_by_qid' key found or data is empty in {P2_COLLATED_FILE}. Exiting.")
        return

    qids_to_process_this_run = []
    if P3_QIDS_TO_PROCESS_THEMATICALLY and \
       isinstance(P3_QIDS_TO_PROCESS_THEMATICALLY, list) and \
       P3_QIDS_TO_PROCESS_THEMATICALLY: # Check if the list is not empty

        qids_to_process_this_run = [
            qid for qid in P3_QIDS_TO_PROCESS_THEMATICALLY
            if qid in aggregated_content_by_qid_from_file
        ]
        if not qids_to_process_this_run:
            print(f"Warning: None of the specified QIDs {P3_QIDS_TO_PROCESS_THEMATICALLY} were found in the loaded data's QIDs. Exiting.")
            return
    else:
        qids_to_process_limit_fallback = 1
        print(f"P3_QIDS_TO_PROCESS_THEMATICALLY not set or empty. Processing up to {qids_to_process_limit_fallback} QID(s) from data as a fallback.")
        qids_to_process_this_run = list(aggregated_content_by_qid_from_file.keys())[:qids_to_process_limit_fallback]
        if not qids_to_process_this_run:
            print("No QIDs found in data to process based on the fallback limit. Exiting.")
            return

    if not qids_to_process_this_run: # Final safeguard
        print("No QIDs selected for processing after all checks. Exiting.")
        return
    print(f"\nMDL analysis will run for QIDs: {qids_to_process_this_run}\n")

    # --- Main QID Processing Loop ---
    for qid_identifier_str in qids_to_process_this_run:
        print(f"--- Analyzing Data for QID: {qid_identifier_str} ---")

        list_of_individual_response_structs = aggregated_content_by_qid_from_file.get(qid_identifier_str, [])
        actual_response_texts_for_qid = [
            item.get("text", "") for item in list_of_individual_response_structs
            if isinstance(item, dict) and isinstance(item.get("text"), str) and item.get("text","").strip()
        ]
        if not actual_response_texts_for_qid:
            print(f"  No valid text strings extracted from responses for QID {qid_identifier_str}. Skipping.");
            print("-" * 50); continue

        full_corpus_text_for_qid = "\n\n<RSP_SEP>\n\n".join(actual_response_texts_for_qid)

        if len(full_corpus_text_for_qid.strip()) < 100:
            print(f"  Skipping QID {qid_identifier_str}: combined text too short ({len(full_corpus_text_for_qid)} chars).")
            print("-" * 50); continue

        num_total_responses_for_qid = len(actual_response_texts_for_qid)
        print(f"  Corpus for QID {qid_identifier_str}: {len(full_corpus_text_for_qid)} chars, {num_total_responses_for_qid} responses.")

        baseline_bdm_original_corpus = compute_bdm_for_text(full_corpus_text_for_qid, bdm_instance_main, MATRIX_SIZE_GLOBAL)
        if baseline_bdm_original_corpus < 0:
            print(f"  Error computing baseline BDM for QID {qid_identifier_str}. Skipping this QID.")
            # Log an error entry
            error_entry = {"qid": qid_identifier_str, "status": "ERROR_BASELINE_BDM", "baseline_mdl": -1.0}
            all_qid_mdl_results_list.append(error_entry)
            print("-" * 50); continue

        current_qid_baseline_mdl_cost = baseline_bdm_original_corpus
        print(f"  Baseline MDL for QID {qid_identifier_str} (L(D_orig)): {current_qid_baseline_mdl_cost:.4f}")

        # Get raw motifs (these have passed initial JSON parsing and schema validation from parse_and_validate_llm_json_response)
        # get_motifs_for_qid_batched is from Cell 4 equivalent
        raw_motifs_from_chunks = get_motifs_for_qid_batched(
            actual_response_texts_for_qid,
            LLM_BATCH_SIZE_RESPONSES,
            hf_pipeline_instance,
            hf_tokenizer_instance,
            qid_identifier_str
        )

        current_qid_result_entry = { # Initialize result dict for this QID
            "qid": qid_identifier_str,
            "corpus_len_chars": len(full_corpus_text_for_qid),
            "num_responses": num_total_responses_for_qid,
            "baseline_mdl": current_qid_baseline_mdl_cost,
            "final_refined_motifs": [],
            "l_h_final_motifs": 0.0,
            "l_d_h_final_motifs": current_qid_baseline_mdl_cost,
            "total_mdl_with_final_motifs": current_qid_baseline_mdl_cost,
            "compression_achieved": 0.0,
            "num_raw_motifs_extracted": len(raw_motifs_from_chunks),
            "num_consolidated_motifs": 0,
            "num_globally_refined_motifs": 0
        }

        if not raw_motifs_from_chunks:
            print(f"  No raw motifs extracted by LLM for QID {qid_identifier_str} from any chunk.")
            all_qid_mdl_results_list.append(current_qid_result_entry)
            print("-" * 50); continue

        print(f"  Extracted {len(raw_motifs_from_chunks)} raw motif objects from LLM for QID {qid_identifier_str} (across all chunks).")

        # Consolidate motifs (from Cell 4 equivalent)
        consolidated_motifs_list = consolidate_raw_motifs(raw_motifs_from_chunks)
        current_qid_result_entry["num_consolidated_motifs"] = len(consolidated_motifs_list)
        print(f"  Consolidated into {len(consolidated_motifs_list)} unique motifs (by label) for QID {qid_identifier_str}.")

        if not consolidated_motifs_list:
            print(f"  No unique motifs left after consolidation for QID {qid_identifier_str}.")
            all_qid_mdl_results_list.append(current_qid_result_entry)
            print("-" * 50); continue

        # print(f"  Consolidated Motifs for QID {qid_identifier_str} (BEFORE Global SF refinement):")
        # for idx, mo_con in enumerate(consolidated_motifs_list):
        #     print(f"    Cons. Motif {idx+1}: L='{mo_con.get('label')}', D='{mo_con.get('description','N/A')[:30]}...', SFs({len(mo_con.get('surface_forms',[]))})='{mo_con.get('surface_forms',[])[:2]}...'")

        # Globally filter surface forms (from Cell 4 equivalent)
        globally_refined_motifs = filter_surface_forms_by_global_frequency(
            consolidated_motifs_list,
            full_corpus_text_for_qid, # Filter against the full original corpus for this QID
            min_global_freq=MIN_SF_FREQUENCY_IN_FULL_CORPUS
        )
        current_qid_result_entry["num_globally_refined_motifs"] = len(globally_refined_motifs)
        print(f"  Globally refined into {len(globally_refined_motifs)} motifs for QID {qid_identifier_str}.")

        if not globally_refined_motifs:
            print(f"  No motifs left after GLOBAL surface form frequency refinement for QID {qid_identifier_str}.")
            all_qid_mdl_results_list.append(current_qid_result_entry)
            print("-" * 50); continue

        print(f"  Final Globally Refined Motifs for QID {qid_identifier_str} (for MDL eval):")
        for idx, mo_final in enumerate(globally_refined_motifs):
            print(f"    Refined Motif {idx+1}: L='{mo_final.get('label')}', D='{mo_final.get('description','N/A')[:60]}...', SFs({len(mo_final.get('surface_forms',[]))})='{mo_final.get('surface_forms',[])}'")

        # Final MDL Calculation (from Cell 5 equivalent)
        l_h_final_val, l_d_h_final_val, total_mdl_final_val = compute_mdl_cost_for_text_block(
            full_corpus_text_for_qid,
            globally_refined_motifs,
            bdm_instance_main,
            MATRIX_SIZE_GLOBAL
        )

        current_qid_result_entry["final_refined_motifs"] = globally_refined_motifs
        current_qid_result_entry["l_h_final_motifs"] = l_h_final_val

        if l_d_h_final_val < 0: # BDM error
            print(f"  Error computing MDL cost with final refined motifs for QID {qid_identifier_str} (BDM error in L(D|H)).")
            current_qid_result_entry.update({"l_d_h_final_motifs": -1.0, "total_mdl_with_final_motifs": -1.0, "compression_achieved": "BDM_ERROR"})
        else:
            current_qid_result_entry["l_d_h_final_motifs"] = l_d_h_final_val
            current_qid_result_entry["total_mdl_with_final_motifs"] = total_mdl_final_val
            compression_final_val = current_qid_baseline_mdl_cost - total_mdl_final_val
            current_qid_result_entry["compression_achieved"] = compression_final_val

        all_qid_mdl_results_list.append(current_qid_result_entry) # Append result for this QID

        # Print QID-specific MDL outcome
        if l_d_h_final_val >=0 :
            print(f"  L(H) final motifs: {l_h_final_val:.4f}")
            print(f"  L(D|H) compressed full corpus: {l_d_h_final_val:.4f}")
            print(f"  Total MDL cost with final motifs: {total_mdl_final_val:.4f}")
            compression_val = current_qid_result_entry["compression_achieved"]
            result_status_str = f"SUCCESS: Comp: {compression_val:.4f}" if isinstance(compression_val, float) and compression_val > 0.0001 else f"NOTE: No sig. comp. Diff: {compression_val if isinstance(compression_val, str) else compression_val:.4f}"
            print(f"  {result_status_str}")
        print("-" * 50)

    # --- Summary Printing and Saving Results ---
    print("\n--- Overall QID-based MDL Analysis Summary ---")
    if not all_qid_mdl_results_list:
        print("No QIDs were processed or no valid results were generated to summarize.")
    else:
        valid_results_for_stats = [
            r for r in all_qid_mdl_results_list
            if isinstance(r.get('compression_achieved'), float) and r.get('l_h_final_motifs', -1.0) >= 0
        ]
        num_qids_processed_total = len(all_qid_mdl_results_list)
        num_qids_with_valid_mdl_calc = len(valid_results_for_stats)
        num_qids_achieving_compression = sum(1 for r in valid_results_for_stats if r['compression_achieved'] > 0.0001)

        print(f"Total QIDs targeted for analysis: {len(qids_to_process_this_run)}")
        print(f"Total QID result entries logged: {num_qids_processed_total}")
        print(f"Number of QIDs with valid MDL calculations: {num_qids_with_valid_mdl_calc}")
        print(f"Number of QIDs where compression was achieved: {num_qids_achieving_compression}")

        if num_qids_achieving_compression > 0:
            successful_compressions_values = [r['compression_achieved'] for r in valid_results_for_stats if r['compression_achieved'] > 0.0001]
            avg_compression_val = np.mean(successful_compressions_values)
            max_compression_val = np.max(successful_compressions_values)
            print(f"  Average compression (for successful cases): {avg_compression_val:.4f}")
            print(f"  Maximum compression achieved across QIDs: {max_compression_val:.4f}")
        else:
            print("  No compression achieved for any QID in this run.")

        output_filename_final = os.path.join(BASE_PROJECT_DIR, "mdl_analysis_single_cell_reverted_prompt_v5_final.json")
        try:
            with open(output_filename_final, "w", encoding="utf-8") as f_out:
                json.dump(all_qid_mdl_results_list, f_out, indent=2, ensure_ascii=False)
            print(f"Detailed QID-based results saved to {output_filename_final}")
        except Exception as e_save:
            print(f"Error saving QID-based results to {output_filename_final}: {e_save}")

if __name__ == "__main__":
    print(f"Executing main MDL pipeline (Single Cell Reverted Prompt V5 with Label Fix & UnboundLocalError fix) at {time.asctime()}...")
    main()
    print(f"Main MDL pipeline execution finished at {time.asctime()}.")

Executing main MDL pipeline (Single Cell Reverted Prompt V5 with Label Fix & UnboundLocalError fix) at Sun Jun  1 12:34:21 2025...
--- Single Cell MWP (Reverted Simpler Prompt v5, Label Fix, Global SF Filter) ---
Timestamp: Sun Jun  1 12:34:21 2025

--- Configuration Summary ---
LLM Model: google/gemma-2b-it, Quantization: True
LLM Batch Size (Responses): 5, Retries: 2
Max Text Chars per LLM Prompt Chunk: 7000
Max New Tokens for LLM Motif Extraction: 700
Max Motifs to Request per Chunk: 5
L(H) Costs: Label=0.5, DescBase=0.5, DescToken=0.1, SFListBase=0.25, SFTokenInLH=0.1
Global SF Filtering Min Freq: 2
BDM Hash Prefix Length: 2000, BDM Matrix: (8, 8)
Debug Log File: /content/drive/MyDrive/Colab Notebooks/Legal/llm_motif_debug_log_refactored_v1.txt
--- End Configuration Summary ---

--- Initializing LLM Pipeline (model: google/gemma-2b-it, quantization: True, return_full_text: False) ---
Using device: cuda
Loading tokenizer for google/gemma-2b-it...


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

BitsAndBytesConfig created for google/gemma-2b-it, compute_dtype: torch.bfloat16.
Loading local model google/gemma-2b-it (Quantization active: True)...


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Device set to use cuda:0


Local LLM pipeline for google/gemma-2b-it initialized successfully.
Initializing BDM instance...
BDM instance initialized successfully (ndim=2, default CTM-based).
Loading Phase 2 output from: /content/drive/MyDrive/Colab Notebooks/Legal/Phase2_PDF_Collated_Texts/phase2_collated_pdf_texts.json...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



MDL analysis will run for QIDs: ['Q4']

--- Analyzing Data for QID: Q4 ---
  Corpus for QID Q4: 129501 chars, 209 responses.
  Baseline MDL for QID Q4 (L(D_orig)): 121.3693
  QID Q4: Processing 209 responses in 42 preprocessed chunks (batch size: 5 responses).
    Analyzing chunk 1/42 for QID Q4 (processed chunk len: 3158 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 1 (QID Q4).
    Analyzing chunk 2/42 for QID Q4 (processed chunk len: 3262 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 2 (QID Q4).
    Analyzing chunk 3/42 for QID Q4 (processed chunk len: 3219 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 3 (QID Q4).
    Analyzing chunk 4/42 for QID Q4 (processed chunk len: 3134 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 4 (QID Q4).
    Analyzing chunk 5/42 for QID Q4 (processed chunk len: 3142 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 5 (QID Q4).
    Analyzing chunk 6/42 for QID Q4 (processed chunk len: 2924 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 6 (QID Q4).
    Analyzing chunk 7/42 for QID Q4 (processed chunk len: 3325 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    [WARN] Invalid motif object structure after label processing for QID Q4, Chunk 7, Item 3. Skipping item.
      Extracted 3 structured motif objects from chunk 7 (QID Q4).
    Analyzing chunk 8/42 for QID Q4 (processed chunk len: 2977 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 8 (QID Q4).
    Analyzing chunk 9/42 for QID Q4 (processed chunk len: 3084 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 9 (QID Q4).
    Analyzing chunk 10/42 for QID Q4 (processed chunk len: 3218 chars)...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 10 (QID Q4).
    Analyzing chunk 11/42 for QID Q4 (processed chunk len: 2930 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 11 (QID Q4).
    Analyzing chunk 12/42 for QID Q4 (processed chunk len: 3101 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 12 (QID Q4).
    Analyzing chunk 13/42 for QID Q4 (processed chunk len: 3277 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 13 (QID Q4).
    Analyzing chunk 14/42 for QID Q4 (processed chunk len: 3023 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 14 (QID Q4).
    Analyzing chunk 15/42 for QID Q4 (processed chunk len: 3354 chars)...
    [WARN] Motif JSON parsing or core structure validation failed for QID Q4, Chunk 15: Expecting ',' delimiter: line 16 column 3 (char 1787)
      Motif parsing/validation attempt 1 yielded no structured motifs for chunk 15 (QID Q4). Retrying if possible...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    [WARN] Motif JSON parsing or core structure validation failed for QID Q4, Chunk 15: Expecting ',' delimiter: line 16 column 3 (char 1787)
      Motif parsing/validation attempt 2 yielded no structured motifs for chunk 15 (QID Q4). Retrying if possible...
      No valid structured motifs extracted from chunk 15 (QID Q4) after 2 attempts.
    Analyzing chunk 16/42 for QID Q4 (processed chunk len: 3108 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 16 (QID Q4).
    Analyzing chunk 17/42 for QID Q4 (processed chunk len: 3043 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 17 (QID Q4).
    Analyzing chunk 18/42 for QID Q4 (processed chunk len: 3019 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 18 (QID Q4).
    Analyzing chunk 19/42 for QID Q4 (processed chunk len: 2844 chars)...
      Motif parsing/validation attempt 1 yielded no structured motifs for chunk 19 (QID Q4). Retrying if possible...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Motif parsing/validation attempt 2 yielded no structured motifs for chunk 19 (QID Q4). Retrying if possible...
      No valid structured motifs extracted from chunk 19 (QID Q4) after 2 attempts.
    Analyzing chunk 20/42 for QID Q4 (processed chunk len: 2528 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 20 (QID Q4).
    Analyzing chunk 21/42 for QID Q4 (processed chunk len: 2857 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 21 (QID Q4).
    Analyzing chunk 22/42 for QID Q4 (processed chunk len: 3198 chars)...
    [WARN] Motif JSON parsing or core structure validation failed for QID Q4, Chunk 22: Expecting value: line 6 column 3 (char 371)
      Motif parsing/validation attempt 1 yielded no structured motifs for chunk 22 (QID Q4). Retrying if possible...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    [WARN] Motif JSON parsing or core structure validation failed for QID Q4, Chunk 22: Expecting value: line 6 column 3 (char 371)
      Motif parsing/validation attempt 2 yielded no structured motifs for chunk 22 (QID Q4). Retrying if possible...
      No valid structured motifs extracted from chunk 22 (QID Q4) after 2 attempts.
    Analyzing chunk 23/42 for QID Q4 (processed chunk len: 3311 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 23 (QID Q4).
    Analyzing chunk 24/42 for QID Q4 (processed chunk len: 3323 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 24 (QID Q4).
    Analyzing chunk 25/42 for QID Q4 (processed chunk len: 2882 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 25 (QID Q4).
    Analyzing chunk 26/42 for QID Q4 (processed chunk len: 2450 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 26 (QID Q4).
    Analyzing chunk 27/42 for QID Q4 (processed chunk len: 2412 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 27 (QID Q4).
    Analyzing chunk 28/42 for QID Q4 (processed chunk len: 2441 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 28 (QID Q4).
    Analyzing chunk 29/42 for QID Q4 (processed chunk len: 2999 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 29 (QID Q4).
    Analyzing chunk 30/42 for QID Q4 (processed chunk len: 3353 chars)...
    [WARN] Motif JSON parsing or core structure validation failed for QID Q4, Chunk 30: Expecting ',' delimiter: line 24 column 71 (char 1483)
      Motif parsing/validation attempt 1 yielded no structured motifs for chunk 30 (QID Q4). Retrying if possible...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    [WARN] Motif JSON parsing or core structure validation failed for QID Q4, Chunk 30: Expecting ',' delimiter: line 24 column 71 (char 1483)
      Motif parsing/validation attempt 2 yielded no structured motifs for chunk 30 (QID Q4). Retrying if possible...
      No valid structured motifs extracted from chunk 30 (QID Q4) after 2 attempts.
    Analyzing chunk 31/42 for QID Q4 (processed chunk len: 3095 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 31 (QID Q4).
    Analyzing chunk 32/42 for QID Q4 (processed chunk len: 3258 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 32 (QID Q4).
    Analyzing chunk 33/42 for QID Q4 (processed chunk len: 2497 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 33 (QID Q4).
    Analyzing chunk 34/42 for QID Q4 (processed chunk len: 2522 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 34 (QID Q4).
    Analyzing chunk 35/42 for QID Q4 (processed chunk len: 3148 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 35 (QID Q4).
    Analyzing chunk 36/42 for QID Q4 (processed chunk len: 3052 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 36 (QID Q4).
    Analyzing chunk 37/42 for QID Q4 (processed chunk len: 3303 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 37 (QID Q4).
    Analyzing chunk 38/42 for QID Q4 (processed chunk len: 3141 chars)...
      Motif parsing/validation attempt 1 yielded no structured motifs for chunk 38 (QID Q4). Retrying if possible...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Motif parsing/validation attempt 2 yielded no structured motifs for chunk 38 (QID Q4). Retrying if possible...
      No valid structured motifs extracted from chunk 38 (QID Q4) after 2 attempts.
    Analyzing chunk 39/42 for QID Q4 (processed chunk len: 3279 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 39 (QID Q4).
    Analyzing chunk 40/42 for QID Q4 (processed chunk len: 3326 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 40 (QID Q4).
    Analyzing chunk 41/42 for QID Q4 (processed chunk len: 3164 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 41 (QID Q4).
    Analyzing chunk 42/42 for QID Q4 (processed chunk len: 2617 chars)...
      Extracted 5 structured motif objects from chunk 42 (QID Q4).
  Extracted 161 raw motif objects from LLM for QID Q4 (across all chunks).


NameError: name 'current_s_set' is not defined

## Old to New MWP

In [None]:
# @title Enhanced Prompt Single-Cell MWP
# --- Imports ---
import json
import os
import hashlib
import numpy as np
from pybdm import BDM
import re
import time
from typing import List, Dict, Set
from collections import Counter

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

# --- Configuration ---
# !!! IMPORTANT: UPDATE BASE_PROJECT_DIR TO YOUR ACTUAL PATH !!!
BASE_PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Legal/'
# BASE_PROJECT_DIR = './' # For local testing if files are relative

PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json')

P3_QIDS_TO_PROCESS_THEMATICALLY = ["Q4"]

MATRIX_SIZE_GLOBAL = (8, 8)
LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'
USE_QUANTIZATION_FOR_LOCAL_LLM = True
LLM_BATCH_SIZE_RESPONSES = 5
LLM_RETRY_ATTEMPTS = 2
MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK = 7000
LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION = 700
MAX_MOTIFS_PER_CHUNK = 5 # Max motifs to ask LLM per chunk
MAX_TEXT_FOR_BDM_HASH = 2000 # Max characters from text to use for BDM hash input


MOTIF_SYMBOLIC_LABEL_COST = 0.5
MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
MOTIF_DESCRIPTION_TOKEN_COST = 0.1
MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.25
MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.1

MIN_SF_FREQUENCY_IN_FULL_CORPUS = 2

LLM_DEBUG_LOG_FILE = os.path.join(BASE_PROJECT_DIR, "llm_motif_debug_log_single_cell_v5_label_fix.txt")

# --- Helper Function Definitions ---

def tokenize_phrase(phrase_text: str) -> List[str]:
    if not isinstance(phrase_text, str) or not phrase_text.strip(): return []
    return phrase_text.lower().split()

def calculate_L_H_token_based_structured(structured_motifs_list: List[Dict]) -> float:
    # (Same as before - calculates L(H) based on config constants and motif structure)
    if not structured_motifs_list: return 0.0
    total_lh_cost = 0.0
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue
        current_motif_lh = 0.0
        label_str = motif_obj.get('label', "")
        if isinstance(label_str, str) and label_str.strip():
            current_motif_lh += MOTIF_SYMBOLIC_LABEL_COST
        description_str = motif_obj.get('description', "")
        if isinstance(description_str, str) and description_str.strip():
            current_motif_lh += MOTIF_DESCRIPTION_TEXT_BASE_COST
            current_motif_lh += len(tokenize_phrase(description_str)) * MOTIF_DESCRIPTION_TOKEN_COST
        surface_forms_list = motif_obj.get('surface_forms', [])
        if isinstance(surface_forms_list, list) and surface_forms_list:
            valid_sfs_for_lh = [sf for sf in surface_forms_list if isinstance(sf, str) and sf.strip()]
            if valid_sfs_for_lh:
                current_motif_lh += MOTIF_SURFACE_FORMS_LIST_BASE_COST
                for sf_str in valid_sfs_for_lh:
                    current_motif_lh += len(tokenize_phrase(sf_str)) * MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH
        total_lh_cost += current_motif_lh
    return total_lh_cost

def llm_compress_text_structured(text_to_compress: str, structured_motifs_list: List[Dict]) -> str:
    # (Same as before - compresses text using motif labels for surface forms)
    if not isinstance(text_to_compress, str): return ""
    if not structured_motifs_list: return text_to_compress.lower()
    compressed_text = text_to_compress.lower()
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue
        label = motif_obj.get('label', None)
        surface_forms = motif_obj.get('surface_forms', [])
        if not (isinstance(label, str) and label.strip()) or \
           not (isinstance(surface_forms, list) and surface_forms):
            continue
        placeholder = label
        sorted_sfs_for_this_motif = sorted(
            [sf for sf in surface_forms if isinstance(sf, str) and sf.strip()], key=len, reverse=True
        )
        for sf_str in sorted_sfs_for_this_motif:
            sf_lower = sf_str.lower()
            try:
                compressed_text = re.sub(r'\b' + re.escape(sf_lower) + r'\b', placeholder, compressed_text)
            except re.error as re_e:
                print(f"    Regex error during compression for SF '{sf_str}' of motif '{label}': {re_e}. Skipping.")
                continue
    return compressed_text

def text_to_binary_matrix(text_input: str, size: tuple = MATRIX_SIZE_GLOBAL) -> np.ndarray:
    # (Same as before)
    if not isinstance(text_input, str) or not text_input.strip(): return np.zeros(size, dtype=int)
    hash_obj = hashlib.sha256(text_input.encode('utf-8', 'ignore'))
    hash_digest = hash_obj.hexdigest()
    required_bits = size[0] * size[1]
    binary_string_from_hash = bin(int(hash_digest, 16))[2:].zfill(256)
    binary_string_for_matrix = binary_string_from_hash[:required_bits] if required_bits <= 256 else binary_string_from_hash.ljust(required_bits, '0')
    bits_for_matrix = [int(b) for b in binary_string_for_matrix]
    return np.array(bits_for_matrix).reshape(size)

def compute_bdm_for_text(text_input: str, bdm_instance: BDM, matrix_s: tuple = MATRIX_SIZE_GLOBAL) -> float:
    # (Same as before)
    if not isinstance(text_input, str) or not text_input.strip() : return 0.0
    text_for_hash = text_input[:MAX_TEXT_FOR_BDM_HASH] if len(text_input) > MAX_TEXT_FOR_BDM_HASH else text_input
    if not text_for_hash.strip(): return 0.0
    binary_matrix = text_to_binary_matrix(text_for_hash, size=matrix_s)
    try:
        return bdm_instance.bdm(binary_matrix)
    except Exception as e_bdm:
        print(f"      Error during BDM calculation for text (len {len(text_input)}, hashed part len {len(text_for_hash)}): {e_bdm}")
        return -1.0

def compute_mdl_cost_for_text_block(full_qid_corpus_str: str,
                                    final_motifs_to_evaluate: List[Dict],
                                    bdm_instance: BDM,
                                    matrix_s: tuple = MATRIX_SIZE_GLOBAL) -> tuple[float, float, float]:
    # (Same as before)
    if not isinstance(full_qid_corpus_str, str) : full_qid_corpus_str = ""
    l_h = calculate_L_H_token_based_structured(final_motifs_to_evaluate)
    compressed_text_block = llm_compress_text_structured(full_qid_corpus_str, final_motifs_to_evaluate)
    l_d_h = compute_bdm_for_text(compressed_text_block, bdm_instance, matrix_s)
    if l_d_h < 0: return l_h, -1.0, -1.0
    return l_h, l_d_h, l_h + l_d_h

def preprocess_corpus_for_motif_extraction(text_corpus: str) -> str:
    # (Same as before)
    if not isinstance(text_corpus, str): return ""
    text = re.sub(r'\n{3,}', '\n\n', text_corpus)
    text = re.sub(r' {2,}', ' ', text)
    lines = text.split('\n')
    filtered_lines = [line.strip() for line in lines if len(line.strip()) > 10 or not line.strip()]
    return '\n'.join(filtered_lines)

def count_sf_occurrences(corpus_text: str, surface_form: str) -> int:
    # (Same as before)
    if not corpus_text or not surface_form or not isinstance(corpus_text, str) or not isinstance(surface_form, str):
        return 0
    try:
        return len(re.findall(re.escape(surface_form.lower()), corpus_text.lower(), flags=re.IGNORECASE))
    except re.error as e:
        print(f"    [WARN] Regex error in count_sf_occurrences for SF '{surface_form}': {e}")
        return 0

# --- LLM Interaction Functions ---
def build_llm_prompt_for_motifs(text_block_for_prompt: str, max_motifs_to_extract: int = MAX_MOTIFS_PER_CHUNK) -> str:
    """Reverted to the simpler prompt structure, hoping for better label format adherence."""
    if len(text_block_for_prompt) > MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK:
        text_block_for_prompt = text_block_for_prompt[:MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK]

    # This is the prompt from the "Old Successful Code Cell" that seemed to get bracketed labels
    prompt = f"""You will receive a set of comments from different people answering the same question.

Your task is to identify up to {max_motifs_to_extract} key recurring themes.

For each theme, provide:
- A short label like [DATA_PRIVACY]
- A 1-sentence description of the theme
- 2–3 short phrases that often appear in the text (surface forms)

Output MUST be a valid JSON list of objects, where each object has "label", "description", and "surface_forms" keys.
Example of one object in the list:
{{
  "label": "[EXAMPLE_LABEL]",
  "description": "A concise description of the example theme.",
  "surface_forms": ["short repeated phrase 1", "another short repeated phrase"]
}}
If no clear motifs are found, output an empty JSON list: `[]`.
Do not include any other text, explanations, or markdown code fences around the JSON.

Set of comments to analyze:
\"\"\"
{text_block_for_prompt}
\"\"\"

Valid JSON Output (ensure it's a list of objects, or an empty list [] if no themes):
"""
    return prompt.strip()

def call_local_llm_for_raw_response(
    prompt_content_for_user_turn: str,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str,
    chunk_idx_for_log: int
    ) -> str:
    if not hf_pipeline_instance or not hf_tokenizer_instance:
        print(f"    ERROR (call_local_llm): LLM pipeline/tokenizer not initialized for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        return ""
    messages_for_template = [{"role": "user", "content": prompt_content_for_user_turn}]
    try:
        prompt_formatted_for_llm = hf_tokenizer_instance.apply_chat_template(
            messages_for_template, tokenize=False, add_generation_prompt=True
        )
    except Exception as e_template:
        print(f"    ERROR (call_local_llm): Applying chat template failed for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_template}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (call_local_llm_for_raw_response) ---\nERROR APPLYING CHAT TEMPLATE: {e_template}\nUser prompt content (first 300 chars): {prompt_content_for_user_turn[:300]}...\n")
        return ""

    generation_args = { # Using args for deterministic output
        "max_new_tokens": LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION,
        "do_sample": False,
        "pad_token_id": hf_tokenizer_instance.pad_token_id
    }
    # print(f"    DEBUG (call_local_llm): QID {qid_for_log}, Chunk {chunk_idx_for_log}, Prompt len: {len(prompt_formatted_for_llm)}, GenArgs: {generation_args}")

    try:
        outputs = hf_pipeline_instance(prompt_formatted_for_llm, **generation_args)
        if outputs and isinstance(outputs, list) and len(outputs) > 0 and \
           outputs[0] and isinstance(outputs[0], dict) and 'generated_text' in outputs[0]:
            assistant_response_text = outputs[0]['generated_text'].strip()
            # print(f"    DEBUG (call_local_llm): QID {qid_for_log}, Chunk {chunk_idx_for_log}, Raw LLM Output:\n{assistant_response_text[:500]}...")
            return assistant_response_text
        else:
            print(f"    WARN (call_local_llm): LLM pipeline returned unexpected structure for QID {qid_for_log}, Chunk {chunk_idx_for_log}. Output: {outputs}")
            return ""
    except Exception as e_pipeline:
        print(f"    ERROR (call_local_llm): Exception during hf_pipeline call for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_pipeline}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (call_local_llm_for_raw_response) ---\nERROR DURING PIPELINE CALL: {e_pipeline}\nFormatted prompt (first 300 chars): {prompt_formatted_for_llm[:300]}...\n")
        return ""

def parse_and_validate_llm_json_response(
    llm_raw_response_text: str,
    qid_for_log:str,
    chunk_idx_for_log:int,
    prompt_sent_to_llm:str
    ) -> List[Dict]:
    json_str_candidate = llm_raw_response_text.strip()
    if json_str_candidate.startswith("```json"): json_str_candidate = json_str_candidate[len("```json"):].strip()
    if json_str_candidate.startswith("```"): json_str_candidate = json_str_candidate[len("```"):].strip()
    if json_str_candidate.endswith("```"): json_str_candidate = json_str_candidate[:-len("```")].strip()

    # print(f"    DEBUG (parse_validate): QID {qid_for_log}, Chunk {chunk_idx_for_log}, JSON candidate for parsing:\n{json_str_candidate[:500]}...")

    if not json_str_candidate or json_str_candidate.lower() == "[]" or "no_themes_found" in json_str_candidate.lower() or "no clear motifs" in json_str_candidate.lower():
        return []
    try:
        parsed_data = json.loads(json_str_candidate)
        if isinstance(parsed_data, dict): parsed_data = [parsed_data]
        if not isinstance(parsed_data, list): raise ValueError("Parsed JSON is not a list or single object.")

        valid_motifs_from_json = []
        for item_idx, item in enumerate(parsed_data):
            # --- Start: Added logic to handle empty item dict {} ---
            if not item: # If item is an empty dictionary
                # print(f"    DEBUG (parse_validate): Skipping empty item object {{}} for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}")
                continue
            # --- End: Added logic to handle empty item dict {} ---

            label_str_original = item.get('label', "") # Get original label
            label_str_processed = "" # This will store the potentially fixed label

            if isinstance(label_str_original, str) and label_str_original.strip():
                temp_label = label_str_original.strip()
                # Try to extract a bracketed part if it exists and there's other text
                match = re.search(r"(\[[A-Z0-9_]+\])", temp_label)
                if match and match.group(1) == temp_label: # The whole string is correctly bracketed
                    label_str_processed = temp_label
                elif match: # Found a bracketed part within a longer string
                    label_str_processed = match.group(1)
                    # print(f"    DEBUG (parse_validate): Extracted bracketed label '{label_str_processed}' from '{label_str_original}' for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}")
                elif not (temp_label.startswith('[') and temp_label.endswith(']')):
                    # If no brackets or not solely bracketed, attempt to sanitize and add them
                    sanitized_content = re.sub(r'\s+', '_', temp_label)
                    sanitized_content = re.sub(r'[^\w_]', '', sanitized_content).upper()
                    # Take first 3 "words" (snake_case parts) for label if too long
                    sanitized_content = "_".join(sanitized_content.split('_')[:3])
                    if sanitized_content:
                        label_str_processed = f"[{sanitized_content}]"
                        # print(f"    DEBUG (parse_validate): Auto-formatted label from '{label_str_original}' to '{label_str_processed}' for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}")
                    # else label_str_processed remains ""

            # Use the processed label for validation
            current_item_label_for_validation = label_str_processed
            item['label'] = current_item_label_for_validation # Update item with processed label for consistency if it passes

            desc_str = item.get('description',"")
            sf_list = item.get('surface_forms', [])

            is_dict_val = isinstance(item, dict)
            has_all_keys_val = all(k in item for k in ["label", "description", "surface_forms"])

            is_label_str_val = isinstance(current_item_label_for_validation, str) and bool(current_item_label_for_validation)
            label_starts_bracket_val = current_item_label_for_validation.startswith('[') if is_label_str_val else False
            label_ends_bracket_val = current_item_label_for_validation.endswith(']') if is_label_str_val else False

            is_desc_str_val = isinstance(desc_str, str) # Allow empty description string
            is_sf_list_val = isinstance(sf_list, list)
            sfs_are_strings_val = all(isinstance(sf_item, str) for sf_item in sf_list) if is_sf_list_val else False

            # --- UNCOMMENT FOR DETAILED VALIDATION DEBUG ---
            # print(f"      DEBUG (parse_validate): Validating item {item_idx+1} for QID {qid_for_log}, Chunk {chunk_idx_for_log}:")
            # print(f"        ITEM_CONTENT_AFTER_LABEL_PROCESSING: {json.dumps(item, indent=2)}")
            # print(f"        is_dict: {is_dict_val}")
            # print(f"        has_all_keys: {has_all_keys_val}")
            # print(f"        is_label_str: {is_label_str_val}, label_content_for_validation: '{current_item_label_for_validation}'")
            # print(f"        label_starts_bracket: {label_starts_bracket_val}")
            # print(f"        label_ends_bracket: {label_ends_bracket_val}")
            # print(f"        is_desc_str: {is_desc_str_val}")
            # print(f"        is_sf_list: {is_sf_list_val}")
            # print(f"        sfs_are_strings: {sfs_are_strings_val}")
            # --- END DETAILED VALIDATION DEBUG ---

            if is_dict_val and has_all_keys_val and \
               is_label_str_val and label_starts_bracket_val and label_ends_bracket_val and \
               is_desc_str_val and is_sf_list_val and sfs_are_strings_val:
                valid_motifs_from_json.append({
                    "label": current_item_label_for_validation,
                    "description": desc_str.strip(),
                    "surface_forms": [s.strip() for s in sf_list if isinstance(s, str) and s.strip()]
                })
                # print("        RESULT: Item PASSED validation.")
            else:
                print(f"    [WARN] Invalid motif object structure in LLM JSON for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}. Skipping item.")
                # ... (logging to file) ...
        return valid_motifs_from_json
    except (json.JSONDecodeError, ValueError) as e:
        print(f"    [WARN] Motif JSON parsing or core structure validation failed for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e}")
        # ... (logging to file) ...
        return []

def get_motifs_for_qid_batched(
    list_of_individual_response_texts: List[str],
    responses_per_batch: int,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str
    ) -> List[Dict]:
    # (This function remains largely the same as your last correct version, calling the above helpers)
    all_raw_motifs_from_chunks = []
    batched_text_chunks_for_llm = []
    for i in range(0, len(list_of_individual_response_texts), responses_per_batch):
        batch_responses = list_of_individual_response_texts[i:i + responses_per_batch]
        chunk_text_for_llm = preprocess_corpus_for_motif_extraction("\n\n<RSP_SEP>\n\n".join(batch_responses))
        batched_text_chunks_for_llm.append(chunk_text_for_llm)

    print(f"  QID {qid_for_log}: Processing {len(list_of_individual_response_texts)} responses in {len(batched_text_chunks_for_llm)} preprocessed chunks (batch size: {responses_per_batch} responses).")

    for chunk_idx, text_chunk_to_analyze_processed in enumerate(batched_text_chunks_for_llm):
        print(f"    Analyzing chunk {chunk_idx + 1}/{len(batched_text_chunks_for_llm)} for QID {qid_for_log} (processed chunk len: {len(text_chunk_to_analyze_processed)} chars)...")
        if len(text_chunk_to_analyze_processed.strip()) < 50:
            print(f"      Chunk {chunk_idx+1} (QID {qid_for_log}) too short after preprocessing, skipping.")
            continue
        prompt_for_llm = build_llm_prompt_for_motifs(text_chunk_to_analyze_processed)
        motifs_from_this_chunk = []
        for attempt in range(LLM_RETRY_ATTEMPTS):
            raw_llm_response = call_local_llm_for_raw_response(
                prompt_for_llm, hf_pipeline_instance, hf_tokenizer_instance, qid_for_log, chunk_idx + 1
            )
            if not raw_llm_response:
                print(f"      LLM call attempt {attempt + 1} for chunk {chunk_idx+1} (QID {qid_for_log}) returned empty string. Retrying if possible...")
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)
                continue
            parsed_motifs_from_this_attempt = parse_and_validate_llm_json_response(
                raw_llm_response, qid_for_log, chunk_idx+1, prompt_for_llm
            )
            if parsed_motifs_from_this_attempt:
                motifs_from_this_chunk = parsed_motifs_from_this_attempt
                break
            else:
                print(f"      Motif parsing/validation attempt {attempt + 1} yielded no structured motifs for chunk {chunk_idx+1} (QID {qid_for_log}). Retrying if possible...")
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)
        if motifs_from_this_chunk:
            print(f"      Extracted {len(motifs_from_this_chunk)} structured motif objects from chunk {chunk_idx+1} (QID {qid_for_log}).")
            all_raw_motifs_from_chunks.extend(motifs_from_this_chunk)
        else:
            print(f"      No valid structured motifs extracted from chunk {chunk_idx+1} (QID {qid_for_log}) after {LLM_RETRY_ATTEMPTS} attempts.")
    return all_raw_motifs_from_chunks


def consolidate_raw_motifs(list_of_all_raw_motifs: List[Dict]) -> List[Dict]:
    # (Same as before)
    if not list_of_all_raw_motifs: return []
    consolidated_motifs_map = {}
    for motif_obj in list_of_all_raw_motifs:
        label = motif_obj.get("label","").strip()
        description = motif_obj.get("description","").strip()
        surface_forms = motif_obj.get("surface_forms", [])
        if not (label and isinstance(surface_forms, list)): continue # Description can be empty string but label must exist
        current_sfs_set = set(sf.lower().strip() for sf in surface_forms if isinstance(sf, str) and sf.strip())
        if label not in consolidated_motifs_map:
            consolidated_motifs_map[label] = {
                "label": label, "description": description, "surface_forms": sorted(list(current_sfs_set))
            }
        else:
            existing_sfs_set = set(consolidated_motifs_map[label].get("surface_forms", []))
            consolidated_motifs_map[label]["surface_forms"] = sorted(list(existing_sfs_set.union(current_sfs_set)))
    return list(consolidated_motifs_map.values())

def filter_surface_forms_by_global_frequency(
    consolidated_motifs_list: List[Dict],
    full_qid_corpus_text: str,
    min_global_freq: int = MIN_SF_FREQUENCY_IN_FULL_CORPUS
    ) -> List[Dict]:
    # (Same as before)
    if not consolidated_motifs_list: return []
    final_globally_filtered_motifs = []
    for motif_obj in consolidated_motifs_list:
        globally_frequent_sfs_for_this_motif = []
        original_sfs_for_this_motif = motif_obj.get("surface_forms", [])
        for sf_str in original_sfs_for_this_motif:
            count = count_sf_occurrences(full_qid_corpus_text, sf_str)
            if count >= min_global_freq:
                globally_frequent_sfs_for_this_motif.append(sf_str)
        if globally_frequent_sfs_for_this_motif:
            filtered_motif_entry = motif_obj.copy()
            filtered_motif_entry["surface_forms"] = sorted(list(set(globally_frequent_sfs_for_this_motif)))
            final_globally_filtered_motifs.append(filtered_motif_entry)
    return final_globally_filtered_motifs

# This function should be defined BEFORE your main() function

def initialize_llm_pipeline(
    model_id: str = LOCAL_LLM_MODEL_ID,
    use_quantization: bool = USE_QUANTIZATION_FOR_LOCAL_LLM,
    pipeline_return_full_text: bool = False # Defaulting to False
    ):
    """Initializes and returns the Hugging Face pipeline and tokenizer."""
    print(f"--- Initializing LLM Pipeline (model: {model_id}, quantization: {use_quantization}, return_full_text: {pipeline_return_full_text}) ---")

    hf_pipeline_instance = None
    hf_tokenizer_instance = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    try:
        print(f"Loading tokenizer for {model_id}...")
        hf_tokenizer_instance = AutoTokenizer.from_pretrained(model_id)

        if hf_tokenizer_instance.pad_token is None:
            print("Tokenizer does not have a pad_token; setting pad_token = eos_token.")
            hf_tokenizer_instance.pad_token = hf_tokenizer_instance.eos_token
            # Note: model.config.pad_token_id will be set below after model loading

        bnb_config = None
        quant_active = False
        if use_quantization and torch.cuda.is_available():
            try:
                compute_dtype = torch.bfloat16 if (device.type == 'cuda' and torch.cuda.is_bf16_supported()) else torch.float16
                bnb_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=compute_dtype,
                    bnb_4bit_use_double_quant=True
                )
                quant_active = True
                print(f"BitsAndBytesConfig created for {model_id}, compute_dtype: {compute_dtype}.")
            except Exception as e_bnb:
                print(f"WARN: Failed to create BitsAndBytesConfig: {e_bnb}. Quantization may be disabled or fall back.")
                quant_active = False

        print(f"Loading local model {model_id} (Quantization active: {quant_active})...")
        model_kwargs = {"device_map": "auto", "trust_remote_code": True}

        if quant_active and bnb_config:
            model_kwargs["quantization_config"] = bnb_config
        elif device.type == 'cuda':
             model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

        hf_model_instance = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)

        if hf_tokenizer_instance.pad_token_id == hf_tokenizer_instance.eos_token_id:
             hf_model_instance.config.pad_token_id = hf_model_instance.config.eos_token_id

        hf_pipeline_instance = pipeline(
            "text-generation",
            model=hf_model_instance,
            tokenizer=hf_tokenizer_instance,
            return_full_text=pipeline_return_full_text
        )
        print(f"Local LLM pipeline for {model_id} initialized successfully.")
        return hf_pipeline_instance, hf_tokenizer_instance
    except Exception as e:
        print(f"CRITICAL: Failed to initialize local LLM pipeline: {e}")
        import traceback
        traceback.print_exc()
        return None, None

# --- BDM Initialization (should also be defined before main) ---
def initialize_bdm_instance():
    """Initializes and returns a BDM instance."""
    print("Initializing BDM instance...")
    try:
        bdm_instance = BDM(ndim=2)
        print("BDM instance initialized successfully (ndim=2, default CTM-based).")
        return bdm_instance
    except Exception as e_bdm_init:
        print(f"CRITICAL: Failed to initialize BDM instance: {e_bdm_init}")
        if "CTM data files" in str(e_bdm_init).lower() or "dataset" in str(e_bdm_init).lower():
            print("  BDM Error Hint: This might be related to missing/corrupted CTM data files for PyBDM.")
            print("  Ensure PyBDM is installed correctly and can access/download its data.")
            print("  You might need to run the following once in your environment:")
            print("  from pybdm import get_ctm_dataset; get_ctm_dataset(force=False)")
        return None

# Now your main() function can be defined and called
# def main():
#   ...
#   hf_pipeline_instance, hf_tokenizer_instance = initialize_llm_pipeline()
#   bdm_instance_main = initialize_bdm_instance()
#   ...

# if __name__ == "__main__":
#   main()


# --- Main Execution Logic ---
def main():
    print("--- MWP Single Cell (Reverted Simpler Prompt v4, Label Fix Attempt, Global SF Filter, Batched LLM, Token-L(H), BDM L(D|H)) ---")
    # ... (Print config params - same as before) ...
    # ... (Initialize debug log file - same as before) ...
    # ... (Initialize LLM Pipeline - same as before, ensure return_full_text=False) ...
    # ... (Initialize BDM - same as before, ensure BDM(ndim=2)) ...
    # ... (Load Phase 2 Data - same as before) ...
    # ... (QID selection logic with UnboundLocalError fix - same as before) ...
    # --- Full main() logic from previous complete cell goes here ---
    # --- It will call the updated functions defined above ---
    print(f"Timestamp: {time.asctime()}")
    print("\n--- Configuration Summary ---")
    print(f"LLM Model: {LOCAL_LLM_MODEL_ID}, Quantization: {USE_QUANTIZATION_FOR_LOCAL_LLM}")
    print(f"LLM Batch Size (Responses): {LLM_BATCH_SIZE_RESPONSES}, Retries: {LLM_RETRY_ATTEMPTS}")
    print(f"Max Text Chars per LLM Prompt Chunk: {MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK}")
    print(f"Max New Tokens for LLM Motif Extraction: {LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION}")
    print(f"Max Motifs to Request per Chunk: {MAX_MOTIFS_PER_CHUNK}")
    print(f"L(H) Costs: Label={MOTIF_SYMBOLIC_LABEL_COST}, DescBase={MOTIF_DESCRIPTION_TEXT_BASE_COST}, DescToken={MOTIF_DESCRIPTION_TOKEN_COST}, SFListBase={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, SFTokenInLH={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}")
    print(f"Global SF Filtering Min Freq: {MIN_SF_FREQUENCY_IN_FULL_CORPUS}")
    print(f"BDM Hash Prefix Length: {MAX_TEXT_FOR_BDM_HASH}, BDM Matrix: {MATRIX_SIZE_GLOBAL}")
    print(f"Debug Log File: {LLM_DEBUG_LOG_FILE}")
    print("--- End Configuration Summary ---\n")

    try:
        with open(LLM_DEBUG_LOG_FILE, "w", encoding="utf-8") as f:
            f.write(f"LLM Motif Debug Log - Run Started: {time.asctime()}\nModel ID: {LOCAL_LLM_MODEL_ID}\nPipeline Config: return_full_text=False\nReverted Simpler Prompt with Label Fix Attempt Active\n---\n")
    except Exception as e_log: print(f"WARN: Could not initialize debug log file {LLM_DEBUG_LOG_FILE}: {e_log}")

    try:
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f: # Use "a" to append to existing header
            f.write(f"\n--- Test Log Entry from main() at {time.asctime()} --m`, `llm_raw_response_text`, `item`) might contain some character or be of a type that causes `f.write()` to fail silently or throw an error that's caught broadly elsewhere without specific indication. (Less-\n")
            f.flush()
        print(f"DEBUG: Successfully wrote test entry to {LLM_DEBUG_LOG_FILE}")
    except Exception as e_test_log:
        print(f"CRITICAL DEBUG: FAILED TO WRITE TEST ENTRY TO LOG FILE {LLM_DEBUG_LOG_FILE}: {e_test_log}")

    hf_pipeline_instance, hf_tokenizer_instance = initialize_llm_pipeline()
    if not hf_pipeline_instance or not hf_tokenizer_instance: print("CRITICAL: Exiting due to LLM pipeline initialization failure."); return

    bdm_instance_main = initialize_bdm_instance()
    if not bdm_instance_main: print("CRITICAL: Exiting due to BDM initialization failure."); return

    if not os.path.exists(P2_COLLATED_FILE): print(f"ERROR: Phase 2 file not found: {P2_COLLATED_FILE}"); return
    print(f"Loading Phase 2 data from: {P2_COLLATED_FILE}...")
    phase2_data_content = None
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f: phase2_data_content = json.load(f)
    except Exception as e_load: print(f"Error loading {P2_COLLATED_FILE}: {e_load}"); return

    all_qid_mdl_results_list = []
    aggregated_content_by_qid_from_file = phase2_data_content.get("aggregated_pdf_content_by_qid", {})
    if not aggregated_content_by_qid_from_file: print(f"No 'aggregated_pdf_content_by_qid' in {P2_COLLATED_FILE}."); return

    qids_to_process_this_run = []
    if P3_QIDS_TO_PROCESS_THEMATICALLY and isinstance(P3_QIDS_TO_PROCESS_THEMATICALLY, list) and P3_QIDS_TO_PROCESS_THEMATICALLY:
        qids_to_process_this_run = [qid for qid in P3_QIDS_TO_PROCESS_THEMATICALLY if qid in aggregated_content_by_qid_from_file]
        if not qids_to_process_this_run:
            print(f"Warning: None of specified QIDs {P3_QIDS_TO_PROCESS_THEMATICALLY} found in loaded data. Exiting.")
            return
    else:
        qids_to_process_limit_fallback = 1
        print(f"P3_QIDS_TO_PROCESS_THEMATICALLY not set or empty. Processing up to {qids_to_process_limit_fallback} QID(s) from data as fallback.")
        qids_to_process_this_run = list(aggregated_content_by_qid_from_file.keys())[:qids_to_process_limit_fallback]
        if not qids_to_process_this_run:
            print("No QIDs available in data to process based on fallback. Exiting.")
            return

    if not qids_to_process_this_run:
        print("No QIDs selected for processing. Exiting.")
        return
    print(f"\nMDL analysis will run for QIDs: {qids_to_process_this_run}\n")

    for qid_identifier_str in qids_to_process_this_run:
        print(f"--- Analyzing Data for QID: {qid_identifier_str} ---")
        list_of_individual_response_structs = aggregated_content_by_qid_from_file.get(qid_identifier_str, [])
        actual_response_texts_for_qid = [item.get("text", "") for item in list_of_individual_response_structs if isinstance(item, dict) and isinstance(item.get("text"), str) and item.get("text","").strip()]
        if not actual_response_texts_for_qid: print(f"  No valid text for QID {qid_identifier_str}. Skipping."); print("-" * 50); continue

        full_corpus_text_for_qid = "\n\n<RSP_SEP>\n\n".join(actual_response_texts_for_qid)
        if len(full_corpus_text_for_qid.strip()) < 100: print(f"  Skipping QID {qid_identifier_str}: text too short."); print("-" * 50); continue

        num_total_responses_for_qid = len(actual_response_texts_for_qid)
        print(f"  Corpus for QID {qid_identifier_str}: {len(full_corpus_text_for_qid)} chars, {num_total_responses_for_qid} responses.")

        baseline_bdm_original_corpus = compute_bdm_for_text(full_corpus_text_for_qid, bdm_instance_main, MATRIX_SIZE_GLOBAL)
        if baseline_bdm_original_corpus < 0: print(f"  Error computing baseline BDM for QID {qid_identifier_str}. Skipping."); print("-" * 50); continue
        current_qid_baseline_mdl_cost = baseline_bdm_original_corpus
        print(f"  Baseline MDL for QID {qid_identifier_str} (L(D_orig)): {current_qid_baseline_mdl_cost:.4f}")

        raw_motifs_from_chunks = get_motifs_for_qid_batched(
            actual_response_texts_for_qid, LLM_BATCH_SIZE_RESPONSES,
            hf_pipeline_instance, hf_tokenizer_instance, qid_identifier_str
        )
        current_qid_result_entry = {
            "qid": qid_identifier_str, "corpus_len_chars": len(full_corpus_text_for_qid), "num_responses": num_total_responses_for_qid,
            "baseline_mdl": current_qid_baseline_mdl_cost, "final_refined_motifs": [], "l_h_final_motifs": 0.0,
            "l_d_h_final_motifs": current_qid_baseline_mdl_cost, "total_mdl_with_final_motifs": current_qid_baseline_mdl_cost,
            "compression_achieved": 0.0, "num_raw_motifs_extracted": len(raw_motifs_from_chunks),
            "num_consolidated_motifs": 0, "num_globally_refined_motifs": 0
        }
        if not raw_motifs_from_chunks: print(f"  No raw motifs by LLM for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue
        print(f"  Extracted {len(raw_motifs_from_chunks)} raw motif objects for QID {qid_identifier_str}.")

        consolidated_motifs_list = consolidate_raw_motifs(raw_motifs_from_chunks)
        current_qid_result_entry["num_consolidated_motifs"] = len(consolidated_motifs_list)
        print(f"  Consolidated into {len(consolidated_motifs_list)} unique motifs for QID {qid_identifier_str}.")
        if not consolidated_motifs_list: print(f"  No unique motifs after consolidation for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        print(f"  Consolidated Motifs for QID {qid_identifier_str} (BEFORE Global SF refinement):")
        for idx, mo_con in enumerate(consolidated_motifs_list): print(f"    Cons. Motif {idx+1}: L='{mo_con.get('label')}', D='{mo_con.get('description','N/A')[:30]}...', SFs({len(mo_con.get('surface_forms',[]))})='{mo_con.get('surface_forms',[])[:2]}...'")

        globally_refined_motifs = filter_surface_forms_by_global_frequency(
            consolidated_motifs_list, full_corpus_text_for_qid, MIN_SF_FREQUENCY_IN_FULL_CORPUS
        )
        current_qid_result_entry["num_globally_refined_motifs"] = len(globally_refined_motifs)
        print(f"  Globally refined into {len(globally_refined_motifs)} motifs for QID {qid_identifier_str}.")
        if not globally_refined_motifs: print(f"  No motifs left after GLOBAL SF refinement for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        print(f"  Final Globally Refined Motifs for QID {qid_identifier_str}:")
        for idx, mo_final in enumerate(globally_refined_motifs): print(f"    Refined Motif {idx+1}: L='{mo_final.get('label')}', D='{mo_final.get('description','N/A')[:60]}...', SFs({len(mo_final.get('surface_forms',[]))})='{mo_final.get('surface_forms',[])}'")

        l_h_final, l_d_h_final, total_mdl_final = compute_mdl_cost_for_text_block(
            full_corpus_text_for_qid, globally_refined_motifs, bdm_instance_main, MATRIX_SIZE_GLOBAL
        )
        current_qid_result_entry.update({
            "final_refined_motifs": globally_refined_motifs, "l_h_final_motifs": l_h_final,
            "l_d_h_final_motifs": l_d_h_final if l_d_h_final >=0 else "BDM_ERROR",
            "total_mdl_with_final_motifs": total_mdl_final if l_d_h_final >=0 else "BDM_ERROR",
            "compression_achieved": "BDM_ERROR" if l_d_h_final < 0 else (current_qid_baseline_mdl_cost - total_mdl_final)
        })
        if l_d_h_final < 0: print(f"  Error computing MDL cost (BDM error) for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        print(f"  L(H) final motifs: {l_h_final:.4f}, L(D|H) compressed: {l_d_h_final:.4f}, Total MDL: {total_mdl_final:.4f}")
        compression_val = current_qid_result_entry["compression_achieved"]
        result_status_str = f"SUCCESS: Comp: {compression_val:.4f}" if isinstance(compression_val, float) and compression_val > 0.0001 else f"NOTE: No sig. comp. Diff: {compression_val if isinstance(compression_val, str) else compression_val:.4f}"
        print(f"  {result_status_str}"); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50)

    print("\n--- Overall QID-based MDL Analysis Summary ---")
    if not all_qid_mdl_results_list: print("No QIDs processed.")
    else:
        valid_results = [r for r in all_qid_mdl_results_list if isinstance(r.get('compression_achieved'), float) and r.get('l_h_final_motifs', -1.0) >= 0]
        num_qids_ok = len(valid_results); num_comp = sum(1 for r in valid_results if r['compression_achieved'] > 0.0001)
        print(f"Targeted QIDs: {len(qids_to_process_this_run)}, Results logged: {len(all_qid_mdl_results_list)}, Valid MDL: {num_qids_ok}, QIDs compressed: {num_comp}")
        if num_comp > 0:
            comp_vals = [r['compression_achieved'] for r in valid_results if r['compression_achieved'] > 0.0001]
            print(f"  Avg compression: {np.mean(comp_vals):.4f}, Max compression: {np.max(comp_vals):.4f}")
        else: print("  No compression achieved.")

        output_filename = os.path.join(BASE_PROJECT_DIR, "mdl_analysis_single_cell_reverted_prompt_v4_labelfix.json")
        try:
            with open(output_filename, "w", encoding="utf-8") as f_out: json.dump(all_qid_mdl_results_list, f_out, indent=2, ensure_ascii=False)
            print(f"Detailed results saved to {output_filename}")
        except Exception as e_s: print(f"Error saving results: {e_s}")

if __name__ == "__main__":
    print(f"Executing main MDL pipeline (Single Cell Reverted Prompt V4 with Label Fix Attempt) at {time.asctime()}...")
    main()
    print(f"Main MDL pipeline execution finished at {time.asctime()}.")

Executing main MDL pipeline (Single Cell Reverted Prompt V4 with Label Fix Attempt) at Sun Jun  1 10:22:04 2025...
--- MWP Single Cell (Reverted Simpler Prompt v4, Label Fix Attempt, Global SF Filter, Batched LLM, Token-L(H), BDM L(D|H)) ---
Timestamp: Sun Jun  1 10:22:04 2025

--- Configuration Summary ---
LLM Model: google/gemma-2b-it, Quantization: True
LLM Batch Size (Responses): 5, Retries: 2
Max Text Chars per LLM Prompt Chunk: 7000
Max New Tokens for LLM Motif Extraction: 700
Max Motifs to Request per Chunk: 5
L(H) Costs: Label=0.5, DescBase=0.5, DescToken=0.1, SFListBase=0.25, SFTokenInLH=0.1
Global SF Filtering Min Freq: 2
BDM Hash Prefix Length: 2000, BDM Matrix: (8, 8)
Debug Log File: /content/drive/MyDrive/Colab Notebooks/Legal/llm_motif_debug_log_single_cell_v5_label_fix.txt
--- End Configuration Summary ---

DEBUG: Successfully wrote test entry to /content/drive/MyDrive/Colab Notebooks/Legal/llm_motif_debug_log_single_cell_v5_label_fix.txt
--- Initializing LLM Pipeline (m

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

BitsAndBytesConfig created for google/gemma-2b-it, compute_dtype: torch.bfloat16.
Loading local model google/gemma-2b-it (Quantization active: True)...


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Device set to use cuda:0


Local LLM pipeline for google/gemma-2b-it initialized successfully.
Initializing BDM instance...
BDM instance initialized successfully (ndim=2, default CTM-based).
Loading Phase 2 data from: /content/drive/MyDrive/Colab Notebooks/Legal/Phase2_PDF_Collated_Texts/phase2_collated_pdf_texts.json...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



MDL analysis will run for QIDs: ['Q4']

--- Analyzing Data for QID: Q4 ---
  Corpus for QID Q4: 129501 chars, 209 responses.
  Baseline MDL for QID Q4 (L(D_orig)): 121.3693
  QID Q4: Processing 209 responses in 42 preprocessed chunks (batch size: 5 responses).
    Analyzing chunk 1/42 for QID Q4 (processed chunk len: 3158 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 1 (QID Q4).
    Analyzing chunk 2/42 for QID Q4 (processed chunk len: 3262 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 2 (QID Q4).
    Analyzing chunk 3/42 for QID Q4 (processed chunk len: 3219 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 3 (QID Q4).
    Analyzing chunk 4/42 for QID Q4 (processed chunk len: 3134 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 4 (QID Q4).
    Analyzing chunk 5/42 for QID Q4 (processed chunk len: 3142 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 5 (QID Q4).
    Analyzing chunk 6/42 for QID Q4 (processed chunk len: 2924 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 6 (QID Q4).
    Analyzing chunk 7/42 for QID Q4 (processed chunk len: 3325 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    [WARN] Invalid motif object structure in LLM JSON for QID Q4, Chunk 7, Item 3. Skipping item.
      Extracted 3 structured motif objects from chunk 7 (QID Q4).
    Analyzing chunk 8/42 for QID Q4 (processed chunk len: 2977 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 8 (QID Q4).
    Analyzing chunk 9/42 for QID Q4 (processed chunk len: 3084 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 9 (QID Q4).
    Analyzing chunk 10/42 for QID Q4 (processed chunk len: 3218 chars)...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 10 (QID Q4).
    Analyzing chunk 11/42 for QID Q4 (processed chunk len: 2930 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 11 (QID Q4).
    Analyzing chunk 12/42 for QID Q4 (processed chunk len: 3101 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 12 (QID Q4).
    Analyzing chunk 13/42 for QID Q4 (processed chunk len: 3277 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 13 (QID Q4).
    Analyzing chunk 14/42 for QID Q4 (processed chunk len: 3023 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 14 (QID Q4).
    Analyzing chunk 15/42 for QID Q4 (processed chunk len: 3354 chars)...
    [WARN] Motif JSON parsing or core structure validation failed for QID Q4, Chunk 15: Expecting ',' delimiter: line 16 column 3 (char 1787)
      Motif parsing/validation attempt 1 yielded no structured motifs for chunk 15 (QID Q4). Retrying if possible...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    [WARN] Motif JSON parsing or core structure validation failed for QID Q4, Chunk 15: Expecting ',' delimiter: line 16 column 3 (char 1787)
      Motif parsing/validation attempt 2 yielded no structured motifs for chunk 15 (QID Q4). Retrying if possible...
      No valid structured motifs extracted from chunk 15 (QID Q4) after 2 attempts.
    Analyzing chunk 16/42 for QID Q4 (processed chunk len: 3108 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 16 (QID Q4).
    Analyzing chunk 17/42 for QID Q4 (processed chunk len: 3043 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 17 (QID Q4).
    Analyzing chunk 18/42 for QID Q4 (processed chunk len: 3019 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 18 (QID Q4).
    Analyzing chunk 19/42 for QID Q4 (processed chunk len: 2844 chars)...
      Motif parsing/validation attempt 1 yielded no structured motifs for chunk 19 (QID Q4). Retrying if possible...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Motif parsing/validation attempt 2 yielded no structured motifs for chunk 19 (QID Q4). Retrying if possible...
      No valid structured motifs extracted from chunk 19 (QID Q4) after 2 attempts.
    Analyzing chunk 20/42 for QID Q4 (processed chunk len: 2528 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 20 (QID Q4).
    Analyzing chunk 21/42 for QID Q4 (processed chunk len: 2857 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 21 (QID Q4).
    Analyzing chunk 22/42 for QID Q4 (processed chunk len: 3198 chars)...
    [WARN] Motif JSON parsing or core structure validation failed for QID Q4, Chunk 22: Expecting value: line 6 column 3 (char 371)
      Motif parsing/validation attempt 1 yielded no structured motifs for chunk 22 (QID Q4). Retrying if possible...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    [WARN] Motif JSON parsing or core structure validation failed for QID Q4, Chunk 22: Expecting value: line 6 column 3 (char 371)
      Motif parsing/validation attempt 2 yielded no structured motifs for chunk 22 (QID Q4). Retrying if possible...
      No valid structured motifs extracted from chunk 22 (QID Q4) after 2 attempts.
    Analyzing chunk 23/42 for QID Q4 (processed chunk len: 3311 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 23 (QID Q4).
    Analyzing chunk 24/42 for QID Q4 (processed chunk len: 3323 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 24 (QID Q4).
    Analyzing chunk 25/42 for QID Q4 (processed chunk len: 2882 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 25 (QID Q4).
    Analyzing chunk 26/42 for QID Q4 (processed chunk len: 2450 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 26 (QID Q4).
    Analyzing chunk 27/42 for QID Q4 (processed chunk len: 2412 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 27 (QID Q4).
    Analyzing chunk 28/42 for QID Q4 (processed chunk len: 2441 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 28 (QID Q4).
    Analyzing chunk 29/42 for QID Q4 (processed chunk len: 2999 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 29 (QID Q4).
    Analyzing chunk 30/42 for QID Q4 (processed chunk len: 3353 chars)...
    [WARN] Motif JSON parsing or core structure validation failed for QID Q4, Chunk 30: Expecting ',' delimiter: line 24 column 71 (char 1483)
      Motif parsing/validation attempt 1 yielded no structured motifs for chunk 30 (QID Q4). Retrying if possible...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    [WARN] Motif JSON parsing or core structure validation failed for QID Q4, Chunk 30: Expecting ',' delimiter: line 24 column 71 (char 1483)
      Motif parsing/validation attempt 2 yielded no structured motifs for chunk 30 (QID Q4). Retrying if possible...
      No valid structured motifs extracted from chunk 30 (QID Q4) after 2 attempts.
    Analyzing chunk 31/42 for QID Q4 (processed chunk len: 3095 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 31 (QID Q4).
    Analyzing chunk 32/42 for QID Q4 (processed chunk len: 3258 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 32 (QID Q4).
    Analyzing chunk 33/42 for QID Q4 (processed chunk len: 2497 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 33 (QID Q4).
    Analyzing chunk 34/42 for QID Q4 (processed chunk len: 2522 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 34 (QID Q4).
    Analyzing chunk 35/42 for QID Q4 (processed chunk len: 3148 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 35 (QID Q4).
    Analyzing chunk 36/42 for QID Q4 (processed chunk len: 3052 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 structured motif objects from chunk 36 (QID Q4).
    Analyzing chunk 37/42 for QID Q4 (processed chunk len: 3303 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 37 (QID Q4).
    Analyzing chunk 38/42 for QID Q4 (processed chunk len: 3141 chars)...
      Motif parsing/validation attempt 1 yielded no structured motifs for chunk 38 (QID Q4). Retrying if possible...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Motif parsing/validation attempt 2 yielded no structured motifs for chunk 38 (QID Q4). Retrying if possible...
      No valid structured motifs extracted from chunk 38 (QID Q4) after 2 attempts.
    Analyzing chunk 39/42 for QID Q4 (processed chunk len: 3279 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 39 (QID Q4).
    Analyzing chunk 40/42 for QID Q4 (processed chunk len: 3326 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 40 (QID Q4).
    Analyzing chunk 41/42 for QID Q4 (processed chunk len: 3164 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 structured motif objects from chunk 41 (QID Q4).
    Analyzing chunk 42/42 for QID Q4 (processed chunk len: 2617 chars)...
      Extracted 5 structured motif objects from chunk 42 (QID Q4).
  Extracted 161 raw motif objects for QID Q4.
  Consolidated into 17 unique motifs for QID Q4.
  Consolidated Motifs for QID Q4 (BEFORE Global SF refinement):
    Cons. Motif 1: L='[DATA_PRIVACY]', D='A concern regarding the potent...', SFs(140)='["a 3-sentence summary of the excerpt's main points, addressing the question about exceptions in the employment context", 'a brief summary of the topic']...'
    Cons. Motif 2: L='[EXAMPLE_LABEL]', D='A concise summary of the excer...', SFs(23)='['a concise description of the example theme.', 'a focus on consumer expectations and current privacy laws']...'
    Cons. Motif 3: L='[GDPR_BENCHMARK]', D='The GDPR provides a valuable b...', SFs(1)='['focus on robust privacy controls and careful consideration of exceptions']...'
    Cons. Motif 4

In [None]:
# @title Enhanced Prompt Single-Cell MWP
# --- Imports ---
import json
import os
import hashlib
import numpy as np
from pybdm import BDM
import re
import time
from typing import List, Dict, Set
from collections import Counter

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

# --- Configuration ---
BASE_PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Legal/'
PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json')

P3_QIDS_TO_PROCESS_THEMATICALLY = ["Q4"]

MATRIX_SIZE_GLOBAL = (8, 8)
LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'
USE_QUANTIZATION_FOR_LOCAL_LLM = True
LLM_BATCH_SIZE_RESPONSES = 5
LLM_RETRY_ATTEMPTS = 2
MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK = 7000
LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION = 700 # Back to 700 as per old successful script
MAX_MOTIFS_PER_CHUNK = 5 # Back to 5 as per old successful script's prompt

MOTIF_SYMBOLIC_LABEL_COST = 0.5
MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
MOTIF_DESCRIPTION_TOKEN_COST = 0.1
MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.25
MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.1

MIN_SF_FREQUENCY_IN_FULL_CORPUS = 2

LLM_DEBUG_LOG_FILE = os.path.join(BASE_PROJECT_DIR, "llm_motif_debug_log_single_cell_v2_revert_prompt.txt")

# --- Helper Function Definitions ---

def tokenize_phrase(phrase_text: str) -> List[str]:
    if not isinstance(phrase_text, str) or not phrase_text.strip(): return []
    return phrase_text.lower().split()

def calculate_L_H_token_based_structured(structured_motifs_list: List[Dict]) -> float:
    if not structured_motifs_list: return 0.0
    total_lh_cost = 0.0
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue
        current_motif_lh = 0.0
        label_str = motif_obj.get('label', "")
        if isinstance(label_str, str) and label_str.strip():
            current_motif_lh += MOTIF_SYMBOLIC_LABEL_COST
        description_str = motif_obj.get('description', "")
        if isinstance(description_str, str) and description_str.strip():
            current_motif_lh += MOTIF_DESCRIPTION_TEXT_BASE_COST
            current_motif_lh += len(tokenize_phrase(description_str)) * MOTIF_DESCRIPTION_TOKEN_COST
        surface_forms_list = motif_obj.get('surface_forms', [])
        if isinstance(surface_forms_list, list) and surface_forms_list:
            valid_sfs_for_lh = [sf for sf in surface_forms_list if isinstance(sf, str) and sf.strip()]
            if valid_sfs_for_lh:
                current_motif_lh += MOTIF_SURFACE_FORMS_LIST_BASE_COST
                for sf_str in valid_sfs_for_lh:
                    current_motif_lh += len(tokenize_phrase(sf_str)) * MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH
        total_lh_cost += current_motif_lh
    return total_lh_cost

def llm_compress_text_structured(text_to_compress: str, structured_motifs_list: List[Dict]) -> str:
    if not isinstance(text_to_compress, str): return ""
    if not structured_motifs_list: return text_to_compress.lower()
    compressed_text = text_to_compress.lower()
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue
        label = motif_obj.get('label', None)
        surface_forms = motif_obj.get('surface_forms', [])
        if not (isinstance(label, str) and label.strip()) or \
           not (isinstance(surface_forms, list) and surface_forms):
            continue
        placeholder = label
        sorted_sfs_for_this_motif = sorted(
            [sf for sf in surface_forms if isinstance(sf, str) and sf.strip()], key=len, reverse=True
        )
        for sf_str in sorted_sfs_for_this_motif:
            sf_lower = sf_str.lower()
            try:
                compressed_text = re.sub(r'\b' + re.escape(sf_lower) + r'\b', placeholder, compressed_text)
            except re.error as re_e:
                print(f"    Regex error during compression for SF '{sf_str}' of motif '{label}': {re_e}. Skipping.")
                continue
    return compressed_text

def text_to_binary_matrix(text_input: str, size: tuple = MATRIX_SIZE_GLOBAL) -> np.ndarray:
    if not isinstance(text_input, str) or not text_input.strip(): return np.zeros(size, dtype=int)
    hash_obj = hashlib.sha256(text_input.encode('utf-8', 'ignore'))
    hash_digest = hash_obj.hexdigest()
    required_bits = size[0] * size[1]
    binary_string_from_hash = bin(int(hash_digest, 16))[2:].zfill(256)
    binary_string_for_matrix = binary_string_from_hash[:required_bits] if required_bits <= 256 else binary_string_from_hash.ljust(required_bits, '0')
    bits_for_matrix = [int(b) for b in binary_string_for_matrix]
    return np.array(bits_for_matrix).reshape(size)

def compute_bdm_for_text(text_input: str, bdm_instance: BDM, matrix_s: tuple = MATRIX_SIZE_GLOBAL) -> float:
    if not isinstance(text_input, str) or not text_input.strip() : return 0.0
    text_for_hash = text_input[:MAX_TEXT_FOR_BDM_HASH] if len(text_input) > MAX_TEXT_FOR_BDM_HASH else text_input
    if not text_for_hash.strip(): return 0.0
    binary_matrix = text_to_binary_matrix(text_for_hash, size=matrix_s)
    try:
        return bdm_instance.bdm(binary_matrix)
    except Exception as e_bdm:
        print(f"      Error during BDM calculation for text (len {len(text_input)}, hashed part len {len(text_for_hash)}): {e_bdm}")
        return -1.0

def compute_mdl_cost_for_text_block(full_qid_corpus_str: str,
                                    final_motifs_to_evaluate: List[Dict],
                                    bdm_instance: BDM,
                                    matrix_s: tuple = MATRIX_SIZE_GLOBAL) -> tuple[float, float, float]:
    if not isinstance(full_qid_corpus_str, str) : full_qid_corpus_str = ""
    l_h = calculate_L_H_token_based_structured(final_motifs_to_evaluate)
    compressed_text_block = llm_compress_text_structured(full_qid_corpus_str, final_motifs_to_evaluate)
    l_d_h = compute_bdm_for_text(compressed_text_block, bdm_instance, matrix_s)
    if l_d_h < 0: return l_h, -1.0, -1.0
    return l_h, l_d_h, l_h + l_d_h

def preprocess_corpus_for_motif_extraction(text_corpus: str) -> str:
    if not isinstance(text_corpus, str): return ""
    text = re.sub(r'\n{3,}', '\n\n', text_corpus)
    text = re.sub(r' {2,}', ' ', text)
    lines = text.split('\n')
    filtered_lines = [line.strip() for line in lines if len(line.strip()) > 10 or not line.strip()]
    return '\n'.join(filtered_lines)

def count_sf_occurrences(corpus_text: str, surface_form: str) -> int:
    if not corpus_text or not surface_form or not isinstance(corpus_text, str) or not isinstance(surface_form, str):
        return 0
    try:
        return len(re.findall(re.escape(surface_form.lower()), corpus_text.lower(), flags=re.IGNORECASE))
    except re.error as e:
        print(f"    [WARN] Regex error in count_sf_occurrences for SF '{surface_form}': {e}")
        return 0

# --- LLM Interaction Functions ---
def build_llm_prompt_for_motifs(text_block_for_prompt: str, max_motifs_to_extract: int = MAX_MOTIFS_PER_CHUNK) -> str:
    """Reverted to the simpler prompt that previously yielded bracketed labels."""
    if len(text_block_for_prompt) > MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK:
        text_block_for_prompt = text_block_for_prompt[:MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK]

    # This is the prompt from the "Old Successful Code Cell"
    prompt = f"""You will receive a set of comments from different people answering the same question.

Your task is to identify up to {max_motifs_to_extract} key recurring themes.

For each theme, provide:
- A short label like [DATA_PRIVACY]
- A 1-sentence description of the theme
- 2–3 short phrases that often appear in the text (surface forms)

Output MUST be a valid JSON list of objects, where each object has "label", "description", and "surface_forms" keys.
Example of one object in the list:
{{
  "label": "[EXAMPLE_LABEL]",
  "description": "A concise description of the example theme.",
  "surface_forms": ["short repeated phrase 1", "another short repeated phrase"]
}}
If no clear motifs are found, output an empty JSON list: `[]`.
Do not include any other text, explanations, or markdown code fences around the JSON.

Set of comments to analyze:
\"\"\"
{text_block_for_prompt}
\"\"\"

Valid JSON Output (ensure it's a list of objects, or an empty list [] if no themes):
"""
    return prompt.strip()

def call_local_llm_for_raw_response(
    prompt_content_for_user_turn: str,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str,
    chunk_idx_for_log: int
    ) -> str:
    if not hf_pipeline_instance or not hf_tokenizer_instance:
        print(f"    ERROR (call_local_llm): LLM pipeline/tokenizer not initialized for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        return ""
    messages_for_template = [{"role": "user", "content": prompt_content_for_user_turn}]
    try:
        prompt_formatted_for_llm = hf_tokenizer_instance.apply_chat_template(
            messages_for_template, tokenize=False, add_generation_prompt=True
        )
    except Exception as e_template:
        print(f"    ERROR (call_local_llm): Applying chat template failed for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_template}")
        # ... (logging to file) ...
        return ""

    # Using generation_args consistent with "Old Successful Code"
    generation_args = {
        "max_new_tokens": LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION,
        "do_sample": False,
        "pad_token_id": hf_tokenizer_instance.pad_token_id
    }
    # print(f"    DEBUG (call_local_llm): QID {qid_for_log}, Chunk {chunk_idx_for_log}, Prompt len: {len(prompt_formatted_for_llm)}, GenArgs: {generation_args}")

    try:
        outputs = hf_pipeline_instance(prompt_formatted_for_llm, **generation_args)
        if outputs and isinstance(outputs, list) and len(outputs) > 0 and \
           outputs[0] and isinstance(outputs[0], dict) and 'generated_text' in outputs[0]:
            assistant_response_text = outputs[0]['generated_text'].strip()
            # print(f"    DEBUG (call_local_llm): QID {qid_for_log}, Chunk {chunk_idx_for_log}, Raw LLM Output:\n{assistant_response_text[:500]}...")
            return assistant_response_text
        else: # ... (warn and return "") ...
            print(f"    WARN (call_local_llm): LLM pipeline returned unexpected structure for QID {qid_for_log}, Chunk {chunk_idx_for_log}. Output: {outputs}")
            return ""
    except Exception as e_pipeline: # ... (error handling and logging) ...
        print(f"    ERROR (call_local_llm): Exception during hf_pipeline call for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_pipeline}")
        # ... (logging to file) ...
        return ""

def parse_and_validate_llm_json_response(
    llm_raw_response_text: str,
    qid_for_log:str,
    chunk_idx_for_log:int,
    prompt_sent_to_llm:str
    ) -> List[Dict]:
    json_str_candidate = llm_raw_response_text.strip()
    if json_str_candidate.startswith("```json"): json_str_candidate = json_str_candidate[len("```json"):].strip()
    if json_str_candidate.startswith("```"): json_str_candidate = json_str_candidate[len("```"):].strip()
    if json_str_candidate.endswith("```"): json_str_candidate = json_str_candidate[:-len("```")].strip()

    # print(f"    DEBUG (parse_validate): QID {qid_for_log}, Chunk {chunk_idx_for_log}, JSON candidate:\n{json_str_candidate[:500]}...")

    if not json_str_candidate or json_str_candidate.lower() == "[]" or "no_themes_found" in json_str_candidate.lower() or "no clear motifs" in json_str_candidate.lower():
        return []
    try:
        parsed_data = json.loads(json_str_candidate)
        if isinstance(parsed_data, dict): parsed_data = [parsed_data]
        if not isinstance(parsed_data, list): raise ValueError("Parsed JSON is not a list or single object.")

        valid_motifs_from_json = []
        for item_idx, item in enumerate(parsed_data):
            # print(f"    DEBUG (parse_validate): Validating item {item_idx+1} for QID {qid_for_log}, Chunk {chunk_idx_for_log}:")
            # print(f"      ITEM_RAW_CONTENT: {item}")

            label_str = item.get('label',"").strip()
            desc_str = item.get('description',"").strip()
            sf_list = item.get('surface_forms', [])

            is_dict_val = isinstance(item, dict)
            has_all_keys_val = all(k in item for k in ["label", "description", "surface_forms"])
            is_label_str_val = isinstance(label_str, str) and bool(label_str) # Ensure label is not empty
            label_starts_bracket_val = label_str.startswith('[')
            label_ends_bracket_val = label_str.endswith(']')
            is_desc_str_val = isinstance(desc_str, str) # Allow empty description
            is_sf_list_val = isinstance(sf_list, list)
            sfs_are_strings_val = all(isinstance(sf_item, str) for sf_item in sf_list)

            # print(f"      VALIDATION CHECKS: is_dict={is_dict_val}, has_all_keys={has_all_keys_val}, is_label_str={is_label_str_val}, label_content='{label_str}', starts_bracket={label_starts_bracket_val}, ends_bracket={label_ends_bracket_val}, is_desc_str={is_desc_str_val}, is_sf_list={is_sf_list_val}, sfs_are_strings={sfs_are_strings_val}")

            if is_dict_val and has_all_keys_val and \
               is_label_str_val and label_starts_bracket_val and label_ends_bracket_val and \
               is_desc_str_val and is_sf_list_val and sfs_are_strings_val:
                valid_motifs_from_json.append({
                    "label": label_str,
                    "description": desc_str,
                    "surface_forms": [s.strip() for s in sf_list if s.strip()]
                })
                # print("        RESULT: Item PASSED validation.")
            else:
                print(f"    [WARN] Invalid motif object structure in LLM JSON for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}. Skipping item.")
                # print(f"      ITEM CONTENT: {item}") # Keep this for deeper debug if needed
                # ... (logging to file for failed item structure) ...
        return valid_motifs_from_json
    except (json.JSONDecodeError, ValueError) as e:
        print(f"    [WARN] Motif JSON parsing or core structure validation failed for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e}")
        # ... (logging to file for JSONDecodeError) ...
        return []

def get_motifs_for_qid_batched(
    list_of_individual_response_texts: List[str],
    responses_per_batch: int,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str
    ) -> List[Dict]:
    all_raw_motifs_from_chunks = []
    batched_text_chunks_for_llm = []
    for i in range(0, len(list_of_individual_response_texts), responses_per_batch):
        batch_responses = list_of_individual_response_texts[i:i + responses_per_batch]
        chunk_text_for_llm = preprocess_corpus_for_motif_extraction("\n\n<RSP_SEP>\n\n".join(batch_responses))
        batched_text_chunks_for_llm.append(chunk_text_for_llm)

    print(f"  QID {qid_for_log}: Processing {len(list_of_individual_response_texts)} responses in {len(batched_text_chunks_for_llm)} preprocessed chunks (batch size: {responses_per_batch} responses).")

    for chunk_idx, text_chunk_to_analyze_processed in enumerate(batched_text_chunks_for_llm):
        print(f"    Analyzing chunk {chunk_idx + 1}/{len(batched_text_chunks_for_llm)} for QID {qid_for_log} (processed chunk len: {len(text_chunk_to_analyze_processed)} chars)...")
        if len(text_chunk_to_analyze_processed.strip()) < 50:
            print(f"      Chunk {chunk_idx+1} (QID {qid_for_log}) too short after preprocessing, skipping.")
            continue
        prompt_for_llm = build_llm_prompt_for_motifs(text_chunk_to_analyze_processed)
        motifs_from_this_chunk = []
        for attempt in range(LLM_RETRY_ATTEMPTS):
            raw_llm_response = call_local_llm_for_raw_response(
                prompt_for_llm, hf_pipeline_instance, hf_tokenizer_instance, qid_for_log, chunk_idx + 1
            )
            if not raw_llm_response:
                print(f"      LLM call attempt {attempt + 1} for chunk {chunk_idx+1} (QID {qid_for_log}) returned empty string. Retrying if possible...")
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)
                continue
            parsed_motifs_from_this_attempt = parse_and_validate_llm_json_response(
                raw_llm_response, qid_for_log, chunk_idx+1, prompt_for_llm
            )
            if parsed_motifs_from_this_attempt:
                motifs_from_this_chunk = parsed_motifs_from_this_attempt
                break
            else:
                print(f"      Motif parsing/validation attempt {attempt + 1} yielded no structured motifs for chunk {chunk_idx+1} (QID {qid_for_log}). Retrying if possible...")
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)
        if motifs_from_this_chunk:
            print(f"      Extracted {len(motifs_from_this_chunk)} structured motif objects from chunk {chunk_idx+1} (QID {qid_for_log}).")
            all_raw_motifs_from_chunks.extend(motifs_from_this_chunk)
        else:
            print(f"      No valid structured motifs extracted from chunk {chunk_idx+1} (QID {qid_for_log}) after {LLM_RETRY_ATTEMPTS} attempts.")
    return all_raw_motifs_from_chunks

def consolidate_raw_motifs(list_of_all_raw_motifs: List[Dict]) -> List[Dict]:
    if not list_of_all_raw_motifs: return []
    consolidated_motifs_map = {}
    for motif_obj in list_of_all_raw_motifs:
        label = motif_obj.get("label","").strip()
        description = motif_obj.get("description","").strip()
        surface_forms = motif_obj.get("surface_forms", [])
        if not (label and isinstance(surface_forms, list)): continue # Description can be empty
        current_sfs_set = set(sf.lower().strip() for sf in surface_forms if isinstance(sf, str) and sf.strip())
        if label not in consolidated_motifs_map:
            consolidated_motifs_map[label] = {
                "label": label, "description": description, "surface_forms": sorted(list(current_sfs_set))
            }
        else:
            existing_sfs_set = set(consolidated_motifs_map[label].get("surface_forms", []))
            consolidated_motifs_map[label]["surface_forms"] = sorted(list(existing_sfs_set.union(current_sfs_set)))
    return list(consolidated_motifs_map.values())

def filter_surface_forms_by_global_frequency(
    consolidated_motifs_list: List[Dict],
    full_qid_corpus_text: str,
    min_global_freq: int = MIN_SF_FREQUENCY_IN_FULL_CORPUS
    ) -> List[Dict]:
    if not consolidated_motifs_list: return []
    final_globally_filtered_motifs = []
    for motif_obj in consolidated_motifs_list:
        globally_frequent_sfs_for_this_motif = []
        original_sfs_for_this_motif = motif_obj.get("surface_forms", [])
        for sf_str in original_sfs_for_this_motif:
            count = count_sf_occurrences(full_qid_corpus_text, sf_str)
            if count >= min_global_freq:
                globally_frequent_sfs_for_this_motif.append(sf_str)
        if globally_frequent_sfs_for_this_motif:
            filtered_motif_entry = motif_obj.copy()
            filtered_motif_entry["surface_forms"] = sorted(list(set(globally_frequent_sfs_for_this_motif)))
            final_globally_filtered_motifs.append(filtered_motif_entry)
    return final_globally_filtered_motifs

# --- Main Execution Logic ---
def main():
    print("--- MWP Single Cell (Reverted Prompt, Global SF Filter, Batched LLM, Token-L(H), BDM L(D|H)) ---") # Updated title
    # ... (Print config params - same as before) ...
    # ... (Initialize debug log file - same as before) ...
    # ... (Initialize LLM Pipeline - same as before, ensure return_full_text=False) ...
    # ... (Initialize BDM - same as before, ensure BDM(ndim=2)) ...
    # ... (Load Phase 2 Data - same as before) ...
    # ... (Determine qids_to_process_this_run - same as before) ...
    # --- Main Execution (Copied from previous complete cell, with GDB lines removed for brevity here) ---
    # (The long main loop from your previous "complete revised single cell MWP now" post would go here,
    # ensuring it calls the updated build_llm_prompt_for_motifs and updated parse_and_validate_llm_json_response)
    # ... it will look very similar to the last main() I provided, just ensure it uses the functions defined in *this* cell.

    # For brevity, I'll sketch the main loop parts that changed or are key:
    # --- Initialize LLM and BDM ---
    hf_pipeline_instance, hf_tokenizer_instance = initialize_llm_pipeline()
    if not hf_pipeline_instance: return
    bdm_instance_main = initialize_bdm_instance() # Using new init function name
    if not bdm_instance_main: return
    # --- Load Data --- (as before)
    # --- Determine QIDs --- (as before)
    # --- Loop QIDs ---
    for qid_identifier_str in qids_to_process_this_run:
        # --- Get QID text and baseline BDM --- (as before)
        # ...
        # --- Call get_motifs_for_qid_batched (which uses the new prompt and parsing) ---
        raw_motifs_from_chunks = get_motifs_for_qid_batched(
            actual_response_texts_for_qid,
            LLM_BATCH_SIZE_RESPONSES, # Use correct constant
            hf_pipeline_instance,
            hf_tokenizer_instance,
            qid_identifier_str
        )
        # --- Initialize qid_result_entry --- (as before)
        # --- Handle no raw motifs --- (as before)
        # --- Consolidate motifs --- (as before, uses consolidate_raw_motifs)
        # --- Handle no consolidated motifs --- (as before)
        # --- Print consolidated motifs (optional detailed) --- (as before)
        # --- Global SF Frequency Filtering --- (as before, uses filter_surface_forms_by_global_frequency)
        # --- Handle no refined motifs --- (as before)
        # --- Print refined motifs --- (as before)
        # --- Final MDL Calculation --- (as before, uses compute_mdl_cost_for_text_block)
        # --- Log results for QID --- (as before)
    # --- Final Summary and Save --- (as before)

    # Placeholder for the actual main loop structure from the previous full cell
    # This is just to indicate that the main logic flow remains, but it now
    # calls the functions defined within *this specific cell*.
    # You would copy the full main() from the previous complete cell I provided,
    # ensuring it uses the `build_llm_prompt_for_motifs` and
    # `parse_and_validate_llm_json_response` defined *here*.

    # --- THIS IS THE FULL MAIN() from the previous "complete revised code cell" ---
    # --- It should work with the functions now defined above in *this* cell ---
    print(f"Timestamp: {time.asctime()}")
    print("\n--- Configuration Summary ---") # ... (print all config constants)
    with open(LLM_DEBUG_LOG_FILE, "w", encoding="utf-8") as f: f.write(f"LLM Log - {time.asctime()}\nModel: {LOCAL_LLM_MODEL_ID}\n")

    hf_pipeline_instance, hf_tokenizer_instance = initialize_llm_pipeline()
    if not hf_pipeline_instance: return
    bdm_instance_main = initialize_bdm_instance()
    if not bdm_instance_main: return

    if not os.path.exists(P2_COLLATED_FILE): print(f"ERROR: File {P2_COLLATED_FILE} not found."); return
    print(f"Loading data from: {P2_COLLATED_FILE}...")
    phase2_data_content = None
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f: phase2_data_content = json.load(f)
    except Exception as e: print(f"Error loading {P2_COLLATED_FILE}: {e}"); return

    all_qid_mdl_results_list = []
    aggregated_content_by_qid_from_file = phase2_data_content.get("aggregated_pdf_content_by_qid", {})
    if not aggregated_content_by_qid_from_file: print(f"No 'aggregated_pdf_content_by_qid' in {P2_COLLATED_FILE}."); return

    qids_to_process_this_run = [qid for qid in P3_QIDS_TO_PROCESS_THEMATICALLY if qid in aggregated_content_by_qid_from_file] if (P3_QIDS_TO_PROCESS_THEMATICALLY and isinstance(P3_QIDS_TO_PROCESS_THEMATICALLY, list) and P3_QIDS_TO_PROCESS_THEMATICALLY) else list(aggregated_content_by_qid_from_file.keys())[:1]
    if not qids_to_process_this_run: print(f"No QIDs to process. Exiting."); return
    print(f"\nMDL analysis for QIDs: {qids_to_process_this_run}\n")

    for qid_identifier_str in qids_to_process_this_run:
        print(f"--- Analyzing QID: {qid_identifier_str} ---")
        list_of_individual_response_structs = aggregated_content_by_qid_from_file.get(qid_identifier_str, [])
        actual_response_texts_for_qid = [item.get("text", "") for item in list_of_individual_response_structs if isinstance(item, dict) and isinstance(item.get("text"), str) and item.get("text","").strip()]
        if not actual_response_texts_for_qid: print(f"  No valid text for QID {qid_identifier_str}. Skipping."); print("-" * 50); continue

        full_corpus_text_for_qid = "\n\n<RSP_SEP>\n\n".join(actual_response_texts_for_qid)
        if len(full_corpus_text_for_qid.strip()) < 100: print(f"  Skipping QID {qid_identifier_str}: text too short."); print("-" * 50); continue

        num_total_responses_for_qid = len(actual_response_texts_for_qid)
        print(f"  Corpus for QID {qid_identifier_str}: {len(full_corpus_text_for_qid)} chars, {num_total_responses_for_qid} responses.")

        baseline_bdm_original_corpus = compute_bdm_for_text(full_corpus_text_for_qid, bdm_instance_main, MATRIX_SIZE_GLOBAL)
        if baseline_bdm_original_corpus < 0: print(f"  Error computing baseline BDM for QID {qid_identifier_str}. Skipping."); print("-" * 50); continue
        current_qid_baseline_mdl_cost = baseline_bdm_original_corpus
        print(f"  Baseline MDL for QID {qid_identifier_str} (L(D_orig)): {current_qid_baseline_mdl_cost:.4f}")

        raw_motifs_from_chunks = get_motifs_for_qid_batched(
            actual_response_texts_for_qid, LLM_BATCH_SIZE_RESPONSES,
            hf_pipeline_instance, hf_tokenizer_instance, qid_identifier_str
        )
        current_qid_result_entry = {
            "qid": qid_identifier_str, "corpus_len_chars": len(full_corpus_text_for_qid), "num_responses": num_total_responses_for_qid,
            "baseline_mdl": current_qid_baseline_mdl_cost, "final_refined_motifs": [], "l_h_final_motifs": 0.0,
            "l_d_h_final_motifs": current_qid_baseline_mdl_cost, "total_mdl_with_final_motifs": current_qid_baseline_mdl_cost,
            "compression_achieved": 0.0, "num_raw_motifs_extracted": len(raw_motifs_from_chunks),
            "num_consolidated_motifs": 0, "num_globally_refined_motifs": 0
        }
        if not raw_motifs_from_chunks: print(f"  No raw motifs by LLM for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue
        print(f"  Extracted {len(raw_motifs_from_chunks)} raw motif objects for QID {qid_identifier_str}.")

        consolidated_motifs_list = consolidate_raw_motifs(raw_motifs_from_chunks)
        current_qid_result_entry["num_consolidated_motifs"] = len(consolidated_motifs_list)
        print(f"  Consolidated into {len(consolidated_motifs_list)} unique motifs for QID {qid_identifier_str}.")
        if not consolidated_motifs_list: print(f"  No unique motifs after consolidation for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        # print(f"  Consolidated Motifs (BEFORE Global SF refinement):") # Optional detailed print
        # for idx, mo_con in enumerate(consolidated_motifs_list): print(f"    Cons. Motif {idx+1}: L='{mo_con.get('label')}', D='{mo_con.get('description','N/A')[:30]}...', SFs({len(mo_con.get('surface_forms',[]))})='{mo_con.get('surface_forms',[])[:2]}...'")

        globally_refined_motifs = filter_surface_forms_by_global_frequency(
            consolidated_motifs_list, full_corpus_text_for_qid, MIN_SF_FREQUENCY_IN_FULL_CORPUS
        )
        current_qid_result_entry["num_globally_refined_motifs"] = len(globally_refined_motifs)
        print(f"  Globally refined into {len(globally_refined_motifs)} motifs for QID {qid_identifier_str}.")
        if not globally_refined_motifs: print(f"  No motifs left after GLOBAL SF refinement for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        print(f"  Final Globally Refined Motifs for QID {qid_identifier_str}:")
        for idx, mo_final in enumerate(globally_refined_motifs): print(f"    Refined Motif {idx+1}: L='{mo_final.get('label')}', D='{mo_final.get('description','N/A')[:60]}...', SFs({len(mo_final.get('surface_forms',[]))})='{mo_final.get('surface_forms',[])}'")

        l_h_final, l_d_h_final, total_mdl_final = compute_mdl_cost_for_text_block(
            full_corpus_text_for_qid, globally_refined_motifs, bdm_instance_main, MATRIX_SIZE_GLOBAL
        )
        current_qid_result_entry.update({
            "final_refined_motifs": globally_refined_motifs, "l_h_final_motifs": l_h_final,
            "l_d_h_final_motifs": l_d_h_final if l_d_h_final >=0 else "BDM_ERROR",
            "total_mdl_with_final_motifs": total_mdl_final if l_d_h_final >=0 else "BDM_ERROR",
            "compression_achieved": "BDM_ERROR" if l_d_h_final < 0 else (current_qid_baseline_mdl_cost - total_mdl_final)
        })
        if l_d_h_final < 0: print(f"  Error computing MDL cost (BDM error) for QID {qid_identifier_str}."); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50); continue

        print(f"  L(H) final motifs: {l_h_final:.4f}, L(D|H) compressed: {l_d_h_final:.4f}, Total MDL: {total_mdl_final:.4f}")
        compression_val = current_qid_result_entry["compression_achieved"]
        result_status_str = f"SUCCESS: Comp: {compression_val:.4f}" if isinstance(compression_val, float) and compression_val > 0.0001 else f"NOTE: No sig. comp. Diff: {compression_val if isinstance(compression_val, str) else compression_val:.4f}"
        print(f"  {result_status_str}"); all_qid_mdl_results_list.append(current_qid_result_entry); print("-" * 50)

    print("\n--- Overall QID-based MDL Analysis Summary ---")
    if not all_qid_mdl_results_list: print("No QIDs processed.")
    else:
        valid_results = [r for r in all_qid_mdl_results_list if isinstance(r.get('compression_achieved'), float) and r.get('l_h_final_motifs', -1.0) >= 0]
        num_qids_ok = len(valid_results); num_comp = sum(1 for r in valid_results if r['compression_achieved'] > 0.0001)
        print(f"Targeted QIDs: {len(qids_to_process_this_run)}, Results logged: {len(all_qid_mdl_results_list)}, Valid MDL: {num_qids_ok}, QIDs compressed: {num_comp}")
        if num_comp > 0:
            comp_vals = [r['compression_achieved'] for r in valid_results if r['compression_achieved'] > 0.0001]
            print(f"  Avg compression: {np.mean(comp_vals):.4f}, Max compression: {np.max(comp_vals):.4f}")
        else: print("  No compression achieved.")

        output_filename = os.path.join(BASE_PROJECT_DIR, "mdl_analysis_single_cell_reverted_prompt_v1.json")
        try:
            with open(output_filename, "w", encoding="utf-8") as f_out: json.dump(all_qid_mdl_results_list, f_out, indent=2, ensure_ascii=False)
            print(f"Detailed results saved to {output_filename}")
        except Exception as e_s: print(f"Error saving results: {e_s}")


if __name__ == "__main__":
    print(f"Executing main MDL pipeline (Single Cell Reverted Prompt Version) at {time.asctime()}...")
    main()
    print(f"Main MDL pipeline execution finished at {time.asctime()}.")

Executing main MDL pipeline (Single Cell Reverted Prompt Version) at Sun Jun  1 06:57:27 2025...
--- MWP Single Cell (Reverted Prompt, Global SF Filter, Batched LLM, Token-L(H), BDM L(D|H)) ---
--- Initializing LLM Pipeline (model: google/gemma-2b-it, quantization: True, return_full_text: False) ---
Using device: cuda
Loading tokenizer for google/gemma-2b-it...
BitsAndBytesConfig created for google/gemma-2b-it, compute_dtype: torch.bfloat16.
Loading local model google/gemma-2b-it (Quantization active: True)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Local LLM pipeline for google/gemma-2b-it initialized successfully.
Initializing BDM instance...
BDM instance initialized successfully (ndim=2, default CTM-based).


UnboundLocalError: cannot access local variable 'qids_to_process_this_run' where it is not associated with a value

## Revised after Claude's inputs

In [None]:
# @title 1. Configuration
# Purpose: Centralize all global constants and configuration parameters.
# Cell 1: Imports and Configuration

# --- Standard Library Imports ---
import json
import os
import hashlib
import re
import time
from typing import List, Dict, Set # For type hinting
from collections import Counter

# --- Third-party Library Imports ---
import numpy as np
from pybdm import BDM
import torch # Ensure torch is imported before transformers for some setups
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

# --- Configuration Constants ---

# --- Project Paths ---
# !!! IMPORTANT: UPDATE BASE_PROJECT_DIR TO YOUR ACTUAL PATH !!!
# Example for Google Colab:
# from google.colab import drive
# drive.mount('/content/drive')
# BASE_PROJECT_DIR = '/content/drive/MyDrive/YourFolder/LegalAnalysis/'
# Example for local:
BASE_PROJECT_DIR = './' # Assumes files are in current or subdirectories
# Ensure BASE_PROJECT_DIR ends with a slash if it's a directory path

PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json') # Example filename

P3_QIDS_TO_PROCESS_THEMATICALLY = ["Q4"] # QIDs to process, e.g., ["Q4", "Q12"] or None to process fallback

# --- BDM Configuration ---
MATRIX_SIZE_GLOBAL = (8, 8) # For BDM text_to_binary_matrix representation
MAX_TEXT_FOR_BDM_HASH = 2000 # Max characters from text to use for BDM hash input

# --- LLM Configuration ---
LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'
USE_QUANTIZATION_FOR_LOCAL_LLM = True # Set to False if no GPU or issues
LLM_BATCH_SIZE_RESPONSES = 5 # Number of individual responses to batch together for one LLM call
LLM_RETRY_ATTEMPTS = 2
MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK = 7000 # Max chars for the actual text block within the LLM prompt
LLM_MAX_NEW_TOKENS_ENHANCED_MOTIF = 800 # Max new tokens LLM should generate for motif extraction output
MAX_MOTIFS_PER_CHUNK = 3 # Ask LLM for up to this many motifs per processed chunk

# --- Token-Based L(H) Configuration ---
MOTIF_SYMBOLIC_LABEL_COST = 0.5
MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
MOTIF_DESCRIPTION_TOKEN_COST = 0.1
MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.25
MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.1

# --- Surface Form Filtering Configuration ---
# For validate_and_refine_surface_forms_in_chunk (applied to SFs from one LLM call on one chunk)
MIN_SF_FREQ_IN_CHUNK_VALIDATION = 2
# For filter_surface_forms_by_global_frequency (applied to consolidated motifs, against full QID corpus)
MIN_SF_FREQ_FOR_FINAL_MOTIFS = 2

# --- Logging ---
LLM_DEBUG_LOG_FILE = os.path.join(BASE_PROJECT_DIR, "llm_motif_debug_log_mwp_enhanced.txt")

# --- Derived Constants (Optional, for clarity if paths get complex) ---
# (None needed for now)

print("Cell 1: Imports and Configuration loaded.")
print(f"Base project directory set to: {BASE_PROJECT_DIR}")
print(f"LLM Debug Log will be at: {LLM_DEBUG_LOG_FILE}")
if not os.path.exists(BASE_PROJECT_DIR):
    print(f"WARNING: BASE_PROJECT_DIR '{BASE_PROJECT_DIR}' does not exist. Please create it or update the path.")
if not os.path.exists(PHASE2_OUTPUT_DIR):
    print(f"WARNING: PHASE2_OUTPUT_DIR '{PHASE2_OUTPUT_DIR}' does not exist. Will attempt to create if needed by data loading.")


Cell 1: Imports and Configuration loaded.
Base project directory set to: ./
LLM Debug Log will be at: ./llm_motif_debug_log_mwp_enhanced.txt


In [None]:
# @title 2. Text Utilities
# Purpose: General text processing and utility functions not specific to LLMs or MDL.
# Cell 2: Text Utilities

def tokenize_phrase(phrase_text: str) -> List[str]:
    """
    Simple tokenizer for phrases, definitions, or surface forms.
    Lowercases and splits by space.
    """
    if not isinstance(phrase_text, str) or not phrase_text.strip():
        return []
    # More advanced tokenization could be used here if needed (e.g., handling punctuation differently)
    # For now, simple split and lowercase is consistent with other parts.
    return phrase_text.lower().split()

def preprocess_corpus_for_motif_extraction(text_corpus: str) -> str:
    """
    Preprocesses a text corpus (typically a chunk of joined responses)
    before sending to LLM or for n-gram extraction.
    - Consolidates excessive newlines and spaces.
    - Filters out very short lines (potential noise).
    """
    if not isinstance(text_corpus, str):
        return ""

    # Consolidate multiple newlines to a maximum of two (to preserve paragraph-like breaks)
    text = re.sub(r'\n{3,}', '\n\n', text_corpus)
    # Consolidate multiple spaces into a single space
    text = re.sub(r' {2,}', ' ', text)

    lines = text.split('\n')
    # Filter out lines that are too short to likely contain meaningful themes,
    # but keep intentionally blank lines (which result from '\n\n').
    # This threshold is heuristic.
    filtered_lines = [line.strip() for line in lines if len(line.strip()) > 10 or not line.strip()]

    return '\n'.join(filtered_lines)

def count_sf_occurrences(corpus_text: str, surface_form: str) -> int:
    """
    Counts case-insensitive occurrences of a surface_form within the corpus_text.
    """
    if not corpus_text or not surface_form or not isinstance(corpus_text, str) or not isinstance(surface_form, str):
        return 0
    # Ensure search is case-insensitive and handles regex special characters in the surface_form
    try:
        return len(re.findall(re.escape(surface_form.lower()), corpus_text.lower(), flags=re.IGNORECASE))
    except re.error as e:
        print(f"    [WARN] Regex error in count_sf_occurrences for SF '{surface_form}': {e}")
        return 0

def extract_actual_phrases_from_text(
    text: str,
    min_phrase_len: int = 2,
    max_phrase_len: int = 6,
    min_freq: int = MIN_SF_FREQ_IN_CHUNK_VALIDATION # Uses config constant
    ) -> Dict[str, int]:
    """
    Extracts n-gram phrases (2 to 6 words by default) and their frequencies from text.
    Only returns phrases meeting min_freq.
    Designed for validating LLM-generated surface forms against actual recurring content in a text CHUNK.
    """
    if not isinstance(text, str) or not text.strip():
        return {}

    # Basic cleaning: lowercase, remove punctuation (except intra-word apostrophes if any), split into words
    # This cleaning should be somewhat consistent with how surface forms are expected/matched.
    text_cleaned = text.lower()
    text_cleaned = re.sub(r'[^\w\s\']', ' ', text_cleaned) # Keep apostrophes, replace other non-alphanum with space
    text_cleaned = re.sub(r'\s+', ' ', text_cleaned).strip() # Consolidate whitespace

    words = text_cleaned.split()
    if not words or len(words) < min_phrase_len:
        return {}

    phrase_counts = Counter()
    for n in range(min_phrase_len, max_phrase_len + 1):
        if n > len(words): continue # Cannot form n-gram if n > num words
        for i in range(len(words) - n + 1):
            phrase_tokens = words[i:i+n]
            # Optional: filter out phrases composed only of stopwords, or starting/ending with them
            # For now, keeping it simple.
            phrase = ' '.join(phrase_tokens)
            if phrase: # Ensure not empty after join (e.g. if words list had empty strings)
                phrase_counts[phrase] += 1

    recurring_phrases = {phrase: count for phrase, count in phrase_counts.items() if count >= min_freq}
    return recurring_phrases

print("Cell 2: Text Utilities loaded.")

Cell 2: Text Utilities loaded.


In [None]:
# @title 3. LLM Interaction
# Purpose: Functions directly related to interacting with the LLM (prompt building, LLM calls, basic response parsing).
# Cell 3: LLM Interaction

def initialize_llm_pipeline(
    model_id: str = LOCAL_LLM_MODEL_ID,
    use_quantization: bool = USE_QUANTIZATION_FOR_LOCAL_LLM,
    pipeline_return_full_text: bool = False # Defaulting to False as per our findings
    ):
    """Initializes and returns the Hugging Face pipeline and tokenizer."""
    print(f"--- Initializing LLM Pipeline (model: {model_id}, quantization: {use_quantization}, return_full_text: {pipeline_return_full_text}) ---")

    hf_pipeline_instance = None
    hf_tokenizer_instance = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    try:
        print(f"Loading tokenizer for {model_id}...")
        hf_tokenizer_instance = AutoTokenizer.from_pretrained(model_id)

        if hf_tokenizer_instance.pad_token is None:
            print("Tokenizer does not have a pad_token; setting pad_token = eos_token.")
            hf_tokenizer_instance.pad_token = hf_tokenizer_instance.eos_token
            # Note: model.config.pad_token_id will be set below after model loading

        bnb_config = None
        quant_active = False
        if use_quantization and torch.cuda.is_available():
            try:
                # Ensure bfloat16 is available if trying to use it
                compute_dtype = torch.bfloat16 if (device.type == 'cuda' and torch.cuda.is_bf16_supported()) else torch.float16
                bnb_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=compute_dtype,
                    bnb_4bit_use_double_quant=True
                )
                quant_active = True
                print(f"BitsAndBytesConfig created for {model_id}, compute_dtype: {compute_dtype}.")
            except Exception as e_bnb:
                print(f"WARN: Failed to create BitsAndBytesConfig: {e_bnb}. Quantization may be disabled or fall back.")
                quant_active = False # Ensure it's False if config fails

        print(f"Loading local model {model_id} (Quantization active: {quant_active})...")
        model_kwargs = {"device_map": "auto", "trust_remote_code": True} # trust_remote_code for some models

        if quant_active and bnb_config:
            model_kwargs["quantization_config"] = bnb_config
        elif device.type == 'cuda': # If not quantizing but on GPU, use appropriate dtype
             model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

        hf_model_instance = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)

        # Set model's pad_token_id if tokenizer's was set to eos_token_id
        if hf_tokenizer_instance.pad_token_id == hf_tokenizer_instance.eos_token_id:
             hf_model_instance.config.pad_token_id = hf_model_instance.config.eos_token_id

        hf_pipeline_instance = pipeline(
            "text-generation",
            model=hf_model_instance,
            tokenizer=hf_tokenizer_instance,
            return_full_text=pipeline_return_full_text
        )
        print(f"Local LLM pipeline for {model_id} initialized successfully.")
        return hf_pipeline_instance, hf_tokenizer_instance
    except Exception as e:
        print(f"CRITICAL: Failed to initialize local LLM pipeline: {e}")
        import traceback
        traceback.print_exc()
        return None, None

def create_enhanced_motif_prompt(text_corpus_chunk: str, max_motifs_to_extract: int = MAX_MOTIFS_PER_CHUNK) -> str:
    """
    Creates the enhanced prompt for the LLM to extract structured motifs.
    Truncates text_corpus_chunk if it exceeds MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK.
    """
    # Truncate the input text chunk if it's too long for the prompt context
    # This truncation happens *before* it's embedded in the larger prompt template
    if len(text_corpus_chunk) > MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK:
        # print(f"    Note: Text block for LLM prompt analysis truncated from {len(text_corpus_chunk)} to {MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK} chars.")
        text_corpus_chunk = text_corpus_chunk[:MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK]

    prompt = f"""You are analyzing text to find recurring themes and the specific phrases that signal these themes.

CRITICAL INSTRUCTIONS:
1. Surface forms MUST be SHORT PHRASES (2-6 words) that appear to recur or are representative of how the theme is actually stated in the provided text.
2. Do NOT create generic descriptions or full sentences as surface forms - find phrases that people actually used.
3. Focus on meaningful recurring concepts, not obvious topics that span the entire text.
4. Labels should be specific and descriptive, in [UPPER_SNAKE_CASE_WITH_BRACKETS] (e.g., "[REGULATORY_COMPLIANCE]", "[EMPLOYEE_MONITORING]"). Avoid generic labels like [EXAMPLE_LABEL] or [THEME_A].

Your task: Identify up to {max_motifs_to_extract} recurring themes from the "Text to analyze" below.

For each theme, provide:
- label: A specific, descriptive label.
- description: One concise sentence explaining what this theme represents in the context of the analyzed text.
- surface_forms: A JSON list of 2 to 4 short (2-6 words) phrases that are strong examples of how this theme is expressed in the analyzed text. These should ideally be phrases you see or infer as recurring from the provided text.

IMPORTANT: Surface forms should be actual phrases indicative of the theme, not summaries.

Examples of GOOD surface forms (if they were recurring in analyzed text):
- "data protection measures"
- "employee privacy rights"
- "compliance requirements"

Examples of BAD surface forms (AVOID these):
- "A comprehensive approach to privacy" (too long, a description)
- "The need for better policies" (generic, not a specific phrase from text)
- "Various stakeholders expressed concerns about data handling and privacy" (a full sentence summary)

Text to analyze:
\"\"\"
{text_corpus_chunk}
\"\"\"

Output MUST be a valid JSON list of objects. Each object must have "label", "description", and "surface_forms" keys.
If no clear themes with appropriate surface forms are found, output an empty JSON list: `[]`.
Ensure the entire output is ONLY the JSON list, with no other surrounding text or markdown.

Valid JSON Output:"""
# Removed the explicit JSON example from the end of the prompt to prevent copying,
# relying on the description and "Output MUST be a valid JSON list..."
    return prompt.strip()

def call_local_llm_for_raw_response( # Renamed for clarity: gets RAW response
    prompt_content_for_user_turn: str,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str, # For logging context
    chunk_idx_for_log: int # For logging context
    ) -> str:
    """
    Makes the actual call to the local LLM pipeline using a pre-formatted user prompt string.
    Returns the raw text string generated by the LLM.
    Assumes pipeline is initialized with return_full_text=False.
    """
    if not hf_pipeline_instance or not hf_tokenizer_instance:
        print(f"    ERROR (call_local_llm): LLM pipeline/tokenizer not initialized for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        return ""

    messages_for_template = [{"role": "user", "content": prompt_content_for_user_turn}]

    try:
        prompt_formatted_for_llm = hf_tokenizer_instance.apply_chat_template(
            messages_for_template, tokenize=False, add_generation_prompt=True
        )
    except Exception as e_template:
        print(f"    ERROR (call_local_llm): Applying chat template failed for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_template}")
        # Log this error too
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (call_local_llm_for_raw_response) ---\n")
            f.write(f"ERROR APPLYING CHAT TEMPLATE: {e_template}\n")
            f.write(f"User prompt content (first 300 chars): {prompt_content_for_user_turn[:300]}...\n")
        return ""

    generation_args = {
        "max_new_tokens": LLM_MAX_NEW_TOKENS_ENHANCED_MOTIF, # From config
        "do_sample": False, # True,
        "temperature": 0.3,
        "top_p": 0.9,
        "repetition_penalty": 1.1,
        "pad_token_id": hf_tokenizer_instance.pad_token_id
    }
    # print(f"    DEBUG (call_local_llm): Sending prompt to LLM for QID {qid_for_log}, Chunk {chunk_idx_for_log}. Prompt length for pipeline: {len(prompt_formatted_for_llm)}")

    try:
        outputs = hf_pipeline_instance(prompt_formatted_for_llm, **generation_args)

        # print(f"    DEBUG (call_local_llm): Raw 'outputs' from hf_pipeline for QID {qid_for_log}, Chunk {chunk_idx_for_log}:\n<<<<<\n{outputs}\n>>>>>")

        if outputs and isinstance(outputs, list) and len(outputs) > 0 and \
           outputs[0] and isinstance(outputs[0], dict) and 'generated_text' in outputs[0]:
            # Since pipeline is expected to be initialized with return_full_text=False,
            # outputs[0]['generated_text'] is only the new tokens.
            assistant_response_text = outputs[0]['generated_text'].strip()
            # print(f"    DEBUG (call_local_llm): 'assistant_response_text' (return_full_text=False) for QID {qid_for_log}, Chunk {chunk_idx_for_log} (len {len(assistant_response_text)}):\n<<<<<\n{assistant_response_text[:1000]}...\n>>>>>")
            return assistant_response_text
        else:
            print(f"    WARN (call_local_llm): LLM pipeline returned unexpected or empty structure for QID {qid_for_log}, Chunk {chunk_idx_for_log}. Output: {outputs}")
            return ""
    except Exception as e_pipeline:
        print(f"    ERROR (call_local_llm): Exception during hf_pipeline call for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e_pipeline}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (call_local_llm_for_raw_response) ---\n")
            f.write(f"ERROR DURING PIPELINE CALL: {e_pipeline}\n")
            f.write(f"Formatted prompt (first 300 chars): {prompt_formatted_for_llm[:300]}...\n")
        return ""

print("Cell 3: LLM Interaction Utilities loaded.")

Cell 3: LLM Interaction Utilities loaded.


In [None]:
# @title 4. Motif Processing & Validation
# Purpose: Functions related to processing, validating, and refining motifs after they've been initially extracted by the LLM.
# Cell 4: Motif Processing and Validation

def parse_and_validate_llm_json_response(
    llm_raw_response_text: str,
    qid_for_log: str,
    chunk_idx_for_log: int,
    prompt_sent_to_llm: str # For logging context if error
    ) -> List[Dict]:
    """
    Parses the LLM's raw text response (expected to be JSON or contain JSON),
    validates the basic structure of extracted motif objects.
    Returns a list of valid motif dictionaries, or an empty list on failure.
    """
    # print(f"    DEBUG (parse_validate): Raw LLM response for QID {qid_for_log}, Chunk {chunk_idx_for_log}:\n<<<<<\n{llm_raw_response_text}\n>>>>>")

    json_str_candidate = llm_raw_response_text.strip()

    # Attempt to remove markdown fences if LLM adds them
    if json_str_candidate.startswith("```json"):
        json_str_candidate = json_str_candidate[len("```json"):].strip()
    if json_str_candidate.startswith("```"):
        json_str_candidate = json_str_candidate[len("```"):].strip()
    if json_str_candidate.endswith("```"):
        json_str_candidate = json_str_candidate[:-len("```")].strip()

    # print(f"    DEBUG (parse_validate): Final json_str candidate for QID {qid_for_log}, Chunk {chunk_idx_for_log}:\n<<<<<\n{json_str_candidate}\n>>>>>")

    if not json_str_candidate or json_str_candidate.lower() == "[]":
        # print(f"    LLM indicated no themes or JSON was effectively empty for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        return []
    # Check for explicit "no themes found" type messages that are not JSON '[]'
    if "no_themes_found" in json_str_candidate.lower() or "no clear motifs" in json_str_candidate.lower():
        # print(f"    LLM explicitly stated no themes found for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        return []

    try:
        parsed_data = json.loads(json_str_candidate)

        # Handle if LLM returns a single JSON object instead of a list (as seen in tests)
        if isinstance(parsed_data, dict):
            # print(f"    DEBUG (parse_validate): LLM returned a single JSON object, wrapping in list for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
            parsed_data = [parsed_data]

        if not isinstance(parsed_data, list):
            raise ValueError("Parsed JSON from LLM is not a list (nor a single object that could be wrapped).")

        # Validate structure of each item in the parsed list
        valid_motifs_from_json = []
        for item_idx, item in enumerate(parsed_data):
            if isinstance(item, dict) and \
               all(k in item for k in ["label", "description", "surface_forms"]) and \
               isinstance(item['label'], str) and item['label'].strip().startswith('[') and item['label'].strip().endswith(']') and \
               isinstance(item['description'], str) and \
               isinstance(item['surface_forms'], list) and \
               all(isinstance(sf_item, str) for sf_item in item['surface_forms']):

                valid_motifs_from_json.append({
                    "label": item['label'].strip(),
                    "description": item['description'].strip(),
                    "surface_forms": [s.strip() for s in item['surface_forms'] if s.strip()] # Clean SFs
                })
            else:
                print(f"    [WARN] Invalid motif object structure in LLM JSON for QID {qid_for_log}, Chunk {chunk_idx_for_log}, Item {item_idx+1}. Skipping item: {str(item)[:200]}...")
                with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
                    f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (parse_and_validate_llm_json_response) ---\n")
                    f.write(f"INVALID MOTIF ITEM STRUCTURE (Item {item_idx+1}):\n{item}\n")
                    f.write(f"FULL PROBLEMATIC JSON CANDIDATE:\n{json_str_candidate}\n")


        return valid_motifs_from_json

    except (json.JSONDecodeError, ValueError) as e:
        print(f"    [WARN] Motif JSON parsing or structure validation failed for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} (parse_and_validate_llm_json_response) ---\n")
            f.write(f"PROMPT SENT (first 500 chars of user content):\n{prompt_sent_to_llm.split('Text to analyze:')[1][:500]}...\n") # Log relevant part of prompt
            f.write(f"RAW LLM RESPONSE (Error: {type(e).__name__}):\n{llm_raw_response_text}\n")
            f.write(f"EXTRACTED JSON STRING CANDIDATE (Error: {type(e).__name__}):\n{json_str_candidate}\n")
        return []

def validate_and_refine_surface_forms_in_chunk(
    llm_parsed_motifs: List[Dict],
    text_chunk_analyzed_by_llm: str # The specific text chunk LLM saw
    ) -> List[Dict]:
    """
    Validates LLM-generated SFs against actual recurring phrases in the *specific text chunk*.
    Uses extract_actual_phrases_from_text (from text_utils cell/module).
    """
    if not llm_parsed_motifs:
        return []

    # Get recurring phrases (n-grams) from the *specific text chunk* the LLM analyzed
    # These n-grams are already lowercased by extract_actual_phrases_from_text
    actual_recurring_phrases_in_chunk_map = extract_actual_phrases_from_text(
        text_chunk_analyzed_by_llm,
        min_freq=MIN_SF_FREQ_IN_CHUNK_VALIDATION # Uses config constant
    )

    if not actual_recurring_phrases_in_chunk_map:
        # print(f"    No recurring phrases (min_freq={MIN_SF_FREQ_IN_CHUNK_VALIDATION}) found in chunk for SF validation.")
        # If no actual recurring phrases, LLM SFs cannot be validated against them.
        # We might choose to return motifs with their original SFs or discard them.
        # For now, let's require SFs to be found in recurring phrases.
        return [] # Or return llm_parsed_motifs if you want to skip this validation if no n-grams found

    refined_motifs_for_this_chunk = []
    for motif_dict in llm_parsed_motifs:
        llm_proposed_sfs_list = motif_dict.get('surface_forms', [])
        if not isinstance(llm_proposed_sfs_list, list): llm_proposed_sfs_list = []

        validated_sfs_for_this_motif = set()

        for sf_candidate_from_llm in llm_proposed_sfs_list:
            if not isinstance(sf_candidate_from_llm, str) or not sf_candidate_from_llm.strip():
                continue
            sf_lower_candidate = sf_candidate_from_llm.lower().strip()

            # Check for exact match (case-insensitive) in the recurring n-grams from this chunk
            if sf_lower_candidate in actual_recurring_phrases_in_chunk_map:
                validated_sfs_for_this_motif.add(sf_lower_candidate)
            # else:
                # Optional: Implement fuzzy matching here if desired
                # print(f"      SF '{sf_lower_candidate}' from LLM not found in chunk's recurring phrases.")

        if validated_sfs_for_this_motif: # Only keep motif if it has at least one SF validated against chunk's n-grams
            refined_motif_entry = motif_dict.copy()
            refined_motif_entry['surface_forms'] = sorted(list(validated_sfs_for_this_motif))
            refined_motifs_for_this_chunk.append(refined_motif_entry)
        # else:
            # print(f"    Motif '{motif_dict.get('label')}' discarded for chunk (no SFs validated against recurring n-grams in chunk).")

    return refined_motifs_for_this_chunk

def enhanced_motif_extraction_per_chunk(
    text_chunk_to_analyze: str,
    hf_pipeline_instance,
    hf_tokenizer_instance,
    qid_for_log: str,
    chunk_idx_for_log: int
    ) -> List[Dict]:
    """
    Orchestrates LLM call for motif extraction from a single text chunk,
    parses the JSON response, and performs chunk-level SF validation.
    """
    # 1. Build the prompt using the text_chunk_to_analyze
    prompt_str_for_llm = create_enhanced_motif_prompt(text_chunk_to_analyze, MAX_MOTIFS_PER_CHUNK)

    raw_llm_response_text = "" # Initialize for logging/return
    parsed_and_validated_motifs_from_chunk = []

    for attempt in range(LLM_RETRY_ATTEMPTS):
        # print(f"      LLM Call attempt {attempt + 1} for chunk {chunk_idx_for_log} (QID {qid_for_log})...")
        raw_llm_response_text = call_local_llm_for_raw_response( # From llm_interaction cell
            prompt_str_for_llm,
            hf_pipeline_instance,
            hf_tokenizer_instance,
            qid_for_log,
            chunk_idx_for_log
        )

        if not raw_llm_response_text:
            print(f"      LLM call attempt {attempt + 1} for chunk {chunk_idx_for_log} (QID {qid_for_log}) returned empty string. Retrying if possible...")
            if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1) # Small delay before retry
            continue # Go to next attempt

        # 2. Parse LLM's raw response into list of motif dicts (basic structure validation)
        llm_parsed_motifs = parse_and_validate_llm_json_response(
            raw_llm_response_text,
            qid_for_log,
            chunk_idx_for_log,
            prompt_str_for_llm # Pass full prompt for logging context on error
        )

        if llm_parsed_motifs: # If parsing was successful and returned some motif objects
            # 3. Validate the SFs of these parsed motifs against the current text_chunk_to_analyze
            # print(f"      Validating SFs for {len(llm_parsed_motifs)} LLM-parsed motifs from chunk {chunk_idx_for_log}...")
            motifs_validated_in_chunk = validate_and_refine_surface_forms_in_chunk(
                llm_parsed_motifs,
                text_chunk_to_analyze # Validate against the text chunk LLM saw
            )

            if motifs_validated_in_chunk:
                # print(f"      Found {len(motifs_validated_in_chunk)} motifs with SFs validated against current chunk's n-grams.")
                parsed_and_validated_motifs_from_chunk = motifs_validated_in_chunk
                break # Successful extraction and validation for this chunk, exit retry loop
            else:
                print(f"      Chunk {chunk_idx_for_log} (QID {qid_for_log}): LLM provided motifs, but none had SFs validated against chunk's recurring n-grams (Attempt {attempt+1}). Retrying if possible...")
                if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)
        else: # Parsing itself failed or returned empty list (e.g., LLM output `[]`)
            print(f"      Motif JSON parsing/validation attempt {attempt + 1} yielded no structured motifs for chunk {chunk_idx_for_log} (QID {qid_for_log}). Retrying if possible...")
            if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)

    if not parsed_and_validated_motifs_from_chunk:
        print(f"      No valid & validated motifs extracted from chunk {chunk_idx_for_log} (QID {qid_for_log}) after {LLM_RETRY_ATTEMPTS} attempts.")

    return parsed_and_validated_motifs_from_chunk


def consolidate_raw_motifs(list_of_all_raw_motifs: List[Dict]) -> List[Dict]:
    """Consolidates motifs extracted from all chunks, primarily by label, merging surface forms."""
    if not list_of_all_raw_motifs:
        return []

    consolidated_motifs_map = {}
    for motif_obj in list_of_all_raw_motifs:
        label = motif_obj.get("label")
        description = motif_obj.get("description") # Assuming description from first encounter is fine
        surface_forms = motif_obj.get("surface_forms", [])

        if not isinstance(label, str) or not label.strip() or \
           not isinstance(description, str) or \
           not isinstance(surface_forms, list):
            # print(f"    [WARN] Skipping malformed raw motif during consolidation: {motif_obj}")
            continue # Skip malformed motif objects

        if label not in consolidated_motifs_map:
            consolidated_motifs_map[label] = {
                "label": label,
                "description": description,
                "surface_forms": sorted(list(set(sf.lower().strip() for sf in surface_forms if sf.strip()))) # Store unique, lowercased SFs
            }
        else: # Label exists, merge surface forms
            existing_sfs_set = set(consolidated_motifs_map[label].get("surface_forms", []))
            new_sfs_set = set(sf.lower().strip() for sf in surface_forms if sf.strip())
            consolidated_motifs_map[label]["surface_forms"] = sorted(list(existing_sfs_set.union(new_sfs_set)))
            # Could also choose to update/average description, but simplest is to keep the first one.

    return list(consolidated_motifs_map.values())

def filter_surface_forms_by_global_frequency(
    consolidated_motifs_list: List[Dict],
    full_qid_corpus_text: str, # The entire original text for the QID
    min_global_freq: int = MIN_SF_FREQ_FOR_FINAL_MOTIFS # Uses config
    ) -> List[Dict]:
    """
    Filters surface forms in consolidated motifs based on their frequency
    in the full QID corpus text.
    """
    if not consolidated_motifs_list:
        return []

    final_globally_filtered_motifs = []
    # print(f"  Filtering SFs from {len(consolidated_motifs_list)} consolidated motifs (min global freq: {min_global_freq})...")

    # Pre-calculate frequencies of all potential SFs for efficiency if many motifs share SFs
    # For now, count per SF as it appears.

    for motif_obj in consolidated_motifs_list:
        globally_frequent_sfs_for_this_motif = []
        original_sfs_for_this_motif = motif_obj.get("surface_forms", [])

        for sf_str in original_sfs_for_this_motif:
            # Count occurrences in the *full* QID corpus text
            # count_sf_occurrences handles case-insensitivity and re.escape
            count = count_sf_occurrences(full_qid_corpus_text, sf_str)

            if count >= min_global_freq:
                globally_frequent_sfs_for_this_motif.append(sf_str) # Keep original casing from consolidated list
                # print(f"    SF '{sf_str}' (label '{motif_obj.get('label')}') kept, global freq: {count}")
            # else:
                # print(f"    SF '{sf_str}' (label '{motif_obj.get('label')}') filtered out, global freq: {count} (min_req: {min_global_freq})")

        if globally_frequent_sfs_for_this_motif: # Only keep motif if it has at least one globally frequent SF
            filtered_motif_entry = motif_obj.copy() # Make a copy to modify
            filtered_motif_entry["surface_forms"] = sorted(list(set(globally_frequent_sfs_for_this_motif))) # Ensure unique and sorted
            final_globally_filtered_motifs.append(filtered_motif_entry)
        # else:
            # print(f"    Motif '{motif_obj.get('label')}' discarded (no globally frequent SFs after filtering).")

    return final_globally_filtered_motifs

print("Cell 4: Motif Processing and Validation Utilities loaded.")

Cell 4: Motif Processing and Validation Utilities loaded.


In [None]:
# @title 5. MDL Calculations
# Purpose: All functions related to MDL cost calculations (L(H), L(D|H)) and BDM.
# Cell 5: MDL Calculations

def initialize_bdm_instance():
    """Initializes and returns a BDM instance."""
    print("Initializing BDM instance...")
    try:
        # Using default CTM-based NKS for 2D data
        bdm_instance = BDM(ndim=2)
        print("BDM instance initialized successfully (ndim=2, default CTM-based).")
        return bdm_instance
    except Exception as e_bdm_init:
        print(f"CRITICAL: Failed to initialize BDM instance: {e_bdm_init}")
        if "CTM data files" in str(e_bdm_init).lower() or "dataset" in str(e_bdm_init).lower():
            print("  BDM Error Hint: This might be related to missing/corrupted CTM data files for PyBDM.")
            print("  Ensure PyBDM is installed correctly and can access/download its data.")
            print("  You might need to run the following once in your environment:")
            print("  from pybdm import get_ctm_dataset; get_ctm_dataset(force=False)") # Add force=True if re-download needed
        return None

def text_to_binary_matrix(text_input: str, size: tuple = MATRIX_SIZE_GLOBAL) -> np.ndarray:
    """
    Converts a text string to a binary matrix using its SHA256 hash.
    The BDM calculation is sensitive to this representation.
    """
    if not isinstance(text_input, str) or not text_input.strip():
        # Return a zero matrix for empty or invalid input to ensure BDM gets a matrix
        # BDM of a zero matrix will be low, reflecting low complexity.
        return np.zeros(size, dtype=int)

    # Using SHA256 hash of the text
    hash_obj = hashlib.sha256(text_input.encode('utf-8', 'ignore')) # Ignore encoding errors for robustness
    hash_digest = hash_obj.hexdigest() # 64 hex characters = 256 bits

    required_bits = size[0] * size[1]

    # Convert hex digest to a binary string
    binary_string_from_hash = bin(int(hash_digest, 16))[2:].zfill(256) # Ensure it's 256 bits long

    # Handle matrix size vs. hash size
    if required_bits <= 256:
        # Take the first 'required_bits' from the hash's binary string
        binary_string_for_matrix = binary_string_from_hash[:required_bits]
    else:
        # If matrix is larger than hash bits, pad with zeros (or could repeat hash, but zero-padding is simpler)
        # print(f"    WARN (text_to_binary_matrix): Matrix size {size} requires {required_bits} bits, SHA256 provides 256. Padding with zeros.")
        binary_string_for_matrix = binary_string_from_hash.ljust(required_bits, '0')

    bits_for_matrix = [int(b) for b in binary_string_for_matrix]
    return np.array(bits_for_matrix).reshape(size)

def compute_bdm_for_text(text_input: str, bdm_instance: BDM, matrix_s: tuple = MATRIX_SIZE_GLOBAL) -> float:
    """
    Computes BDM for a given text string.
    Uses a truncated prefix of the text for hashing if text is too long,
    as BDM on very large matrices derived from full text can be slow and BDM
    is often used on fixed-size representations.
    """
    if not isinstance(text_input, str) or not text_input.strip():
        return 0.0 # BDM of effectively nothing is low complexity

    # Using a prefix of the text for BDM calculation to keep it manageable
    # and to compare changes in a consistent part of the data.
    text_for_hash = text_input[:MAX_TEXT_FOR_BDM_HASH] if len(text_input) > MAX_TEXT_FOR_BDM_HASH else text_input

    if not text_for_hash.strip(): # If the prefix is also empty/whitespace
        return 0.0

    binary_matrix = text_to_binary_matrix(text_for_hash, size=matrix_s)
    try:
        bdm_value = bdm_instance.bdm(binary_matrix)
        return bdm_value
    except Exception as e_bdm:
        print(f"      Error during BDM calculation for text (full len {len(text_input)}, hashed part len {len(text_for_hash)}): {e_bdm}")
        return -1.0 # Indicate error

def calculate_L_H_token_based_structured(structured_motifs_list: List[Dict]) -> float:
    """
    Calculates L(H) - the cost of defining the list of structured motifs.
    Uses token-based costs defined in the configuration.
    """
    if not structured_motifs_list:
        return 0.0

    total_lh_cost = 0.0
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict):
            continue # Skip malformed entries

        current_motif_lh = 0.0

        # Cost for the symbolic label itself
        label_str = motif_obj.get('label', "")
        if isinstance(label_str, str) and label_str.strip():
            current_motif_lh += MOTIF_SYMBOLIC_LABEL_COST

        # Cost for the textual description
        description_str = motif_obj.get('description', "")
        if isinstance(description_str, str) and description_str.strip():
            current_motif_lh += MOTIF_DESCRIPTION_TEXT_BASE_COST
            current_motif_lh += len(tokenize_phrase(description_str)) * MOTIF_DESCRIPTION_TOKEN_COST

        # Cost for listing the surface forms in the L(H) definition
        surface_forms_list = motif_obj.get('surface_forms', [])
        if isinstance(surface_forms_list, list) and surface_forms_list: # Check if list is not empty
            valid_sfs_for_lh = [sf for sf in surface_forms_list if isinstance(sf, str) and sf.strip()]
            if valid_sfs_for_lh:
                current_motif_lh += MOTIF_SURFACE_FORMS_LIST_BASE_COST
                for sf_str in valid_sfs_for_lh:
                    current_motif_lh += len(tokenize_phrase(sf_str)) * MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH

        total_lh_cost += current_motif_lh
    return total_lh_cost

def llm_compress_text_structured(text_to_compress: str, structured_motifs_list: List[Dict]) -> str:
    """
    Compresses text by replacing occurrences of motif surface forms with their symbolic labels.
    Text and surface forms are lowercased for matching.
    """
    if not isinstance(text_to_compress, str):
        return ""
    if not structured_motifs_list:
        return text_to_compress.lower() # Return lowercased original if no motifs

    # Start with the lowercased version of the text to compress
    compressed_text = text_to_compress.lower()

    # It can be beneficial to process motifs that have longer surface forms first,
    # or motifs that are more specific. For now, we process in given order.
    # A more advanced strategy might sort structured_motifs_list here.

    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict):
            continue

        label = motif_obj.get('label', None)
        surface_forms = motif_obj.get('surface_forms', [])

        if not (isinstance(label, str) and label.strip()) or \
           not (isinstance(surface_forms, list) and surface_forms): # Ensure label is valid and SF list is not empty
            continue # Skip motif if label is bad or no surface forms

        placeholder = label # Use the actual symbolic label as the placeholder

        # Sort this motif's own surface forms by length (descending)
        # to ensure longer matches are prioritized over shorter sub-matches.
        sorted_sfs_for_this_motif = sorted(
            [sf for sf in surface_forms if isinstance(sf, str) and sf.strip()],
            key=len,
            reverse=True
        )

        for sf_str in sorted_sfs_for_this_motif:
            sf_lower = sf_str.lower() # Surface forms are already lowercased during consolidation & filtering in this pipeline
            try:
                # Replace all occurrences of this surface form (case-insensitive due to prior lowercasing)
                # re.escape handles any special regex characters in sf_lower
                compressed_text = re.sub(r'\b' + re.escape(sf_lower) + r'\b', placeholder, compressed_text)
                # Using word boundaries \b to avoid partial word matches, e.g. 'cat' in 'caterpillar'
            except re.error as re_e:
                print(f"    Regex error during compression for SF '{sf_str}' of motif '{label}': {re_e}. Skipping this SF.")
                continue # Skip this surface form, try next
    return compressed_text

def compute_mdl_cost_for_text_block(
    full_qid_corpus_str: str,
    final_motifs_to_evaluate: List[Dict], # Renamed for clarity
    bdm_instance: BDM,
    matrix_s: tuple = MATRIX_SIZE_GLOBAL
    ) -> tuple[float, float, float]:
    """
    Computes L(H), L(D|H), and Total MDL for a text block given a final set of motifs.
    """
    if not isinstance(full_qid_corpus_str, str):
        full_qid_corpus_str = "" # Ensure it's a string

    # Calculate L(H): Cost of defining the final_motifs_to_evaluate
    l_h = calculate_L_H_token_based_structured(final_motifs_to_evaluate)

    # Calculate L(D|H): Compress the full corpus and then compute BDM
    compressed_text_block = llm_compress_text_structured(full_qid_corpus_str, final_motifs_to_evaluate)
    l_d_h = compute_bdm_for_text(compressed_text_block, bdm_instance, matrix_s)

    if l_d_h < 0: # Indicates a BDM computation error
        # print(f"    WARN (compute_mdl_cost): BDM error for L(D|H). L(H) was {l_h:.4f}")
        return l_h, -1.0, -1.0 # Propagate error for L(D|H) and total

    total_mdl_cost = l_h + l_d_h
    return l_h, l_d_h, total_mdl_cost

print("Cell 5: MDL Calculation Utilities loaded.")

Cell 5: MDL Calculation Utilities loaded.


In [None]:
# @title 6. Main Orchestration
# Purpose: The top-level script that orchestrates the entire workflow, importing functions from other modules. This would contain your main() function.
# Cell 6: Main Pipeline Orchestration

def main():
    # --- Initial Setup and Welcome Message ---
    print("--- MWP Enhanced: Batched LLM, Validated SFs, Structured Motifs, Token-L(H), BDM L(D|H) ---")
    print(f"Timestamp: {time.asctime()}")
    print("\n--- Configuration Summary ---")
    print(f"LLM Model: {LOCAL_LLM_MODEL_ID}, Quantization: {USE_QUANTIZATION_FOR_LOCAL_LLM}")
    print(f"LLM Batch Size (Responses): {LLM_BATCH_SIZE_RESPONSES}, Retries: {LLM_RETRY_ATTEMPTS}")
    print(f"Max Text Chars per LLM Prompt Chunk: {MAX_TEXT_CHARS_PER_LLM_PROMPT_CHUNK}")
    print(f"Max New Tokens for LLM Motif Extraction: {LLM_MAX_NEW_TOKENS_ENHANCED_MOTIF}")
    print(f"Max Motifs to Request per Chunk: {MAX_MOTIFS_PER_CHUNK}")
    print(f"L(H) Costs: Label={MOTIF_SYMBOLIC_LABEL_COST}, DescBase={MOTIF_DESCRIPTION_TEXT_BASE_COST}, DescToken={MOTIF_DESCRIPTION_TOKEN_COST}, SFListBase={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, SFTokenInLH={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}")
    print(f"SF Validation (Chunk N-grams) Min Freq: {MIN_SF_FREQ_IN_CHUNK_VALIDATION}")
    print(f"SF Filtering (Global Corpus) Min Freq: {MIN_SF_FREQ_FOR_FINAL_MOTIFS}")
    print(f"BDM Hash Prefix Length: {MAX_TEXT_FOR_BDM_HASH}, BDM Matrix: {MATRIX_SIZE_GLOBAL}")
    print(f"Debug Log File: {LLM_DEBUG_LOG_FILE}")
    print("--- End Configuration Summary ---\n")

    # --- Initialize Debug Log File ---
    try:
        with open(LLM_DEBUG_LOG_FILE, "w", encoding="utf-8") as f: # Overwrite for new run
            f.write(f"LLM Motif Debug Log - Run Started: {time.asctime()}\n")
            f.write(f"Model ID: {LOCAL_LLM_MODEL_ID}\n")
            f.write(f"Pipeline Config: return_full_text=False (Implicit in enhanced_motif_extraction_per_chunk)\n")
            f.write("Enhanced Prompting, Chunk-level SF Validation, and Global SF Filtering Active\n---\n")
    except Exception as e_log:
        print(f"WARN: Could not initialize debug log file {LLM_DEBUG_LOG_FILE}: {e_log}")

    # --- Initialize LLM and BDM ---
    # Functions are expected to be defined in previous cells:
    # initialize_llm_pipeline (from llm_interaction.py / Cell 3)
    # initialize_bdm_instance (from mdl_calculations.py / Cell 5)

    hf_pipeline_instance, hf_tokenizer_instance = initialize_llm_pipeline() # Uses defaults from config
    if not hf_pipeline_instance or not hf_tokenizer_instance:
        print("CRITICAL: Exiting due to LLM pipeline initialization failure.")
        return

    bdm_instance_main = initialize_bdm_instance()
    if not bdm_instance_main:
        print("CRITICAL: Exiting due to BDM initialization failure.")
        return

    # --- Load Phase 2 Collated Data ---
    if not os.path.exists(P2_COLLATED_FILE):
        print(f"ERROR: Phase 2 output file not found: {P2_COLLATED_FILE}")
        return
    print(f"Loading Phase 2 output from: {P2_COLLATED_FILE}...")
    phase2_data_content = None
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f:
            phase2_data_content = json.load(f)
    except Exception as e_load:
        print(f"Error loading or parsing {P2_COLLATED_FILE}: {e_load}")
        return

    all_qid_mdl_results_list = [] # Stores result dict for each QID

    # --- Determine QIDs to Process ---
    aggregated_content_by_qid_from_file = phase2_data_content.get("aggregated_pdf_content_by_qid", {})
    if not aggregated_content_by_qid_from_file:
        print(f"No 'aggregated_pdf_content_by_qid' key found or data is empty in {P2_COLLATED_FILE}.")
        return

    qids_to_process_this_run = []
    if P3_QIDS_TO_PROCESS_THEMATICALLY and isinstance(P3_QIDS_TO_PROCESS_THEMATICALLY, list) and P3_QIDS_TO_PROCESS_THEMATICALLY:
        qids_to_process_this_run = [qid for qid in P3_QIDS_TO_PROCESS_THEMATICALLY if qid in aggregated_content_by_qid_from_file]
        if not qids_to_process_this_run:
            print(f"Warning: None of specified QIDs {P3_QIDS_TO_PROCESS_THEMATICALLY} found in loaded data. Exiting.")
            return
    else: # Fallback if P3_QIDS_TO_PROCESS_THEMATICALLY is not set or empty
        qids_to_process_limit_fallback = 1 # Process only the first QID found in data as a fallback
        print(f"P3_QIDS_TO_PROCESS_THEMATICALLY not set or empty. Processing up to {qids_to_process_limit_fallback} QID(s) from data as fallback.")
        qids_to_process_this_run = list(aggregated_content_by_qid_from_file.keys())[:qids_to_process_limit_fallback]
        if not qids_to_process_this_run:
            print("No QIDs available in data to process based on fallback. Exiting.")
            return
    print(f"\nMDL analysis will run for these QIDs: {qids_to_process_this_run}\n")

    # --- Main QID Processing Loop ---
    for qid_identifier_str in qids_to_process_this_run:
        print(f"--- Analyzing Data for QID: {qid_identifier_str} ---")

        list_of_individual_response_structs = aggregated_content_by_qid_from_file.get(qid_identifier_str, [])
        # Extract actual text strings from the list of response dicts/structs
        actual_response_texts_for_qid = [
            item.get("text", "") for item in list_of_individual_response_structs
            if isinstance(item, dict) and isinstance(item.get("text"), str) and item.get("text","").strip()
        ]
        if not actual_response_texts_for_qid:
            print(f"  No valid text strings extracted from responses for QID {qid_identifier_str}. Skipping.")
            print("-" * 50); continue

        # Create the full corpus for this QID (used for baseline BDM and final L(D|H))
        full_corpus_text_for_qid = "\n\n<RSP_SEP>\n\n".join(actual_response_texts_for_qid)
        # Optional: Preprocess the full corpus if SF counting and compression should operate on preprocessed text
        # For now, SF counting and compression will use full_corpus_text_for_qid.lower()

        if len(full_corpus_text_for_qid.strip()) < 100: # Arbitrary threshold for meaningful analysis
            print(f"  Skipping QID {qid_identifier_str}: combined text too short ({len(full_corpus_text_for_qid)} chars).")
            print("-" * 50); continue

        num_total_responses_for_qid = len(actual_response_texts_for_qid)
        print(f"  Combined corpus for QID {qid_identifier_str} has {len(full_corpus_text_for_qid)} chars from {num_total_responses_for_qid} individual responses.")

        # Calculate baseline L(D) for the entire QID's text
        baseline_bdm_original_corpus = compute_bdm_for_text(full_corpus_text_for_qid, bdm_instance_main, MATRIX_SIZE_GLOBAL)
        if baseline_bdm_original_corpus < 0: # BDM error
            print(f"  Error computing baseline BDM for QID {qid_identifier_str}. Skipping this QID.")
            # Log an error entry for this QID
            error_entry = {"qid": qid_identifier_str, "status": "ERROR_BASELINE_BDM", "baseline_mdl": -1.0}
            all_qid_mdl_results_list.append(error_entry)
            print("-" * 50); continue

        current_qid_baseline_mdl_cost = baseline_bdm_original_corpus # L(H) is 0 for baseline
        print(f"  Baseline MDL for QID {qid_identifier_str} (L(D_orig)): {current_qid_baseline_mdl_cost:.4f}")

        # --- Batched Enhanced Motif Extraction ---
        # Create text chunks where each chunk is a string of joined individual responses
        batched_text_chunks_for_llm_input = []
        for i in range(0, len(actual_response_texts_for_qid), LLM_BATCH_SIZE_RESPONSES):
            batch_of_responses = actual_response_texts_for_qid[i:i + LLM_BATCH_SIZE_RESPONSES]
            # Preprocess the joined text of the chunk before sending to LLM
            chunk_text_for_llm = preprocess_corpus_for_motif_extraction("\n\n<RSP_SEP>\n\n".join(batch_of_responses))
            batched_text_chunks_for_llm_input.append(chunk_text_for_llm)

        print(f"  QID {qid_identifier_str}: Processing {num_total_responses_for_qid} responses in {len(batched_text_chunks_for_llm_input)} preprocessed chunks (batch size: {LLM_BATCH_SIZE_RESPONSES} responses).")

        # Accumulate motifs from all chunks (these have passed chunk-level SF validation)
        raw_motifs_chunk_validated = []
        for chunk_idx, text_chunk_being_analyzed in enumerate(batched_text_chunks_for_llm_input):
            print(f"    Analyzing chunk {chunk_idx + 1}/{len(batched_text_chunks_for_llm_input)} for QID {qid_identifier_str} (processed chunk len: {len(text_chunk_being_analyzed)} chars)...")
            if len(text_chunk_being_analyzed.strip()) < 50: # Skip very small/empty chunks after preprocessing
                print(f"      Chunk {chunk_idx+1} (QID {qid_identifier_str}) too short after preprocessing, skipping.")
                continue

            # enhanced_motif_extraction_per_chunk is from motif_processing.py / Cell 4
            motifs_from_this_chunk_list = enhanced_motif_extraction_per_chunk(
                text_chunk_being_analyzed,
                hf_pipeline_instance,
                hf_tokenizer_instance,
                qid_identifier_str,
                chunk_idx + 1
            )
            if motifs_from_this_chunk_list:
                print(f"      Extracted {len(motifs_from_this_chunk_list)} chunk-validated motif objects from chunk {chunk_idx+1} (QID {qid_identifier_str}).")
                raw_motifs_chunk_validated.extend(motifs_from_this_chunk_list)
            else:
                print(f"      No valid & chunk-validated motifs extracted from chunk {chunk_idx+1} (QID {qid_identifier_str}).")

        # --- Initialize result entry for this QID ---
        current_qid_result_entry = {
            "qid": qid_identifier_str,
            "corpus_len_chars": len(full_corpus_text_for_qid),
            "num_responses": num_total_responses_for_qid,
            "baseline_mdl": current_qid_baseline_mdl_cost,
            "final_refined_motifs": [], # Will hold globally refined motifs
            "l_h_final_motifs": 0.0,
            "l_d_h_final_motifs": current_qid_baseline_mdl_cost, # Default to baseline if no motifs
            "total_mdl_with_final_motifs": current_qid_baseline_mdl_cost, # Default to baseline
            "compression_achieved": 0.0,
            "num_raw_motifs_chunk_validated": len(raw_motifs_chunk_validated),
            "num_consolidated_motifs": 0,
            "num_globally_refined_motifs": 0
        }

        if not raw_motifs_chunk_validated:
            print(f"  No raw & chunk-validated motifs extracted by LLM for QID {qid_identifier_str} from any chunk.")
            all_qid_mdl_results_list.append(current_qid_result_entry)
            print("-" * 50); continue # Go to next QID

        print(f"  Total {len(raw_motifs_chunk_validated)} raw (chunk-validated) motifs extracted from LLM for QID {qid_identifier_str}.")

        # --- Motif Consolidation Step ---
        # consolidate_raw_motifs is from motif_processing.py / Cell 4
        consolidated_motifs_list = consolidate_raw_motifs(raw_motifs_chunk_validated)
        current_qid_result_entry["num_consolidated_motifs"] = len(consolidated_motifs_list)
        print(f"  Consolidated into {len(consolidated_motifs_list)} unique motifs (by label) for QID {qid_identifier_str}.")

        if not consolidated_motifs_list:
            print(f"  No unique motifs left after consolidation for QID {qid_identifier_str}.")
            all_qid_mdl_results_list.append(current_qid_result_entry)
            print("-" * 50); continue

        # --- Global Surface Form Frequency Filtering ---
        print(f"  Refining {len(consolidated_motifs_list)} consolidated motifs by GLOBAL SF frequency (min freq: {MIN_SF_FREQ_FOR_FINAL_MOTIFS})...")
        # filter_surface_forms_by_global_frequency is from motif_processing.py / Cell 4
        globally_refined_motifs_for_mdl = filter_surface_forms_by_global_frequency(
            consolidated_motifs_list,
            full_corpus_text_for_qid, # Filter against the full original corpus for this QID
            min_global_freq=MIN_SF_FREQ_FOR_FINAL_MOTIFS
        )
        current_qid_result_entry["num_globally_refined_motifs"] = len(globally_refined_motifs_for_mdl)
        print(f"  Globally refined into {len(globally_refined_motifs_for_mdl)} motifs for QID {qid_identifier_str}.")

        if not globally_refined_motifs_for_mdl:
            print(f"  No motifs left after GLOBAL surface form frequency refinement for QID {qid_identifier_str}.")
            all_qid_mdl_results_list.append(current_qid_result_entry)
            print("-" * 50); continue

        print(f"  Final Globally Refined Motifs for QID {qid_identifier_str} (for MDL eval):")
        for idx, mo_final in enumerate(globally_refined_motifs_for_mdl): # Iterate through globally_refined_motifs_for_mdl
            print(f"    --- Refined Motif {idx+1} ---")
            print(f"      Label: {mo_final.get('label', 'N/A')}, Desc: {mo_final.get('description','N/A')[:70]}..., SFs ({len(mo_final.get('surface_forms',[]))}): {mo_final.get('surface_forms',[])}")

        # --- Final MDL Calculation ---
        # compute_mdl_cost_for_text_block is from mdl_calculations.py / Cell 5
        l_h_final_val, l_d_h_final_val, total_mdl_with_final_motifs_val = compute_mdl_cost_for_text_block(
            full_corpus_text_for_qid,
            globally_refined_motifs_for_mdl, # Use globally refined motifs
            bdm_instance_main,
            MATRIX_SIZE_GLOBAL
        )

        current_qid_result_entry["final_refined_motifs"] = globally_refined_motifs_for_mdl # Store the motifs used
        current_qid_result_entry["l_h_final_motifs"] = l_h_final_val

        if l_d_h_final_val < 0: # BDM error during L(D|H) calculation
            print(f"  Error computing MDL cost with final refined motifs for QID {qid_identifier_str} (BDM error in L(D|H)).")
            current_qid_result_entry.update({"l_d_h_final_motifs": -1.0, "total_mdl_with_final_motifs": -1.0, "compression_achieved": "BDM_ERROR"})
            all_qid_mdl_results_list.append(current_qid_result_entry)
            print("-" * 50); continue

        current_qid_result_entry["l_d_h_final_motifs"] = l_d_h_final_val
        current_qid_result_entry["total_mdl_with_final_motifs"] = total_mdl_with_final_motifs_val
        compression_final_val = current_qid_baseline_mdl_cost - total_mdl_with_final_motifs_val
        current_qid_result_entry["compression_achieved"] = compression_final_val

        print(f"  L(H) for final motifs: {l_h_final_val:.4f}")
        print(f"  L(D|H) compressed full corpus: {l_d_h_final_val:.4f}")
        print(f"  Total MDL cost with final motifs: {total_mdl_with_final_motifs_val:.4f}")

        result_status_str = f"SUCCESS: Compression: {compression_final_val:.4f}" if compression_final_val > 0.0001 else f"NOTE: No sig. comp. Diff: {compression_final_val:.4f}"
        print(f"  {result_status_str}")
        all_qid_mdl_results_list.append(current_qid_result_entry)
        print("-" * 50)

    # --- Summary Printing and Saving Results ---
    print("\n--- Overall QID-based MDL Analysis Summary (Enhanced Pipeline) ---")
    if not all_qid_mdl_results_list:
        print("No QIDs were processed or no valid results generated.")
    else:
        valid_results_for_stats = [r for r in all_qid_mdl_results_list if isinstance(r.get('compression_achieved'), float) and r.get('l_h_final_motifs', -1.0) >= 0]
        num_qids_processed = len(all_qid_mdl_results_list)
        num_qids_with_valid_mdl = len(valid_results_for_stats)
        num_compressed_qids = sum(1 for r in valid_results_for_stats if r['compression_achieved'] > 0.0001)

        successful_compressions_values = [r['compression_achieved'] for r in valid_results_for_stats if r['compression_achieved'] > 0.0001]
        avg_compression_val = np.mean(successful_compressions_values) if successful_compressions_values else 0
        max_compression_val = np.max(successful_compressions_values) if successful_compressions_values else 0

        print(f"Total QIDs targeted for analysis: {len(qids_to_process_this_run)}")
        print(f"Total QID result entries logged: {num_qids_processed}")
        print(f"Number of QIDs with valid MDL calculations: {num_qids_with_valid_mdl}")
        print(f"Number of QIDs where compression was achieved: {num_compressed_qids}")
        if num_compressed_qids > 0:
            print(f"  Average compression (for successful cases): {avg_compression_val:.4f}")
            print(f"  Maximum compression achieved across QIDs: {max_compression_val:.4f}")
        else:
            print("  No compression achieved for any QID in this run.")

        output_filename_qids_final = os.path.join(BASE_PROJECT_DIR, "mdl_analysis_per_qid_enhanced_pipeline_vLatest.json")
        try:
            with open(output_filename_qids_final, "w", encoding="utf-8") as f_out:
                json.dump(all_qid_mdl_results_list, f_out, indent=2, ensure_ascii=False) # ensure_ascii=False for non-latin chars
            print(f"Detailed QID-based results saved to {output_filename_qids_final}")
        except Exception as e_save:
            print(f"Error saving QID-based results to {output_filename_qids_final}: {e_save}")

if __name__ == "__main__":
    # Ensure this script is run after cells 1-5 have defined their constants and functions
    print("Executing main MDL pipeline...")
    main()
    print("Main MDL pipeline execution finished.")

Executing main MDL pipeline...
--- MWP Enhanced: Batched LLM, Validated SFs, Structured Motifs, Token-L(H), BDM L(D|H) ---
Timestamp: Sun Jun  1 06:25:13 2025

--- Configuration Summary ---
LLM Model: google/gemma-2b-it, Quantization: True
LLM Batch Size (Responses): 5, Retries: 2
Max Text Chars per LLM Prompt Chunk: 7000
Max New Tokens for LLM Motif Extraction: 800
Max Motifs to Request per Chunk: 3
L(H) Costs: Label=0.5, DescBase=0.5, DescToken=0.1, SFListBase=0.25, SFTokenInLH=0.1
SF Validation (Chunk N-grams) Min Freq: 2
SF Filtering (Global Corpus) Min Freq: 2
BDM Hash Prefix Length: 2000, BDM Matrix: (8, 8)
Debug Log File: ./llm_motif_debug_log_mwp_enhanced.txt
--- End Configuration Summary ---

--- Initializing LLM Pipeline (model: google/gemma-2b-it, quantization: True, return_full_text: False) ---
Using device: cuda
Loading tokenizer for google/gemma-2b-it...


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

BitsAndBytesConfig created for google/gemma-2b-it, compute_dtype: torch.bfloat16.
Loading local model google/gemma-2b-it (Quantization active: True)...


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Device set to use cuda:0


Local LLM pipeline for google/gemma-2b-it initialized successfully.
Initializing BDM instance...
BDM instance initialized successfully (ndim=2, default CTM-based).
Loading Phase 2 output from: ./Phase2_PDF_Collated_Texts/phase2_collated_pdf_texts.json...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



MDL analysis will run for these QIDs: ['Q4']

--- Analyzing Data for QID: Q4 ---
  Combined corpus for QID Q4 has 129501 chars from 209 individual responses.
  Baseline MDL for QID Q4 (L(D_orig)): 121.3693
  QID Q4: Processing 209 responses in 42 preprocessed chunks (batch size: 5 responses).
    Analyzing chunk 1/42 for QID Q4 (processed chunk len: 3158 chars)...
    [WARN] Invalid motif object structure in LLM JSON for QID Q4, Chunk 1, Item 1. Skipping item: {'label': 'Exceptions to Individual Rights', 'description': 'The text discusses the need for exceptions to certain individual rights in the employment context due to competing public interests.', 'sur...
      Motif JSON parsing/validation attempt 1 yielded no structured motifs for chunk 1 (QID Q4). Retrying if possible...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    [WARN] Invalid motif object structure in LLM JSON for QID Q4, Chunk 1, Item 1. Skipping item: {'label': 'Exceptions to Individual Rights', 'description': 'The text discusses the need for exceptions to certain individual rights in the employment context due to competing public interests.', 'sur...
      Motif JSON parsing/validation attempt 2 yielded no structured motifs for chunk 1 (QID Q4). Retrying if possible...
      No valid & validated motifs extracted from chunk 1 (QID Q4) after 2 attempts.
      No valid & chunk-validated motifs extracted from chunk 1 (QID Q4).
    Analyzing chunk 2/42 for QID Q4 (processed chunk len: 3262 chars)...
    [WARN] Invalid motif object structure in LLM JSON for QID Q4, Chunk 2, Item 1. Skipping item: {'label': 'Exceptions to Data Rights in Employment Context', 'description': 'The excerpt argues for stronger data rights for Australians, highlighting their current limitations compared to internation...
    [WARN] Invalid motif object structure in 

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    [WARN] Invalid motif object structure in LLM JSON for QID Q4, Chunk 2, Item 1. Skipping item: {'label': 'Exceptions to Data Rights in Employment Context', 'description': 'The excerpt argues for stronger data rights for Australians, highlighting their current limitations compared to internation...
    [WARN] Invalid motif object structure in LLM JSON for QID Q4, Chunk 2, Item 2. Skipping item: {'label': 'Exceptions for Employee Rights', 'description': 'The excerpt emphasizes the need for careful consideration when granting specific exceptions to individual rights in the employment context, ...
    [WARN] Invalid motif object structure in LLM JSON for QID Q4, Chunk 2, Item 3. Skipping item: {}...
      Motif JSON parsing/validation attempt 2 yielded no structured motifs for chunk 2 (QID Q4). Retrying if possible...
      No valid & validated motifs extracted from chunk 2 (QID Q4) after 2 attempts.
      No valid & chunk-validated motifs extracted from chunk 2 (QID Q4).
    Analyzing 

KeyboardInterrupt: 

# SEPARATOR

---

In [None]:
# @title Revised MWP
# --- Imports ---
import json
import os
import hashlib
import numpy as np
from pybdm import BDM
import re
import time
from typing import List, Dict # For type hinting

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

# --- Configuration ---
BASE_PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Legal/' # !!! EXAMPLE - UPDATE THIS PATH !!!
PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json')

P3_QIDS_TO_PROCESS_THEMATICALLY = ["Q4"] # Process only Q4 for this example

# --- BDM and LLM Model Configuration ---
MATRIX_SIZE_GLOBAL = (8, 8)
LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it' # Or your verified 'gemma-3n-e4b-it'
USE_QUANTIZATION_FOR_LOCAL_LLM = True
LLM_BATCH_SIZE = 5
LLM_RETRY_ATTEMPTS = 2
MAX_TEXT_PER_LLM_PROMPT_CHUNK = 7000 # Max characters for the text_block to be analyzed within a single LLM prompt
LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION = 700 # Max tokens for LLM to generate for motif extraction

# --- Token-Based L(H) Configuration ---
MOTIF_SYMBOLIC_LABEL_COST = 0.5
MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
MOTIF_DESCRIPTION_TOKEN_COST = 0.1
MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.25
MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.1

# --- Logging File ---
LLM_DEBUG_LOG_FILE = "llm_motif_debug_log_mwp.txt" # New log file for this version

# --- Helper Function Definitions (Tokenization, L(H), BDM, Compression) ---

def tokenize_phrase(phrase_text: str) -> List[str]:
    if not isinstance(phrase_text, str): return []
    phrase_text = phrase_text.lower()
    tokens = phrase_text.split()
    return [t for t in tokens if t]

def calculate_L_H_token_based_structured(structured_motifs_list: List[Dict]) -> float:
    if not structured_motifs_list: return 0.0
    total_lh_cost = 0.0
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue
        current_motif_lh = 0
        label_str = motif_obj.get('label', "")
        if label_str and isinstance(label_str, str) and label_str.strip():
            current_motif_lh += MOTIF_SYMBOLIC_LABEL_COST

        description_str = motif_obj.get('description', "")
        if description_str and isinstance(description_str, str) and description_str.strip():
            current_motif_lh += MOTIF_DESCRIPTION_TEXT_BASE_COST
            current_motif_lh += len(tokenize_phrase(description_str)) * MOTIF_DESCRIPTION_TOKEN_COST

        surface_forms_list = motif_obj.get('surface_forms', [])
        if surface_forms_list and isinstance(surface_forms_list, list):
            valid_sfs_for_lh = [sf for sf in surface_forms_list if isinstance(sf, str) and sf.strip()]
            if valid_sfs_for_lh:
                current_motif_lh += MOTIF_SURFACE_FORMS_LIST_BASE_COST
                for sf_str in valid_sfs_for_lh:
                    current_motif_lh += len(tokenize_phrase(sf_str)) * MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH
        total_lh_cost += current_motif_lh
    return total_lh_cost

def llm_compress_text_structured(text_to_compress: str, structured_motifs_list: List[Dict]) -> str:
    if not isinstance(text_to_compress, str): return ""
    if not structured_motifs_list: return text_to_compress.lower()

    compressed_text = text_to_compress.lower()

    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue

        label = motif_obj.get('label', None)
        surface_forms = motif_obj.get('surface_forms', [])

        if not label or not surface_forms or not isinstance(surface_forms, list):
            continue

        placeholder = label

        sorted_sfs_for_this_motif = sorted(
            [sf for sf in surface_forms if isinstance(sf, str) and sf.strip()],
            key=len,
            reverse=True
        )

        for sf_str in sorted_sfs_for_this_motif:
            try:
                compressed_text = re.sub(re.escape(sf_str.lower()), placeholder, compressed_text, flags=re.IGNORECASE)
            except re.error as re_e:
                print(f"    Regex error during compression for SF '{sf_str}' of motif '{label}': {re_e}. Skipping.")
                continue
    return compressed_text

def text_to_binary_matrix(text_input: str, size=MATRIX_SIZE_GLOBAL) -> np.ndarray:
    if not text_input or not isinstance(text_input, str) or not text_input.strip():
        return np.zeros(size, dtype=int)
    hash_obj = hashlib.sha256(text_input.encode('utf-8', 'ignore'))
    hash_digest = hash_obj.hexdigest()
    required_bits = size[0] * size[1]
    binary_string = bin(int(hash_digest, 16))[2:].zfill(256)
    binary_string_padded = binary_string.ljust(required_bits, '0')
    bits = [int(b) for b in binary_string_padded[:required_bits]]
    return np.array(bits).reshape(size)

def compute_bdm_for_text(text_input: str, bdm_instance: BDM, matrix_s=MATRIX_SIZE_GLOBAL) -> float:
    if not text_input or not isinstance(text_input, str) or not text_input.strip() : return 0.0
    MAX_TEXT_FOR_BDM_HASH = 2000
    text_for_hash = text_input if len(text_input) <= MAX_TEXT_FOR_BDM_HASH else text_input[:MAX_TEXT_FOR_BDM_HASH]
    matrix = text_to_binary_matrix(text_for_hash, size=matrix_s)
    try:
        bdm_value = bdm_instance.bdm(matrix)
        return bdm_value
    except Exception as e_bdm:
        print(f"      Error during BDM calculation for text (len {len(text_input)}, hashed part len {len(text_for_hash)}): {e_bdm}")
        return -1.0

def compute_mdl_cost_for_text_block(full_qid_corpus_str: str,
                                    final_consolidated_motifs: List[Dict],
                                    bdm_instance: BDM,
                                    matrix_s=MATRIX_SIZE_GLOBAL) -> tuple[float, float, float]:
    if not isinstance(full_qid_corpus_str, str) : full_qid_corpus_str = ""
    l_h = calculate_L_H_token_based_structured(final_consolidated_motifs)
    compressed_text_block = llm_compress_text_structured(full_qid_corpus_str, final_consolidated_motifs)
    l_d_h = compute_bdm_for_text(compressed_text_block, bdm_instance, matrix_s)
    if l_d_h < 0: return l_h, -1.0, -1.0
    return l_h, l_d_h, l_h + l_d_h

# --- LLM Motif Extraction ---

def build_llm_prompt_for_motifs(text_block_for_prompt: str) -> str:
    if len(text_block_for_prompt) > MAX_TEXT_PER_LLM_PROMPT_CHUNK:
        # print(f"    Note: Text block for LLM prompt truncated from {len(text_block_for_prompt)} to {MAX_TEXT_PER_LLM_PROMPT_CHUNK} chars.")
        text_block_for_prompt = text_block_for_prompt[:MAX_TEXT_PER_LLM_PROMPT_CHUNK]

    prompt = f"""You will receive a set of comments from different people answering the same question.

Your task is to identify up to 5 key recurring themes.

For each theme, provide:
- A short label like [DATA_PRIVACY]
- A 1-sentence description of the theme
- 2–3 short phrases that often appear in the text (surface forms)

Output MUST be a valid JSON list of objects, where each object has "label", "description", and "surface_forms" keys.
Example of one object in the list:
{{
  "label": "[EXAMPLE_LABEL]",
  "description": "A concise description of the example theme.",
  "surface_forms": ["short repeated phrase 1", "another short repeated phrase"]
}}
If no clear motifs are found, output an empty JSON list: `[]`.
Do not include any other text, explanations, or markdown code fences around the JSON.

Set of comments to analyze:
\"\"\"
{text_block_for_prompt}
\"\"\"

Valid JSON Output (ensure it's a list of objects, or an empty list [] if no themes):
"""
    return prompt.strip()

def call_local_llm_for_motifs(prompt_str: str, hf_pipeline, hf_tokenizer, qid_for_log: str, chunk_idx_for_log: int) -> str:
    messages_for_template = [{"role": "user", "content": prompt_str}]
    prompt_formatted_for_llm = hf_tokenizer.apply_chat_template(
        messages_for_template, tokenize=False, add_generation_prompt=True
    )

    generation_args = {
        "max_new_tokens": LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION,
        "do_sample": False,
        "pad_token_id": hf_tokenizer.pad_token_id
    }
    # print(f"    DEBUG (call_local_llm): Sending prompt to LLM for QID {qid_for_log}, Chunk {chunk_idx_for_log}. Prompt length for pipeline: {len(prompt_formatted_for_llm)}")

    outputs = hf_pipeline(prompt_formatted_for_llm, **generation_args)

    # print(f"    DEBUG (call_local_llm): Raw 'outputs' from hf_pipeline for QID {qid_for_log}, Chunk {chunk_idx_for_log}:\n<<<<<\n{outputs}\n>>>>>")

    if outputs and isinstance(outputs, list) and len(outputs) > 0 and \
       outputs[0] and isinstance(outputs[0], dict) and 'generated_text' in outputs[0]:
        # Since pipeline initialized with return_full_text=False, generated_text is only new tokens
        assistant_response_text = outputs[0]['generated_text'].strip()
        # print(f"    DEBUG (call_local_llm): 'assistant_response_text' (return_full_text=False) for QID {qid_for_log}, Chunk {chunk_idx_for_log} (len {len(assistant_response_text)}):\n<<<<<\n{assistant_response_text[:1000]}...\n>>>>>")
        return assistant_response_text
    else:
        print(f"    WARN (call_local_llm): LLM pipeline returned unexpected or empty structure for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        return ""

def extract_motifs_from_llm_response(llm_response_str: str, qid_for_log:str, chunk_idx_for_log:int, prompt_sent:str) -> List[Dict]:
    # print(f"    DEBUG (extract_motifs): Raw LLM response for QID {qid_for_log}, Chunk {chunk_idx_for_log}:\n<<<<<\n{llm_response_str}\n>>>>>")
    json_str_candidate = llm_response_str.strip()

    if json_str_candidate.startswith("```json"):
        json_str_candidate = json_str_candidate[len("```json"):].strip()
    if json_str_candidate.startswith("```"):
        json_str_candidate = json_str_candidate[len("```"):].strip()
    if json_str_candidate.endswith("```"):
        json_str_candidate = json_str_candidate[:-len("```")].strip()

    # print(f"    DEBUG (extract_motifs): Final json_str candidate for QID {qid_for_log}, Chunk {chunk_idx_for_log}:\n<<<<<\n{json_str_candidate}\n>>>>>")

    if not json_str_candidate or json_str_candidate.lower() == "[]" or "no_themes_found" in json_str_candidate.lower() or "no clear motifs" in json_str_candidate.lower():
        return []

    try:
        parsed_data = json.loads(json_str_candidate)
        if isinstance(parsed_data, dict): # Handle if LLM returns a single object instead of a list
            # print(f"    DEBUG (extract_motifs): LLM returned a single JSON object, wrapping in list for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
            parsed_data = [parsed_data]

        if not isinstance(parsed_data, list):
            raise ValueError("Parsed JSON is not a list (or a single object that could be wrapped).")

        valid_motifs_from_json = []
        for item in parsed_data:
            if isinstance(item, dict) and \
               'label' in item and isinstance(item['label'], str) and \
               'description' in item and isinstance(item['description'], str) and \
               item['label'].strip().startswith('[') and item['label'].strip().endswith(']'): # Check for brackets after stripping

                sf = item.get('surface_forms', [])
                if not isinstance(sf, list) or not all(isinstance(s, str) for s in sf):
                    sf = []

                valid_motifs_from_json.append({
                    "label": item['label'].strip(),
                    "description": item['description'].strip(),
                    "surface_forms": [s.strip() for s in sf if s.strip()]
                })
        return valid_motifs_from_json
    except (json.JSONDecodeError, ValueError) as e:
        print(f"    [WARN] Motif JSON parsing/validation failed for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} ---\n")
            f.write(f"PROMPT SENT (first 500 chars):\n{prompt_sent[:500]}...\n")
            f.write(f"RAW LLM RESPONSE (Error: {type(e).__name__}):\n{llm_response_str}\n")
            f.write(f"EXTRACTED JSON STRING CANDIDATE (Error: {type(e).__name__}):\n{json_str_candidate}\n")
        return []

def get_motifs_for_text_chunks(
    list_of_response_strings: List[str],
    batch_size: int,
    hf_pipeline,
    hf_tokenizer,
    qid_for_log: str
    ) -> List[Dict]:
    all_extracted_motifs_from_all_chunks = []
    batched_text_blocks = []
    for i in range(0, len(list_of_response_strings), batch_size):
        current_batch_responses = list_of_response_strings[i:i + batch_size]
        text_block_for_chunk = "\n\n<RSP_SEP>\n\n".join(current_batch_responses)
        batched_text_blocks.append(text_block_for_chunk)

    print(f"  QID {qid_for_log}: Processing {len(list_of_response_strings)} responses in {len(batched_text_blocks)} chunks (batch size: {batch_size} responses).")

    for chunk_idx, text_chunk_to_analyze in enumerate(batched_text_blocks):
        print(f"    Analyzing chunk {chunk_idx + 1}/{len(batched_text_blocks)} for QID {qid_for_log} (len: {len(text_chunk_to_analyze)} chars)...")
        if len(text_chunk_to_analyze.strip()) < 50:
            print(f"      Chunk {chunk_idx+1} for QID {qid_for_log} too short, skipping.")
            continue

        prompt_for_llm = build_llm_prompt_for_motifs(text_chunk_to_analyze)
        motifs_from_this_chunk = []
        for attempt in range(LLM_RETRY_ATTEMPTS):
            try:
                raw_llm_response_text = call_local_llm_for_motifs(prompt_for_llm, hf_pipeline, hf_tokenizer, qid_for_log, chunk_idx + 1)
                if not raw_llm_response_text:
                    print(f"      LLM call attempt {attempt + 1} for chunk {chunk_idx+1} (QID {qid_for_log}) returned empty. Retrying if possible...")
                    if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)
                    continue

                motifs_attempt_from_chunk = extract_motifs_from_llm_response(raw_llm_response_text, qid_for_log, chunk_idx+1, prompt_for_llm)
                if motifs_attempt_from_chunk:
                    motifs_from_this_chunk = motifs_attempt_from_chunk
                    break
                else:
                    print(f"      Motif extraction/parsing attempt {attempt + 1} yielded no valid motifs for chunk {chunk_idx+1} (QID {qid_for_log}). Retrying if possible...")
                    if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)
            except Exception as e_call:
                 print(f"      Critical error during LLM call/parsing attempt {attempt + 1} for chunk {chunk_idx+1} (QID {qid_for_log}): {e_call}")
                 if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)
                 with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f: # Log critical errors too
                    f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx+1} --- ATTEMPT: {attempt+1} ---\n")
                    f.write(f"PROMPT SENT (first 500 chars):\n{prompt_for_llm[:500]}...\n")
                    f.write(f"CRITICAL LLM CALL/PARSING ERROR:\n{e_call}\n")
        if motifs_from_this_chunk:
            print(f"      Extracted {len(motifs_from_this_chunk)} motif objects from chunk {chunk_idx+1} (QID {qid_for_log}).")
            all_extracted_motifs_from_all_chunks.extend(motifs_from_this_chunk)
        else:
            print(f"      No valid motifs extracted from chunk {chunk_idx+1} (QID {qid_for_log}) after {LLM_RETRY_ATTEMPTS} attempts.")
    return all_extracted_motifs_from_all_chunks



# (Keep all your existing imports, constants, and helper functions as they are)
# ... (calculate_L_H_token_based_structured, llm_compress_text_structured, etc.) ...
# ... (get_motifs_for_text_chunks, build_llm_prompt_for_motifs, etc.) ...

# --- NEW CONSTANT for Surface Form Filtering ---
MIN_SF_FREQUENCY_IN_FULL_CORPUS = 2 # Surface form must appear at least this many times in the full QID corpus

def count_sf_occurrences(corpus_text: str, surface_form: str) -> int:
    """Counts case-insensitive occurrences of a surface form in the corpus text."""
    if not corpus_text or not surface_form:
        return 0
    return len(re.findall(re.escape(surface_form.lower()), corpus_text.lower(), flags=re.IGNORECASE))

# --- Main Execution Logic ---
def main():
    print("--- MWP: Batched LLM Motif Extraction, Refined SFs, Structured Motifs, Token-L(H), BDM L(D|H) ---") # Updated title
    print(f"Using L(H) Cost Params: Label={MOTIF_SYMBOLIC_LABEL_COST}, DescBase={MOTIF_DESCRIPTION_TEXT_BASE_COST}, DescToken={MOTIF_DESCRIPTION_TOKEN_COST}, SFListBase={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, SFTokenInLH={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}")
    print(f"LLM: {LOCAL_LLM_MODEL_ID}, Batch Size: {LLM_BATCH_SIZE} responses, Retries: {LLM_RETRY_ATTEMPTS}, Max Text per Prompt: {MAX_TEXT_PER_LLM_PROMPT_CHUNK}, Max New Tokens: {LLM_MAX_NEW_TOKENS_MOTIF_EXTRACTION}")
    print(f"Min SF Frequency for Refinement: {MIN_SF_FREQUENCY_IN_FULL_CORPUS}")

    with open(LLM_DEBUG_LOG_FILE, "w", encoding="utf-8") as f:
        f.write(f"LLM Motif Debug Log - {time.asctime()}\nModel: {LOCAL_LLM_MODEL_ID}\nPipeline return_full_text=False\n")

    # ... (LLM Pipeline Initialization - same as your last complete cell) ...
    local_llm_pipeline_instance = None
    local_llm_tokenizer_instance = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    try:
        print(f"Loading tokenizer for {LOCAL_LLM_MODEL_ID}...")
        local_llm_tokenizer_instance = AutoTokenizer.from_pretrained(LOCAL_LLM_MODEL_ID)
        if local_llm_tokenizer_instance.pad_token is None:
            print("Tokenizer setting pad_token = eos_token.")
            local_llm_tokenizer_instance.pad_token = local_llm_tokenizer_instance.eos_token

        bnb_config = None
        quant_active = False
        if USE_QUANTIZATION_FOR_LOCAL_LLM and torch.cuda.is_available():
            try:
                compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
                bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=True)
                quant_active = True
                print(f"BNB config created for {LOCAL_LLM_MODEL_ID}, compute_dtype: {compute_dtype}.")
            except Exception as e_bnb:
                print(f"WARN: Failed to create BitsAndBytesConfig: {e_bnb}. Quantization disabled.")

        print(f"Loading local model {LOCAL_LLM_MODEL_ID} (Quantization: {quant_active})...")
        model_kwargs = {"device_map": "auto", "trust_remote_code": True}
        if quant_active: model_kwargs["quantization_config"] = bnb_config
        else:
            if device.type == 'cuda': model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

        local_llm_model_instance = AutoModelForCausalLM.from_pretrained(LOCAL_LLM_MODEL_ID, **model_kwargs)
        if local_llm_tokenizer_instance.pad_token_id == local_llm_tokenizer_instance.eos_token_id:
             local_llm_model_instance.config.pad_token_id = local_llm_model_instance.config.eos_token_id

        local_llm_pipeline_instance = pipeline(
            "text-generation", model=local_llm_model_instance, tokenizer=local_llm_tokenizer_instance, return_full_text=False
        )
        print(f"Local LLM pipeline for {LOCAL_LLM_MODEL_ID} initialized successfully (return_full_text=False).")
    except Exception as e:
        print(f"CRITICAL: Failed to initialize local LLM pipeline: {e}")
        return

    # ... (BDM Initialization - same as your last complete cell, ensure it uses BDM(ndim=2)) ...
    try:
        bdm_instance_main = BDM(ndim=2)
        print("BDM instance initialized successfully (ndim=2, default CTM-based).")
    except Exception as e_bdm_init:
        print(f"CRITICAL: Failed to initialize BDM instance: {e_bdm_init}")
        if "CTM data files" in str(e_bdm_init):
            print("  BDM Error Hint: This might be related to missing CTM data files for PyBDM.")
        return

    # ... (Loading Phase 2 Data - same as your last complete cell) ...
    if not os.path.exists(P2_COLLATED_FILE):
        print(f"ERROR: Phase 2 output file not found: {P2_COLLATED_FILE}")
        return
    print(f"Loading Phase 2 output from: {P2_COLLATED_FILE}...")
    phase2_data_content = None
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f:
            phase2_data_content = json.load(f)
    except Exception as e:
        print(f"Error loading or parsing {P2_COLLATED_FILE}: {e}")
        return

    # --- ADD DEBUG PRINT HERE ---
    if phase2_data_content:
        print(f"DEBUG: Type of phase2_data_content: {type(phase2_data_content)}")
        if isinstance(phase2_data_content, dict):
            print(f"DEBUG: Keys in phase2_data_content: {list(phase2_data_content.keys())}")
            aggregated_content_map_debug = phase2_data_content.get("aggregated_pdf_content_by_qid")
            if aggregated_content_map_debug:
                print(f"DEBUG: 'aggregated_pdf_content_by_qid' found. Type: {type(aggregated_content_map_debug)}")
                print(f"DEBUG: QIDs available in 'aggregated_pdf_content_by_qid': {list(aggregated_content_map_debug.keys())}")
                if "Q4" in aggregated_content_map_debug:
                    print(f"DEBUG: Number of raw response items for Q4 in loaded file: {len(aggregated_content_map_debug['Q4'])}")
                else:
                    print("DEBUG: Q4 not found in 'aggregated_pdf_content_by_qid' at this stage.")
            else:
                print("DEBUG: 'aggregated_pdf_content_by_qid' key NOT found in loaded data.")
    else:
        print("DEBUG: phase2_data_content is None after attempting to load.")
    # --- END DEBUG PRINT ---


    all_qid_mdl_results = []
    qids_to_process_final = [] # Renamed from qids_to_target
    aggregated_content_by_qid_map = {}

    if phase2_data_content:
        aggregated_content_by_qid_map = phase2_data_content.get("aggregated_pdf_content_by_qid", {})
    if not aggregated_content_by_qid_map:
        print(f"No 'aggregated_pdf_content_by_qid' key found or data is empty in {P2_COLLATED_FILE}.")
        return

    if P3_QIDS_TO_PROCESS_THEMATICALLY and isinstance(P3_QIDS_TO_PROCESS_THEMATICALLY, list) and P3_QIDS_TO_PROCESS_THEMATICALLY:
        qids_to_process_final = [qid for qid in P3_QIDS_TO_PROCESS_THEMATICALLY if qid in aggregated_content_by_qid_map]
        if not qids_to_process_final:
            print(f"Warning: None of QIDs {P3_QIDS_TO_PROCESS_THEMATICALLY} found in loaded data.")
            return
    else:
        qids_to_process_limit_fallback = 1
        qids_to_process_final = list(aggregated_content_by_qid_map.keys())[:qids_to_process_limit_fallback]
        if not qids_to_process_final:
            print("No QIDs available to process based on fallback.")
            return
    print(f"\nMDL analysis will run for these QIDs: {qids_to_process_final}\n")


    for qid_str in qids_to_process_final:
        list_of_response_item_dicts = aggregated_content_by_qid_map.get(qid_str, [])
        actual_response_text_strings = [
            item.get("text", "") for item in list_of_response_item_dicts
            if isinstance(item, dict) and isinstance(item.get("text"), str) and item.get("text","").strip()
        ]
        if not actual_response_text_strings:
            print(f"No valid text strings extracted from responses for QID {qid_str}. Skipping.")
            continue

        print(f"--- Analyzing Aggregated Text for QID: {qid_str} ---")
        full_corpus_for_qid_str = "\n\n<RSP_SEP>\n\n".join(actual_response_text_strings)
        if len(full_corpus_for_qid_str.strip()) < 100:
            print(f"  Skipping QID {qid_str}: combined text too short ({len(full_corpus_for_qid_str)} chars).")
            continue

        num_responses_for_qid = len(actual_response_text_strings)
        print(f"  Combined corpus for QID {qid_str} has {len(full_corpus_for_qid_str)} chars from {num_responses_for_qid} individual responses.")

        baseline_l_d_original = compute_bdm_for_text(full_corpus_for_qid_str, bdm_instance_main, MATRIX_SIZE_GLOBAL)
        if baseline_l_d_original < 0:
            print(f"  Error computing baseline BDM for QID {qid_str}. Skipping this QID.")
            continue
        baseline_total_mdl_cost = baseline_l_d_original
        print(f"  Baseline MDL for QID {qid_str} (L(D_orig)): {baseline_total_mdl_cost:.4f}")

        raw_motifs_from_all_chunks = get_motifs_for_text_chunks(
            actual_response_text_strings, LLM_BATCH_SIZE,
            local_llm_pipeline_instance, local_llm_tokenizer_instance, qid_str
        )

        qid_result_entry = { # Initialize with baseline values
            "qid": qid_str, "corpus_len_for_qid": len(full_corpus_for_qid_str),
            "num_responses": num_responses_for_qid,
            "baseline_mdl": baseline_total_mdl_cost, "final_motifs": [],
            "l_h_motifs": 0.0, "l_d_h_motifs": baseline_total_mdl_cost,
            "total_mdl_motifs": baseline_total_mdl_cost,
            "compression_achieved": 0.0,
            "num_raw_motifs_extracted": len(raw_motifs_from_all_chunks),
            "num_consolidated_motifs": 0,
            "num_refined_motifs": 0 # For motifs after SF frequency filtering
        }

        if not raw_motifs_from_all_chunks:
            print(f"  No raw motifs extracted by LLM for QID {qid_str} from any chunk.")
            all_qid_mdl_results.append(qid_result_entry)
            print("-" * 40)
            continue

        print(f"  Extracted {len(raw_motifs_from_all_chunks)} raw motif objects from LLM for QID {qid_str} (across all chunks).")

        # --- Motif Consolidation Step ---
        consolidated_motifs_temp_dict = {}
        for motif_obj_raw in raw_motifs_from_all_chunks:
            label = motif_obj_raw.get("label")
            description = motif_obj_raw.get("description") # Ensure your LLM output uses this key
            surface_forms = motif_obj_raw.get("surface_forms")
            if label and description and surface_forms is not None:
                if label not in consolidated_motifs_temp_dict:
                    consolidated_motifs_temp_dict[label] = motif_obj_raw
                else:
                    existing_sfs_set = set(consolidated_motifs_temp_dict[label].get("surface_forms", []))
                    new_sfs_set = set(surface_forms)
                    consolidated_motifs_temp_dict[label]["surface_forms"] = sorted(list(dict.fromkeys(list(existing_sfs_set.union(new_sfs_set))))) # Deduplicate merged SFs

        consolidated_motifs_list = list(consolidated_motifs_temp_dict.values())
        qid_result_entry["num_consolidated_motifs"] = len(consolidated_motifs_list)
        print(f"  Consolidated into {len(consolidated_motifs_list)} unique motifs (by label) for QID {qid_str}.")

        if not consolidated_motifs_list:
            print(f"  No unique motifs left after consolidation for QID {qid_str}.")
            all_qid_mdl_results.append(qid_result_entry)
            print("-" * 40)
            continue

        print(f"  Consolidated Motifs for QID {qid_str} (BEFORE SF refinement):")
        for idx, mo_con in enumerate(consolidated_motifs_list):
            print(f"    --- Cons. Motif {idx+1} ---")
            print(f"      Label: {mo_con.get('label', 'N/A')}")
            print(f"      Description: {mo_con.get('description','N/A')[:70]}...") # Truncate for brevity
            print(f"      Surface Forms ({len(mo_con.get('surface_forms',[]))}): {mo_con.get('surface_forms',[])[:3]}...") # Print first 3 SFs

        # --- NEW: Refine Consolidated Motifs by Surface Form Frequency ---
        print(f"  Refining {len(consolidated_motifs_list)} consolidated motifs by SF frequency (min freq: {MIN_SF_FREQUENCY_IN_FULL_CORPUS})...")
        refined_motifs_for_mdl = []
        for motif_obj_consolidated in consolidated_motifs_list:
            original_sfs = motif_obj_consolidated.get("surface_forms", [])
            frequent_sfs = []
            for sf_str in original_sfs:
                if count_sf_occurrences(full_corpus_for_qid_str, sf_str) >= MIN_SF_FREQUENCY_IN_FULL_CORPUS:
                    frequent_sfs.append(sf_str)

            if frequent_sfs: # Only keep motif if it has at least one frequent SF
                refined_motif = motif_obj_consolidated.copy() # Create a copy to modify
                refined_motif["surface_forms"] = frequent_sfs
                refined_motifs_for_mdl.append(refined_motif)
            # else:
                # print(f"    Motif '{motif_obj_consolidated.get('label')}' discarded after SF refinement (no frequent SFs).")

        qid_result_entry["num_refined_motifs"] = len(refined_motifs_for_mdl)
        print(f"  Refined into {len(refined_motifs_for_mdl)} motifs with frequent surface forms for QID {qid_str}.")

        if not refined_motifs_for_mdl:
            print(f"  No motifs left after surface form frequency refinement for QID {qid_str}.")
            all_qid_mdl_results.append(qid_result_entry) # Save with 0 refined motifs
            print("-" * 40)
            continue

        print(f"  Final Refined Motifs for QID {qid_str} (for MDL eval):")
        for idx, mo_final in enumerate(refined_motifs_for_mdl):
            print(f"    --- Refined Motif {idx+1} ---")
            print(f"      Label: {mo_final.get('label', 'N/A')}")
            print(f"      Description: {mo_final.get('description','N/A')[:70]}...")
            print(f"      Surface Forms ({len(mo_final.get('surface_forms',[]))}): {mo_final.get('surface_forms',[])}")

        # Calculate MDL cost using the full original corpus for this QID and the *refined* motifs
        l_h_final, l_d_h_final, total_mdl_with_final_motifs = compute_mdl_cost_for_text_block(
            full_corpus_for_qid_str,
            refined_motifs_for_mdl, # Use refined motifs
            bdm_instance_main,
            MATRIX_SIZE_GLOBAL
        )

        qid_result_entry["final_motifs"] = refined_motifs_for_mdl # Store refined motifs
        qid_result_entry["l_h_motifs"] = l_h_final

        if l_d_h_final < 0:
            print(f"  Error computing MDL cost with final refined motifs for QID {qid_str} (BDM error in L(D|H)).")
            # ... (update qid_result_entry for error) ...
            all_qid_mdl_results.append(qid_result_entry)
            print("-" * 40)
            continue

        # ... (update qid_result_entry with final MDL costs and compression_final - same as before) ...
        qid_result_entry["l_d_h_motifs"] = l_d_h_final
        qid_result_entry["total_mdl_motifs"] = total_mdl_with_final_motifs
        compression_final = baseline_total_mdl_cost - total_mdl_with_final_motifs
        qid_result_entry["compression_achieved"] = compression_final

        print(f"  L(H) (Token-based Structured) for final refined motifs of QID {qid_str}: {l_h_final:.4f}")
        print(f"  L(D|H) (BDM-based) compressed full corpus complexity for QID {qid_str}: {l_d_h_final:.4f}")
        print(f"  Total MDL cost with final refined motifs for QID {qid_str}: {total_mdl_with_final_motifs:.4f}")

        result_status_str = ""
        if compression_final > 0.0001:
            result_status_str = f"SUCCESS: Compression achieved: {compression_final:.4f}"
        else:
            result_status_str = f"NOTE: No significant compression (or cost increased). Diff: {compression_final:.4f}"
        print(f"  {result_status_str}")
        all_qid_mdl_results.append(qid_result_entry)
        print("-" * 40)

    # --- Summary Printing and Saving Results ---
    # (This part remains largely the same)
    print("\n--- Overall QID-based MDL Analysis Summary (Batched LLM, Refined SFs, Structured Motifs, Token-L(H)) ---") # Updated title
    # ... (rest of summary and saving logic) ...
    if not all_qid_mdl_results:
        print("No QIDs were processed or no valid results generated.")
    else:
        # ... (same summary logic as before, ensure keys match qid_result_entry) ...
        output_filename_qids_final = "mdl_analysis_per_qid_batched_llm_refinedSF_v1.json" # New filename
        # ... (save to file) ...

if __name__ == "__main__":
    main()

--- MWP: Batched LLM Motif Extraction, Refined SFs, Structured Motifs, Token-L(H), BDM L(D|H) ---
Using L(H) Cost Params: Label=0.5, DescBase=0.5, DescToken=0.1, SFListBase=0.25, SFTokenInLH=0.1
LLM: google/gemma-2b-it, Batch Size: 5 responses, Retries: 2, Max Text per Prompt: 7000, Max New Tokens: 700
Min SF Frequency for Refinement: 2
Using device: cuda
Loading tokenizer for google/gemma-2b-it...
BNB config created for google/gemma-2b-it, compute_dtype: torch.bfloat16.
Loading local model google/gemma-2b-it (Quantization: True)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Local LLM pipeline for google/gemma-2b-it initialized successfully (return_full_text=False).
BDM instance initialized successfully (ndim=2, default CTM-based).
Loading Phase 2 output from: /content/drive/MyDrive/Colab Notebooks/Legal/Phase2_PDF_Collated_Texts/phase2_collated_pdf_texts.json...
DEBUG: Type of phase2_data_content: <class 'dict'>
DEBUG: Keys in phase2_data_content: ['metadata', 'aggregated_pdf_content_by_qid']
DEBUG: 'aggregated_pdf_content_by_qid' found. Type: <class 'dict'>
DEBUG: QIDs available in 'aggregated_pdf_content_by_qid': ['Q4', 'Q5', 'Q6', 'Q7', 'Q10', 'Q13', 'Q19', 'Q28', 'Q31', 'Q1', 'Q2', 'Q3', 'Q8', 'Q9', 'Q11', 'Q12', 'Q14', 'Q16', 'Q17', 'Q21', 'Q22', 'Q23', 'Q24', 'Q27', 'Q29', 'Q30', 'Q33', 'Q15', 'Q18', 'Q20', 'Q25', 'Q26', 'Q34', 'Q36', 'Q32', 'Q35']
DEBUG: Number of raw response items for Q4 in loaded file: 209

MDL analysis will run for these QIDs: ['Q4']

--- Analyzing Aggregated Text for QID: Q4 ---
  Combined corpus for QID Q4 has 129501 chars fr

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 1 motif objects from chunk 1 (QID Q4).
    Analyzing chunk 2/42 for QID Q4 (len: 3302 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 motif objects from chunk 2 (QID Q4).
    Analyzing chunk 3/42 for QID Q4 (len: 3259 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 motif objects from chunk 3 (QID Q4).
    Analyzing chunk 4/42 for QID Q4 (len: 3174 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 2 motif objects from chunk 4 (QID Q4).
    Analyzing chunk 5/42 for QID Q4 (len: 3182 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 motif objects from chunk 5 (QID Q4).
    Analyzing chunk 6/42 for QID Q4 (len: 2964 chars)...
      Motif extraction/parsing attempt 1 yielded no valid motifs for chunk 6 (QID Q4). Retrying if possible...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Motif extraction/parsing attempt 2 yielded no valid motifs for chunk 6 (QID Q4). Retrying if possible...
      No valid motifs extracted from chunk 6 (QID Q4) after 2 attempts.
    Analyzing chunk 7/42 for QID Q4 (len: 3365 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 motif objects from chunk 7 (QID Q4).
    Analyzing chunk 8/42 for QID Q4 (len: 3017 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 motif objects from chunk 8 (QID Q4).
    Analyzing chunk 9/42 for QID Q4 (len: 3124 chars)...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 2 motif objects from chunk 9 (QID Q4).
    Analyzing chunk 10/42 for QID Q4 (len: 3258 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 motif objects from chunk 10 (QID Q4).
    Analyzing chunk 11/42 for QID Q4 (len: 2970 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 2 motif objects from chunk 11 (QID Q4).
    Analyzing chunk 12/42 for QID Q4 (len: 3141 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 2 motif objects from chunk 12 (QID Q4).
    Analyzing chunk 13/42 for QID Q4 (len: 3317 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 motif objects from chunk 13 (QID Q4).
    Analyzing chunk 14/42 for QID Q4 (len: 3063 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 motif objects from chunk 14 (QID Q4).
    Analyzing chunk 15/42 for QID Q4 (len: 3394 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 1 motif objects from chunk 15 (QID Q4).
    Analyzing chunk 16/42 for QID Q4 (len: 3148 chars)...
    [WARN] Motif JSON parsing/validation failed for QID Q4, Chunk 16: Expecting value: line 6 column 3 (char 369)
      Motif extraction/parsing attempt 1 yielded no valid motifs for chunk 16 (QID Q4). Retrying if possible...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    [WARN] Motif JSON parsing/validation failed for QID Q4, Chunk 16: Expecting value: line 6 column 3 (char 369)
      Motif extraction/parsing attempt 2 yielded no valid motifs for chunk 16 (QID Q4). Retrying if possible...
      No valid motifs extracted from chunk 16 (QID Q4) after 2 attempts.
    Analyzing chunk 17/42 for QID Q4 (len: 3083 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 motif objects from chunk 17 (QID Q4).
    Analyzing chunk 18/42 for QID Q4 (len: 3059 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 motif objects from chunk 18 (QID Q4).
    Analyzing chunk 19/42 for QID Q4 (len: 2884 chars)...
      Motif extraction/parsing attempt 1 yielded no valid motifs for chunk 19 (QID Q4). Retrying if possible...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Motif extraction/parsing attempt 2 yielded no valid motifs for chunk 19 (QID Q4). Retrying if possible...
      No valid motifs extracted from chunk 19 (QID Q4) after 2 attempts.
    Analyzing chunk 20/42 for QID Q4 (len: 2568 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 motif objects from chunk 20 (QID Q4).
    Analyzing chunk 21/42 for QID Q4 (len: 2897 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 motif objects from chunk 21 (QID Q4).
    Analyzing chunk 22/42 for QID Q4 (len: 3238 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 motif objects from chunk 22 (QID Q4).
    Analyzing chunk 23/42 for QID Q4 (len: 3351 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 motif objects from chunk 23 (QID Q4).
    Analyzing chunk 24/42 for QID Q4 (len: 3363 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 2 motif objects from chunk 24 (QID Q4).
    Analyzing chunk 25/42 for QID Q4 (len: 2922 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 motif objects from chunk 25 (QID Q4).
    Analyzing chunk 26/42 for QID Q4 (len: 2490 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 motif objects from chunk 26 (QID Q4).
    Analyzing chunk 27/42 for QID Q4 (len: 2452 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 motif objects from chunk 27 (QID Q4).
    Analyzing chunk 28/42 for QID Q4 (len: 2481 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 motif objects from chunk 28 (QID Q4).
    Analyzing chunk 29/42 for QID Q4 (len: 3039 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 2 motif objects from chunk 29 (QID Q4).
    Analyzing chunk 30/42 for QID Q4 (len: 3393 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 2 motif objects from chunk 30 (QID Q4).
    Analyzing chunk 31/42 for QID Q4 (len: 3135 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 motif objects from chunk 31 (QID Q4).
    Analyzing chunk 32/42 for QID Q4 (len: 3298 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 motif objects from chunk 32 (QID Q4).
    Analyzing chunk 33/42 for QID Q4 (len: 2537 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 1 motif objects from chunk 33 (QID Q4).
    Analyzing chunk 34/42 for QID Q4 (len: 2562 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 motif objects from chunk 34 (QID Q4).
    Analyzing chunk 35/42 for QID Q4 (len: 3188 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 2 motif objects from chunk 35 (QID Q4).
    Analyzing chunk 36/42 for QID Q4 (len: 3092 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 motif objects from chunk 36 (QID Q4).
    Analyzing chunk 37/42 for QID Q4 (len: 3343 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 motif objects from chunk 37 (QID Q4).
    Analyzing chunk 38/42 for QID Q4 (len: 3181 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 motif objects from chunk 38 (QID Q4).
    Analyzing chunk 39/42 for QID Q4 (len: 3319 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 4 motif objects from chunk 39 (QID Q4).
    Analyzing chunk 40/42 for QID Q4 (len: 3366 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 motif objects from chunk 40 (QID Q4).
    Analyzing chunk 41/42 for QID Q4 (len: 3204 chars)...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


      Extracted 5 motif objects from chunk 41 (QID Q4).
    Analyzing chunk 42/42 for QID Q4 (len: 2647 chars)...
      Extracted 4 motif objects from chunk 42 (QID Q4).
  Extracted 144 raw motif objects from LLM for QID Q4 (across all chunks).
  Consolidated into 4 unique motifs (by label) for QID Q4.
  Consolidated Motifs for QID Q4 (BEFORE SF refinement):
    --- Cons. Motif 1 ---
      Label: [DATA_PRIVACY]
      Description: A concern regarding the potential erosion of privacy rights due to the...
      Surface Forms (96): ['A breach of privacy when displaying images of private residences', 'A concise summary of the excerpt', 'A concise summary of the excerpt, focusing on exceptions to privacy rights in the employment context']...
    --- Cons. Motif 2 ---
      Label: [EXAMPLE_LABEL]
      Description: A concise summary of the excerpt's argument for stronger data rights f...
      Surface Forms (43): ['Advocate for a consistent age of 18 for defining a child', 'Australian perspec

In [None]:
# @title Revised return_full_text=False
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import time

# --- Configuration (Adjust as needed) ---
LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it' # Or your 'gemma-3n-e4b-it'
USE_QUANTIZATION_FOR_LOCAL_LLM = True
MAX_NEW_TOKENS_TEST = 150 # Default for simple tests

# --- LLM Initialization ---
def initialize_llm_pipeline_for_test(return_full_text_setting: bool):
    print(f"--- Initializing LLM for Test (return_full_text={return_full_text_setting}) ---")
    local_llm_pipeline_instance = None
    local_llm_tokenizer_instance = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    try:
        print(f"Loading tokenizer for {LOCAL_LLM_MODEL_ID}...")
        local_llm_tokenizer_instance = AutoTokenizer.from_pretrained(LOCAL_LLM_MODEL_ID)

        if local_llm_tokenizer_instance.pad_token is None:
            print("Tokenizer does not have a pad_token; setting pad_token = eos_token.")
            local_llm_tokenizer_instance.pad_token = local_llm_tokenizer_instance.eos_token
            # Important: The model's config might also need pad_token_id set if it's used during generation
            # For pipeline, usually handled if tokenizer has it.

        bnb_config = None
        quant_active = False
        if USE_QUANTIZATION_FOR_LOCAL_LLM and torch.cuda.is_available():
            try:
                compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
                bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=True)
                quant_active = True
                print(f"BNB config created for {LOCAL_LLM_MODEL_ID}, compute_dtype: {compute_dtype}.")
            except Exception as e_bnb:
                print(f"WARN: Failed to create BitsAndBytesConfig: {e_bnb}. Quantization disabled.")

        print(f"Loading local model {LOCAL_LLM_MODEL_ID} (Quantization: {quant_active})...")
        model_kwargs = {"device_map": "auto", "trust_remote_code": True}
        if quant_active: model_kwargs["quantization_config"] = bnb_config
        else:
            if device.type == 'cuda': model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

        local_llm_model_instance = AutoModelForCausalLM.from_pretrained(LOCAL_LLM_MODEL_ID, **model_kwargs)

        # Explicitly set pad_token_id in model config if tokenizer's was None
        if local_llm_tokenizer_instance.pad_token_id == local_llm_tokenizer_instance.eos_token_id:
             local_llm_model_instance.config.pad_token_id = local_llm_model_instance.config.eos_token_id


        local_llm_pipeline_instance = pipeline(
            "text-generation",
            model=local_llm_model_instance,
            tokenizer=local_llm_tokenizer_instance,
            return_full_text=return_full_text_setting
        )
        print(f"Local LLM pipeline for {LOCAL_LLM_MODEL_ID} initialized successfully.")
        return local_llm_pipeline_instance, local_llm_tokenizer_instance
    except Exception as e:
        print(f"CRITICAL: Failed to initialize local LLM pipeline for test: {e}")
        return None, None

# --- Test Function ---
def run_llm_test(test_name: str,
                 user_prompt_content: str,
                 hf_pipeline,
                 hf_tokenizer,
                 is_pipeline_returning_full_text: bool, # Argument to know the pipeline's setting
                 max_new_tok=MAX_NEW_TOKENS_TEST):
    print(f"\n--- Running Test: {test_name} (Pipeline return_full_text={is_pipeline_returning_full_text}) ---")
    if not hf_pipeline or not hf_tokenizer:
        print("LLM Pipeline not initialized. Skipping test.")
        return

    messages_for_template = [{"role": "user", "content": user_prompt_content.strip()}]

    try:
        # For Gemma, apply_chat_template adds the generation prompt for the model turn
        prompt_formatted_for_llm = hf_tokenizer.apply_chat_template(
            messages_for_template,
            tokenize=False,
            add_generation_prompt=True
        )
    except Exception as e_template:
        print(f"ERROR applying chat template: {e_template}")
        # This can happen if the tokenizer doesn't have a chat_template defined
        # or if the messages format is wrong.
        print("Ensure your tokenizer has a chat_template (e.g., tokenizer.chat_template).")
        print("For Gemma, messages should be like: [{'role': 'user', 'content': '...'}, {'role': 'model', 'content': '...'}]")
        return

    print(f"Formatted Prompt (sent to pipeline):\n<<<<<\n{prompt_formatted_for_llm}\n>>>>>")

    generation_args = {
        "max_new_tokens": max_new_tok,
        "do_sample": False, # Keep False for predictable output during debugging
        "pad_token_id": hf_tokenizer.pad_token_id # Use the tokenizer's pad_token_id
                                                # which we ensured is set to eos_token_id if originally None
    }

    try:
        start_time = time.time()
        outputs = hf_pipeline(prompt_formatted_for_llm, **generation_args)
        end_time = time.time()
        print(f"LLM call took {end_time - start_time:.2f} seconds.")

        print(f"Raw 'outputs' from hf_pipeline:\n<<<<<\n{outputs}\n>>>>>")

        if outputs and isinstance(outputs, list) and len(outputs) > 0 and \
           outputs[0] and isinstance(outputs[0], dict) and 'generated_text' in outputs[0]:

            generated_text_full = outputs[0]['generated_text']
            print(f"'generated_text' from LLM (len {len(generated_text_full)}):\n<<<<<\n{generated_text_full}\n>>>>>")

            assistant_response = ""
            if is_pipeline_returning_full_text:
                print("Attempting to strip prompt (since is_pipeline_returning_full_text=True)...")
                # Try to strip the exact formatted prompt first
                if generated_text_full.startswith(prompt_formatted_for_llm):
                    assistant_response = generated_text_full[len(prompt_formatted_for_llm):].strip()
                    print(f"Isolated Assistant Response (exact prompt strip):\n<<<<<\n{assistant_response}\n>>>>>")
                else:
                    # Fallback: Look for the model's turn marker if the start doesn't match exactly
                    # This is common if the echoed prompt has slight variations or only part is echoed
                    model_turn_start_token_default = "<start_of_turn>model" # Common for Gemma

                    # Try to get the actual model turn start from chat template if possible
                    # This is more robust if the tokenizer has a proper template
                    try:
                        model_turn_template_parts = hf_tokenizer.apply_chat_template(
                            [{"role": "assistant", "content": ""}], # Empty assistant message
                            tokenize=False,
                            add_generation_prompt=False # We only want the prefix for assistant
                        )
                        # If it adds a newline or space after model token, strip it for matching
                        model_turn_start_token = model_turn_template_parts.strip()
                        if not model_turn_start_token: # Fallback if template gives empty for assistant prefix
                            model_turn_start_token = model_turn_start_token_default
                    except: # If apply_chat_template fails for assistant role, use default
                        model_turn_start_token = model_turn_start_token_default

                    idx = generated_text_full.rfind(model_turn_start_token) # Find the last model turn
                    if idx != -1:
                        potential_response = generated_text_full[idx + len(model_turn_start_token):].strip()
                        if potential_response.startswith("\n"): # Clean leading newline often added by models
                            potential_response = potential_response[1:].strip()

                        # Heuristic: if the stripped part is significantly shorter than original, it's likely the response
                        if len(potential_response) < len(generated_text_full) or not generated_text_full.strip().endswith(prompt_formatted_for_llm.strip()):
                             assistant_response = potential_response
                             print(f"Isolated Assistant Response (using model turn token '{model_turn_start_token}'):\n<<<<<\n{assistant_response}\n>>>>>")
                        else:
                            print(f"WARN: Model turn token '{model_turn_start_token}' found, but stripping did not significantly shorten text. This might be an issue.")
                            assistant_response = potential_response # Keep it for inspection
                            print(f"Potentially problematic isolated response:\n<<<<<\n{assistant_response}\n>>>>>")
                    else:
                        print(f"WARN: Could not reliably strip prompt. Model turn token '{model_turn_start_token}' not found as expected in 'generated_text'.")
                        assistant_response = generated_text_full # Fallback to full text if stripping fails
            else: # if is_pipeline_returning_full_text=False
                assistant_response = generated_text_full.strip() # generated_text is already just the new tokens
                print(f"Assistant Response (since is_pipeline_returning_full_text=False):\n<<<<<\n{assistant_response}\n>>>>>")

            # Final check for empty response after stripping
            if not assistant_response:
                print("WARN: Assistant response is empty after processing.")

        else:
            print("LLM output structure was not as expected (e.g., no 'generated_text' or empty list).")

    except Exception as e:
        print(f"ERROR during LLM test call or processing: {e}")
        import traceback
        traceback.print_exc()


# --- Main Test Execution ---
if __name__ == "__main__":
    # Ensure API key is set if your model/SDK needs it (not for local HF transformers usually)
    # if 'GEMINI_API_KEY' not in os.environ and 'HF_TOKEN' not in os.environ:
    #     print("Warning: Relevant API key (e.g., GEMINI_API_KEY or HF_TOKEN for gated models) not found in environment.")

    # --- Test with return_full_text=False first ---
    print("\n" + "#"*10 + " TESTING WITH pipeline(return_full_text=False) " + "#"*10)
    test_pipeline_rff_false, test_tokenizer_rff_false = initialize_llm_pipeline_for_test(return_full_text_setting=False)

    if test_pipeline_rff_false and test_tokenizer_rff_false:
        prompt1 = "What is 2+2?"
        run_llm_test("Test 1 (Simple Q&A)",
                     prompt1, test_pipeline_rff_false, test_tokenizer_rff_false,
                     is_pipeline_returning_full_text=False, max_new_tok=20)

        prompt2 = "List three primary colors."
        run_llm_test("Test 2 (Simple Instruction)",
                     prompt2, test_pipeline_rff_false, test_tokenizer_rff_false,
                     is_pipeline_returning_full_text=False, max_new_tok=30)

        prompt3 = """
        Provide a JSON object with a "fruit" key and a "color" key.
        Example: {"fruit": "apple", "color": "red"}
        Your JSON:
        """
        run_llm_test("Test 3 (Simple JSON, no context)",
                     prompt3, test_pipeline_rff_false, test_tokenizer_rff_false,
                     is_pipeline_returning_full_text=False, max_new_tok=50)

        short_context_for_test4 = "The user expressed concerns about data privacy. Another user mentioned data security. Access control was also discussed as important."
        motif_prompt_template_for_test4 = f"""You will receive a set of comments from different people answering the same question.
Your task is to identify up to 2 key recurring themes.
For each theme, provide:
- A short label like [DATA_PRIVACY]
- A 1-sentence definition
- 1-2 short phrases that often appear in the text (surface forms)

Output MUST be a valid JSON list of objects, where each object has "label", "description", and "surface_forms" keys.
Example of one object in the list:
{{
  "label": "[EXAMPLE_LABEL]",
  "description": "A concise description of the example theme.",
  "surface_forms": ["short repeated phrase 1", "another short repeated phrase"]
}}
If no clear motifs are found, output an empty JSON list: `[]`.
Do not include any other text, explanations, or markdown code fences around the JSON.

Set of comments to analyze:
\"\"\"
{short_context_for_test4}
\"\"\"

Valid JSON Output (ensure it's a list of objects, or an empty list [] if no themes):
"""
        run_llm_test("Test 4 (Motif Prompt, Short Context)",
                     motif_prompt_template_for_test4, test_pipeline_rff_false, test_tokenizer_rff_false,
                     is_pipeline_returning_full_text=False, max_new_tok=300)
    else:
        print("Skipping tests for return_full_text=False due to pipeline init failure.")

    print("\n" + "="*50 + "\n")

    # --- Test with return_full_text=True ---
    print("\n" + "#"*10 + " TESTING WITH pipeline(return_full_text=True) " + "#"*10)
    test_pipeline_rff_true, test_tokenizer_rff_true = initialize_llm_pipeline_for_test(return_full_text_setting=True)
    if test_pipeline_rff_true and test_tokenizer_rff_true:
        # Re-use prompt1 from above for brevity
        prompt1_reused = "What is 2+2?"
        run_llm_test("Test 5 (Simple Q&A, RFF=True)",
                     prompt1_reused, test_pipeline_rff_true, test_tokenizer_rff_true,
                     is_pipeline_returning_full_text=True, max_new_tok=100) # Increased max_new_tokens slightly to accommodate echoed prompt

        # Re-use motif_prompt_template_for_test4
        run_llm_test("Test 6 (Motif Prompt, Short Context, RFF=True)",
                     motif_prompt_template_for_test4, test_pipeline_rff_true, test_tokenizer_rff_true,
                     is_pipeline_returning_full_text=True, max_new_tok=1000) # Increased max_new_tokens significantly
    else:
        print("Skipping tests for return_full_text=True due to pipeline init failure.")

    # Clean up (optional, if GPU memory is an issue between runs)
    # print("Cleaning up model and tokenizer objects...")
    # del test_pipeline_rff_false, test_tokenizer_rff_false, test_pipeline_rff_true, test_tokenizer_rff_true
    # if torch.cuda.is_available():
    #    torch.cuda.empty_cache()
    # print("Cleanup complete.")


########## TESTING WITH pipeline(return_full_text=False) ##########
--- Initializing LLM for Test (return_full_text=False) ---
Using device: cuda
Loading tokenizer for google/gemma-2b-it...
BNB config created for google/gemma-2b-it, compute_dtype: torch.bfloat16.
Loading local model google/gemma-2b-it (Quantization: True)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Local LLM pipeline for google/gemma-2b-it initialized successfully.

--- Running Test: Test 1 (Simple Q&A) (Pipeline return_full_text=False) ---
Formatted Prompt (sent to pipeline):
<<<<<
<bos><start_of_turn>user
What is 2+2?<end_of_turn>
<start_of_turn>model

>>>>>


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


LLM call took 1.47 seconds.
Raw 'outputs' from hf_pipeline:
<<<<<
[{'generated_text': 'The answer is 4. 2+2 is a simple addition problem that can be solved by'}]
>>>>>
'generated_text' from LLM (len 71):
<<<<<
The answer is 4. 2+2 is a simple addition problem that can be solved by
>>>>>
Assistant Response (since is_pipeline_returning_full_text=False):
<<<<<
The answer is 4. 2+2 is a simple addition problem that can be solved by
>>>>>

--- Running Test: Test 2 (Simple Instruction) (Pipeline return_full_text=False) ---
Formatted Prompt (sent to pipeline):
<<<<<
<bos><start_of_turn>user
List three primary colors.<end_of_turn>
<start_of_turn>model

>>>>>


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


LLM call took 1.32 seconds.
Raw 'outputs' from hf_pipeline:
<<<<<
[{'generated_text': 'Sure, here are the three primary colors:\n\n1. Red\n2. Yellow\n3. Blue'}]
>>>>>
'generated_text' from LLM (len 66):
<<<<<
Sure, here are the three primary colors:

1. Red
2. Yellow
3. Blue
>>>>>
Assistant Response (since is_pipeline_returning_full_text=False):
<<<<<
Sure, here are the three primary colors:

1. Red
2. Yellow
3. Blue
>>>>>

--- Running Test: Test 3 (Simple JSON, no context) (Pipeline return_full_text=False) ---
Formatted Prompt (sent to pipeline):
<<<<<
<bos><start_of_turn>user
Provide a JSON object with a "fruit" key and a "color" key.
        Example: {"fruit": "apple", "color": "red"}
        Your JSON:<end_of_turn>
<start_of_turn>model

>>>>>


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


LLM call took 1.86 seconds.
Raw 'outputs' from hf_pipeline:
<<<<<
[{'generated_text': '```json\n{\n  "fruit": "banana",\n  "color": "yellow"\n}\n```'}]
>>>>>
'generated_text' from LLM (len 56):
<<<<<
```json
{
  "fruit": "banana",
  "color": "yellow"
}
```
>>>>>
Assistant Response (since is_pipeline_returning_full_text=False):
<<<<<
```json
{
  "fruit": "banana",
  "color": "yellow"
}
```
>>>>>

--- Running Test: Test 4 (Motif Prompt, Short Context) (Pipeline return_full_text=False) ---
Formatted Prompt (sent to pipeline):
<<<<<
<bos><start_of_turn>user
You will receive a set of comments from different people answering the same question.
Your task is to identify up to 2 key recurring themes.
For each theme, provide:
- A short label like [DATA_PRIVACY]
- A 1-sentence definition
- 1-2 short phrases that often appear in the text (surface forms)

Output MUST be a valid JSON list of objects, where each object has "label", "description", and "surface_forms" keys.
Example of one object in the

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Local LLM pipeline for google/gemma-2b-it initialized successfully.

--- Running Test: Test 5 (Simple Q&A, RFF=True) (Pipeline return_full_text=True) ---
Formatted Prompt (sent to pipeline):
<<<<<
<bos><start_of_turn>user
What is 2+2?<end_of_turn>
<start_of_turn>model

>>>>>


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


LLM call took 1.73 seconds.
Raw 'outputs' from hf_pipeline:
<<<<<
[{'generated_text': '<bos><start_of_turn>user\nWhat is 2+2?<end_of_turn>\n<start_of_turn>model\nThe answer is 4. 2+2 is a simple addition problem that can be solved by adding the two numbers together.'}]
>>>>>
'generated_text' from LLM (len 176):
<<<<<
<bos><start_of_turn>user
What is 2+2?<end_of_turn>
<start_of_turn>model
The answer is 4. 2+2 is a simple addition problem that can be solved by adding the two numbers together.
>>>>>
Attempting to strip prompt (since is_pipeline_returning_full_text=True)...
Isolated Assistant Response (exact prompt strip):
<<<<<
The answer is 4. 2+2 is a simple addition problem that can be solved by adding the two numbers together.
>>>>>

--- Running Test: Test 6 (Motif Prompt, Short Context, RFF=True) (Pipeline return_full_text=True) ---
Formatted Prompt (sent to pipeline):
<<<<<
<bos><start_of_turn>user
You will receive a set of comments from different people answering the same question.

In [None]:
# @title simple llm prompting test
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import time

# --- Configuration (Adjust as needed) ---
LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it' # Or your 'gemma-3n-e4b-it' if that's the one you are targeting
USE_QUANTIZATION_FOR_LOCAL_LLM = True
MAX_NEW_TOKENS_TEST = 150 # Start with a reasonable number for simple tests

# --- LLM Initialization (from your main script) ---
def initialize_llm_pipeline_for_test(return_full_text_setting):
    print(f"--- Initializing LLM for Test (return_full_text={return_full_text_setting}) ---")
    local_llm_pipeline_instance = None
    local_llm_tokenizer_instance = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    try:
        print(f"Loading tokenizer for {LOCAL_LLM_MODEL_ID}...")
        local_llm_tokenizer_instance = AutoTokenizer.from_pretrained(LOCAL_LLM_MODEL_ID)

        # Ensure pad_token is set if tokenizer doesn't have one (Gemma usually doesn't)
        if local_llm_tokenizer_instance.pad_token is None:
            print("Tokenizer does not have a pad_token, setting it to eos_token.")
            local_llm_tokenizer_instance.pad_token = local_llm_tokenizer_instance.eos_token

        bnb_config = None
        quant_active = False
        if USE_QUANTIZATION_FOR_LOCAL_LLM and torch.cuda.is_available():
            try:
                compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
                bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=True)
                quant_active = True
                print(f"BNB config created for {LOCAL_LLM_MODEL_ID}, compute_dtype: {compute_dtype}.")
            except Exception as e_bnb:
                print(f"WARN: Failed to create BitsAndBytesConfig: {e_bnb}. Quantization disabled.")

        print(f"Loading local model {LOCAL_LLM_MODEL_ID} (Quantization: {quant_active})...")
        model_kwargs = {"device_map": "auto", "trust_remote_code": True}
        if quant_active: model_kwargs["quantization_config"] = bnb_config
        else:
            if device.type == 'cuda': model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

        local_llm_model_instance = AutoModelForCausalLM.from_pretrained(LOCAL_LLM_MODEL_ID, **model_kwargs)

        local_llm_pipeline_instance = pipeline(
            "text-generation",
            model=local_llm_model_instance,
            tokenizer=local_llm_tokenizer_instance,
            return_full_text=return_full_text_setting # Configurable
        )
        print(f"Local LLM pipeline for {LOCAL_LLM_MODEL_ID} initialized successfully.")
        return local_llm_pipeline_instance, local_llm_tokenizer_instance
    except Exception as e:
        print(f"CRITICAL: Failed to initialize local LLM pipeline for test: {e}")
        return None, None

# --- Test Function ---
def run_llm_test(test_name, user_prompt_content, hf_pipeline, hf_tokenizer, max_new_tok=MAX_NEW_TOKENS_TEST):
    print(f"\n--- Running Test: {test_name} ---")
    if not hf_pipeline or not hf_tokenizer:
        print("LLM Pipeline not initialized. Skipping test.")
        return

    messages_for_template = [{"role": "user", "content": user_prompt_content.strip()}]

    # Important: For Gemma, apply_chat_template often expects a list of dicts with 'role' and 'content'
    # and add_generation_prompt=True is crucial to add the '<start_of_turn>model\n' token.
    try:
        prompt_formatted_for_llm = hf_tokenizer.apply_chat_template(
            messages_for_template,
            tokenize=False,
            add_generation_prompt=True # This adds the turn for the model to start generating
        )
    except Exception as e_template:
        print(f"ERROR applying chat template: {e_template}")
        print("Make sure your tokenizer has a chat template configured (e.g., tokenizer.chat_template).")
        return

    print(f"Formatted Prompt (sent to pipeline):\n<<<<<\n{prompt_formatted_for_llm}\n>>>>>")

    generation_args = {
        "max_new_tokens": max_new_tok,
        "do_sample": False, # Keep False for predictable output during debugging
        "pad_token_id": hf_tokenizer.eos_token_id
    }

    try:
        start_time = time.time()
        outputs = hf_pipeline(prompt_formatted_for_llm, **generation_args)
        end_time = time.time()
        print(f"LLM call took {end_time - start_time:.2f} seconds.")

        print(f"Raw 'outputs' from hf_pipeline:\n<<<<<\n{outputs}\n>>>>>")

        if outputs and isinstance(outputs, list) and outputs[0] and isinstance(outputs[0], dict) and 'generated_text' in outputs[0]:
            generated_text_full = outputs[0]['generated_text']
            print(f"'generated_text' from LLM (len {len(generated_text_full)}):\n<<<<<\n{generated_text_full}\n>>>>>")

            # Attempt to isolate assistant's response if prompt was echoed (when return_full_text=True)
            if hf_pipeline.return_full_text: # Check the setting of the pipeline
                if generated_text_full.startswith(prompt_formatted_for_llm):
                    assistant_response = generated_text_full[len(prompt_formatted_for_llm):].strip()
                    print(f"Isolated Assistant Response (if prompt echoed):\n<<<<<\n{assistant_response}\n>>>>>")
                else:
                    # Fallback for Gemma template if prompt not exactly at start
                    model_turn_start_token = "<start_of_turn>model"
                    if model_turn_start_token in generated_text_full:
                        last_occurrence_index = generated_text_full.rfind(model_turn_start_token)
                        assistant_response = generated_text_full[last_occurrence_index + len(model_turn_start_token):].strip()
                        if assistant_response.startswith("\n"): assistant_response = assistant_response[1:].strip()
                        print(f"Isolated Assistant Response (Gemma template heuristic):\n<<<<<\n{assistant_response}\n>>>>>")
                    else:
                        print("Could not reliably strip prompt from 'generated_text'. The full text is shown above.")
            else: # if return_full_text=False, generated_text is already just the new tokens
                print(f"Assistant Response (since return_full_text=False):\n<<<<<\n{generated_text_full}\n>>>>>")

        else:
            print("LLM output structure was not as expected (e.g., no 'generated_text').")

    except Exception as e:
        print(f"ERROR during LLM test call or processing: {e}")

# --- Main Test Execution ---
if __name__ == "__main__":
    # --- Test with return_full_text=False first ---
    test_pipeline_rff_false, test_tokenizer_rff_false = initialize_llm_pipeline_for_test(return_full_text_setting=False)

    if test_pipeline_rff_false:
        # Test 1: Simplest possible prompt
        prompt1 = "What is 2+2?"
        run_llm_test("Test 1 (Simple Q&A, return_full_text=False)", prompt1, test_pipeline_rff_false, test_tokenizer_rff_false, max_new_tok=20)

        # Test 2: Simple instruction following
        prompt2 = "List three colors."
        run_llm_test("Test 2 (Simple Instruction, return_full_text=False)", prompt2, test_pipeline_rff_false, test_tokenizer_rff_false, max_new_tok=30)

        # Test 3: Instruction to produce structured output (no context text yet)
        prompt3 = """
        Provide a JSON object with a "fruit" key and a "color" key.
        Example: {"fruit": "apple", "color": "red"}
        Your JSON:
        """
        run_llm_test("Test 3 (Simple JSON, no context, return_full_text=False)", prompt3, test_pipeline_rff_false, test_tokenizer_rff_false, max_new_tok=50)

        # Test 4: Your motif prompt with VERY SHORT placeholder context
        short_context = "Privacy is important. Data security is a concern. We need good access control."
        motif_prompt_template = f"""You will receive a set of comments from different people answering the same question.

Your task is to identify up to 2 key recurring themes.

For each theme, provide:
- A short label like [DATA_PRIVACY]
- A 1-sentence definition
- 1-2 short phrases that often appear in the text (surface forms)

Output MUST be a valid JSON list of objects, where each object has "label", "description", and "surface_forms" keys.
Example of one object in the list:
{{
  "label": "[EXAMPLE_LABEL]",
  "description": "A concise description of the example theme.",
  "surface_forms": ["short repeated phrase 1", "another short repeated phrase"]
}}
If no clear motifs are found, output an empty JSON list: `[]`.
Do not include any other text, explanations, or markdown code fences around the JSON.

Set of comments to analyze:
\"\"\"
{short_context}
\"\"\"

Valid JSON Output (ensure it's a list of objects, or an empty list [] if no themes):
"""
        run_llm_test("Test 4 (Motif Prompt, Short Context, return_full_text=False)", motif_prompt_template, test_pipeline_rff_false, test_tokenizer_rff_false, max_new_tok=300) # Increased max_new_tokens for JSON
    else:
        print("Skipping tests for return_full_text=False due to pipeline init failure.")

    print("\n" + "="*50 + "\n")

    # --- Test with return_full_text=True ---
    # test_pipeline_rff_true, test_tokenizer_rff_true = initialize_llm_pipeline_for_test(return_full_text_setting=True)
    # if test_pipeline_rff_true:
    #     run_llm_test("Test 5 (Simple Q&A, return_full_text=True)", prompt1, test_pipeline_rff_true, test_tokenizer_rff_true, max_new_tok=20)
    #     run_llm_test("Test 6 (Motif Prompt, Short Context, return_full_text=True)", motif_prompt_template, test_pipeline_rff_true, test_tokenizer_rff_true, max_new_tok=300)
    # else:
    #     print("Skipping tests for return_full_text=True due to pipeline init failure.")

    # Clean up (optional, if GPU memory is an issue between runs)
    # del test_pipeline_rff_false, test_tokenizer_rff_false, test_pipeline_rff_true, test_tokenizer_rff_true
    # torch.cuda.empty_cache()

--- Initializing LLM for Test (return_full_text=False) ---
Using device: cuda
Loading tokenizer for google/gemma-2b-it...


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

BNB config created for google/gemma-2b-it, compute_dtype: torch.bfloat16.
Loading local model google/gemma-2b-it (Quantization: True)...


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Local LLM pipeline for google/gemma-2b-it initialized successfully.

--- Running Test: Test 1 (Simple Q&A, return_full_text=False) ---
Formatted Prompt (sent to pipeline):
<<<<<
<bos><start_of_turn>user
What is 2+2?<end_of_turn>
<start_of_turn>model

>>>>>


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


LLM call took 1.80 seconds.
Raw 'outputs' from hf_pipeline:
<<<<<
[{'generated_text': 'The answer is 4. 2+2 is a simple addition problem that can be solved by'}]
>>>>>
'generated_text' from LLM (len 71):
<<<<<
The answer is 4. 2+2 is a simple addition problem that can be solved by
>>>>>
ERROR during LLM test call or processing: 'TextGenerationPipeline' object has no attribute 'return_full_text'

--- Running Test: Test 2 (Simple Instruction, return_full_text=False) ---
Formatted Prompt (sent to pipeline):
<<<<<
<bos><start_of_turn>user
List three colors.<end_of_turn>
<start_of_turn>model

>>>>>


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


LLM call took 1.19 seconds.
Raw 'outputs' from hf_pipeline:
<<<<<
[{'generated_text': 'Sure, here are three colors:\n\n1. Red\n2. Yellow\n3. Blue'}]
>>>>>
'generated_text' from LLM (len 54):
<<<<<
Sure, here are three colors:

1. Red
2. Yellow
3. Blue
>>>>>
ERROR during LLM test call or processing: 'TextGenerationPipeline' object has no attribute 'return_full_text'

--- Running Test: Test 3 (Simple JSON, no context, return_full_text=False) ---
Formatted Prompt (sent to pipeline):
<<<<<
<bos><start_of_turn>user
Provide a JSON object with a "fruit" key and a "color" key.
        Example: {"fruit": "apple", "color": "red"}
        Your JSON:<end_of_turn>
<start_of_turn>model

>>>>>


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


LLM call took 1.44 seconds.
Raw 'outputs' from hf_pipeline:
<<<<<
[{'generated_text': '```json\n{\n  "fruit": "banana",\n  "color": "yellow"\n}\n```'}]
>>>>>
'generated_text' from LLM (len 56):
<<<<<
```json
{
  "fruit": "banana",
  "color": "yellow"
}
```
>>>>>
ERROR during LLM test call or processing: 'TextGenerationPipeline' object has no attribute 'return_full_text'

--- Running Test: Test 4 (Motif Prompt, Short Context, return_full_text=False) ---
Formatted Prompt (sent to pipeline):
<<<<<
<bos><start_of_turn>user
You will receive a set of comments from different people answering the same question.

Your task is to identify up to 2 key recurring themes.

For each theme, provide:
- A short label like [DATA_PRIVACY]
- A 1-sentence definition
- 1-2 short phrases that often appear in the text (surface forms)

Output MUST be a valid JSON list of objects, where each object has "label", "description", and "surface_forms" keys.
Example of one object in the list:
{
  "label": "[EXAMPLE_LAB

# 1st June

Debugging empty motifs returned

In [None]:
# @title revised call_local_llm_for_motifs()
def call_local_llm_for_motifs(prompt_str: str, hf_pipeline, hf_tokenizer, qid_for_log: str, chunk_idx_for_log: int) -> str: # Added qid and chunk_idx for logging
    """Makes the actual call to the local LLM pipeline and returns raw text."""
    messages_for_template = [{"role": "user", "content": prompt_str}]
    prompt_formatted_for_llm = hf_tokenizer.apply_chat_template(
        messages_for_template, tokenize=False, add_generation_prompt=True
    )

    generation_args = {
        "max_new_tokens": 700,
        "do_sample": False,    # Keep this False for deterministic output for now
        "pad_token_id": hf_tokenizer.eos_token_id,
        # The warning "generation flags are not valid" for temp, top_p, top_k is fine when do_sample=False
    }

    print(f"    DEBUG (call_local_llm): Sending prompt to LLM for QID {qid_for_log}, Chunk {chunk_idx_for_log} (Prompt length: {len(prompt_formatted_for_llm)} chars). First 300 chars of formatted prompt:\n<<<<<\n{prompt_formatted_for_llm[:300]}...\n>>>>>") # Log part of the prompt

    outputs = hf_pipeline(prompt_formatted_for_llm, **generation_args)

    print(f"    DEBUG (call_local_llm): Raw 'outputs' from hf_pipeline for QID {qid_for_log}, Chunk {chunk_idx_for_log}:\n<<<<<\n{outputs}\n>>>>>") # VERY IMPORTANT DEBUG LINE

    if not outputs or not isinstance(outputs, list) or not outputs[0] or not isinstance(outputs[0], dict) or 'generated_text' not in outputs[0] or not outputs[0]['generated_text']:
        print(f"    WARN (call_local_llm): LLM pipeline returned unexpected or empty structure for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        return "" # Return empty string on pipeline failure

    generated_text_full = outputs[0]['generated_text']
    print(f"    DEBUG (call_local_llm): 'generated_text_full' for QID {qid_for_log}, Chunk {chunk_idx_for_log} (len {len(generated_text_full)}):\n<<<<<\n{generated_text_full[:1000]}...\n>>>>>") # Log first 1000 chars

    # Extract only the assistant's response part
    assistant_response_text = ""
    # Logic for stripping prompt if return_full_text=True (which is typical for pipeline)
    if generated_text_full.startswith(prompt_formatted_for_llm):
        assistant_response_text = generated_text_full[len(prompt_formatted_for_llm):].strip()
    else:
        # Fallback for models/templates that might structure output differently
        # (e.g., with specific role tokens like Gemma's <start_of_turn>model)
        model_turn_start_token = "<start_of_turn>model"
        if model_turn_start_token in generated_text_full:
            # Find the *last* occurrence of the model turn start, in case of complex multi-turn internal thoughts
            last_occurrence_index = generated_text_full.rfind(model_turn_start_token)
            potential_response = generated_text_full[last_occurrence_index + len(model_turn_start_token):].strip()
            if potential_response.startswith("\n"):
                potential_response = potential_response[1:].strip()

            # Heuristic: If this potential response is much shorter than the full text, it's likely correct
            # Also check if it doesn't look like the prompt itself
            if len(potential_response) < len(generated_text_full) * 0.8 and "TEXT TO ANALYZE:" not in potential_response:
                 assistant_response_text = potential_response
            else: # If heuristic fails, maybe the prompt wasn't echoed as expected
                 assistant_response_text = generated_text_full.strip()
                 print(f"    WARN (call_local_llm): Could not reliably strip prompt/template. Using full output as response for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        else:
            assistant_response_text = generated_text_full.strip()
            print(f"    WARN (call_local_llm): Prompt/template start not found. Using full output as response for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")

    print(f"    DEBUG (call_local_llm): 'assistant_response_text' (after stripping) for QID {qid_for_log}, Chunk {chunk_idx_for_log} (len {len(assistant_response_text)}):\n<<<<<\n{assistant_response_text[:1000]}...\n>>>>>")
    return assistant_response_text

# 31th May

In [None]:
# @title Batching and Revised Prompting
# --- Imports ---
import json
import os
import hashlib
import numpy as np
from pybdm import BDM
import re
import time
from typing import List, Dict # For type hinting

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

# --- Configuration ---
BASE_PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Legal/' # !!! EXAMPLE - UPDATE THIS PATH !!!
PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json')

P3_QIDS_TO_PROCESS_THEMATICALLY = ["Q4"] # Process only Q4 for this example

# --- BDM and LLM Model Configuration ---
MATRIX_SIZE_GLOBAL = (8, 8)
LOCAL_LLM_MODEL_ID = 'google/gemma-3-4b-it'
USE_QUANTIZATION_FOR_LOCAL_LLM = True
LLM_BATCH_SIZE = 5 # Number of responses per chunk for LLM processing
LLM_RETRY_ATTEMPTS = 2 # Number of retries for LLM calls
MAX_TEXT_PER_LLM_PROMPT_CHUNK = 7500 # Max characters for the text_block in a single LLM prompt

# --- Token-Based L(H) Configuration ---
MOTIF_SYMBOLIC_LABEL_COST = 0.5
MOTIF_DESCRIPTION_TEXT_BASE_COST = 0.5
MOTIF_DESCRIPTION_TOKEN_COST = 0.1
MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.25
MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.1

# --- Logging File ---
LLM_DEBUG_LOG_FILE = "llm_motif_debug_log.txt"

# --- Helper Function Definitions (Tokenization, L(H), BDM, Compression) ---

def tokenize_phrase(phrase_text): # Renamed for clarity
    """Simple tokenizer for phrases/definitions."""
    if not isinstance(phrase_text, str): return []
    phrase_text = phrase_text.lower()
    # Basic cleaning, can be expanded
    tokens = phrase_text.split()
    return [t for t in tokens if t]

def calculate_L_H_token_based_structured(structured_motifs_list: List[Dict]) -> float:
    """Calculates L(H) using token-based costs from structured motif objects."""
    if not structured_motifs_list: return 0.0
    total_lh_cost = 0.0
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue
        current_motif_lh = 0
        label_str = motif_obj.get('label', "")
        if label_str and isinstance(label_str, str) and label_str.strip():
            current_motif_lh += MOTIF_SYMBOLIC_LABEL_COST

        description_str = motif_obj.get('description', "") # Using "description" key
        if description_str and isinstance(description_str, str) and description_str.strip():
            current_motif_lh += MOTIF_DESCRIPTION_TEXT_BASE_COST
            current_motif_lh += len(tokenize_phrase(description_str)) * MOTIF_DESCRIPTION_TOKEN_COST

        surface_forms_list = motif_obj.get('surface_forms', [])
        if surface_forms_list and isinstance(surface_forms_list, list):
            valid_sfs_for_lh = [sf for sf in surface_forms_list if isinstance(sf, str) and sf.strip()]
            if valid_sfs_for_lh:
                current_motif_lh += MOTIF_SURFACE_FORMS_LIST_BASE_COST
                for sf_str in valid_sfs_for_lh:
                    current_motif_lh += len(tokenize_phrase(sf_str)) * MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH
        total_lh_cost += current_motif_lh
    return total_lh_cost

def llm_compress_text_structured(text_to_compress: str, structured_motifs_list: List[Dict]) -> str:
    """Compresses text by replacing surface forms with their motif labels."""
    if not isinstance(text_to_compress, str): return ""
    if not structured_motifs_list: return text_to_compress.lower()

    compressed_text = text_to_compress.lower()

    # For this version, iterate through motifs as provided.
    # More advanced: sort motifs by some priority (e.g. length of longest SF)
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue

        label = motif_obj.get('label', None)
        surface_forms = motif_obj.get('surface_forms', [])

        if not label or not surface_forms or not isinstance(surface_forms, list):
            continue

        placeholder = label # Use the actual symbolic label

        # Sort this motif's own surface forms by length (descending) for greedy matching
        sorted_sfs_for_this_motif = sorted(
            [sf for sf in surface_forms if isinstance(sf, str) and sf.strip()],
            key=len,
            reverse=True
        )

        for sf_str in sorted_sfs_for_this_motif:
            try:
                # Important: Ensure surface forms are also lowercased for matching
                compressed_text = re.sub(re.escape(sf_str.lower()), placeholder, compressed_text, flags=re.IGNORECASE)
            except re.error as re_e:
                print(f"    Regex error during compression for SF '{sf_str}' of motif '{label}': {re_e}. Skipping.")
                continue
    return compressed_text

def text_to_binary_matrix(text_input: str, size=MATRIX_SIZE_GLOBAL) -> np.ndarray:
    """Converts text to a binary matrix using SHA256 hash."""
    if not text_input or not isinstance(text_input, str) or not text_input.strip():
        return np.zeros(size, dtype=int)
    hash_obj = hashlib.sha256(text_input.encode('utf-8', 'ignore'))
    hash_digest = hash_obj.hexdigest()
    required_bits = size[0] * size[1]
    binary_string = bin(int(hash_digest, 16))[2:].zfill(256) # SHA256 produces 256 bits
    binary_string_padded = binary_string.ljust(required_bits, '0') # Pad if required_bits > 256
    bits = [int(b) for b in binary_string_padded[:required_bits]]
    return np.array(bits).reshape(size)

def compute_bdm_for_text(text_input: str, bdm_instance: BDM, matrix_s=MATRIX_SIZE_GLOBAL) -> float:
    """Computes BDM for a given text string."""
    if not text_input or not isinstance(text_input, str) or not text_input.strip() : return 0.0

    MAX_TEXT_FOR_BDM_HASH = 2000 # Consider if this truncation is still desired
    text_for_hash = text_input if len(text_input) <= MAX_TEXT_FOR_BDM_HASH else text_input[:MAX_TEXT_FOR_BDM_HASH]

    matrix = text_to_binary_matrix(text_for_hash, size=matrix_s)
    try:
        bdm_value = bdm_instance.bdm(matrix)
        return bdm_value
    except Exception as e_bdm:
        print(f"      Error during BDM calculation for text (len {len(text_input)}, hashed part len {len(text_for_hash)}): {e_bdm}")
        return -1.0 # Indicate error

def compute_mdl_cost_for_text_block(full_qid_corpus_str: str,
                                    final_consolidated_motifs: List[Dict],
                                    bdm_instance: BDM,
                                    matrix_s=MATRIX_SIZE_GLOBAL) -> tuple[float, float, float]:
    """Computes L(H), L(D|H), and Total MDL for a text block given consolidated motifs."""
    if not isinstance(full_qid_corpus_str, str) : full_qid_corpus_str = ""

    l_h = calculate_L_H_token_based_structured(final_consolidated_motifs)

    compressed_text_block = llm_compress_text_structured(full_qid_corpus_str, final_consolidated_motifs)
    l_d_h = compute_bdm_for_text(compressed_text_block, bdm_instance, matrix_s)

    if l_d_h < 0: # BDM error
        return l_h, -1.0, -1.0

    return l_h, l_d_h, l_h + l_d_h

# --- LLM Motif Extraction (Batched, with your prompt) ---

def build_llm_prompt_for_motifs(text_block_for_prompt: str) -> str:
    """Builds the prompt for the LLM to extract motifs."""
    # Ensure text_block_for_prompt is not excessively long for the prompt itself
    if len(text_block_for_prompt) > MAX_TEXT_PER_LLM_PROMPT_CHUNK:
        text_block_for_prompt = text_block_for_prompt[:MAX_TEXT_PER_LLM_PROMPT_CHUNK]
        # print(f"    Note: Text block for LLM prompt truncated to {MAX_TEXT_PER_LLM_PROMPT_CHUNK} chars.")

    prompt = f"""You will receive a set of comments from different people answering the same question.

Your task is to identify up to 5 key recurring themes.

For each theme, provide:
- A short label like [DATA_PRIVACY]
- A 1-sentence definition
- 2–3 short phrases that often appear in the text (surface forms)

Output MUST be a valid JSON list of objects, where each object has "label", "description", and "surface_forms" keys.
Example of one object in the list:
{{
  "label": "[EXAMPLE_LABEL]",
  "description": "A concise description of the example theme.",
  "surface_forms": ["short repeated phrase 1", "another short repeated phrase"]
}}
If no clear motifs are found, output an empty JSON list: `[]`.
Do not include any other text, explanations, or markdown code fences around the JSON.

Set of comments to analyze:
'''
{text_block_for_prompt}
'''

Valid JSON Output (ensure it's a list of objects, or an empty list [] if no themes):
"""
    return prompt.strip()


# def call_local_llm_for_motifs(prompt_str: str, hf_pipeline, hf_tokenizer) -> str:
#     """Makes the actual call to the local LLM pipeline and returns raw text."""
#     messages_for_template = [{"role": "user", "content": prompt_str}]
#     prompt_formatted_for_llm = hf_tokenizer.apply_chat_template(
#         messages_for_template, tokenize=False, add_generation_prompt=True
#     )

#     generation_args = {
#         "max_new_tokens": 700, # Allow space for JSON with multiple motifs
#         "do_sample": False,
#         "pad_token_id": hf_tokenizer.eos_token_id,
#     }
#     outputs = hf_pipeline(prompt_formatted_for_llm, **generation_args)

#     if not outputs or not isinstance(outputs, list) or not outputs[0].get('generated_text'):
#         # print(f"    LLM pipeline returned unexpected/empty output.") # Logged by caller
#         return "" # Return empty string on pipeline failure

#     generated_text_full = outputs[0]['generated_text']

#     # Extract only the assistant's response part
#     assistant_response_text = ""
#     if generated_text_full.startswith(prompt_formatted_for_llm): # Check if prompt is echoed
#         assistant_response_text = generated_text_full[len(prompt_formatted_for_llm):].strip()
#     else:
#         model_turn_start_token = "<start_of_turn>model" # Common for Gemma instruct
#         if model_turn_start_token in generated_text_full:
#             last_occurrence_index = generated_text_full.rfind(model_turn_start_token)
#             assistant_response_text = generated_text_full[last_occurrence_index + len(model_turn_start_token):].strip()
#             if assistant_response_text.startswith("\n"): # Remove leading newline if present
#                 assistant_response_text = assistant_response_text[1:].strip()
#         else:
#             assistant_response_text = generated_text_full.strip() # Fallback
#     return assistant_response_text


def extract_motifs_from_llm_response(llm_response_str: str, qid_for_log:str, chunk_idx_for_log:int, prompt_sent:str) -> List[Dict]:
    """Parses the LLM's raw text response to extract structured motifs."""
    # print(f"    DEBUG (extract_motifs): Raw LLM response for QID {qid_for_log}, Chunk {chunk_idx_for_log}:\n<<<<<\n{llm_response_str}\n>>>>>")

    json_str_candidate = llm_response_str # Start with the full response

    # Heuristic to find JSON block if LLM wraps it (though prompt asks not to)
    json_block_match_markdown = re.search(r"```json\s*([\s\S]*?)\s*```", llm_response_str)
    if json_block_match_markdown:
        json_str_candidate = json_block_match_markdown.group(1).strip()
    else:
        # If no markdown, try to find first '[' and last ']' as a simpler heuristic
        first_bracket = llm_response_str.find('[')
        last_bracket = llm_response_str.rfind(']')
        if first_bracket != -1 and last_bracket != -1 and last_bracket > first_bracket:
            json_str_candidate = llm_response_str[first_bracket : last_bracket+1].strip()
        # Else, assume the whole response is the JSON string (might fail if it's not)

    # print(f"    DEBUG (extract_motifs): Final json_str candidate for QID {qid_for_log}, Chunk {chunk_idx_for_log}:\n<<<<<\n{json_str_candidate}\n>>>>>")

    if not json_str_candidate or json_str_candidate.lower() == "[]":
        # print(f"    LLM indicated no themes or JSON was empty for QID {qid_for_log}, Chunk {chunk_idx_for_log}.")
        return []

    try:
        parsed_json = json.loads(json_str_candidate)
        if not isinstance(parsed_json, list):
            # print(f"    WARN (extract_motifs): LLM output was not a JSON list for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {parsed_json}")
            raise ValueError("Response was not a list")

        valid_motifs_from_json = []
        for item in parsed_json:
            if isinstance(item, dict) and \
               'label' in item and isinstance(item['label'], str) and \
               'description' in item and isinstance(item['description'], str) and \
               item['label'].startswith('[') and item['label'].endswith(']'):

                sf = item.get('surface_forms', [])
                if not isinstance(sf, list) or not all(isinstance(s, str) for s in sf):
                    sf = []

                valid_motifs_from_json.append({
                    "label": item['label'].strip(),
                    "description": item['description'].strip(),
                    "surface_forms": [s.strip() for s in sf if s.strip()]
                })
            # else:
                # print(f"    WARN (extract_motifs): Invalid item structure in LLM JSON for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {item}")
        return valid_motifs_from_json

    except json.JSONDecodeError as e:
        print(f"    [WARN] Motif JSON parsing failed for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {e}")
        # Log the problematic parts for debugging
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} ---\n")
            f.write(f"PROMPT SENT (first 500 chars):\n{prompt_sent[:500]}...\n")
            f.write(f"RAW LLM RESPONSE (导致 JSONDecodeError):\n{llm_response_str}\n")
            f.write(f"EXTRACTED JSON STRING CANDIDATE (导致 JSONDecodeError):\n{json_str_candidate}\n")
        return []
    except ValueError as ve: # Handles "Response was not a list"
        print(f"    [WARN] Motif structure validation failed for QID {qid_for_log}, Chunk {chunk_idx_for_log}: {ve}")
        with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx_for_log} ---\n")
            f.write(f"PROMPT SENT (first 500 chars):\n{prompt_sent[:500]}...\n")
            f.write(f"RAW LLM RESPONSE (导致 ValueError):\n{llm_response_str}\n")
            f.write(f"EXTRACTED JSON STRING CANDIDATE (导致 ValueError):\n{json_str_candidate}\n")
        return []


def get_motifs_for_text_chunks(
    list_of_response_strings: List[str],
    batch_size: int,
    hf_pipeline,
    hf_tokenizer,
    qid_for_log: str # For logging context
    ) -> List[Dict]:
    """Processes responses in batches, calls LLM for each, aggregates motifs."""

    all_extracted_motifs_from_all_chunks = []

    # Create chunks of text, where each chunk is a join of 'batch_size' responses
    batched_text_blocks = []
    for i in range(0, len(list_of_response_strings), batch_size):
        current_batch_responses = list_of_response_strings[i:i + batch_size]
        # Join individual responses within a batch with a clear separator
        # This text_block is what goes into the LLM prompt
        text_block_for_chunk = "\n\n<RSP_SEP>\n\n".join(current_batch_responses)
        batched_text_blocks.append(text_block_for_chunk)

    print(f"  QID {qid_for_log}: Processing {len(list_of_response_strings)} responses in {len(batched_text_blocks)} chunks (batch size: {batch_size} responses).")

    for chunk_idx, text_chunk_to_analyze in enumerate(batched_text_blocks):
        print(f"    Analyzing chunk {chunk_idx + 1}/{len(batched_text_blocks)} for QID {qid_for_log} (len: {len(text_chunk_to_analyze)} chars)...")

        if len(text_chunk_to_analyze.strip()) < 50: # Skip very small chunks
            print(f"      Chunk {chunk_idx+1} for QID {qid_for_log} too short, skipping.")
            continue

        prompt_for_llm = build_llm_prompt_for_motifs(text_chunk_to_analyze)
        motifs_from_this_chunk = []

        for attempt in range(LLM_RETRY_ATTEMPTS):
            try:
                # raw_llm_response_text = call_local_llm_for_motifs(prompt_for_llm, hf_pipeline, hf_tokenizer)
                # VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
                # FIX IS HERE: Pass qid_for_log and chunk_idx + 1
                # VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
                raw_llm_response_text = call_local_llm_for_motifs(
                    prompt_for_llm,
                    hf_pipeline,
                    hf_tokenizer,
                    qid_for_log,      # Pass the QID for this whole batch operation
                    chunk_idx + 1     # Pass the current chunk number (1-based for logging)
                )
                # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                # END OF FIX
                # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

                if not raw_llm_response_text:
                    print(f"      LLM call attempt {attempt + 1} for chunk {chunk_idx+1} (QID {qid_for_log}) returned empty. Retrying if possible...")
                    if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)
                    continue # Go to next attempt or fail

                motifs_attempt_from_chunk = extract_motifs_from_llm_response(raw_llm_response_text, qid_for_log, chunk_idx+1, prompt_for_llm)

                if motifs_attempt_from_chunk: # If non-empty list (successful parse and valid structure)
                    motifs_from_this_chunk = motifs_attempt_from_chunk
                    break # Success, exit retry loop
                else: # Parsing failed or LLM returned empty valid JSON like []
                    print(f"      Motif extraction/parsing attempt {attempt + 1} yielded no valid motifs for chunk {chunk_idx+1} (QID {qid_for_log}). Retrying if possible...")
                    if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)

            except Exception as e_call: # Catch errors from call_local_llm_for_motifs itself
                 print(f"      Critical error during LLM call attempt {attempt + 1} for chunk {chunk_idx+1} (QID {qid_for_log}): {e_call}")
                 if attempt < LLM_RETRY_ATTEMPTS - 1: time.sleep(1)
                 # Log error to debug file
                 with open(LLM_DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
                    f.write(f"\n--- QID: {qid_for_log} --- CHUNK: {chunk_idx+1} --- ATTEMPT: {attempt+1} ---\n")
                    f.write(f"PROMPT SENT (first 500 chars):\n{prompt_for_llm[:500]}...\n")
                    f.write(f"CRITICAL LLM CALL ERROR:\n{e_call}\n")


        if motifs_from_this_chunk:
            print(f"      Extracted {len(motifs_from_this_chunk)} motif objects from chunk {chunk_idx+1} (QID {qid_for_log}).")
            all_extracted_motifs_from_all_chunks.extend(motifs_from_this_chunk)
        else:
            print(f"      No valid motifs extracted from chunk {chunk_idx+1} (QID {qid_for_log}) after {LLM_RETRY_ATTEMPTS} attempts.")

    return all_extracted_motifs_from_all_chunks


# --- Main Execution Logic ---
def main():
    print("--- MDL Prototype: Batched LLM Motif Extraction, Structured Motifs, Token-L(H) ---")
    print(f"Using L(H) Cost Params: Label={MOTIF_SYMBOLIC_LABEL_COST}, DescBase={MOTIF_DESCRIPTION_TEXT_BASE_COST}, DescToken={MOTIF_DESCRIPTION_TOKEN_COST}, SFListBase={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, SFTokenInLH={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}")
    print(f"LLM Batch Size: {LLM_BATCH_SIZE} responses. LLM Retries: {LLM_RETRY_ATTEMPTS}. Max Text per Prompt Chunk: {MAX_TEXT_PER_LLM_PROMPT_CHUNK}")

    # Clear or create debug log file
    with open(LLM_DEBUG_LOG_FILE, "w", encoding="utf-8") as f:
        f.write(f"LLM Motif Debug Log - {time.asctime()}\n")
        f.write(f"Model: {LOCAL_LLM_MODEL_ID}\n")

    local_llm_pipeline_instance = None
    local_llm_tokenizer_instance = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    try:
        print(f"Loading tokenizer for {LOCAL_LLM_MODEL_ID}...")
        local_llm_tokenizer_instance = AutoTokenizer.from_pretrained(LOCAL_LLM_MODEL_ID)
        bnb_config = None
        quant_active = False
        if USE_QUANTIZATION_FOR_LOCAL_LLM and torch.cuda.is_available():
            try:
                compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
                bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=True)
                quant_active = True
                print(f"BNB config created for {LOCAL_LLM_MODEL_ID}, compute_dtype: {compute_dtype}.")
            except Exception as e_bnb:
                print(f"WARN: Failed to create BitsAndBytesConfig: {e_bnb}. Quantization disabled.")
        print(f"Loading local model {LOCAL_LLM_MODEL_ID} (Quantization: {quant_active})...")
        model_kwargs = {"device_map": "auto", "trust_remote_code": True}
        if quant_active: model_kwargs["quantization_config"] = bnb_config
        else:
            if device.type == 'cuda': model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        local_llm_model_instance = AutoModelForCausalLM.from_pretrained(LOCAL_LLM_MODEL_ID, **model_kwargs)
        local_llm_pipeline_instance = pipeline("text-generation", model=local_llm_model_instance, tokenizer=local_llm_tokenizer_instance, return_full_text=True) # return_full_text=True for easier splitting
        print(f"Local LLM pipeline for {LOCAL_LLM_MODEL_ID} initialized successfully.")
    except Exception as e:
        print(f"CRITICAL: Failed to initialize local LLM pipeline: {e}")
        return

    try:
        bdm_instance_main = BDM(ndim=2) # CTM often slower, NKS default is fine
        print("BDM instance initialized successfully.")
    except Exception as e_bdm_init:
        print(f"CRITICAL: Failed to initialize BDM instance: {e_bdm_init}")
        return

    if not os.path.exists(P2_COLLATED_FILE):
        print(f"ERROR: Phase 2 output file not found: {P2_COLLATED_FILE}")
        return
    print(f"Loading Phase 2 output from: {P2_COLLATED_FILE}...")
    phase2_data_content = None
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f:
            phase2_data_content = json.load(f)
    except Exception as e:
        print(f"Error loading or parsing {P2_COLLATED_FILE}: {e}")
        return

    all_qid_mdl_results = []
    qids_to_process_final = []
    aggregated_content_by_qid_map = {}

    if phase2_data_content:
        aggregated_content_by_qid_map = phase2_data_content.get("aggregated_pdf_content_by_qid", {})

    if not aggregated_content_by_qid_map:
        print(f"No 'aggregated_pdf_content_by_qid' key found or data is empty in {P2_COLLATED_FILE}.")
        return

    if P3_QIDS_TO_PROCESS_THEMATICALLY and isinstance(P3_QIDS_TO_PROCESS_THEMATICALLY, list) and P3_QIDS_TO_PROCESS_THEMATICALLY:
        qids_to_process_final = [qid for qid in P3_QIDS_TO_PROCESS_THEMATICALLY if qid in aggregated_content_by_qid_map]
        print(f"Targeting specific QIDs from P3_QIDS_TO_PROCESS_THEMATICALLY: {qids_to_process_final}")
        if not qids_to_process_final:
            print(f"Warning: None of QIDs {P3_QIDS_TO_PROCESS_THEMATICALLY} found in loaded data.")
            return
    else:
        # Fallback to process a limited number of QIDs if P3_QIDS_TO_PROCESS_THEMATICALLY is not set
        qids_to_process_limit_fallback = 1
        print(f"P3_QIDS_TO_PROCESS_THEMATICALLY not set or empty. Processing up to {qids_to_process_limit_fallback} QID(s) as a fallback.")
        qids_to_process_final = list(aggregated_content_by_qid_map.keys())[:qids_to_process_limit_fallback]
        if not qids_to_process_final:
            print("No QIDs available to process based on fallback.")
            return
    print(f"\nMDL analysis will run for these QIDs: {qids_to_process_final}\n")

    for qid_str in qids_to_process_final:
        list_of_individual_responses = aggregated_content_by_qid_map.get(qid_str) # This should be a list of text dicts

        if not list_of_individual_responses or not isinstance(list_of_individual_responses, list):
            print(f"No valid responses found for QID {qid_str}, or format is incorrect. Skipping.")
            continue

        # Extract just the text from the response dicts
        actual_response_text_strings = [
            item.get("text", "") for item in list_of_individual_responses
            if isinstance(item, dict) and isinstance(item.get("text"), str) and item.get("text","").strip()
        ]

        if not actual_response_text_strings:
            print(f"No valid text strings extracted from responses for QID {qid_str}. Skipping.")
            continue

        print(f"--- Analyzing Aggregated Text for QID: {qid_str} ---")
        # The full corpus for this QID is needed for baseline and final L(D|H)
        full_corpus_for_qid_str = "\n\n<RSP_SEP>\n\n".join(actual_response_text_strings)

        if len(full_corpus_for_qid_str.strip()) < 100: # Arbitrary short corpus threshold
            print(f"  Skipping QID {qid_str}: combined text too short ({len(full_corpus_for_qid_str)} chars).")
            continue

        num_responses_for_qid = len(actual_response_text_strings)
        print(f"  Combined corpus for QID {qid_str} has {len(full_corpus_for_qid_str)} chars from {num_responses_for_qid} individual responses.")

        # Calculate baseline L(D) for the entire QID's text
        baseline_l_d_original = compute_bdm_for_text(full_corpus_for_qid_str, bdm_instance_main, MATRIX_SIZE_GLOBAL)
        if baseline_l_d_original < 0:
            print(f"  Error computing baseline BDM for QID {qid_str}. Skipping this QID.")
            continue
        baseline_total_mdl_cost = baseline_l_d_original # L(H) is 0 for baseline
        print(f"  Baseline MDL for QID {qid_str} (L(D_orig)): {baseline_total_mdl_cost:.4f}")

        # --- New Batched Motif Extraction ---
        raw_motifs_from_all_chunks = get_motifs_for_text_chunks(
            actual_response_text_strings, # Pass list of individual response strings
            LLM_BATCH_SIZE,
            local_llm_pipeline_instance,
            local_llm_tokenizer_instance,
            qid_str # For logging
        )

        if not raw_motifs_from_all_chunks:
            print(f"  No raw motifs extracted by LLM for QID {qid_str} from any chunk.")
            # Log baseline result
            all_qid_mdl_results.append({
                "qid": qid_str, "corpus_len_for_qid": len(full_corpus_for_qid_str), "num_responses": num_responses_for_qid,
                "baseline_mdl": baseline_total_mdl_cost, "final_motifs": [],
                "l_h_motifs": 0.0, "l_d_h_motifs": baseline_total_mdl_cost, "total_mdl_motifs": baseline_total_mdl_cost,
                "compression_achieved": 0.0, "num_raw_motifs_extracted": 0, "num_consolidated_motifs": 0
            })
            print("-" * 40)
            continue

        print(f"  Extracted {len(raw_motifs_from_all_chunks)} raw motif objects from LLM for QID {qid_str} (across all chunks).")

        # --- Motif Consolidation Step ---
        consolidated_motifs_temp_dict = {}
        for motif_obj_raw in raw_motifs_from_all_chunks:
            label = motif_obj_raw.get("label")
            description = motif_obj_raw.get("description")
            surface_forms = motif_obj_raw.get("surface_forms")

            if label and description and surface_forms is not None: # Basic validation
                if label not in consolidated_motifs_temp_dict:
                    consolidated_motifs_temp_dict[label] = motif_obj_raw
                else: # Label exists, merge surface forms (simple union)
                    existing_sfs_set = set(consolidated_motifs_temp_dict[label].get("surface_forms", []))
                    new_sfs_set = set(surface_forms)
                    # Keep original description from first encounter of label
                    consolidated_motifs_temp_dict[label]["surface_forms"] = sorted(list(existing_sfs_set.union(new_sfs_set)))
            # else:
                # print(f"    [WARN] Invalid raw motif object structure skipped during QID {qid_str} consolidation: {motif_obj_raw}")

        final_motifs_for_qid = list(consolidated_motifs_temp_dict.values())
        print(f"  Consolidated into {len(final_motifs_for_qid)} unique motifs (by label) for QID {qid_str}.")

        if not final_motifs_for_qid:
            print(f"  No unique motifs left after consolidation for QID {qid_str}.")
            # Log baseline result
            all_qid_mdl_results.append({
                "qid": qid_str, "corpus_len_for_qid": len(full_corpus_for_qid_str), "num_responses": num_responses_for_qid,
                "baseline_mdl": baseline_total_mdl_cost, "final_motifs": [],
                "l_h_motifs": 0.0, "l_d_h_motifs": baseline_total_mdl_cost, "total_mdl_motifs": baseline_total_mdl_cost,
                "compression_achieved": 0.0, "num_raw_motifs_extracted": len(raw_motifs_from_all_chunks), "num_consolidated_motifs": 0
            })
            print("-" * 40)
            continue

        print(f"  Final Consolidated Motifs for QID {qid_str} (for MDL eval):")
        for mo_final in final_motifs_for_qid:
            print(f"    Label: {mo_final.get('label')}, Desc: {mo_final.get('description','N/A')[:60]}..., SFs: {mo_final.get('surface_forms',[])}")


        # Calculate MDL cost using the full original corpus for this QID and the consolidated motifs
        l_h_final, l_d_h_final, total_mdl_with_final_motifs = compute_mdl_cost_for_text_block(
            full_corpus_for_qid_str,
            final_motifs_for_qid,
            bdm_instance_main,
            MATRIX_SIZE_GLOBAL
        )

        if l_d_h_final < 0: # Check if BDM error occurred for L(D|H)
            print(f"  Error computing MDL cost with final motifs for QID {qid_str} (BDM error in L(D|H)). Skipping result.")
            # Log error result
            all_qid_mdl_results.append({
                "qid": qid_str, "corpus_len_for_qid": len(full_corpus_for_qid_str), "num_responses": num_responses_for_qid,
                "baseline_mdl": baseline_total_mdl_cost, "final_motifs": final_motifs_for_qid, # Store attempted motifs
                "l_h_motifs": l_h_final if l_h_final >=0 else "L(H)_OK_BDM_ERR_IN_L(D|H)",
                "l_d_h_motifs": -1.0, "total_mdl_motifs": -1.0,
                "compression_achieved": "BDM_ERROR_IN_L(D|H)",
                "num_raw_motifs_extracted": len(raw_motifs_from_all_chunks),
                "num_consolidated_motifs": len(final_motifs_for_qid)
            })
            print("-" * 40)
            continue

        print(f"  L(H) (Token-based Structured) for final motifs of QID {qid_str}: {l_h_final:.4f}")
        print(f"  L(D|H) (BDM-based) compressed full corpus complexity for QID {qid_str}: {l_d_h_final:.4f}")
        print(f"  Total MDL cost with final motifs for QID {qid_str}: {total_mdl_with_final_motifs:.4f}")

        compression_final = baseline_total_mdl_cost - total_mdl_with_final_motifs
        result_status_str = ""
        if compression_final > 0.0001: # Using a small threshold for "significant"
            result_status_str = f"SUCCESS: Compression achieved: {compression_final:.4f}"
        else:
            result_status_str = f"NOTE: No significant compression (or cost increased). Diff: {compression_final:.4f}"
        print(f"  {result_status_str}")

        all_qid_mdl_results.append({
            "qid": qid_str,
            "corpus_len_for_qid": len(full_corpus_for_qid_str),
            "num_responses": num_responses_for_qid,
            "baseline_mdl": baseline_total_mdl_cost,
            "final_motifs": final_motifs_for_qid,
            "l_h_motifs": l_h_final,
            "l_d_h_motifs": l_d_h_final,
            "total_mdl_motifs": total_mdl_with_final_motifs,
            "compression_achieved": compression_final,
            "num_raw_motifs_extracted": len(raw_motifs_from_all_chunks),
            "num_consolidated_motifs": len(final_motifs_for_qid)
        })
        print("-" * 40)

    # --- Summary Printing and Saving Results ---
    print("\n--- Overall QID-based MDL Analysis Summary (Batched Local LLM, Structured Motifs, Token-L(H)) ---")
    if not all_qid_mdl_results:
        print("No QIDs were processed or no valid results generated.")
    else:
        valid_results_for_stats = [r for r in all_qid_mdl_results if isinstance(r.get('compression_achieved'), float) and r.get('l_h_motifs', -1.0) >= 0]
        num_compressed_qids = sum(1 for r in valid_results_for_stats if r['compression_achieved'] > 0.0001)
        successful_compressions_values = [r['compression_achieved'] for r in valid_results_for_stats if r['compression_achieved'] > 0.0001]

        avg_compression_val = np.mean(successful_compressions_values) if successful_compressions_values else 0
        max_compression_val = np.max(successful_compressions_values) if successful_compressions_values else 0

        print(f"Total QIDs targeted for analysis: {len(qids_to_process_final)}")
        print(f"Total QID results logged: {len(all_qid_mdl_results)}")
        print(f"Number of QIDs where compression was achieved (from valid results): {num_compressed_qids}")
        if num_compressed_qids > 0:
            print(f"  Average compression (for successful cases): {avg_compression_val:.4f}")
            print(f"  Maximum compression achieved across QIDs: {max_compression_val:.4f}")

        output_filename_qids_final = "mdl_analysis_per_qid_batched_llm_v1.json"
        try:
            with open(output_filename_qids_final, "w", encoding="utf-8") as f_out:
                json.dump(all_qid_mdl_results, f_out, indent=2)
            print(f"Detailed QID-based results saved to {output_filename_qids_final}")
        except Exception as e_save:
            print(f"Error saving QID-based results to {output_filename_qids_final}: {e_save}")

if __name__ == "__main__":
    main()

--- MDL Prototype: Batched LLM Motif Extraction, Structured Motifs, Token-L(H) ---
Using L(H) Cost Params: Label=0.5, DescBase=0.5, DescToken=0.1, SFListBase=0.25, SFTokenInLH=0.1
LLM Batch Size: 5 responses. LLM Retries: 2. Max Text per Prompt Chunk: 7500
Using device: cuda
Loading tokenizer for google/gemma-3-4b-it...
BNB config created for google/gemma-3-4b-it, compute_dtype: torch.bfloat16.
Loading local model google/gemma-3-4b-it (Quantization: True)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Local LLM pipeline for google/gemma-3-4b-it initialized successfully.
BDM instance initialized successfully.
Loading Phase 2 output from: /content/drive/MyDrive/Colab Notebooks/Legal/Phase2_PDF_Collated_Texts/phase2_collated_pdf_texts.json...
Targeting specific QIDs from P3_QIDS_TO_PROCESS_THEMATICALLY: ['Q4']

MDL analysis will run for these QIDs: ['Q4']

--- Analyzing Aggregated Text for QID: Q4 ---
  Combined corpus for QID Q4 has 30351 chars from 26 individual responses.
  Baseline MDL for QID Q4 (L(D_orig)): 122.0250
  QID Q4: Processing 26 responses in 6 chunks (batch size: 5 responses).
    Analyzing chunk 1/6 for QID Q4 (len: 4702 chars)...
    DEBUG (call_local_llm): Sending prompt to LLM for QID Q4, Chunk 1 (Prompt length: 5661 chars). First 300 chars of formatted prompt:
<<<<<
<bos><start_of_turn>user
You will receive a set of comments from different people answering the same question.

Your task is to identify up to 5 key recurring themes.

For each theme, provide:
- A shor

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    DEBUG (call_local_llm): Sending prompt to LLM for QID Q4, Chunk 1 (Prompt length: 5661 chars). First 300 chars of formatted prompt:
<<<<<
<bos><start_of_turn>user
You will receive a set of comments from different people answering the same question.

Your task is to identify up to 5 key recurring themes.

For each theme, provide:
- A short label like [DATA_PRIVACY]
- A 1-sentence definition
- 2–3 short phrases that often appear in the ...
>>>>>


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    DEBUG (call_local_llm): Raw 'outputs' from hf_pipeline for QID Q4, Chunk 1:
<<<<<
[{'generated_text': '<bos><start_of_turn>user\nYou will receive a set of comments from different people answering the same question.\n\nYour task is to identify up to 5 key recurring themes.\n\nFor each theme, provide:\n- A short label like [DATA_PRIVACY]\n- A 1-sentence definition\n- 2–3 short phrases that often appear in the text (surface forms)\n\nOutput MUST be a valid JSON list of objects, where each object has "label", "description", and "surface_forms" keys.\nExample of one object in the list:\n{\n  "label": "[EXAMPLE_LABEL]",\n  "description": "A concise description of the example theme.",\n  "surface_forms": ["short repeated phrase 1", "another short repeated phrase"]\n}\nIf no clear motifs are found, output an empty JSON list: `[]`.\nDo not include any other text, explanations, or markdown code fences around the JSON.\n\nSet of comments to analyze:\n\'\'\'\nThis excerpt highlights a signific

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    DEBUG (call_local_llm): Sending prompt to LLM for QID Q4, Chunk 2 (Prompt length: 6938 chars). First 300 chars of formatted prompt:
<<<<<
<bos><start_of_turn>user
You will receive a set of comments from different people answering the same question.

Your task is to identify up to 5 key recurring themes.

For each theme, provide:
- A short label like [DATA_PRIVACY]
- A 1-sentence definition
- 2–3 short phrases that often appear in the ...
>>>>>


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    DEBUG (call_local_llm): Raw 'outputs' from hf_pipeline for QID Q4, Chunk 2:
<<<<<
[{'generated_text': '<bos><start_of_turn>user\nYou will receive a set of comments from different people answering the same question.\n\nYour task is to identify up to 5 key recurring themes.\n\nFor each theme, provide:\n- A short label like [DATA_PRIVACY]\n- A 1-sentence definition\n- 2–3 short phrases that often appear in the text (surface forms)\n\nOutput MUST be a valid JSON list of objects, where each object has "label", "description", and "surface_forms" keys.\nExample of one object in the list:\n{\n  "label": "[EXAMPLE_LABEL]",\n  "description": "A concise description of the example theme.",\n  "surface_forms": ["short repeated phrase 1", "another short repeated phrase"]\n}\nIf no clear motifs are found, output an empty JSON list: `[]`.\nDo not include any other text, explanations, or markdown code fences around the JSON.\n\nSet of comments to analyze:\n\'\'\'\nHere’s a concise summary of the ex

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    DEBUG (call_local_llm): Sending prompt to LLM for QID Q4, Chunk 3 (Prompt length: 7362 chars). First 300 chars of formatted prompt:
<<<<<
<bos><start_of_turn>user
You will receive a set of comments from different people answering the same question.

Your task is to identify up to 5 key recurring themes.

For each theme, provide:
- A short label like [DATA_PRIVACY]
- A 1-sentence definition
- 2–3 short phrases that often appear in the ...
>>>>>


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


    DEBUG (call_local_llm): Raw 'outputs' from hf_pipeline for QID Q4, Chunk 3:
<<<<<
[{'generated_text': '<bos><start_of_turn>user\nYou will receive a set of comments from different people answering the same question.\n\nYour task is to identify up to 5 key recurring themes.\n\nFor each theme, provide:\n- A short label like [DATA_PRIVACY]\n- A 1-sentence definition\n- 2–3 short phrases that often appear in the text (surface forms)\n\nOutput MUST be a valid JSON list of objects, where each object has "label", "description", and "surface_forms" keys.\nExample of one object in the list:\n{\n  "label": "[EXAMPLE_LABEL]",\n  "description": "A concise description of the example theme.",\n  "surface_forms": ["short repeated phrase 1", "another short repeated phrase"]\n}\nIf no clear motifs are found, output an empty JSON list: `[]`.\nDo not include any other text, explanations, or markdown code fences around the JSON.\n\nSet of comments to analyze:\n\'\'\'\nHere’s a concise summary of the ex

KeyboardInterrupt: 

In [None]:
# @title Token-based L(H)
# --- Imports ---
import json
import os
import hashlib
import numpy as np
from pybdm import BDM
import re
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

# --- Configuration ---
BASE_PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Legal/' # !!! EXAMPLE - UPDATE THIS PATH !!!
PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json')

P3_QIDS_TO_PROCESS_THEMATICALLY = ["Q4"] # EXAMPLE: Process only Q4

MATRIX_SIZE_GLOBAL = (8, 8)
LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'
USE_QUANTIZATION_FOR_LOCAL_LLM = True

# --- Token-Based L(H) Costs for Structured Motifs ---
MOTIF_SYMBOLIC_LABEL_COST = 0.5
MOTIF_DEFINITION_TEXT_BASE_COST = 0.25
MOTIF_DEFINITION_TOKEN_COST = 0.05
MOTIF_SURFACE_FORMS_LIST_BASE_COST = 0.1
MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH = 0.05

# --- Helper Function Definitions ---

def llm_extract_motifs_local_hf(text_to_analyze, hf_pipeline, hf_tokenizer):
    MAX_TEXT_FOR_LLM = 7000 # Or your chosen limit
    text_for_llm_step1 = text_to_analyze
    if len(text_to_analyze) > MAX_TEXT_FOR_LLM:
        text_for_llm_step1 = text_to_analyze[:MAX_TEXT_FOR_LLM]

    # --- Step 1: Extract Candidate Phrases ---
    print("    Executing LLM Step 1: Extracting candidate phrases...")
    prompt_step1_content = f"""
    From the text below, extract up to 7 distinct, short, recurring keyword phrases or noun phrases (typically 2 to 5 words each) that seem to represent key themes or concepts.
    Focus on phrases that you believe are used multiple times or are central to the discussion.
    List each distinct phrase on a new line, with no other formatting or explanations.
    If no such phrases are found, output the exact text: NO_PHRASES_FOUND

    TEXT TO ANALYZE:
    '''
    {text_for_llm_step1}
    '''

    KEY PHRASES (each on a new line):
    """
    messages_step1 = [{"role": "user", "content": prompt_step1_content.strip()}]
    prompt_formatted_step1 = hf_tokenizer.apply_chat_template(messages_step1, tokenize=False, add_generation_prompt=True)

    generation_args_step1 = {"max_new_tokens": 150, "do_sample": False, "pad_token_id": hf_tokenizer.eos_token_id}

    candidate_phrases = []
    try:
        outputs_step1 = hf_pipeline(prompt_formatted_step1, **generation_args_step1)
        if outputs_step1 and outputs_step1[0].get('generated_text'):
            full_response_step1 = outputs_step1[0]['generated_text']
            # Extract only the assistant's response part
            assistant_response_step1 = ""
            if full_response_step1.startswith(prompt_formatted_step1):
                assistant_response_step1 = full_response_step1[len(prompt_formatted_step1):].strip()
            else: # Fallback for models that might not echo prompt fully
                model_turn_start_token = "<start_of_turn>model"
                if model_turn_start_token in full_response_step1:
                    last_occurrence_index = full_response_step1.rfind(model_turn_start_token)
                    assistant_response_step1 = full_response_step1[last_occurrence_index + len(model_turn_start_token):].strip()
                    if assistant_response_step1.startswith("\n"):
                        assistant_response_step1 = assistant_response_step1[1:].strip()
                else:
                     assistant_response_step1 = full_response_step1.strip()

            print(f"    DEBUG Step 1 LLM Raw Output: '{assistant_response_step1}'")
            if "NO_PHRASES_FOUND" not in assistant_response_step1.upper():
                candidate_phrases = [p.strip() for p in assistant_response_step1.split('\n') if p.strip() and len(p.strip().split()) >= 2 and len(p.strip().split()) <= 5] # Ensure phrases meet length criteria
                candidate_phrases = list(dict.fromkeys(candidate_phrases)) # Deduplicate
                print(f"    LLM Step 1 Extracted Candidate Phrases: {candidate_phrases}")
            else:
                print("    LLM Step 1 indicated NO_PHRASES_FOUND.")
        else:
            print("    Error: LLM Step 1 returned empty/unexpected output.")
    except Exception as e_step1:
        print(f"    Error in LLM Step 1 (Extracting Phrases): {e_step1}")
        return [] # Return empty if step 1 fails

    if not candidate_phrases:
        return []

    # --- Step 2: Structure Each Phrase into a Motif Object ---
    structured_motifs_list = []
    # Provide a smaller context snippet for step 2 to avoid overwhelming the LLM again with long text
    context_snippet_for_step2 = text_for_llm_step1[:1000] # e.g., first 1000 chars of the (potentially truncated) text

    for i, phrase in enumerate(candidate_phrases[:5]): # Process up to 5 phrases from step 1
        print(f"    Executing LLM Step 2: Structuring phrase '{phrase}'...")
        prompt_step2_content = f"""
        You are an expert in thematic analysis.
        The following KEY PHRASE was identified as potentially important from a larger document: "{phrase}"

        For context, here is a brief snippet from the beginning of the larger document:
        '''
        {context_snippet_for_step2}
        '''

        Your task is to structure this KEY PHRASE into a thematic motif. Provide:
        1. A short, symbolic label in [UPPER_SNAKE_CASE_WITH_BRACKETS] that captures the essence of the KEY PHRASE.
        2. A concise, 1-sentence human-readable description of the theme represented by this KEY PHRASE.
        3. A list called "surface_forms". This list MUST include the original KEY PHRASE: "{phrase}". You can optionally add 1 or 2 other very close variations or synonyms if they strongly represent the same immediate concept. Prefer short, common variations.

        Output MUST be a single valid JSON object with "label", "description", and "surface_forms" keys.
        Do not include any other text, explanations, or markdown code fences around the JSON.

        Example of ONE JSON object:
        {{
          "label": "[EXAMPLE_LABEL]",
          "description": "A concise description of the theme related to the input phrase.",
          "surface_forms": ["original input phrase", "very close variant 1"]
        }}

        JSON Output for KEY PHRASE "{phrase}":
        """
        messages_step2 = [{"role": "user", "content": prompt_step2_content.strip()}]
        prompt_formatted_step2 = hf_tokenizer.apply_chat_template(messages_step2, tokenize=False, add_generation_prompt=True)

        generation_args_step2 = {"max_new_tokens": 250, "do_sample": False, "pad_token_id": hf_tokenizer.eos_token_id} # Max tokens for a single JSON object

        try:
            outputs_step2 = hf_pipeline(prompt_formatted_step2, **generation_args_step2)
            if outputs_step2 and outputs_step2[0].get('generated_text'):
                full_response_step2 = outputs_step2[0]['generated_text']
                assistant_response_step2 = ""
                if full_response_step2.startswith(prompt_formatted_step2):
                     assistant_response_step2 = full_response_step2[len(prompt_formatted_step2):].strip()
                else: # Fallback for models that might not echo prompt fully
                    model_turn_start_token = "<start_of_turn>model"
                    if model_turn_start_token in full_response_step2:
                        last_occurrence_index = full_response_step2.rfind(model_turn_start_token)
                        assistant_response_step2 = full_response_step2[last_occurrence_index + len(model_turn_start_token):].strip()
                        if assistant_response_step2.startswith("\n"):
                             assistant_response_step2 = assistant_response_step2[1:].strip()
                    else:
                        assistant_response_step2 = full_response_step2.strip()

                print(f"    DEBUG Step 2 LLM Raw Output for '{phrase}': '{assistant_response_step2}'")

                json_str_step2 = assistant_response_step2
                # Minimal JSON extraction (assuming LLM is mostly compliant)
                first_bracket_s2 = json_str_step2.find('{')
                last_bracket_s2 = json_str_step2.rfind('}')
                if first_bracket_s2 != -1 and last_bracket_s2 != -1 and last_bracket_s2 > first_bracket_s2:
                    json_str_step2 = json_str_step2[first_bracket_s2 : last_bracket_s2+1].strip()

                # print(f"    DEBUG Step 2 Extracted JSON string for '{phrase}': '{json_str_step2}'")

                try:
                    parsed_json_obj = json.loads(json_str_step2)
                    if isinstance(parsed_json_obj, dict) and \
                       'label' in parsed_json_obj and isinstance(parsed_json_obj['label'], str) and \
                       'description' in parsed_json_obj and isinstance(parsed_json_obj['description'], str) and \
                       parsed_json_obj['label'].startswith('[') and parsed_json_obj['label'].endswith(']'):

                        sf = parsed_json_obj.get('surface_forms', [])
                        if not isinstance(sf, list) or not all(isinstance(s, str) for s in sf):
                            sf = [phrase] # Default to the input phrase if SFs are bad
                        elif phrase.lower() not in [s.lower() for s in sf]: # Ensure original phrase is in SFs
                            sf.insert(0, phrase)

                        structured_motifs_list.append({
                            "label": parsed_json_obj['label'].strip(),
                            "description": parsed_json_obj['description'].strip(),
                            "surface_forms": list(dict.fromkeys([s.strip() for s in sf if s.strip()])) # Deduplicate SFs
                        })
                        print(f"    LLM Step 2 Structured Motif: {structured_motifs_list[-1]}")
                    # else:
                        # print(f"    WARN: Invalid JSON object structure from LLM Step 2 for '{phrase}': {parsed_json_obj}")
                except json.JSONDecodeError as e_s2:
                    print(f"    Error decoding JSON from LLM Step 2 for '{phrase}': {e_s2}")
                    print(f"    Problematic JSON string was: '{json_str_step2}'")
            else:
                print(f"    Error: LLM Step 2 returned empty/unexpected output for phrase '{phrase}'.")
        except Exception as e_step2:
            print(f"    Error in LLM Step 2 (Structuring Phrase '{phrase}'): {e_step2}")
            # Continue to next phrase even if one fails

    return structured_motifs_list

def calculate_L_H_token_based_structured(structured_motifs_list):
    if not structured_motifs_list: return 0.0
    total_lh_cost = 0.0
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue
        current_motif_lh = 0
        label_str = motif_obj.get('label', "")
        if label_str and isinstance(label_str, str) and label_str.strip():
            current_motif_lh += MOTIF_SYMBOLIC_LABEL_COST
        definition_str = motif_obj.get('definition', "")
        if definition_str and isinstance(definition_str, str) and definition_str.strip():
            current_motif_lh += MOTIF_DEFINITION_TEXT_BASE_COST
            current_motif_lh += len(definition_str.split()) * MOTIF_DEFINITION_TOKEN_COST
        surface_forms_list = motif_obj.get('surface_forms', [])
        if surface_forms_list and isinstance(surface_forms_list, list):
            valid_sfs_for_lh = [sf for sf in surface_forms_list if isinstance(sf, str) and sf.strip()]
            if valid_sfs_for_lh:
                current_motif_lh += MOTIF_SURFACE_FORMS_LIST_BASE_COST
                for sf_str in valid_sfs_for_lh:
                    current_motif_lh += len(sf_str.split()) * MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH
        total_lh_cost += current_motif_lh
    return total_lh_cost

def llm_compress_text_structured(text_to_compress, structured_motifs_list):
    if not isinstance(text_to_compress, str): return ""
    compressed = text_to_compress.lower()
    if not structured_motifs_list: return compressed
    for motif_obj in structured_motifs_list:
        if not isinstance(motif_obj, dict): continue
        label = motif_obj.get('label', "[UNKNOWN_LABEL]")
        surface_forms = motif_obj.get('surface_forms', [])
        if not label or not surface_forms or not isinstance(surface_forms, list): continue
        sorted_sfs_for_this_motif = sorted(
            [sf for sf in surface_forms if isinstance(sf, str) and sf.strip()],
            key=len, reverse=True
        )
        placeholder = label
        for sf in sorted_sfs_for_this_motif:
            try:
                compressed = re.sub(re.escape(sf.lower()), placeholder, compressed, flags=re.IGNORECASE)
            except re.error as re_e:
                print(f"    Regex error for SF '{sf}' of motif '{label}': {re_e}. Skipping.")
                continue
    return compressed

def text_to_binary_matrix(text_input, size=MATRIX_SIZE_GLOBAL):
    if not text_input or not isinstance(text_input, str): return np.zeros(size, dtype=int)
    hash_digest = hashlib.sha256(text_input.encode('utf-8', 'ignore')).hexdigest()
    required_bits = size[0] * size[1]
    binary_string = bin(int(hash_digest, 16))[2:].zfill(required_bits)
    binary_string_padded = binary_string.ljust(required_bits, '0')
    bits = [int(b) for b in binary_string_padded[:required_bits]]
    return np.array(bits).reshape(size)

def compute_bdm_for_text(text_input, bdm_instance, matrix_s=MATRIX_SIZE_GLOBAL):
    if not text_input or not isinstance(text_input, str) : return 0.0
    if not text_input.strip(): return 0.0
    MAX_TEXT_FOR_BDM_HASH = 2000
    text_for_hash = text_input if len(text_input) <= MAX_TEXT_FOR_BDM_HASH else text_input[:MAX_TEXT_FOR_BDM_HASH]
    matrix = text_to_binary_matrix(text_for_hash, size=matrix_s)
    try:
        return bdm_instance.bdm(matrix)
    except Exception as e_bdm:
        print(f"      Error during BDM calculation for text (len {len(text_input)}, hashed part len {len(text_for_hash)}): {e_bdm}")
        return -1.0

def compute_mdl_cost_for_text_block(text_block_str, structured_motifs_list, bdm_instance, matrix_s=MATRIX_SIZE_GLOBAL):
    if not isinstance(text_block_str, str) : text_block_str = ""
    l_h = calculate_L_H_token_based_structured(structured_motifs_list)
    compressed_text_block = llm_compress_text_structured(text_block_str, structured_motifs_list)
    l_d_h = compute_bdm_for_text(compressed_text_block, bdm_instance, matrix_s)
    if l_d_h < 0: return l_h, -1.0, -1.0
    return l_h, l_d_h, l_h + l_d_h

# --- Main Execution Logic ---
def main():
    print("--- MDL Prototype: Analyzing Per-QID Aggregated Text (Local LLM, Structured Motifs, Token-L(H)) ---")
    print(f"Using L(H) Cost Params: Label={MOTIF_SYMBOLIC_LABEL_COST}, DefBase={MOTIF_DEFINITION_TEXT_BASE_COST}, DefToken={MOTIF_DEFINITION_TOKEN_COST}, SFListBase={MOTIF_SURFACE_FORMS_LIST_BASE_COST}, SFTokenInLH={MOTIF_SURFACE_FORM_TOKEN_COST_IN_LH}\n")

    local_llm_pipeline = None
    local_llm_tokenizer = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    try:
        print(f"Loading tokenizer for {LOCAL_LLM_MODEL_ID}...")
        local_llm_tokenizer = AutoTokenizer.from_pretrained(LOCAL_LLM_MODEL_ID)
        bnb_config = None; quant_active = False
        if USE_QUANTIZATION_FOR_LOCAL_LLM and torch.cuda.is_available():
            try:
                compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
                bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=True)
                quant_active = True; print(f"BNB config created for {LOCAL_LLM_MODEL_ID}, compute_dtype: {compute_dtype}.")
            except Exception as e_bnb: print(f"WARN: Failed to create BitsAndBytesConfig: {e_bnb}. Quantization disabled.")

        print(f"Loading local model {LOCAL_LLM_MODEL_ID} (Quantization: {quant_active})...")
        model_kwargs = {"device_map": "auto", "trust_remote_code": True}
        if quant_active: model_kwargs["quantization_config"] = bnb_config
        else:
            if device.type == 'cuda': model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        local_llm_model = AutoModelForCausalLM.from_pretrained(LOCAL_LLM_MODEL_ID, **model_kwargs)
        local_llm_pipeline = pipeline("text-generation", model=local_llm_model, tokenizer=local_llm_tokenizer)
        print(f"Local LLM pipeline for {LOCAL_LLM_MODEL_ID} initialized successfully.")
    except Exception as e: print(f"CRITICAL: Failed to initialize local LLM pipeline: {e}"); return

    try:
        bdm_instance = BDM(ndim=2); print("BDM instance initialized successfully.")
    except Exception as e_bdm_init: print(f"CRITICAL: Failed to initialize BDM instance: {e_bdm_init}"); return

    if not os.path.exists(P2_COLLATED_FILE): print(f"ERROR: Phase 2 output file not found: {P2_COLLATED_FILE}"); return
    print(f"Loading Phase 2 output from: {P2_COLLATED_FILE}...")
    phase2_data_content = None
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f: phase2_data_content = json.load(f)
    except Exception as e: print(f"Error loading or parsing {P2_COLLATED_FILE}: {e}"); return

    all_qid_results = []; qids_to_target = []; aggregated_content_by_qid = {}
    if phase2_data_content: aggregated_content_by_qid = phase2_data_content.get("aggregated_pdf_content_by_qid", {})
    if not aggregated_content_by_qid: print(f"No 'aggregated_pdf_content_by_qid' key found or data loaded from {P2_COLLATED_FILE}."); return

    if P3_QIDS_TO_PROCESS_THEMATICALLY and isinstance(P3_QIDS_TO_PROCESS_THEMATICALLY, list) and len(P3_QIDS_TO_PROCESS_THEMATICALLY) > 0:
        qids_to_target = [qid for qid in P3_QIDS_TO_PROCESS_THEMATICALLY if qid in aggregated_content_by_qid]
        print(f"Targeting specific QIDs: {qids_to_target}")
        if not qids_to_target: print(f"WARN: Specified QIDs not found."); return
    else:
        qids_to_process_limit_fallback = 1; print(f"Processing up to {qids_to_process_limit_fallback} QID(s).")
        temp_qids_to_target = [qid_key for q_idx, qid_key in enumerate(aggregated_content_by_qid.keys()) if q_idx < qids_to_process_limit_fallback]
        qids_to_target = temp_qids_to_target
        if not qids_to_target: print("No QIDs to process."); return
    print(f"\nMDL analysis will run for QIDs: {qids_to_target}\n")

    for qid in qids_to_target:
        text_items_list = aggregated_content_by_qid.get(qid)
        if not text_items_list or not isinstance(text_items_list, list): continue
        print(f"--- Analyzing Aggregated Text for QID: {qid} ---")
        all_texts_for_qid = [item.get("text", "") for item in text_items_list if isinstance(item, dict) and isinstance(item.get("text"), str) and item.get("text","").strip()]
        corpus_for_qid = "\n\n<RSP_SEP>\n\n".join(all_texts_for_qid)
        if len(corpus_for_qid.strip()) < 100: print(f"  Skipping QID {qid}: combined text too short."); continue

        num_responses_for_qid = len(all_texts_for_qid)
        print(f"  Combined corpus for QID {qid} has {len(corpus_for_qid)} chars from {num_responses_for_qid} responses.")
        baseline_l_d_h = compute_bdm_for_text(corpus_for_qid, bdm_instance, MATRIX_SIZE_GLOBAL)
        if baseline_l_d_h < 0: print(f"  Error computing baseline BDM for QID {qid}. Skipping."); continue
        baseline_total_mdl = baseline_l_d_h
        print(f"  Baseline MDL (L(D_orig)): {baseline_total_mdl:.4f}")

        structured_motifs = llm_extract_motifs_local_hf_json(corpus_for_qid, local_llm_pipeline, local_llm_tokenizer)
        print(f"  Local LLM Extracted Structured Motifs for QID {qid}:")
        if structured_motifs:
            for mo in structured_motifs: print(f"    Label: {mo.get('label')}, Def: {mo.get('definition', '')[:60]}..., SFs: {mo.get('surface_forms')}")
        else: print("    None extracted.")

        if not structured_motifs:
            print("  No valid structured motifs extracted by Local LLM.")
            all_qid_results.append({"qid": qid, "baseline_mdl": baseline_total_mdl, "motifs": [], "l_h_motifs": 0.0, "l_d_h_motifs": baseline_total_mdl, "total_mdl_motifs": baseline_total_mdl, "compression_achieved": 0.0})
            print("-" * 40); continue

        l_h, l_d_h, total_mdl_with_motifs = compute_mdl_cost_for_text_block(corpus_for_qid, structured_motifs, bdm_instance, MATRIX_SIZE_GLOBAL)
        if l_d_h < 0:
            print(f"  Error computing MDL L(D|H) with motifs. Skipping."); all_qid_results.append({"qid": qid, "baseline_mdl": baseline_total_mdl, "motifs": structured_motifs, "l_h_motifs": l_h, "l_d_h_motifs": -1.0, "total_mdl_motifs": -1.0, "compression_achieved": "BDM_ERROR_LDH"})
            print("-" * 40); continue

        print(f"  L(H) (Token-based Structured) motif complexity for QID {qid}: {l_h:.4f}")
        print(f"  L(D|H) (BDM-based) compressed corpus complexity for QID {qid}: {l_d_h:.4f}")
        print(f"  Total MDL cost with motifs for QID {qid}: {total_mdl_with_motifs:.4f}")
        compression = baseline_total_mdl - total_mdl_with_motifs
        result_status = f"NOTE: No significant compression. Diff: {compression:.4f}"
        # Fixed typo in variable name for SUCCESS case:
        if compression > 0.0001: result_status = f"SUCCESS: Compression achieved: {compression:.4f}"
        print(f"  {result_status}")
        all_qid_results.append({"qid": qid, "baseline_mdl": baseline_total_mdl, "motifs": structured_motifs, "l_h_motifs": l_h, "l_d_h_motifs": l_d_h, "total_mdl_motifs": total_mdl_with_motifs, "compression_achieved": compression})
        print("-" * 40)

    print("\n--- Overall QID-based MDL Analysis Summary (Local LLM, Structured Motifs, Token-L(H)) ---")
    if not all_qid_results: print("No QIDs processed/results generated.")
    else:
        valid_results = [r for r in all_qid_results if isinstance(r.get('compression_achieved', 'error'), float)]
        num_compressed = sum(1 for r in valid_results if r['compression_achieved'] > 0.0001)
        success_comps = [r['compression_achieved'] for r in valid_results if r['compression_achieved'] > 0.0001]
        avg_comp = np.mean(success_comps) if success_comps else 0
        max_comp = np.max(success_comps) if success_comps else 0
        print(f"Total QIDs targeted: {len(qids_to_target)}, Results logged: {len(all_qid_results)}")
        print(f"QIDs with compression: {num_compressed}")
        if num_compressed > 0: print(f"  Avg compression: {avg_comp:.4f}, Max compression: {max_comp:.4f}")
        output_filename = "mdl_analysis_per_qid_local_llm_structured_placeholders_v1.json" # New filename
        try:
            with open(output_filename, "w", encoding="utf-8") as f_out: json.dump(all_qid_results, f_out, indent=2)
            print(f"Detailed results saved to {output_filename}")
        except Exception as e_save: print(f"Error saving results: {e_save}")

if __name__ == "__main__":
    main()

--- MDL Prototype: Analyzing Per-QID Aggregated Text (Local LLM, Structured Motifs, Token-L(H)) ---
Using L(H) Cost Params: Label=0.5, DefBase=0.25, DefToken=0.05, SFListBase=0.1, SFTokenInLH=0.05

Using device: cuda
Loading tokenizer for google/gemma-2b-it...
BNB config created for google/gemma-2b-it, compute_dtype: torch.bfloat16.
Loading local model google/gemma-2b-it (Quantization: True)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Local LLM pipeline for google/gemma-2b-it initialized successfully.
BDM instance initialized successfully.
Loading Phase 2 output from: /content/drive/MyDrive/Colab Notebooks/Legal/Phase2_PDF_Collated_Texts/phase2_collated_pdf_texts.json...
Targeting specific QIDs: ['Q4']

MDL analysis will run for QIDs: ['Q4']

--- Analyzing Aggregated Text for QID: Q4 ---
  Combined corpus for QID Q4 has 129501 chars from 209 responses.
  Baseline MDL (L(D_orig)): 121.3693
  Local LLM Extracted Structured Motifs for QID Q4:
    None extracted.
  No valid structured motifs extracted by Local LLM.
----------------------------------------

--- Overall QID-based MDL Analysis Summary (Local LLM, Structured Motifs, Token-L(H)) ---
Total QIDs targeted: 1, Results logged: 1
QIDs with compression: 0
Detailed results saved to mdl_analysis_per_qid_local_llm_structured_placeholders_v1.json


In [None]:
import json
from typing import List, Dict

def build_prompt(response_texts: List[str]) -> str:
    text_block = "\n\n".join(response_texts)
    prompt = f"""You will receive a set of responses to the same question.

Your task is to identify up to 5 key recurring themes.

For each theme, provide:
- A short label like [DATA_PRIVACY]
- A 1-sentence definition
- 2–3 short phrases that often appear in the text (surface forms)

Respond in JSON format like this:
[
  {{
    "label": "[LABEL]",
    "description": "Short definition...",
    "surface_forms": ["form1", "form2"]
  }}
]

Responses:
{text_block}
"""
    return prompt


def extract_structured_motifs_from_responses(all_responses: List[str], batch_size: int = 10) -> List[Dict]:
    all_motifs = []

    for i in range(0, len(all_responses), batch_size):
        batch = all_responses[i:i + batch_size]
        prompt = build_prompt(batch)

        try:
            # Replace this with your actual LLM call (local model or remote)
            llm_response = local_llm.generate(prompt)
            motifs = json.loads(llm_response)

            if isinstance(motifs, list):
                all_motifs.extend(motifs)
            else:
                raise ValueError("Response was not a list")

        except Exception as e:
            print(f"[WARN] Failed to extract motifs for batch {i//batch_size + 1}: {e}")
            with open("llm_motif_debug.txt", "a") as f:
                f.write(f"\n--- Batch {i//batch_size + 1} Prompt ---\n{prompt}")
                f.write(f"\n--- LLM Response ---\n{llm_response}\n")

    return all_motifs


In [None]:
structured_motifs = extract_structured_motifs_from_responses(qid_responses["Q4"], batch_size=10)

# Proceed to MDL cost estimation
l_h = calculate_L_H_token_based(structured_motifs)
l_d_given_h = estimate_bdm_cost_with_placeholders(corpus, structured_motifs)


# 30th May

In [None]:
pip install pybdm nltk tqdm


Collecting pybdm
  Downloading pybdm-0.1.0-py2.py3-none-any.whl.metadata (8.2 kB)
Downloading pybdm-0.1.0-py2.py3-none-any.whl (39.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pybdm
Successfully installed pybdm-0.1.0


In [None]:
import numpy as np
from pybdm import BDM
import hashlib

# Initialize the BDM estimator for 2D matrices
bdm = BDM(ndim=2)

def text_to_binary_matrix(text, size=(8, 8)):
    """
    Converts a text string into a binary matrix via SHA-256 hashing.
    This ensures fixed-length binary output suitable for BDM input.
    """
    # Create SHA-256 hash and convert to binary string
    hash_digest = hashlib.sha256(text.encode()).hexdigest()
    binary_string = bin(int(hash_digest, 16))[2:].zfill(256)  # Ensure 256 bits

    # Take only enough bits to fill the matrix
    binary_list = [int(bit) for bit in binary_string[:size[0] * size[1]]]

    return np.array(binary_list).reshape(size)

# Example usage
motif = "privacy rights"
matrix = text_to_binary_matrix(motif)
complexity = bdm.bdm(matrix)

print("Motif:", motif)
print("Matrix:\n", matrix)
print("BDM Complexity:", complexity)


Motif: privacy rights
Matrix:
 [[0 0 1 0 1 0 0 1]
 [0 1 1 0 1 0 1 1]
 [1 0 0 1 1 1 0 1]
 [0 0 0 1 1 0 0 1]
 [1 1 1 1 0 1 0 1]
 [0 0 1 0 1 0 1 1]
 [1 0 1 0 0 0 1 1]
 [0 0 1 0 1 0 0 1]]
BDM Complexity: 129.57258675661535


In [None]:
import json
import re
import numpy as np
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from pybdm import BDM
import hashlib
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

# Initialize BDM (2D)
bdm = BDM(ndim=2)
MATRIX_SIZE = (8, 8)
MIN_NGRAM = 2
MAX_NGRAM = 4

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = word_tokenize(text)
    return tokens

def discover_motifs(tokens, min_freq=2, n_range=(2,4)):
    counts = Counter()
    for n in range(n_range[0], n_range[1]+1):
        counts.update(ngrams(tokens, n))
    motifs = { ' '.join(k): v for k, v in counts.items() if v >= min_freq }
    return motifs

def text_to_binary_matrix(text, size=MATRIX_SIZE):
    hash_digest = hashlib.sha256(text.encode()).hexdigest()
    binary_string = bin(int(hash_digest, 16))[2:].zfill(size[0]*size[1])
    bits = [int(b) for b in binary_string[:size[0]*size[1]]]
    return np.array(bits).reshape(size)

def encode_text_as_motifs(tokens, motifs):
    encoded = []
    motif_keys = sorted(motifs.keys(), key=lambda x: -len(x.split()))
    i = 0
    while i < len(tokens):
        matched = False
        for motif in motif_keys:
            motif_tokens = motif.split()
            if tokens[i:i+len(motif_tokens)] == motif_tokens:
                encoded.append(f"<{motif}>")
                i += len(motif_tokens)
                matched = True
                break
        if not matched:
            encoded.append(tokens[i])
            i += 1
    return encoded

def chunk_sequence_to_matrices(sequence, chunk_len=64):
    # Convert sequence (list of strings) into chunks, then each chunk to binary matrices
    matrices = []
    # Join sequence into a string with space separators for hashing
    seq_str = ' '.join(sequence)
    for i in range(0, len(seq_str), chunk_len):
        chunk = seq_str[i:i+chunk_len]
        matrices.append(text_to_binary_matrix(chunk))
    return matrices

def mdl_cost(motifs, encoded_seq):
    # Calculate L(H)
    l_h = sum(bdm.bdm(text_to_binary_matrix(m)) for m in motifs.keys())

    # Calculate L(D|H)
    matrices = chunk_sequence_to_matrices(encoded_seq)
    l_d_h = sum(bdm.bdm(m) for m in matrices)

    return l_h, l_d_h, l_h + l_d_h

# Sample JSONL input for Q4
sample_jsonl_line = '''
{
  "Q4": {
    "question_text": "Noting the current individual rights contained in Australian Privacy Principles 12 and 13, and the proposed individual rights in proposals 18.1, 18.2 and 18.3, what specific exceptions (if any) should apply to these rights in the employment context?",
    "status": "success_summarized",
    "extracted_passages": [
      "13 March 2023 Attorney-Generals Department Email: To whom it may concern Thank you for the opportunity to provide feedback on the Privacy Act Review Report. I would like to raise an issue that undermines the privacy rights of Australians. Property data is available for everyone and websites such as Realestate.com.au display past and current pictures of your home, even though the property is off market."
    ],
    "top_passage_score": 0.6251732707023621,
    "summary": "This excerpt highlights a significant concern regarding privacy rights in Australia, specifically concerning the availability of property data. The Attorney-Generals Department points to the Privacy Act 2022, referencing principles 12 and 13, and proposes new rights for individuals in proposals 18.1, 18.2, and 18.3. The core issue is that real estate websites like Realestate.com.au are collecting and displaying property images, potentially violating individuals’ right to control their personal information and privacy."
  }
}
'''

def run_example(jsonl_line):
    data = json.loads(jsonl_line)
    qid = next(iter(data))
    passage = data[qid]['extracted_passages'][0]
    tokens = normalize_text(passage)
    motifs = discover_motifs(tokens)
    encoded_seq = encode_text_as_motifs(tokens, motifs)
    l_h, l_d_h, total = mdl_cost(motifs, encoded_seq)

    print(f"QID: {qid}")
    print(f"Motifs (L(H)):\n{motifs}")
    print(f"Encoded sequence:\n{encoded_seq}")
    print(f"L(H) (motifs complexity): {l_h}")
    print(f"L(D|H) (encoded text complexity): {l_d_h}")
    print(f"Total MDL cost: {total}")

run_example(sample_jsonl_line)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


QID: Q4
Motifs (L(H)):
{'the privacy': 2}
Encoded sequence:
['13', 'march', '2023', 'attorneygenerals', 'department', 'email', 'to', 'whom', 'it', 'may', 'concern', 'thank', 'you', 'for', 'the', 'opportunity', 'to', 'provide', 'feedback', 'on', '<the privacy>', 'act', 'review', 'report', 'i', 'would', 'like', 'to', 'raise', 'an', 'issue', 'that', 'undermines', '<the privacy>', 'rights', 'of', 'australians', 'property', 'data', 'is', 'available', 'for', 'everyone', 'and', 'websites', 'such', 'as', 'realestatecomau', 'display', 'past', 'and', 'current', 'pictures', 'of', 'your', 'home', 'even', 'though', 'the', 'property', 'is', 'off', 'market']
L(H) (motifs complexity): 125.47929777772686
L(D|H) (encoded text complexity): 868.968392491515
Total MDL cost: 994.4476902692419


## MWP using Q4

In [None]:
pip install google-genai



In [None]:
from google.colab import userdata

import os

os.environ["GEMINI_API_KEY"] = userdata.get('GEMINI_API_KEY')

In [None]:
import os
from google import genai
from google.genai import types
import numpy as np
import hashlib
from pybdm import BDM

bdm = BDM(ndim=2)
MATRIX_SIZE = (8, 8)

# Initialize Gemini client once (make sure GEMINI_API_KEY is set in your env)
client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
model = "gemma-3n-e4b-it"

def llm_extract_motifs(text):
    prompt = f"""
    Extract a concise list of 5 key themes or motifs from the following text. List them separated by commas:

    Text:
    \"\"\"{text}\"\"\"
    """

    contents = [
        types.Content(
            role="user",
            parts=[types.Part.from_text(text=prompt)]
        ),
    ]

    config = types.GenerateContentConfig(response_mime_type="text/plain")

    response = client.models.generate_content(model=model, contents=contents, config=config)
    motifs_text = response.text.strip()

    motifs = [m.strip() for m in motifs_text.split(",") if m.strip()]
    return motifs

def llm_compress_text(text, motifs):
    compressed = text.lower()
    for motif in motifs:
        compressed = compressed.replace(motif, f"<{motif}>")
    return compressed

def text_to_binary_matrix(text, size=MATRIX_SIZE):
    hash_digest = hashlib.sha256(text.encode()).hexdigest()
    binary_string = bin(int(hash_digest, 16))[2:].zfill(size[0]*size[1])
    bits = [int(b) for b in binary_string[:size[0]*size[1]]]
    return np.array(bits).reshape(size)

def compute_mdl_cost(text, motifs):
    l_h = sum(bdm.bdm(text_to_binary_matrix(m)) for m in motifs)
    compressed_text = llm_compress_text(text, motifs)
    l_d_h = bdm.bdm(text_to_binary_matrix(compressed_text))
    return l_h, l_d_h, l_h + l_d_h

if __name__ == "__main__":
    sample_text = (
        "13 March 2023 Attorney-Generals Department Email: To whom it may concern "
        "Thank you for the opportunity to provide feedback on the Privacy Act Review Report. "
        "I would like to raise an issue that undermines the privacy rights of Australians. "
        "Property data is available for everyone and websites such as Realestate.com.au display "
        "past and current pictures of your home, even though the property is off market."
    )

    motifs = llm_extract_motifs(sample_text)
    print("Extracted motifs:", motifs)

    l_h, l_d_h, total = compute_mdl_cost(sample_text, motifs)
    print(f"L(H) motif complexity: {l_h:.4f}")
    print(f"L(D|H) compressed text complexity: {l_d_h:.4f}")
    print(f"Total MDL cost: {total:.4f}")


Extracted motifs: ['Privacy rights', 'Data accessibility', 'Property data exposure', 'Online information sharing', 'Off-market property information.']
L(H) motif complexity: 603.6927
L(D|H) compressed text complexity: 128.5227
Total MDL cost: 732.2154


## Input text extracted using existing Thematic Analysis Pipeline

In [None]:
# --- Previous BDM MDL Prototype Code (Imports, BDM init, text_to_binary_matrix, etc.) ---
import json
import os
import hashlib
import numpy as np
from pybdm import BDM
import re
from google import genai
from google.genai import types

# --- Configuration from your larger codebase (Cell 2) ---
BASE_PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Legal/' # EXAMPLE - UPDATE
PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json') # Corrected variable name

# --- Your existing BDM MDL functions (ensure they are defined above) ---
# (BDM, MATRIX_SIZE, client, model from your BDM prototype)
# Function definitions (llm_extract_motifs, llm_compress_text, text_to_binary_matrix,
# compute_bdm_for_text, compute_mdl_cost_for_text_block)

def llm_extract_motifs(text_to_analyze, genai_client, llm_model_name):
    # Your existing llm_extract_motifs function
    prompt = f"""
    From the following text, which comprises several responses to a single question,
    extract a concise list of up to 5 key recurring themes or motifs.
    List them separated by commas, with no introductory text.
    If no clear themes, output "NO_THEMES_FOUND".

    Text:
    \"\"\"{text_to_analyze}\"\"\"

    Recurring Themes/Motifs:
    """
    # Limit text_to_analyze to avoid exceeding LLM context window for this prototype
    # A more robust solution would involve chunking the text if it's too long.
    MAX_TEXT_FOR_LLM_MOTIF_EXTRACTION = 15000 # Approx characters, adjust based on model
    if len(text_to_analyze) > MAX_TEXT_FOR_LLM_MOTIF_EXTRACTION:
        print(f"    Warning: Text for motif extraction is long ({len(text_to_analyze)} chars), truncating to {MAX_TEXT_FOR_LLM_MOTIF_EXTRACTION} for LLM.")
        text_to_analyze = text_to_analyze[:MAX_TEXT_FOR_LLM_MOTIF_EXTRACTION]


    contents = [types.Content(role="user", parts=[types.Part.from_text(text=prompt)])]
    config = types.GenerateContentConfig(response_mime_type="text/plain")

    try:
        response = genai_client.models.generate_content(model=llm_model_name, contents=contents, config=config)
        motifs_text = response.text.strip()
        if "NO_THEMES_FOUND" in motifs_text or not motifs_text:
            return []
        motifs = [m.strip() for m in motifs_text.split(",") if m.strip() and len(m.strip()) > 3] # Slightly longer min motif
        return motifs[:5] # Max 5 motifs
    except Exception as e:
        print(f"Error in llm_extract_motifs for text starting with '{text_to_analyze[:50]}...': {e}")
        return []

def llm_compress_text(text_to_compress, motifs_list):
    # Your existing llm_compress_text function
    compressed = text_to_compress.lower() # Consistent case for replacement
    for motif in motifs_list:
        if motif and isinstance(motif, str): # Ensure motif is a non-empty string
            placeholder = f"<MOTIF_{motif.replace(' ', '_').upper().replace('[','').replace(']','').replace(':','_')[:20]}>" # Sanitize and shorten placeholder
            try:
                # Use re.escape to handle special characters in motifs
                compressed = re.sub(re.escape(motif.lower()), placeholder, compressed, flags=re.IGNORECASE)
            except re.error as re_e:
                print(f"    Regex error during compression for motif '{motif}': {re_e}. Skipping this motif for compression.")
                continue # Skip problematic motif
    return compressed

def text_to_binary_matrix(text_input, size=(8,8)):
    # Your existing text_to_binary_matrix function
    if not text_input or not isinstance(text_input, str):
        return np.zeros(size, dtype=int)
    hash_digest = hashlib.sha256(text_input.encode('utf-8', 'ignore')).hexdigest()
    required_bits = size[0] * size[1]
    binary_string = bin(int(hash_digest, 16))[2:].zfill(required_bits)
    binary_string_padded = binary_string.ljust(required_bits, '0')
    bits = [int(b) for b in binary_string_padded[:required_bits]]
    return np.array(bits).reshape(size)

def compute_bdm_for_text(text_input, bdm_instance, matrix_s=(8,8)):
    if not text_input: return 0.0
    # BDM can be slow for very long strings if we try to make one giant matrix.
    # For this prototype, we'll still use one matrix per text block (motif or compressed text).
    # A more advanced version might chunk long texts for BDM.
    MAX_BDM_TEXT_LEN = 2000 # Heuristic limit for direct BDM to avoid excessive slowness
    if len(text_input) > MAX_BDM_TEXT_LEN:
        # Simple strategy: take BDM of a representative sample (e.g. hash of truncated)
        # This is a simplification for the prototype.
        # print(f"      BDM input text long ({len(text_input)}), using BDM of truncated hash for speed.")
        text_input_for_bdm = text_input[:MAX_BDM_TEXT_LEN]
    else:
        text_input_for_bdm = text_input

    matrix = text_to_binary_matrix(text_input_for_bdm, size=matrix_s)
    return bdm_instance.bdm(matrix)

def compute_mdl_cost_for_text_block(text_block_str, motifs_list, bdm_instance, matrix_s=(8,8)):
    if not motifs_list:
        l_h = 0.0
    else:
        valid_motifs = [m for m in motifs_list if isinstance(m, str) and m.strip()]
        if not valid_motifs:
            l_h = 0.0
        else:
            l_h = sum(compute_bdm_for_text(m, bdm_instance, matrix_s) for m in valid_motifs)

    compressed_text_block = llm_compress_text(text_block_str, motifs_list)
    l_d_h = compute_bdm_for_text(compressed_text_block, bdm_instance, matrix_s)

    return l_h, l_d_h, l_h + l_d_h

# --- Main Processing Logic ---
if __name__ == "__main__":
    # --- Initialize BDM and LLM Client ---
    if 'GEMINI_API_KEY' not in os.environ:
        print("CRITICAL: GEMINI_API_KEY environment variable not set.")
        try:
            from google.colab import userdata
            os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')
            if 'GEMINI_API_KEY' not in os.environ or not os.environ['GEMINI_API_KEY']:
                 print("CRITICAL: GEMINI_API_KEY still not found after Colab check.")
                 exit()
            print("Loaded GEMINI_API_KEY from Colab secrets.")
        except ImportError:
            print("CRITICAL: Not in Colab and GEMINI_API_KEY not set directly.")
            exit()
        except Exception as e_key:
            print(f"CRITICAL: Error loading GEMINI_API_KEY from Colab secrets: {e_key}")
            exit()

    try:
        g_client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
        g_model_name = "gemma-3n-e4b-it"
        MATRIX_SIZE_GLOBAL = (8, 8)
        print(f"Successfully initialized GenAI client for model {g_model_name}.")
    except Exception as e:
        print(f"CRITICAL: Failed to initialize GenAI client: {e}")
        exit()

    bdm_instance = BDM(ndim=2)

    # --- Load and Process Data from Phase 2 Output ---
    if not os.path.exists(P2_COLLATED_FILE):
        print(f"ERROR: Phase 2 output file not found: {P2_COLLATED_FILE}")
        exit()

    print(f"Loading Phase 2 output from: {P2_COLLATED_FILE}...")
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f:
            phase2_data = json.load(f)
    except Exception as e:
        print(f"Error loading or parsing {P2_COLLATED_FILE}: {e}")
        exit()

    all_qid_results = []
    # Limit QIDs to process for this prototype run
    qids_to_process_limit = 2 # Process first X QIDs found in the file

    print(f"\nProcessing up to {qids_to_process_limit} QIDs for MDL analysis...\n")

    qids_processed_count = 0
    aggregated_content_by_qid = phase2_data.get("aggregated_pdf_content_by_qid", {})
    if not aggregated_content_by_qid:
        print(f"No 'aggregated_pdf_content_by_qid' key found in {P2_COLLATED_FILE}. Check file structure.")
        exit()

    for qid, text_items_list in aggregated_content_by_qid.items():
        if qids_processed_count >= qids_to_process_limit:
            break

        if not text_items_list or not isinstance(text_items_list, list):
            # print(f"Skipping QID {qid}: no text items or not a list.")
            continue

        print(f"--- Analyzing Aggregated Text for QID: {qid} ---")

        # 1. Concatenate all text for this QID
        #    Prioritize 'pdf_passages' if available, else 'pdf_summary' if that's what Phase 2 collated
        #    Your Phase 2 logic seems to choose one or the other already.
        all_texts_for_qid = [item.get("text", "") for item in text_items_list if isinstance(item, dict) and item.get("text")]

        # Join with a clear separator that's unlikely to be part of normal text
        # This separator itself adds to the "complexity" of the original text block from BDM's perspective.
        corpus_for_qid = "\n\n<RESPONSE_SEPARATOR>\n\n".join(filter(None, all_texts_for_qid))

        if len(corpus_for_qid.strip()) < 100: # Skip if the combined corpus for this QID is too short
            print(f"  Skipping QID {qid}: combined text too short ({len(corpus_for_qid)} chars).")
            continue

        print(f"  Combined corpus for QID {qid} has {len(corpus_for_qid)} characters from {len(all_texts_for_qid)} responses.")
        # print(f"  Corpus Snippet (first 200 chars): {corpus_for_qid[:200].replace(chr(10), ' ')}...")


        # 2. Baseline MDL cost (original aggregated text for this QID)
        baseline_l_d_h = compute_bdm_for_text(corpus_for_qid, bdm_instance, MATRIX_SIZE_GLOBAL)
        baseline_total_mdl = baseline_l_d_h # L(H) is 0 for baseline
        print(f"  Baseline MDL for QID {qid} (L(D_orig_corpus_for_qid)): {baseline_total_mdl:.4f}")

        # 3. Extract motifs using LLM from this aggregated QID corpus
        #    The prompt for llm_extract_motifs is already geared towards "recurring themes"
        extracted_motifs = llm_extract_motifs(corpus_for_qid, g_client, g_model_name)
        print(f"  LLM Extracted Motifs for QID {qid}: {extracted_motifs}")

        if not extracted_motifs:
            print("  No motifs extracted by LLM for this QID, skipping MDL cost calculation with motifs.")
            all_qid_results.append({
                "qid": qid, "corpus_len_for_qid": len(corpus_for_qid),
                "baseline_mdl": baseline_total_mdl, "motifs": [],
                "l_h_motifs": 0, "l_d_h_motifs": baseline_total_mdl, "total_mdl_motifs": baseline_total_mdl,
                "compression_achieved": 0
            })
            qids_processed_count += 1
            continue

        # 4. Compute MDL cost with LLM-extracted motifs for this QID's corpus
        l_h, l_d_h, total_mdl_with_motifs = compute_mdl_cost_for_text_block(
            corpus_for_qid, extracted_motifs, bdm_instance, MATRIX_SIZE_GLOBAL
        )

        print(f"  L(H) motif complexity for QID {qid}: {l_h:.4f}")
        print(f"  L(D|H) compressed corpus complexity for QID {qid}: {l_d_h:.4f}")
        print(f"  Total MDL cost with motifs for QID {qid}: {total_mdl_with_motifs:.4f}")

        compression = baseline_total_mdl - total_mdl_with_motifs
        if compression > 0.0001: # Use a small epsilon
            print(f"  SUCCESS for QID {qid}: Compression achieved: {compression:.4f}")
        else:
            print(f"  NOTE for QID {qid}: No significant compression achieved (or cost increased). Diff: {compression:.4f}")

        all_qid_results.append({
            "qid": qid, "corpus_len_for_qid": len(corpus_for_qid),
            "baseline_mdl": baseline_total_mdl, "motifs": extracted_motifs,
            "l_h_motifs": l_h, "l_d_h_motifs": l_d_h, "total_mdl_motifs": total_mdl_with_motifs,
            "compression_achieved": compression
        })
        qids_processed_count += 1
        print("-" * 40)

    print("\n--- Overall QID-based MDL Analysis Summary ---")
    if not all_qid_results:
        print("No QIDs were processed or no valid results generated.")
    else:
        num_compressed_qids = sum(1 for r in all_qid_results if r['compression_achieved'] > 0.0001)
        successful_compressions = [r['compression_achieved'] for r in all_qid_results if r['compression_achieved'] > 0.0001]

        avg_compression = np.mean(successful_compressions) if successful_compressions else 0
        max_compression = np.max(successful_compressions) if successful_compressions else 0

        print(f"Total QIDs analyzed: {len(all_qid_results)}")
        print(f"Number of QIDs where compression was achieved: {num_compressed_qids}")
        if num_compressed_qids > 0:
            print(f"  Average compression (for successful QIDs): {avg_compression:.4f}")
            print(f"  Maximum compression achieved across QIDs: {max_compression:.4f}")

        output_filename_qids = "mdl_analysis_per_qid.json"
        try:
            with open(output_filename_qids, "w", encoding="utf-8") as f_out:
                json.dump(all_qid_results, f_out, indent=2)
            print(f"Detailed QID-based results saved to {output_filename_qids}")
        except Exception as e_save:
            print(f"Error saving QID-based results to {output_filename_qids}: {e_save}")

CRITICAL: GEMINI_API_KEY environment variable not set.
Loaded GEMINI_API_KEY from Colab secrets.
Successfully initialized GenAI client for model gemma-3n-e4b-it.
ERROR: Phase 2 output file not found: /content/drive/MyDrive/Colab Notebooks/Legal/Phase2_PDF_Collated_Texts/phase2_collated_pdf_texts.json
Loading Phase 2 output from: /content/drive/MyDrive/Colab Notebooks/Legal/Phase2_PDF_Collated_Texts/phase2_collated_pdf_texts.json...
Error loading or parsing /content/drive/MyDrive/Colab Notebooks/Legal/Phase2_PDF_Collated_Texts/phase2_collated_pdf_texts.json: [Errno 2] No such file or directory: '/content/drive/MyDrive/Colab Notebooks/Legal/Phase2_PDF_Collated_Texts/phase2_collated_pdf_texts.json'

Processing up to 2 QIDs for MDL analysis...



NameError: name 'phase2_data' is not defined

In [None]:
# --- Imports (ensure all necessary imports are at the top) ---
import json
import os
import hashlib
import numpy as np
from pybdm import BDM
import re
from google import genai # Assuming these are needed if not already imported from other cells
from google.genai import types # Assuming these are needed

# --- Configuration (Define paths and constants) ---
# These would typically be in your "Cell 2: Global Project Configuration"
# For this standalone cell, ensure they are defined or adjust paths as needed.
BASE_PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Legal/' # !!! EXAMPLE - UPDATE THIS PATH !!!
PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
# This is the output file from your "Cell 4: Phase 2 - Collation" script
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json')

# --- BDM and LLM Model Configuration ---
MATRIX_SIZE_GLOBAL = (8, 8) # For BDM text_to_binary_matrix

# --- Helper Function Definitions ---

def llm_extract_motifs(text_to_analyze, genai_client, llm_model_name):
    """Extracts motifs from text using the specified LLM."""
    prompt = f"""
    From the following text, which comprises several responses to a single question,
    extract a concise list of up to 5 key recurring themes or motifs.
    List them separated by commas, with no introductory text.
    If no clear themes or the text is too generic, output "NO_THEMES_FOUND".

    Text (analyse for recurring themes):
    \"\"\"{text_to_analyze}\"\"\"

    Recurring Themes/Motifs (comma-separated, max 5):
    """
    MAX_TEXT_FOR_LLM_MOTIF_EXTRACTION = 15000 # Approx characters
    original_len = len(text_to_analyze)
    if original_len > MAX_TEXT_FOR_LLM_MOTIF_EXTRACTION:
        # print(f"    Warning: Text for motif extraction is long ({original_len} chars), truncating to {MAX_TEXT_FOR_LLM_MOTIF_EXTRACTION} for LLM.")
        text_to_analyze = text_to_analyze[:MAX_TEXT_FOR_LLM_MOTIF_EXTRACTION]

    contents = [types.Content(role="user", parts=[types.Part.from_text(text=prompt)])]
    config = types.GenerateContentConfig(response_mime_type="text/plain")

    try:
        response = genai_client.models.generate_content(model=llm_model_name, contents=contents, config=config)
        motifs_text = response.text.strip()
        if "NO_THEMES_FOUND" in motifs_text.upper() or not motifs_text:
            return []
        motifs = [m.strip() for m in motifs_text.split(",") if m.strip() and len(m.strip()) > 3]
        return motifs[:5] # Max 5 motifs
    except Exception as e:
        print(f"    Error in llm_extract_motifs (text len {original_len}, truncated to {len(text_to_analyze)}): {e}")
        return []

def llm_compress_text(text_to_compress, motifs_list):
    """Compresses text by replacing motifs with placeholders."""
    if not isinstance(text_to_compress, str): return "" # Handle non-string input
    compressed = text_to_compress.lower()
    if not motifs_list: return compressed

    for motif in motifs_list:
        if motif and isinstance(motif, str) and motif.strip():
            # Sanitize motif for placeholder: replace non-alphanum with underscore, uppercase, limit length
            safe_placeholder_name = re.sub(r'\W+', '_', motif).upper()[:20]
            placeholder = f"<MOTIF_{safe_placeholder_name}>"
            try:
                compressed = re.sub(re.escape(motif.lower()), placeholder, compressed, flags=re.IGNORECASE)
            except re.error as re_e:
                print(f"    Regex error during compression for motif '{motif}': {re_e}. Skipping.")
                continue
    return compressed

def text_to_binary_matrix(text_input, size=MATRIX_SIZE_GLOBAL):
    """Converts text to a binary matrix via hashing for BDM."""
    if not text_input or not isinstance(text_input, str):
        return np.zeros(size, dtype=int)
    hash_digest = hashlib.sha256(text_input.encode('utf-8', 'ignore')).hexdigest()
    required_bits = size[0] * size[1]
    binary_string = bin(int(hash_digest, 16))[2:].zfill(required_bits)
    binary_string_padded = binary_string.ljust(required_bits, '0')
    bits = [int(b) for b in binary_string_padded[:required_bits]]
    return np.array(bits).reshape(size)

def compute_bdm_for_text(text_input, bdm_instance, matrix_s=MATRIX_SIZE_GLOBAL):
    """Computes BDM for a given text string."""
    if not text_input or not isinstance(text_input, str) : return 0.0 # Treat BDM of empty/non-string as 0

    # BDM complexity can be sensitive to very small variations if matrix is small
    # For consistency, ensure text_input is not just whitespace
    if not text_input.strip(): return 0.0

    # Simplification: BDM applied to a hash of the text (truncated if very long)
    # This makes BDM less sensitive to actual text length for L(D|H) but consistent.
    MAX_TEXT_FOR_BDM_HASH = 2000 # Apply hash to a potentially truncated version for BDM consistency
    text_for_hash = text_input if len(text_input) <= MAX_TEXT_FOR_BDM_HASH else text_input[:MAX_TEXT_FOR_BDM_HASH]

    matrix = text_to_binary_matrix(text_for_hash, size=matrix_s)
    try:
        return bdm_instance.bdm(matrix)
    except Exception as e_bdm:
        print(f"      Error during BDM calculation for text (len {len(text_input)}, hashed part len {len(text_for_hash)}): {e_bdm}")
        return -1.0 # Indicate BDM error

def compute_mdl_cost_for_text_block(text_block_str, motifs_list, bdm_instance, matrix_s=MATRIX_SIZE_GLOBAL):
    """Computes L(H), L(D|H), and total MDL cost for a text block and its motifs."""
    if not isinstance(text_block_str, str) : text_block_str = "" # Ensure it's a string

    l_h = 0.0
    valid_motifs_for_lh = []
    if motifs_list:
        valid_motifs_for_lh = [m for m in motifs_list if isinstance(m, str) and m.strip()]
        if valid_motifs_for_lh:
            l_h = sum(compute_bdm_for_text(m, bdm_instance, matrix_s) for m in valid_motifs_for_lh)

    # L(D|H): Cost of the data (text_block) compressed with these motifs
    # Use all original motifs_list for compression, even if some were empty/invalid for L(H)
    compressed_text_block = llm_compress_text(text_block_str, motifs_list)
    l_d_h = compute_bdm_for_text(compressed_text_block, bdm_instance, matrix_s)

    # Handle BDM error indication
    if l_h < 0 or l_d_h < 0:
        return -1.0, -1.0, -1.0 # Indicate error in MDL cost calculation

    return l_h, l_d_h, l_h + l_d_h

# --- Main Execution Logic ---
def main():
    print("--- MDL Prototype: Analyzing Per-QID Aggregated Text from Phase 2 ---")
    # --- Initialize BDM and LLM Client ---
    global g_client, g_model_name # Make them accessible if defined outside main

    if 'GEMINI_API_KEY' not in os.environ:
        print("CRITICAL: GEMINI_API_KEY environment variable not set.")
        try:
            from google.colab import userdata
            gemini_api_key_val = userdata.get('GEMINI_API_KEY')
            if not gemini_api_key_val:
                print("CRITICAL: GEMINI_API_KEY not found in Colab secrets.")
                return # Use return instead of exit() if in a function
            os.environ['GEMINI_API_KEY'] = gemini_api_key_val
            print("Loaded GEMINI_API_KEY from Colab secrets.")
        except ImportError:
            print("CRITICAL: Not in Colab and GEMINI_API_KEY not set directly.")
            return
        except Exception as e_key:
            print(f"CRITICAL: Error loading GEMINI_API_KEY from Colab secrets: {e_key}")
            return

    try:
        g_client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
        g_model_name = "gemma-3n-e4b-it" # The model that worked for you
        print(f"Successfully initialized GenAI client for model {g_model_name}.")
    except Exception as e:
        print(f"CRITICAL: Failed to initialize GenAI client: {e}")
        return

    # bdm_instance = BDM(ndim=2, block_size=None, use_ctm=False) # Example BDM config, adjust if needed
    bdm_instance = BDM(ndim=2) # Example BDM config, adjust if needed

    # --- Load and Process Data from Phase 2 Output ---
    if not os.path.exists(P2_COLLATED_FILE):
        print(f"ERROR: Phase 2 output file not found: {P2_COLLATED_FILE}")
        return

    print(f"Loading Phase 2 output from: {P2_COLLATED_FILE}...")
    phase2_data_content = None
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f:
            phase2_data_content = json.load(f)
    except Exception as e:
        print(f"Error loading or parsing {P2_COLLATED_FILE}: {e}")
        return

    all_qid_results = []
    qids_to_process_limit = 3 # Process first X QIDs found in the file for speed

    print(f"\nProcessing up to {qids_to_process_limit} QIDs for MDL analysis...\n")

    qids_processed_count = 0

    if phase2_data_content is None:
        print(f"CRITICAL: Failed to load data from {P2_COLLATED_FILE}. Cannot proceed.")
        aggregated_content_by_qid = {}
    else:
        aggregated_content_by_qid = phase2_data_content.get("aggregated_pdf_content_by_qid", {})

    if not aggregated_content_by_qid:
        print(f"No 'aggregated_pdf_content_by_qid' key found or data loaded from {P2_COLLATED_FILE}.")
        return

    for qid, text_items_list in aggregated_content_by_qid.items():
        if qids_processed_count >= qids_to_process_limit:
            break

        if not text_items_list or not isinstance(text_items_list, list):
            continue

        print(f"--- Analyzing Aggregated Text for QID: {qid} ---")

        all_texts_for_qid = [
            item.get("text", "") for item in text_items_list
            if isinstance(item, dict) and isinstance(item.get("text"), str) and item.get("text","").strip()
        ]

        corpus_for_qid = "\n\n<RSP_SEP>\n\n".join(all_texts_for_qid) # Shorter separator

        if len(corpus_for_qid.strip()) < 100:
            print(f"  Skipping QID {qid}: combined text too short ({len(corpus_for_qid)} chars).")
            continue

        num_responses_for_qid = len(all_texts_for_qid)
        print(f"  Combined corpus for QID {qid} has {len(corpus_for_qid)} chars from {num_responses_for_qid} responses.")

        baseline_l_d_h = compute_bdm_for_text(corpus_for_qid, bdm_instance, MATRIX_SIZE_GLOBAL)
        if baseline_l_d_h < 0: # BDM Error
            print(f"  Error computing baseline BDM for QID {qid}. Skipping.")
            continue
        baseline_total_mdl = baseline_l_d_h
        print(f"  Baseline MDL for QID {qid} (L(D_orig_corpus_for_qid)): {baseline_total_mdl:.4f}")

        extracted_motifs = llm_extract_motifs(corpus_for_qid, g_client, g_model_name)
        print(f"  LLM Extracted Motifs for QID {qid}: {extracted_motifs}")

        if not extracted_motifs:
            print("  No valid motifs extracted by LLM for this QID.")
            # Store result even if no motifs, for completeness
            all_qid_results.append({
                "qid": qid, "corpus_len_for_qid": len(corpus_for_qid), "num_responses": num_responses_for_qid,
                "baseline_mdl": baseline_total_mdl, "motifs": [],
                "l_h_motifs": 0, "l_d_h_motifs": baseline_total_mdl, "total_mdl_motifs": baseline_total_mdl,
                "compression_achieved": 0.0
            })
            qids_processed_count += 1
            print("-" * 40)
            continue

        l_h, l_d_h, total_mdl_with_motifs = compute_mdl_cost_for_text_block(
            corpus_for_qid, extracted_motifs, bdm_instance, MATRIX_SIZE_GLOBAL
        )

        if total_mdl_with_motifs < 0: # BDM Error during motif MDL calculation
            print(f"  Error computing MDL cost with motifs for QID {qid}. Skipping this result.")
            # Still append a result indicating error if needed, or just skip
            all_qid_results.append({
                "qid": qid, "corpus_len_for_qid": len(corpus_for_qid), "num_responses": num_responses_for_qid,
                "baseline_mdl": baseline_total_mdl, "motifs": extracted_motifs,
                "l_h_motifs": -1.0, "l_d_h_motifs": -1.0, "total_mdl_motifs": -1.0,
                "compression_achieved": "BDM_ERROR"
            })
            qids_processed_count += 1
            print("-" * 40)
            continue

        print(f"  L(H) motif complexity for QID {qid}: {l_h:.4f}")
        print(f"  L(D|H) compressed corpus complexity for QID {qid}: {l_d_h:.4f}")
        print(f"  Total MDL cost with motifs for QID {qid}: {total_mdl_with_motifs:.4f}")

        compression = baseline_total_mdl - total_mdl_with_motifs
        result_status = ""
        if compression > 0.0001:
            result_status = f"SUCCESS: Compression achieved: {compression:.4f}"
        else:
            result_status = f"NOTE: No significant compression (or cost increased). Diff: {compression:.4f}"
        print(f"  {result_status}")

        all_qid_results.append({
            "qid": qid, "corpus_len_for_qid": len(corpus_for_qid), "num_responses": num_responses_for_qid,
            "baseline_mdl": baseline_total_mdl, "motifs": extracted_motifs,
            "l_h_motifs": l_h, "l_d_h_motifs": l_d_h, "total_mdl_motifs": total_mdl_with_motifs,
            "compression_achieved": compression
        })
        qids_processed_count += 1
        print("-" * 40)

    print("\n--- Overall QID-based MDL Analysis Summary ---")
    if not all_qid_results:
        print("No QIDs were processed or no valid results generated.")
    else:
        # Filter out error results before calculating stats
        valid_results_for_stats = [r for r in all_qid_results if not isinstance(r['compression_achieved'], str) and r['l_h_motifs'] >= 0]

        num_compressed_qids = sum(1 for r in valid_results_for_stats if r['compression_achieved'] > 0.0001)
        successful_compressions = [r['compression_achieved'] for r in valid_results_for_stats if r['compression_achieved'] > 0.0001]

        avg_compression = np.mean(successful_compressions) if successful_compressions else 0
        max_compression = np.max(successful_compressions) if successful_compressions else 0

        print(f"Total QIDs attempted: {qids_processed_count} (out of {len(aggregated_content_by_qid)} with content)")
        print(f"Total QID results logged: {len(all_qid_results)}")
        print(f"Number of QIDs where compression was achieved (from valid results): {num_compressed_qids}")
        if num_compressed_qids > 0:
            print(f"  Average compression (for successful cases): {avg_compression:.4f}")
            print(f"  Maximum compression achieved across QIDs: {max_compression:.4f}")

        output_filename_qids = "mdl_analysis_per_qid_v2.json" # Changed filename slightly
        try:
            with open(output_filename_qids, "w", encoding="utf-8") as f_out:
                json.dump(all_qid_results, f_out, indent=2)
            print(f"Detailed QID-based results saved to {output_filename_qids}")
        except Exception as e_save:
            print(f"Error saving QID-based results to {output_filename_qids}: {e_save}")

if __name__ == "__main__":
    # This structure allows defining g_client and g_model_name globally if needed by helper functions
    # without explicitly passing them, though passing is cleaner.
    # For this script, helper functions now accept client and model_name.
    main()

--- MDL Prototype: Analyzing Per-QID Aggregated Text from Phase 2 ---
Successfully initialized GenAI client for model gemma-3n-e4b-it.
Loading Phase 2 output from: /content/drive/MyDrive/Colab Notebooks/Legal/Phase2_PDF_Collated_Texts/phase2_collated_pdf_texts.json...

Processing up to 3 QIDs for MDL analysis...

--- Analyzing Aggregated Text for QID: Q4 ---
  Combined corpus for QID Q4 has 129501 chars from 209 responses.
  Baseline MDL for QID Q4 (L(D_orig_corpus_for_qid)): 121.3693
    Error in llm_extract_motifs (text len 129501, truncated to 15000): 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_input_toke

## Using Gemma 3 locally

In [None]:
# --- Add/Ensure these imports are at the top of your cell ---
import json
import os
import hashlib
import numpy as np
from pybdm import BDM
import re
import time # For potential sleeps if needed, though less critical with local model

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
# from google import genai # No longer needed for LLM calls if using local
# from google.genai import types # No longer needed for LLM calls

# --- Configuration ---
BASE_PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Legal/' # !!! EXAMPLE - UPDATE THIS PATH !!!
PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json')
MATRIX_SIZE_GLOBAL = (8, 8)

# LLM Model Configuration for Local Hugging Face Model
LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'
USE_QUANTIZATION_FOR_LOCAL_LLM = True # Recommended for 2b model on Colab

# --- Helper Function Definitions (tokenize_sentence, llm_compress_text, text_to_binary_matrix, etc. remain the same) ---
# ... (Keep your existing helper functions for BDM, text processing) ...

def llm_extract_motifs_local_hf(text_to_analyze, hf_pipeline, hf_tokenizer):
    """Extracts motifs from text using a local Hugging Face pipeline."""
    prompt_template = [ # Gemma instruction format
        {"role": "user", "content": f"""
From the following text, which comprises several responses to a single question,
extract a concise list of up to 5 key recurring themes or motifs.
List them separated by commas, with no introductory text.
If no clear themes or the text is too generic, output "NO_THEMES_FOUND".

Text (analyse for recurring themes):
\"\"\"{text_to_analyze}\"\"\"

Recurring Themes/Motifs (comma-separated, max 5):
"""}
    ]

    # Apply chat template
    prompt_formatted = hf_tokenizer.apply_chat_template(prompt_template, tokenize=False, add_generation_prompt=True)

    MAX_TEXT_FOR_LLM_MOTIF_EXTRACTION = 7000 # Gemma 2b context is ~8k tokens, prompt takes some.
                                           # Adjust based on typical token/char ratio of your text.
                                           # For characters, this might be around 20k-25k chars.
                                           # Let's use characters for direct comparison with previous truncation.
    original_len = len(text_to_analyze)

    # Truncate the *content* part of the prompt if necessary
    # This is a bit more complex as the full prompt_formatted includes instructions
    # A simpler way for now is to truncate text_to_analyze *before* formatting the prompt.
    if len(text_to_analyze) > MAX_TEXT_FOR_LLM_MOTIF_EXTRACTION:
        # print(f"    Warning: Text for motif extraction is long ({len(text_to_analyze)} chars), truncating to {MAX_TEXT_FOR_LLM_MOTIF_EXTRACTION} for local LLM.")
        text_to_analyze_for_prompt = text_to_analyze[:MAX_TEXT_FOR_LLM_MOTIF_EXTRACTION]

        # Re-create prompt with truncated text
        prompt_template_truncated = [
            {"role": "user", "content": f"""
From the following text, which comprises several responses to a single question,
extract a concise list of up to 5 key recurring themes or motifs.
List them separated by commas, with no introductory text.
If no clear themes or the text is too generic, output "NO_THEMES_FOUND".

Text (analyse for recurring themes):
\"\"\"{text_to_analyze_for_prompt}\"\"\"

Recurring Themes/Motifs (comma-separated, max 5):
"""}
        ]
        prompt_formatted = hf_tokenizer.apply_chat_template(prompt_template_truncated, tokenize=False, add_generation_prompt=True)
    else:
        text_to_analyze_for_prompt = text_to_analyze


    generation_args = {
        "max_new_tokens": 100, # Motifs should be short
        "do_sample": False,    # For deterministic output
        "pad_token_id": hf_tokenizer.eos_token_id # Gemma uses eos_token for padding
    }

    try:
        outputs = hf_pipeline(prompt_formatted, **generation_args)
        if not outputs or not isinstance(outputs, list) or not outputs[0].get('generated_text'):
            print(f"    Error: Local LLM pipeline returned unexpected/empty output for text (len {original_len}).")
            return []

        generated_text_full = outputs[0]['generated_text']

        # Strip the prompt part from the generated text
        # This is crucial for local models as they often include the prompt in the output.
        if generated_text_full.startswith(prompt_formatted):
            motifs_text = generated_text_full[len(prompt_formatted):].strip()
        else:
            # A common alternative is that the model just appends to the last turn.
            # For Gemma, it often just gives the assistant's response.
            # We might need to find the start of the assistant's actual response if the template is complex.
            # For this specific template, it's usually clean.
            motifs_text = generated_text_full # Assume for now it's just the response
            # More robust stripping for chat templates:
            # Find the last assistant marker if one was added by add_generation_prompt=True
            # or if the model generates it. For Gemma, it might be simple.
            # If using apply_chat_template with add_generation_prompt=False, then you'd add the marker manually.

            # A simple check: if "Recurring Themes/Motifs" is in the prompt and output, strip from there.
            last_prompt_line = "Recurring Themes/Motifs (comma-separated, max 5):"
            if last_prompt_line in motifs_text: # If prompt is included
                 motifs_text = motifs_text.split(last_prompt_line, 1)[-1].strip()


        if "NO_THEMES_FOUND" in motifs_text.upper() or not motifs_text:
            return []
        motifs = [m.strip() for m in motifs_text.split(",") if m.strip() and len(m.strip()) > 3]
        return motifs[:5]
    except Exception as e:
        print(f"    Error in llm_extract_motifs_local_hf (text len {original_len}, truncated to {len(text_to_analyze_for_prompt)}): {e}")
        # print(traceback.format_exc()) # For detailed debugging
        return []

# --- Main Execution Logic ---
def main():
    print("--- MDL Prototype: Analyzing Per-QID Aggregated Text (Local LLM) ---")

    # --- Initialize Local Hugging Face LLM ---
    local_llm_pipeline = None
    local_llm_tokenizer = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    try:
        print(f"Loading tokenizer for {LOCAL_LLM_MODEL_ID}...")
        local_llm_tokenizer = AutoTokenizer.from_pretrained(LOCAL_LLM_MODEL_ID)

        bnb_config = None
        quant_active = False
        if USE_QUANTIZATION_FOR_LOCAL_LLM and torch.cuda.is_available():
            try:
                compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
                bnb_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=compute_dtype,
                    bnb_4bit_use_double_quant=True,
                )
                quant_active = True
                print(f"BNB config created for {LOCAL_LLM_MODEL_ID}, compute_dtype: {compute_dtype}.")
            except Exception as e_bnb:
                print(f"WARN: Failed to create BitsAndBytesConfig (is bitsandbytes installed?): {e_bnb}. Quantization disabled.")
                quant_active = False

        print(f"Loading local model {LOCAL_LLM_MODEL_ID} (Quantization: {quant_active})...")
        model_kwargs = {"device_map": "auto", "trust_remote_code": True} # Gemma needs trust_remote_code
        if quant_active:
            model_kwargs["quantization_config"] = bnb_config
        else: # If not quantizing, explicitly set dtype for GPU or let CPU use default
            if device.type == 'cuda':
                 model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16


        local_llm_model = AutoModelForCausalLM.from_pretrained(LOCAL_LLM_MODEL_ID, **model_kwargs)

        local_llm_pipeline = pipeline(
            "text-generation", # For Gemma chat/instruct, "text-generation" is appropriate
            model=local_llm_model,
            tokenizer=local_llm_tokenizer,
            # device=0 if device.type == 'cuda' else -1 # pipeline device argument
        )
        print(f"Local LLM pipeline for {LOCAL_LLM_MODEL_ID} initialized successfully.")
    except Exception as e:
        print(f"CRITICAL: Failed to initialize local LLM pipeline: {e}")
        # print(traceback.format_exc())
        return

    # --- Initialize BDM ---
    try:
        bdm_instance = BDM(ndim=2)
        print("BDM instance initialized successfully.")
    except Exception as e_bdm_init:
        print(f"CRITICAL: Failed to initialize BDM instance: {e_bdm_init}")
        return

    # --- Load and Process Data from Phase 2 Output ---
    # (This part of main remains largely the same, just calls the new LLM function)
    if not os.path.exists(P2_COLLATED_FILE):
        print(f"ERROR: Phase 2 output file not found: {P2_COLLATED_FILE}")
        return

    print(f"Loading Phase 2 output from: {P2_COLLATED_FILE}...")
    phase2_data_content = None
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f:
            phase2_data_content = json.load(f)
    except Exception as e:
        print(f"Error loading or parsing {P2_COLLATED_FILE}: {e}")
        return

    all_qid_results = []
    qids_to_process_limit = 3

    print(f"\nProcessing up to {qids_to_process_limit} QIDs for MDL analysis using LOCAL LLM...\n")
    qids_processed_count = 0

    if phase2_data_content is None:
        aggregated_content_by_qid = {}
    else:
        aggregated_content_by_qid = phase2_data_content.get("aggregated_pdf_content_by_qid", {})

    if not aggregated_content_by_qid:
        print(f"No 'aggregated_pdf_content_by_qid' key found or data loaded from {P2_COLLATED_FILE}.")
        return

    for qid, text_items_list in aggregated_content_by_qid.items():
        if qids_processed_count >= qids_to_process_limit: break
        if not text_items_list or not isinstance(text_items_list, list): continue

        print(f"--- Analyzing Aggregated Text for QID: {qid} ---")
        all_texts_for_qid = [
            item.get("text", "") for item in text_items_list
            if isinstance(item, dict) and isinstance(item.get("text"), str) and item.get("text","").strip()
        ]
        corpus_for_qid = "\n\n<RSP_SEP>\n\n".join(all_texts_for_qid)

        if len(corpus_for_qid.strip()) < 100:
            print(f"  Skipping QID {qid}: combined text too short ({len(corpus_for_qid)} chars).")
            continue

        num_responses_for_qid = len(all_texts_for_qid)
        print(f"  Combined corpus for QID {qid} has {len(corpus_for_qid)} chars from {num_responses_for_qid} responses.")

        baseline_l_d_h = compute_bdm_for_text(corpus_for_qid, bdm_instance, MATRIX_SIZE_GLOBAL)
        if baseline_l_d_h < 0:
            print(f"  Error computing baseline BDM for QID {qid}. Skipping.")
            continue
        baseline_total_mdl = baseline_l_d_h
        print(f"  Baseline MDL for QID {qid}: {baseline_total_mdl:.4f}")

        # Call the new local LLM function
        extracted_motifs = llm_extract_motifs_local_hf(corpus_for_qid, local_llm_pipeline, local_llm_tokenizer)
        print(f"  Local LLM Extracted Motifs for QID {qid}: {extracted_motifs}")

        if not extracted_motifs:
            print("  No valid motifs extracted by Local LLM for this QID.")
            # Store result even if no motifs
            all_qid_results.append({
                "qid": qid, "corpus_len_for_qid": len(corpus_for_qid), "num_responses": num_responses_for_qid,
                "baseline_mdl": baseline_total_mdl, "motifs": [],
                "l_h_motifs": 0, "l_d_h_motifs": baseline_total_mdl, "total_mdl_motifs": baseline_total_mdl,
                "compression_achieved": 0.0
            })
            qids_processed_count += 1
            print("-" * 40)
            continue

        l_h, l_d_h, total_mdl_with_motifs = compute_mdl_cost_for_text_block(
            corpus_for_qid, extracted_motifs, bdm_instance, MATRIX_SIZE_GLOBAL
        )

        if total_mdl_with_motifs < 0:
            print(f"  Error computing MDL cost with motifs for QID {qid}. Skipping.")
            all_qid_results.append({
                "qid": qid, "corpus_len_for_qid": len(corpus_for_qid), "num_responses": num_responses_for_qid,
                "baseline_mdl": baseline_total_mdl, "motifs": extracted_motifs,
                "l_h_motifs": -1.0, "l_d_h_motifs": -1.0, "total_mdl_motifs": -1.0,
                "compression_achieved": "BDM_ERROR"
            })
            qids_processed_count += 1
            print("-" * 40)
            continue

        print(f"  L(H) motif complexity for QID {qid}: {l_h:.4f}")
        print(f"  L(D|H) compressed corpus complexity for QID {qid}: {l_d_h:.4f}")
        print(f"  Total MDL cost with motifs for QID {qid}: {total_mdl_with_motifs:.4f}")

        compression = baseline_total_mdl - total_mdl_with_motifs
        result_status = ""
        if compression > 0.0001:
            result_status = f"SUCCESS: Compression achieved: {compression:.4f}"
        else:
            result_status = f"NOTE: No significant compression (or cost increased). Diff: {compression:.4f}"
        print(f"  {result_status}")

        all_qid_results.append({
            "qid": qid, "corpus_len_for_qid": len(corpus_for_qid), "num_responses": num_responses_for_qid,
            "baseline_mdl": baseline_total_mdl, "motifs": extracted_motifs,
            "l_h_motifs": l_h, "l_d_h_motifs": l_d_h, "total_mdl_motifs": total_mdl_with_motifs,
            "compression_achieved": compression
        })
        qids_processed_count += 1
        print("-" * 40)
        # Optional: Add a small delay if GPU is very busy, though less of an API rate limit issue
        # time.sleep(1)


    # (Summary printing and saving results - same as before)
    print("\n--- Overall QID-based MDL Analysis Summary (Local LLM) ---")
    if not all_qid_results:
        print("No QIDs were processed or no valid results generated.")
    else:
        valid_results_for_stats = [r for r in all_qid_results if not isinstance(r['compression_achieved'], str) and r['l_h_motifs'] >= 0]
        num_compressed_qids = sum(1 for r in valid_results_for_stats if r['compression_achieved'] > 0.0001)
        successful_compressions = [r['compression_achieved'] for r in valid_results_for_stats if r['compression_achieved'] > 0.0001]
        avg_compression = np.mean(successful_compressions) if successful_compressions else 0
        max_compression = np.max(successful_compressions) if successful_compressions else 0
        print(f"Total QIDs attempted: {qids_processed_count} (out of {len(aggregated_content_by_qid)} with content)")
        print(f"Total QID results logged: {len(all_qid_results)}")
        print(f"Number of QIDs where compression was achieved (from valid results): {num_compressed_qids}")
        if num_compressed_qids > 0:
            print(f"  Average compression (for successful cases): {avg_compression:.4f}")
            print(f"  Maximum compression achieved across QIDs: {max_compression:.4f}")
        output_filename_qids = "mdl_analysis_per_qid_local_llm.json"
        try:
            with open(output_filename_qids, "w", encoding="utf-8") as f_out:
                json.dump(all_qid_results, f_out, indent=2)
            print(f"Detailed QID-based results saved to {output_filename_qids}")
        except Exception as e_save:
            print(f"Error saving QID-based results to {output_filename_qids}: {e_save}")


if __name__ == "__main__":
    main()

--- MDL Prototype: Analyzing Per-QID Aggregated Text (Local LLM) ---
Using device: cuda
Loading tokenizer for google/gemma-2b-it...


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

BNB config created for google/gemma-2b-it, compute_dtype: torch.bfloat16.
Loading local model google/gemma-2b-it (Quantization: True)...


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Device set to use cuda:0


Local LLM pipeline for google/gemma-2b-it initialized successfully.
CRITICAL: Failed to initialize BDM instance: _Partition.__init__() got an unexpected keyword argument 'use_ctm'


In [None]:
# @title Per QID processing
# --- Imports (ensure all necessary imports are at the top) ---
import json
import os
import hashlib
import numpy as np
from pybdm import BDM
import re
import time

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
# from google import genai # No longer needed for LLM calls if using local
# from google.genai import types # No longer needed for LLM calls

# --- Configuration (Define paths and constants) ---
# These would typically be in your "Cell 2: Global Project Configuration"
# For this standalone cell, ensure they are defined or adjust paths as needed.
BASE_PROJECT_DIR = '/content/drive/MyDrive/Colab Notebooks/Legal/' # !!! EXAMPLE - UPDATE THIS PATH !!!
PHASE2_OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Phase2_PDF_Collated_Texts/')
P2_COLLATED_FILE = os.path.join(PHASE2_OUTPUT_DIR, 'phase2_collated_pdf_texts.json')

# Parameter from your Cell 2, to control which QIDs are processed
# Set to None or empty list to process all (up to qids_to_process_limit)
# Set to e.g., ["Q4"] to process only Q4
# P3_QIDS_TO_PROCESS_THEMATICALLY = None
P3_QIDS_TO_PROCESS_THEMATICALLY = ["Q4"] # EXAMPLE: Process only Q4

# --- BDM and LLM Model Configuration ---
MATRIX_SIZE_GLOBAL = (8, 8)
LOCAL_LLM_MODEL_ID = 'google/gemma-2b-it'
USE_QUANTIZATION_FOR_LOCAL_LLM = True

# --- Helper Function Definitions (llm_extract_motifs_local_hf, llm_compress_text, etc. - keep as before) ---
def llm_extract_motifs_local_hf(text_to_analyze, hf_pipeline, hf_tokenizer):
    """Extracts motifs from text using a local Hugging Face pipeline."""
    # Using a simplified prompt structure that Gemma instruct models usually handle well
    # by just appending their response.
    prompt_content = f"""
From the following text, which comprises several responses to a single question,
extract a concise list of up to 5 key recurring themes or motifs.
List them separated by commas, with no introductory text.
If no clear themes or the text is too generic, output "NO_THEMES_FOUND".

Text (analyse for recurring themes):
\"\"\"{text_to_analyze}\"\"\"

Recurring Themes/Motifs (comma-separated, max 5):
"""
    # For Gemma instruct models, often just the user prompt is enough,
    # and the model appends its response.
    # The apply_chat_template is good, but we need to be careful about stripping.
    # Let's try a simpler prompt construction if direct text-generation is used.

    # Constructing the prompt for Gemma instruct/chat models
    # The key is to get only the assistant's part of the response.
    messages_for_template = [
        {"role": "user", "content": prompt_content}
    ]
    # This adds the necessary tokens for the model to know it should generate a response.
    prompt_formatted_for_llm = hf_tokenizer.apply_chat_template(
        messages_for_template,
        tokenize=False,
        add_generation_prompt=True # Crucial for instruct models
    )

    MAX_TEXT_IN_PROMPT_FOR_LLM = 7000 # Max characters of *user content* (text_to_analyze)
    original_len = len(text_to_analyze)

    # Truncate text_to_analyze *before* putting it into the prompt_content
    if original_len > MAX_TEXT_IN_PROMPT_FOR_LLM:
        text_to_analyze_for_prompt = text_to_analyze[:MAX_TEXT_IN_PROMPT_FOR_LLM]
        # Re-create prompt_content and prompt_formatted_for_llm with truncated text
        prompt_content = f"""
From the following text, which comprises several responses to a single question,
extract a concise list of up to 5 key recurring themes or motifs.
List them separated by commas, with no introductory text.
If no clear themes or the text is too generic, output "NO_THEMES_FOUND".

Text (analyse for recurring themes):
\"\"\"{text_to_analyze_for_prompt}\"\"\"

Recurring Themes/Motifs (comma-separated, max 5):
"""
        messages_for_template = [{"role": "user", "content": prompt_content}]
        prompt_formatted_for_llm = hf_tokenizer.apply_chat_template(
            messages_for_template, tokenize=False, add_generation_prompt=True
        )
    else:
        text_to_analyze_for_prompt = text_to_analyze # Used for logging length

    generation_args = {
        "max_new_tokens": 150, # Increased slightly to allow for a preamble if LLM insists
        "do_sample": False,
        "pad_token_id": hf_tokenizer.eos_token_id
    }

    try:
        outputs = hf_pipeline(prompt_formatted_for_llm, **generation_args)
        if not outputs or not isinstance(outputs, list) or not outputs[0].get('generated_text'):
            print(f"    Error: Local LLM pipeline returned unexpected/empty output for text (len {original_len}).")
            return []

        generated_text_full = outputs[0]['generated_text']

        # --- More Robust Stripping of Prompt and Model Preamble ---
        # 1. Strip the input prompt from the beginning of the output
        if generated_text_full.startswith(prompt_formatted_for_llm):
            actual_response_text = generated_text_full[len(prompt_formatted_for_llm):].strip()
        else:
            # If prompt not found at start, it might be that the pipeline only returned the new tokens.
            # Or, the model added its own chat turn markers.
            # For Gemma with add_generation_prompt=True, the output usually starts right after the prompt.
            # Let's try to find common start-of-response markers if the above fails.
            # A simpler approach if the LLM is good: it just appends its answer.
            # The `apply_chat_template` with `add_generation_prompt=True` should mean the `generated_text_full`
            # contains the prompt AND the model's response.
            # The challenge is that `prompt_formatted_for_llm` itself might end with a model turn token.
            # A more reliable way can be to look for text *after* the last user message in the prompt.

            # Try splitting based on a known part of the prompt that precedes the answer:
            key_phrase_before_answer = "Recurring Themes/Motifs (comma-separated, max 5):"
            if key_phrase_before_answer in generated_text_full:
                # Take text after the last occurrence of this key phrase
                actual_response_text = generated_text_full.split(key_phrase_before_answer)[-1].strip()
            else:
                # Fallback: assume the pipeline might have stripped the prompt for us,
                # or the model just gave the answer without special markers.
                # This part is tricky and model-dependent.
                actual_response_text = generated_text_full # Could be risky, might need more cleanup
                # print(f"    WARN: Could not reliably strip prompt. Full output: {actual_response_text[:200]}")


        # 2. Clean up common LLM preambles from the *actual_response_text*
        preambles_to_remove = [
            "Sure, here's a concise summary of the recurring themes and motifs in the text:",
            "Sure, here is a concise list of the recurring themes and motifs in the text:",
            "Here's a concise summary of the recurring themes and motifs in the text:",
            "Here are the recurring themes and motifs from the text:",
            "Okay, here are some key themes:",
            "Sure, here are 5 key themes or motifs from the text:",
            "Here are 5 key themes or motifs from the text:",
            "- " # If it starts listing with hyphens immediately
        ]
        cleaned_response_text = actual_response_text
        for preamble in preambles_to_remove:
            if cleaned_response_text.lower().startswith(preamble.lower()):
                cleaned_response_text = cleaned_response_text[len(preamble):].strip()
                break # Remove first matching preamble

        # Also remove potential bullet points or leading hyphens from each line if motifs are listed that way
        motifs_lines = cleaned_response_text.split('\n')
        processed_motifs_text = []
        for line in motifs_lines:
            line = line.strip()
            if line.startswith("- "):
                line = line[2:].strip()
            elif line.startswith("* "):
                line = line[2:].strip()
            if line: # Add line if it's not empty after stripping
                processed_motifs_text.append(line)

        # Join back if motifs were on separate lines, then split by comma
        # Or, if each line is a motif, just use them.
        # The prompt asks for comma-separated.
        final_motifs_string = ", ".join(processed_motifs_text) # If motifs were on new lines, this joins them with comma.
                                                              # If they were already comma sep on one line, it might add extra commas.
                                                              # Better to assume LLM tries for comma separation.

        if not final_motifs_string.strip() or "NO_THEMES_FOUND" in final_motifs_string.upper():
            # print(f"    LLM indicated no themes or response was empty after cleaning. Cleaned: '{final_motifs_string}'")
            return []

        # Now split the cleaned string by comma
        motifs = [m.strip() for m in final_motifs_string.split(',') if m.strip() and len(m.strip()) > 3]

        # print(f"    DEBUG: Raw LLM output after prompt strip: '{actual_response_text[:200]}'")
        # print(f"    DEBUG: Cleaned response text for motif splitting: '{final_motifs_string}'")
        # print(f"    DEBUG: Final parsed motifs: {motifs[:5]}")

        return motifs[:5]
    except Exception as e:
        print(f"    Error in llm_extract_motifs_local_hf (text len {original_len}, truncated to {len(text_to_analyze_for_prompt)}): {e}")
        # import traceback
        # print(traceback.format_exc()) # For detailed debugging
        return []

def llm_compress_text(text_to_compress, motifs_list):
    if not isinstance(text_to_compress, str): return ""
    compressed = text_to_compress.lower()
    if not motifs_list: return compressed
    for motif in motifs_list:
        if motif and isinstance(motif, str) and motif.strip():
            safe_placeholder_name = re.sub(r'\W+', '_', motif).upper()[:20]
            placeholder = f"<MOTIF_{safe_placeholder_name}>"
            try:
                compressed = re.sub(re.escape(motif.lower()), placeholder, compressed, flags=re.IGNORECASE)
            except re.error as re_e:
                print(f"    Regex error during compression for motif '{motif}': {re_e}. Skipping.")
                continue
    return compressed

def text_to_binary_matrix(text_input, size=MATRIX_SIZE_GLOBAL):
    if not text_input or not isinstance(text_input, str):
        return np.zeros(size, dtype=int)
    hash_digest = hashlib.sha256(text_input.encode('utf-8', 'ignore')).hexdigest()
    required_bits = size[0] * size[1]
    binary_string = bin(int(hash_digest, 16))[2:].zfill(required_bits)
    binary_string_padded = binary_string.ljust(required_bits, '0')
    bits = [int(b) for b in binary_string_padded[:required_bits]]
    return np.array(bits).reshape(size)

def compute_bdm_for_text(text_input, bdm_instance, matrix_s=MATRIX_SIZE_GLOBAL):
    if not text_input or not isinstance(text_input, str) : return 0.0
    if not text_input.strip(): return 0.0
    MAX_TEXT_FOR_BDM_HASH = 2000
    text_for_hash = text_input if len(text_input) <= MAX_TEXT_FOR_BDM_HASH else text_input[:MAX_TEXT_FOR_BDM_HASH]
    matrix = text_to_binary_matrix(text_for_hash, size=matrix_s)
    try:
        return bdm_instance.bdm(matrix)
    except Exception as e_bdm:
        print(f"      Error during BDM calculation for text (len {len(text_input)}, hashed part len {len(text_for_hash)}): {e_bdm}")
        return -1.0

def compute_mdl_cost_for_text_block(text_block_str, motifs_list, bdm_instance, matrix_s=MATRIX_SIZE_GLOBAL):
    if not isinstance(text_block_str, str) : text_block_str = ""
    l_h = 0.0
    valid_motifs_for_lh = []
    if motifs_list:
        valid_motifs_for_lh = [m for m in motifs_list if isinstance(m, str) and m.strip()]
        if valid_motifs_for_lh:
            l_h = sum(compute_bdm_for_text(m, bdm_instance, matrix_s) for m in valid_motifs_for_lh)
    compressed_text_block = llm_compress_text(text_block_str, motifs_list)
    l_d_h = compute_bdm_for_text(compressed_text_block, bdm_instance, matrix_s)
    if l_h < 0 or l_d_h < 0: return -1.0, -1.0, -1.0
    return l_h, l_d_h, l_h + l_d_h

# --- Main Execution Logic ---
def main():
    print("--- MDL Prototype: Analyzing Per-QID Aggregated Text (Local LLM) ---")

    local_llm_pipeline = None
    local_llm_tokenizer = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    try:
        print(f"Loading tokenizer for {LOCAL_LLM_MODEL_ID}...")
        local_llm_tokenizer = AutoTokenizer.from_pretrained(LOCAL_LLM_MODEL_ID)
        bnb_config = None
        quant_active = False
        if USE_QUANTIZATION_FOR_LOCAL_LLM and torch.cuda.is_available():
            try:
                compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
                bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=True)
                quant_active = True
                print(f"BNB config created for {LOCAL_LLM_MODEL_ID}, compute_dtype: {compute_dtype}.")
            except Exception as e_bnb:
                print(f"WARN: Failed to create BitsAndBytesConfig: {e_bnb}. Quantization disabled.")
        print(f"Loading local model {LOCAL_LLM_MODEL_ID} (Quantization: {quant_active})...")
        model_kwargs = {"device_map": "auto", "trust_remote_code": True}
        if quant_active: model_kwargs["quantization_config"] = bnb_config
        else:
            if device.type == 'cuda': model_kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        local_llm_model = AutoModelForCausalLM.from_pretrained(LOCAL_LLM_MODEL_ID, **model_kwargs)
        local_llm_pipeline = pipeline("text-generation", model=local_llm_model, tokenizer=local_llm_tokenizer)
        print(f"Local LLM pipeline for {LOCAL_LLM_MODEL_ID} initialized successfully.")
    except Exception as e:
        print(f"CRITICAL: Failed to initialize local LLM pipeline: {e}")
        return

    try:
        bdm_instance = BDM(ndim=2)
        print("BDM instance initialized successfully.")
    except Exception as e_bdm_init:
        print(f"CRITICAL: Failed to initialize BDM instance: {e_bdm_init}")
        return

    if not os.path.exists(P2_COLLATED_FILE):
        print(f"ERROR: Phase 2 output file not found: {P2_COLLATED_FILE}")
        return
    print(f"Loading Phase 2 output from: {P2_COLLATED_FILE}...")
    phase2_data_content = None
    try:
        with open(P2_COLLATED_FILE, 'r', encoding='utf-8') as f:
            phase2_data_content = json.load(f)
    except Exception as e:
        print(f"Error loading or parsing {P2_COLLATED_FILE}: {e}")
        return

    all_qid_results = []

    # Use P3_QIDS_TO_PROCESS_THEMATICALLY if defined and not empty,
    # otherwise, qids_to_process_limit applies to all QIDs.
    qids_to_target = []
    aggregated_content_by_qid = {}

    if phase2_data_content:
        aggregated_content_by_qid = phase2_data_content.get("aggregated_pdf_content_by_qid", {})

    if not aggregated_content_by_qid:
        print(f"No 'aggregated_pdf_content_by_qid' key found or data loaded from {P2_COLLATED_FILE}.")
        return

    if P3_QIDS_TO_PROCESS_THEMATICALLY and isinstance(P3_QIDS_TO_PROCESS_THEMATICALLY, list) and len(P3_QIDS_TO_PROCESS_THEMATICALLY) > 0:
        qids_to_target = [qid for qid in P3_QIDS_TO_PROCESS_THEMATICALLY if qid in aggregated_content_by_qid]
        print(f"Targeting specific QIDs based on P3_QIDS_TO_PROCESS_THEMATICALLY: {qids_to_target}")
        if not qids_to_target:
            print(f"Warning: None of the QIDs specified in P3_QIDS_TO_PROCESS_THEMATICALLY ({P3_QIDS_TO_PROCESS_THEMATICALLY}) were found in the loaded data.")
            return # Or process all if preferred fallback
    else:
        qids_to_process_limit = 1 # Example: Process only the first QID if no specific list
        print(f"P3_QIDS_TO_PROCESS_THEMATICALLY not set or empty. Processing up to {qids_to_process_limit} QID(s) from the data.")
        # Iterate through available QIDs up to the limit
        temp_qids_to_target = []
        for q_idx, qid_key in enumerate(aggregated_content_by_qid.keys()):
            if q_idx < qids_to_process_limit:
                temp_qids_to_target.append(qid_key)
            else:
                break
        qids_to_target = temp_qids_to_target
        if not qids_to_target:
            print("No QIDs found to process based on the limit.")
            return

    print(f"\nMDL analysis will run for these QIDs: {qids_to_target}\n")

    for qid in qids_to_target:
        text_items_list = aggregated_content_by_qid.get(qid) # Get items for the target QID

        if not text_items_list or not isinstance(text_items_list, list):
            print(f"Skipping QID {qid}: no text items or not a list.")
            continue

        print(f"--- Analyzing Aggregated Text for QID: {qid} ---")
        all_texts_for_qid = [
            item.get("text", "") for item in text_items_list
            if isinstance(item, dict) and isinstance(item.get("text"), str) and item.get("text","").strip()
        ]
        corpus_for_qid = "\n\n<RSP_SEP>\n\n".join(all_texts_for_qid)

        if len(corpus_for_qid.strip()) < 100:
            print(f"  Skipping QID {qid}: combined text too short ({len(corpus_for_qid)} chars).")
            continue

        num_responses_for_qid = len(all_texts_for_qid)
        print(f"  Combined corpus for QID {qid} has {len(corpus_for_qid)} chars from {num_responses_for_qid} responses.")

        baseline_l_d_h = compute_bdm_for_text(corpus_for_qid, bdm_instance, MATRIX_SIZE_GLOBAL)
        if baseline_l_d_h < 0:
            print(f"  Error computing baseline BDM for QID {qid}. Skipping.")
            continue
        baseline_total_mdl = baseline_l_d_h
        print(f"  Baseline MDL for QID {qid} (L(D_orig_corpus_for_qid)): {baseline_total_mdl:.4f}")

        extracted_motifs = llm_extract_motifs_local_hf(corpus_for_qid, local_llm_pipeline, local_llm_tokenizer)
        print(f"  Local LLM Extracted Motifs for QID {qid}: {extracted_motifs}")

        if not extracted_motifs:
            print("  No valid motifs extracted by Local LLM for this QID.")
            all_qid_results.append({
                "qid": qid, "corpus_len_for_qid": len(corpus_for_qid), "num_responses": num_responses_for_qid,
                "baseline_mdl": baseline_total_mdl, "motifs": [],
                "l_h_motifs": 0, "l_d_h_motifs": baseline_total_mdl, "total_mdl_motifs": baseline_total_mdl,
                "compression_achieved": 0.0
            })
            print("-" * 40)
            continue

        l_h, l_d_h, total_mdl_with_motifs = compute_mdl_cost_for_text_block(
            corpus_for_qid, extracted_motifs, bdm_instance, MATRIX_SIZE_GLOBAL
        )

        if total_mdl_with_motifs < 0:
            print(f"  Error computing MDL cost with motifs for QID {qid}. Skipping this result.")
            all_qid_results.append({
                "qid": qid, "corpus_len_for_qid": len(corpus_for_qid), "num_responses": num_responses_for_qid,
                "baseline_mdl": baseline_total_mdl, "motifs": extracted_motifs,
                "l_h_motifs": -1.0, "l_d_h_motifs": -1.0, "total_mdl_motifs": -1.0,
                "compression_achieved": "BDM_ERROR"
            })
            print("-" * 40)
            continue

        print(f"  L(H) motif complexity for QID {qid}: {l_h:.4f}")
        print(f"  L(D|H) compressed corpus complexity for QID {qid}: {l_d_h:.4f}")
        print(f"  Total MDL cost with motifs for QID {qid}: {total_mdl_with_motifs:.4f}")

        compression = baseline_total_mdl - total_mdl_with_motifs
        result_status = ""
        if compression > 0.0001: result_status = f"SUCCESS: Compression achieved: {compression:.4f}"
        else: result_status = f"NOTE: No significant compression (or cost increased). Diff: {compression:.4f}"
        print(f"  {result_status}")

        all_qid_results.append({
            "qid": qid, "corpus_len_for_qid": len(corpus_for_qid), "num_responses": num_responses_for_qid,
            "baseline_mdl": baseline_total_mdl, "motifs": extracted_motifs,
            "l_h_motifs": l_h, "l_d_h_motifs": l_d_h, "total_mdl_motifs": total_mdl_with_motifs,
            "compression_achieved": compression
        })
        print("-" * 40)

    # (Summary printing and saving results - same as before)
    print("\n--- Overall QID-based MDL Analysis Summary (Local LLM) ---")
    if not all_qid_results:
        print("No QIDs were processed or no valid results generated.")
    else:
        valid_results_for_stats = [r for r in all_qid_results if not isinstance(r['compression_achieved'], str) and r['l_h_motifs'] >= 0]
        num_compressed_qids = sum(1 for r in valid_results_for_stats if r['compression_achieved'] > 0.0001)
        successful_compressions = [r['compression_achieved'] for r in valid_results_for_stats if r['compression_achieved'] > 0.0001]
        avg_compression = np.mean(successful_compressions) if successful_compressions else 0
        max_compression = np.max(successful_compressions) if successful_compressions else 0
        # Note: qids_processed_count was removed as qids_to_target now defines how many are attempted.
        print(f"Total QIDs targeted for analysis: {len(qids_to_target)}")
        print(f"Total QID results logged: {len(all_qid_results)}")
        print(f"Number of QIDs where compression was achieved (from valid results): {num_compressed_qids}")
        if num_compressed_qids > 0:
            print(f"  Average compression (for successful cases): {avg_compression:.4f}")
            print(f"  Maximum compression achieved across QIDs: {max_compression:.4f}")
        output_filename_qids = "mdl_analysis_per_qid_local_llm_v2.json" # Changed filename slightly
        try:
            with open(output_filename_qids, "w", encoding="utf-8") as f_out:
                json.dump(all_qid_results, f_out, indent=2)
            print(f"Detailed QID-based results saved to {output_filename_qids}")
        except Exception as e_save:
            print(f"Error saving QID-based results to {output_filename_qids}: {e_save}")

if __name__ == "__main__":
    main()

--- MDL Prototype: Analyzing Per-QID Aggregated Text (Local LLM) ---
Using device: cuda
Loading tokenizer for google/gemma-2b-it...
BNB config created for google/gemma-2b-it, compute_dtype: torch.bfloat16.
Loading local model google/gemma-2b-it (Quantization: True)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Local LLM pipeline for google/gemma-2b-it initialized successfully.
BDM instance initialized successfully.
Loading Phase 2 output from: /content/drive/MyDrive/Colab Notebooks/Legal/Phase2_PDF_Collated_Texts/phase2_collated_pdf_texts.json...
Targeting specific QIDs based on P3_QIDS_TO_PROCESS_THEMATICALLY: ['Q4']

MDL analysis will run for these QIDs: ['Q4']

--- Analyzing Aggregated Text for QID: Q4 ---
  Combined corpus for QID Q4 has 129501 chars from 209 responses.
  Baseline MDL for QID Q4 (L(D_orig_corpus_for_qid)): 121.3693
  Local LLM Extracted Motifs for QID Q4: ['Exceptions to individual rights', 'Balancing individual rights and competing public interests', 'Robust privacy protections in the digital economy', 'Stricter data rights for employees', 'Balancing employee rights with the need for data security']
  L(H) motif complexity for QID Q4: 609.4512
  L(D|H) compressed corpus complexity for QID Q4: 118.2032
  Total MDL cost with motifs for QID Q4: 727.6544
  NOTE: No signific

# 🧪 Minimal Working Prototype: Synthetic Data via Embedding Translation

This notebook demonstrates how to:
- Embed a few real (or mock) samples
- Learn a transformation into a new space (simulating vec2vec)
- Generate synthetic text from translated embeddings
- Evaluate semantic similarity and privacy risk

In [None]:
# Step 1: Install dependencies
!pip install -q sentence-transformers scikit-learn numpy transformers

In [None]:
# Step 2: Embed original and simulated target samples
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.preprocessing import normalize

model = SentenceTransformer('all-MiniLM-L6-v2')

# Simulated 'real' text (de-identified)
real_texts = [
    'I am concerned about data sharing with third parties.',
    'The reform must ensure algorithmic transparency.',
    'Consent should be more meaningful and not just a checkbox.'
]

# Simulated 'synthetic corpus' as new target space
synthetic_texts = [
    'Ensure user control over personal data.',
    'Explain how machine learning models make decisions.',
    'Design opt-in mechanisms that reflect informed choice.'
]

real_embeddings = normalize(model.encode(real_texts))
target_embeddings = normalize(model.encode(synthetic_texts))

In [None]:
# Step 3: Learn and apply translation (Procrustes)
from scipy.linalg import orthogonal_procrustes

W, _ = orthogonal_procrustes(real_embeddings, target_embeddings)
translated_embeddings = real_embeddings @ W

In [None]:
# Step 4: Use nearest neighbors to recover semantically-aligned synthetic prompts
from sklearn.metrics.pairwise import cosine_similarity

sim_matrix = cosine_similarity(translated_embeddings, target_embeddings)
for i, row in enumerate(sim_matrix):
    match_idx = row.argmax()
    print(f"Original: {real_texts[i]}\n → Synthetic: {synthetic_texts[match_idx]}\n")

## ✅ Next Steps
- Use translated embeddings to prompt an LLM for full text generation
- Use this embedding alignment method to create synthetic corpus in bulk
- Validate outputs via cosine similarity + human review

To integrate with `synthetic-data-kit` or `unsloth`, use these outputs to generate QA pairs or summaries.