<a href="https://colab.research.google.com/github/baker-jr-john/automated-summary-evaluation-llm/blob/main/02_Calibration_Tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =============================================================================
# Calibration Subset Selection
# =============================================================================

# =============================================================================
# SETUP: Mount Google Drive
# =============================================================================
from google.colab import drive
drive.mount('/content/drive')

# =============================================================================
# CONFIGURATION: Set your file paths
# =============================================================================

# UPDATE THESE PATHS to match your Google Drive structure:
VALIDATION_DATASET_PATH = '/content/drive/MyDrive/Courses/2025/3_Fall/EDUC_6192_Large_Language_Model_Applications_in_Education/Project/Phase_1/data/dataset/validation_set_combined_60.csv'
OUTPUT_DIR = '/content/drive/MyDrive/Courses/2025/3_Fall/EDUC_6192_Large_Language_Model_Applications_in_Education/Project/Phase_2/Phase2_Calibration/'

# The script will create these files in OUTPUT_DIR:
# - calibration_subset.csv
# - calibration_practice_summaries.txt

print("="*80)
print("CALIBRATION SUBSET SELECTION")
print("="*80)
print(f"\nReading from: {VALIDATION_DATASET_PATH}")
print(f"Saving to: {OUTPUT_DIR}")

# =============================================================================
# STEP 1: Load and analyze dataset
# =============================================================================
import pandas as pd
import numpy as np
import os

print("\n[STEP 1] Loading Dataset...")
print("-" * 80)

df = pd.read_csv(VALIDATION_DATASET_PATH)

print(f"âœ“ Loaded {len(df)} summaries")
print(f"\nColumns: {list(df.columns)}")

print("\n\nDataset Distribution:")
print(f"  Authentic (ASAP 2.0): {(df['synthetic_flag'] == False).sum()}")
print(f"  Synthetic (GPT-4o-Mini): {(df['synthetic_flag'] == True).sum()}")

print("\nScore distribution:")
score_dist = df['score'].value_counts().sort_index()
for score, count in score_dist.items():
    pct = (count / len(df)) * 100
    print(f"  Score {score}: {count:2d} summaries ({pct:4.1f}%)")

# =============================================================================
# STEP 2: Define selection strategy
# =============================================================================
print("\n\n[STEP 2] Selection Strategy")
print("-" * 80)

selection_plan = {
    1: {'target': 2, 'authentic': 1, 'synthetic': 1},
    2: {'target': 3, 'authentic': 2, 'synthetic': 1},
    3: {'target': 3, 'authentic': 2, 'synthetic': 1},
    4: {'target': 2, 'authentic': 1, 'synthetic': 1},
    5: {'target': 1, 'authentic': 1, 'synthetic': 0},
    6: {'target': 1, 'authentic': 1, 'synthetic': 0}
}

print("\nWill select:")
for score, plan in selection_plan.items():
    print(f"  Score {score}: {plan['target']} summaries ({plan['authentic']} auth, {plan['synthetic']} synth)")

total_target = sum(plan['target'] for plan in selection_plan.values())
print(f"\nTotal: {total_target} summaries for calibration")

# =============================================================================
# STEP 3: Execute selection
# =============================================================================
print("\n\n[STEP 3] Selecting Summaries")
print("-" * 80)

calibration_subset = []

# Score 1: 1 authentic + 1 synthetic
score_1_df = df[df['score'] == 1]
cal_1_auth = score_1_df[score_1_df['synthetic_flag'] == False].iloc[0]
cal_1_synth = score_1_df[score_1_df['synthetic_flag'] == True].iloc[0]
calibration_subset.extend([cal_1_auth, cal_1_synth])
print(f"Score 1: Selected {cal_1_auth['essay_id']} (auth), {cal_1_synth['essay_id']} (synth)")

# Score 2: 2 authentic + 1 synthetic
score_2_df = df[df['score'] == 2]
cal_2_auth = score_2_df[score_2_df['synthetic_flag'] == False].iloc[0:2]
cal_2_synth = score_2_df[score_2_df['synthetic_flag'] == True].iloc[0]
calibration_subset.extend([cal_2_auth.iloc[0], cal_2_auth.iloc[1], cal_2_synth])
print(f"Score 2: Selected {cal_2_auth.iloc[0]['essay_id']}, {cal_2_auth.iloc[1]['essay_id']} (auth), {cal_2_synth['essay_id']} (synth)")

# Score 3: 2 authentic + 1 synthetic
score_3_df = df[df['score'] == 3]
cal_3_auth = score_3_df[score_3_df['synthetic_flag'] == False].iloc[0:2]
cal_3_synth = score_3_df[score_3_df['synthetic_flag'] == True].iloc[0]
calibration_subset.extend([cal_3_auth.iloc[0], cal_3_auth.iloc[1], cal_3_synth])
print(f"Score 3: Selected {cal_3_auth.iloc[0]['essay_id']}, {cal_3_auth.iloc[1]['essay_id']} (auth), {cal_3_synth['essay_id']} (synth)")

# Score 4: 1 authentic + 1 synthetic
score_4_df = df[df['score'] == 4]
cal_4_auth = score_4_df[score_4_df['synthetic_flag'] == False].iloc[0]
cal_4_synth = score_4_df[score_4_df['synthetic_flag'] == True].iloc[0]
calibration_subset.extend([cal_4_auth, cal_4_synth])
print(f"Score 4: Selected {cal_4_auth['essay_id']} (auth), {cal_4_synth['essay_id']} (synth)")

# Score 5: 1 authentic
score_5_df = df[df['score'] == 5]
cal_5_auth = score_5_df[score_5_df['synthetic_flag'] == False].iloc[0]
calibration_subset.append(cal_5_auth)
print(f"Score 5: Selected {cal_5_auth['essay_id']} (auth)")

# Score 6: 1 authentic (only one available)
score_6_df = df[df['score'] == 6]
cal_6 = score_6_df.iloc[0]
calibration_subset.append(cal_6)
print(f"Score 6: Selected {cal_6['essay_id']} (auth)")

# =============================================================================
# STEP 4: Create calibration DataFrame
# =============================================================================
print("\n\n[STEP 4] Creating Calibration DataFrame")
print("-" * 80)

cal_df = pd.DataFrame(calibration_subset)
cal_df = cal_df.reset_index(drop=True)

print(f"\nâœ“ Created DataFrame with {len(cal_df)} summaries")
print(f"\nScore distribution in calibration set:")
print(cal_df['score'].value_counts().sort_index())
print(f"\nAuthentic: {(cal_df['synthetic_flag'] == False).sum()}")
print(f"Synthetic: {(cal_df['synthetic_flag'] == True).sum()}")

# =============================================================================
# STEP 5: Display details
# =============================================================================
print("\n\n[STEP 5] Calibration Subset Details")
print("-" * 80)
print(f"\n{'#':<3} {'Essay ID':<25} {'Score':<6} {'Source':<8} {'Words':<6} {'Error Pattern':<45}")
print("-" * 80)

for idx, row in cal_df.iterrows():
    source = "Synthetic" if row['synthetic_flag'] else "Authentic"
    error = row['target_error_pattern'] if pd.notna(row['target_error_pattern']) else "N/A"
    print(f"{idx+1:<3} {row['essay_id']:<25} {row['score']:<6} {source:<8} {row['word_count']:<6} {error[:45]:<45}")

# =============================================================================
# STEP 6: Save CSV to Google Drive
# =============================================================================
print("\n\n[STEP 6] Saving Calibration Subset CSV")
print("-" * 80)

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

cal_csv_path = os.path.join(OUTPUT_DIR, 'calibration_subset.csv')
cal_df.to_csv(cal_csv_path, index=False)
print(f"âœ“ Saved: {cal_csv_path}")

# =============================================================================
# STEP 7: Create practice summaries document
# =============================================================================
print("\n\n[STEP 7] Creating Practice Summaries Document")
print("-" * 80)

output_lines = []
output_lines.append("=" * 80)
output_lines.append("CALIBRATION PRACTICE SET - 12 SUMMARIES")
output_lines.append("=" * 80)
output_lines.append("\nInstructions:")
output_lines.append("1. Score each summary across all 4 dimensions WITHOUT looking at benchmark scores")
output_lines.append("2. Use your rubric and document your reasoning")
output_lines.append("3. After scoring all 12, compare with the benchmark scores")
output_lines.append("4. Analyze discrepancies to refine your rubric interpretation")
output_lines.append("\n" + "=" * 80 + "\n")

for idx, row in cal_df.iterrows():
    practice_num = idx + 1

    output_lines.append(f"\n{'='*80}")
    output_lines.append(f"PRACTICE_{practice_num:02d}: {row['essay_id']}")
    output_lines.append(f"{'='*80}")
    output_lines.append(f"Source: {'Synthetic' if row['synthetic_flag'] else 'Authentic'}")
    output_lines.append(f"Word Count: {row['word_count']}")
    if pd.notna(row['target_error_pattern']):
        output_lines.append(f"Error Pattern: {row['target_error_pattern']}")

    output_lines.append(f"\n{'-'*80}")
    output_lines.append("SUMMARY TEXT:")
    output_lines.append(f"{'-'*80}\n")
    output_lines.append(row['full_text'])
    output_lines.append("\n" + "="*80 + "\n")

# Save to text file
practice_txt_path = os.path.join(OUTPUT_DIR, 'calibration_practice_summaries.txt')
with open(practice_txt_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(output_lines))

print(f"âœ“ Saved: {practice_txt_path}")

# =============================================================================
# STEP 8: Create Practice IDs reference
# =============================================================================
print("\n\n[STEP 8] Practice IDs for Calibration Tracker")
print("-" * 80)
print("\nCopy these IDs into your Calibration_Tracker.xlsx:")
print()
for i, essay_id in enumerate(cal_df['essay_id'], 1):
    print(f"PRACTICE_R1_{i:02d}: {essay_id}")

# Save to a separate reference file
practice_ids_path = os.path.join(OUTPUT_DIR, 'calibration_practice_ids.txt')
with open(practice_ids_path, 'w', encoding='utf-8') as f:
    f.write("Practice IDs for Calibration Tracker\n")
    f.write("="*80 + "\n\n")
    f.write("Use these in your Calibration_Tracker.xlsx:\n\n")
    for i, essay_id in enumerate(cal_df['essay_id'], 1):
        f.write(f"PRACTICE_R1_{i:02d}: {essay_id}\n")

print(f"\nâœ“ Saved: {practice_ids_path}")

# =============================================================================
# COMPLETION
# =============================================================================
print("\n\n" + "="*80)
print("CALIBRATION SUBSET SELECTION COMPLETE")
print("="*80)

print(f"\nâœ“ Files saved to: {OUTPUT_DIR}")
print(f"  â€¢ calibration_subset.csv (metadata)")
print(f"  â€¢ calibration_practice_summaries.txt (full texts)")
print(f"  â€¢ calibration_practice_ids.txt (IDs for tracker)")

print(f"\nâœ“ Selected {len(cal_df)} summaries:")
print(f"  â€¢ All 6 score levels covered")
print(f"  â€¢ {(cal_df['synthetic_flag'] == False).sum()} authentic, {(cal_df['synthetic_flag'] == True).sum()} synthetic")
print(f"  â€¢ Word count range: {cal_df['word_count'].min()}-{cal_df['word_count'].max()}")

print("\nðŸ“‹ Next steps:")
print("  1. Download the three files from your Google Drive")
print("  2. Review the practice summaries document")
print("  3. Begin Phase 1 of calibration (Rubric Study)")
print("  4. Use the practice IDs to update your Calibration_Tracker.xlsx")

print("\n" + "="*80)

Mounted at /content/drive
CALIBRATION SUBSET SELECTION

Reading from: /content/drive/MyDrive/Courses/2025/3_Fall/EDUC_6192_Large_Language_Model_Applications_in_Education/Project/Phase_1/data/dataset/validation_set_combined_60.csv
Saving to: /content/drive/MyDrive/Courses/2025/3_Fall/EDUC_6192_Large_Language_Model_Applications_in_Education/Project/Phase_2/Phase2_Calibration/

[STEP 1] Loading Dataset...
--------------------------------------------------------------------------------
âœ“ Loaded 60 summaries

Columns: ['essay_id', 'score', 'full_text', 'assignment', 'prompt_name', 'economically_disadvantaged', 'student_disability_status', 'ell_status', 'race_ethnicity', 'gender', 'source_text_1', 'source_text_2', 'source_text_3', 'source_text_4', 'synthetic_flag', 'target_error_pattern', 'generation_date', 'generation_model', 'word_count']


Dataset Distribution:
  Authentic (ASAP 2.0): 37
  Synthetic (GPT-4o-Mini): 23

Score distribution:
  Score 1:  8 summaries (13.3%)
  Score 2: 18 sum

In [None]:
"""
Generate Calibration Benchmark Scores Answer Key
Reads calibration_subset.csv and creates formatted answer key file
"""

import pandas as pd
from pathlib import Path

# Configuration
CALIBRATION_SUBSET_PATH = "/content/drive/MyDrive/Courses/2025/3_Fall/EDUC_6192_Large_Language_Model_Applications_in_Education/Project/Phase_2/Phase2_Calibration/calibration_subset.csv"  # Adjust path as needed
OUTPUT_PATH = "/content/drive/MyDrive/Courses/2025/3_Fall/EDUC_6192_Large_Language_Model_Applications_in_Education/Project/Phase_2/Phase2_Calibration/CALIBRATION_BENCHMARK_SCORES.txt"

def create_benchmark_scores_file(input_csv, output_txt):
    """
    Create formatted benchmark scores file from calibration subset CSV.

    Parameters:
    -----------
    input_csv : str
        Path to calibration_subset.csv
    output_txt : str
        Path for output benchmark scores file
    """

    # Read calibration subset
    df = pd.read_csv(input_csv)

    # Define practice IDs and their roles
    exemplars = {
        'PRACTICE_01': 'Not used for blind practice - reference only',
        'PRACTICE_02': 'EXEMPLAR - analyzed in detail in EXEMPLAR_ANALYSIS_GUIDE.md',
        'PRACTICE_07': 'EXEMPLAR - analyzed in detail in EXEMPLAR_ANALYSIS_GUIDE.md',
        'PRACTICE_11': 'EXEMPLAR - analyzed in detail in EXEMPLAR_ANALYSIS_GUIDE.md'
    }

    round_1 = ['PRACTICE_03', 'PRACTICE_04', 'PRACTICE_05', 'PRACTICE_06']
    round_2 = ['PRACTICE_08', 'PRACTICE_09', 'PRACTICE_10', 'PRACTICE_12']

    # Create practice_id column if it doesn't exist
    if 'practice_id' not in df.columns:
        # Create practice IDs based on row order
        df['practice_id'] = [f'PRACTICE_{str(i+1).zfill(2)}' for i in range(len(df))]

    # Build the output content
    content = []

    # Header
    content.append("=" * 80)
    content.append("CALIBRATION PRACTICE SUMMARIES - BENCHMARK SCORES (ANSWER KEY)")
    content.append("=" * 80)
    content.append("")
    content.append("DO NOT LOOK AT THIS FILE UNTIL YOU HAVE SCORED ALL 9 PRACTICE SUMMARIES BLIND!")
    content.append("")
    content.append("Instructions for Use:")
    content.append("1. Score all 9 practice summaries (PRACTICE_03 through PRACTICE_06, and")
    content.append("   PRACTICE_08 through PRACTICE_12) WITHOUT looking at this file")
    content.append("2. Record your scores in Calibration_Tracker.xlsx")
    content.append("3. AFTER completing all 9, open this file to compare your scores")
    content.append("4. Calculate agreement metrics and analyze discrepancies")
    content.append("")
    content.append("=" * 80)
    content.append("")

    # Exemplar summaries section
    content.append("EXEMPLAR SUMMARIES (Study These First - Scores Already Known)")
    content.append("=" * 80)
    content.append("")

    for practice_id, note in exemplars.items():
        row = df[df['practice_id'] == practice_id].iloc[0]
        content.append(f"{practice_id}: {row['essay_id']}")
        content.append(f"Benchmark Score: {row['score']}")
        content.append(f"Source: {'Authentic' if row['synthetic_flag'] == 0 else 'Synthetic'}")
        content.append(f"Note: {note}")
        content.append("")

    content.append("=" * 80)
    content.append("")

    # Practice Round 1
    content.append("PRACTICE ROUND 1 - BLIND SCORING (Complete First)")
    content.append("=" * 80)
    content.append("")

    for practice_id in round_1:
        row = df[df['practice_id'] == practice_id].iloc[0]
        content.append(f"{practice_id}: {row['essay_id']}")
        content.append(f"Benchmark Score: {row['score']}")
        content.append(f"Source: {'Authentic' if row['synthetic_flag'] == 0 else 'Synthetic'}")
        content.append(f"Word Count: {row['word_count']}")
        if row['synthetic_flag'] == 1 and pd.notna(row['target_error_pattern']):
            content.append(f"Error Pattern: {row['target_error_pattern']}")
        content.append("")

    content.append("=" * 80)
    content.append("")

    # Practice Round 2
    content.append("PRACTICE ROUND 2 - BLIND SCORING (Complete Second)")
    content.append("=" * 80)
    content.append("")

    for practice_id in round_2:
        row = df[df['practice_id'] == practice_id].iloc[0]
        content.append(f"{practice_id}: {row['essay_id']}")
        content.append(f"Benchmark Score: {row['score']}")
        content.append(f"Source: {'Authentic' if row['synthetic_flag'] == 0 else 'Synthetic'}")
        content.append(f"Word Count: {row['word_count']}")
        if row['synthetic_flag'] == 1 and pd.notna(row['target_error_pattern']):
            content.append(f"Error Pattern: {row['target_error_pattern']}")
        content.append("")

    content.append("=" * 80)
    content.append("")

    # Score distribution
    content.append("SCORE DISTRIBUTION IN PRACTICE SET")
    content.append("=" * 80)
    content.append("")

    score_counts = df['score'].value_counts().sort_index()
    for score, count in score_counts.items():
        practice_ids = df[df['score'] == score]['practice_id'].tolist()
        ids_str = ', '.join(practice_ids)

        # Identify exemplars
        exemplar_ids = [pid for pid in practice_ids if pid in exemplars]
        if exemplar_ids:
            ids_str += f" ({', '.join([f'{pid} - exemplar' for pid in exemplar_ids])})"

        content.append(f"Score {score}: {count} {'summary' if count == 1 else 'summaries'} ({ids_str})")

    content.append("")
    content.append("=" * 80)
    content.append("")

    # Agreement metrics section
    content.append("AGREEMENT METRICS TO CALCULATE")
    content.append("=" * 80)
    content.append("")
    content.append("After comparing your scores to these benchmarks:")
    content.append("")
    content.append("1. EXACT AGREEMENT: How many summaries did you score exactly the same?")
    content.append("   Target: â‰¥ 60% (at least 6 out of 9)")
    content.append("")
    content.append("2. ADJACENT AGREEMENT: How many were within Â±1 point?")
    content.append("   Target: > 85% (at least 8 out of 9)")
    content.append("")
    content.append("3. MEAN ABSOLUTE ERROR (MAE): Average distance from benchmark")
    content.append("   Target: < 0.5 points per dimension")
    content.append("   ")
    content.append("   Formula: Sum of |your score - benchmark| Ã· number of summaries")
    content.append("")
    content.append("4. PATTERNS IN DISCREPANCIES:")
    content.append("   - Do you tend to score higher or lower than benchmarks?")
    content.append("   - Are discrepancies concentrated in specific dimensions?")
    content.append("   - Are errors larger for certain score levels?")
    content.append("")
    content.append("=" * 80)
    content.append("")

    # Next steps
    content.append("NEXT STEPS AFTER COMPARISON")
    content.append("=" * 80)
    content.append("")
    content.append("1. Calculate your agreement metrics")
    content.append("2. Identify patterns in discrepancies")
    content.append("3. Create decision rules for borderline cases")
    content.append("4. Review rubric areas where you struggled")
    content.append("5. Proceed to Practice Round 2 with refined approach")
    content.append("6. After Round 2, assess readiness for full validation scoring")
    content.append("")
    content.append("=" * 80)

    # Write to file
    with open(output_txt, 'w', encoding='utf-8') as f:
        f.write('\n'.join(content))

    print(f"âœ“ Benchmark scores file created: {output_txt}")
    print(f"  Total summaries: {len(df)}")
    print(f"  Exemplars: {len(exemplars)}")
    print(f"  Practice Round 1: {len(round_1)}")
    print(f"  Practice Round 2: {len(round_2)}")
    print(f"\nScore distribution:")
    for score, count in score_counts.items():
        print(f"  Score {score}: {count}")


if __name__ == "__main__":
    # Create the benchmark scores file
    create_benchmark_scores_file(CALIBRATION_SUBSET_PATH, OUTPUT_PATH)

    print("\nâœ“ Generation complete!")
    print(f"\nReminder: DO NOT open {OUTPUT_PATH} until after blind scoring!")

âœ“ Benchmark scores file created: /content/drive/MyDrive/Courses/2025/3_Fall/EDUC_6192_Large_Language_Model_Applications_in_Education/Project/Phase_2/Phase2_Calibration/CALIBRATION_BENCHMARK_SCORES.txt
  Total summaries: 12
  Exemplars: 4
  Practice Round 1: 4
  Practice Round 2: 4

Score distribution:
  Score 1: 2
  Score 2: 3
  Score 3: 3
  Score 4: 2
  Score 5: 1
  Score 6: 1

âœ“ Generation complete!

Reminder: DO NOT open /content/drive/MyDrive/Courses/2025/3_Fall/EDUC_6192_Large_Language_Model_Applications_in_Education/Project/Phase_2/Phase2_Calibration/CALIBRATION_BENCHMARK_SCORES.txt until after blind scoring!


In [None]:
"""
25-Summary Validation Subset Selector
Stratified sampling from 60-summary validation set for accelerated timeline
"""

import pandas as pd
import numpy as np
import os

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# ============================================================================
# CONFIGURATION - Adjust these paths for your Google Drive setup
# ============================================================================

# Path to your validation_set_combined_60.csv in Google Drive
VALIDATION_DATASET_PATH = "/content/drive/MyDrive/Courses/2025/3_Fall/EDUC_6192_Large_Language_Model_Applications_in_Education/Project/Phase_1/data/dataset/validation_set_combined_60.csv"

# Output directory in Google Drive
OUTPUT_DIR = "/content/drive/MyDrive/Courses/2025/3_Fall/EDUC_6192_Large_Language_Model_Applications_in_Education/Project/Phase_2/data"

# Random seed for reproducibility
RANDOM_SEED = 42

# Target distribution for 25 summaries (proportional to original 60)
TARGET_DISTRIBUTION = {
    1: 3,   # From 8 available
    2: 8,   # From 18 available
    3: 8,   # From 20 available
    4: 4,   # From 10 available
    5: 1,   # From 3 available
    6: 1    # From 1 available
}
# Total = 25 summaries

# ============================================================================

def select_validation_subset(input_csv, target_dist, random_seed=42):
    """
    Select stratified 25-summary subset from 60-summary validation set.

    Parameters:
    -----------
    input_csv : str
        Path to validation_set_combined_60.csv
    target_dist : dict
        Target number of summaries per score level
    random_seed : int
        Random seed for reproducibility

    Returns:
    --------
    pd.DataFrame
        Selected subset of 25 summaries
    """

    print("=" * 80)
    print("FAST-TRACK VALIDATION SUBSET SELECTION")
    print("=" * 80)
    print()

    # Read full validation dataset
    print(f"Reading validation dataset...")
    print(f"  Path: {input_csv}")
    df = pd.read_csv(input_csv)
    print(f"âœ“ Loaded {len(df)} summaries")
    print()

    # Set random seed
    np.random.seed(random_seed)

    # Display current distribution
    print("Current score distribution (60 summaries):")
    score_dist = df['score'].value_counts().sort_index()
    for score, count in score_dist.items():
        auth_count = len(df[(df['score'] == score) & (df['synthetic_flag'] == 0)])
        synth_count = len(df[(df['score'] == score) & (df['synthetic_flag'] == 1)])
        print(f"  Score {score}: {count} total ({auth_count} authentic, {synth_count} synthetic)")
    print()

    # Stratified sampling by score
    print("Target distribution (25 summaries):")
    for score, target in target_dist.items():
        print(f"  Score {score}: {target} summaries")
    print()

    print("Selecting summaries...")
    selected_dfs = []

    for score, target_count in target_dist.items():
        # Get all summaries with this score
        score_df = df[df['score'] == score].copy()

        if len(score_df) < target_count:
            print(f"  âš  Warning: Only {len(score_df)} summaries available for score {score} (need {target_count})")
            selected = score_df
        else:
            # Randomly sample target_count summaries
            selected = score_df.sample(n=target_count, random_state=random_seed)

        selected_dfs.append(selected)

        auth_selected = len(selected[selected['synthetic_flag'] == 0])
        synth_selected = len(selected[selected['synthetic_flag'] == 1])
        print(f"  âœ“ Score {score}: Selected {len(selected)} ({auth_selected} authentic, {synth_selected} synthetic)")

    # Combine all selected summaries
    subset_df = pd.concat(selected_dfs, ignore_index=True)

    # Shuffle the final subset
    subset_df = subset_df.sample(frac=1, random_state=random_seed).reset_index(drop=True)

    # Add validation_id for tracking
    subset_df['validation_id'] = [f'VAL_{str(i+1).zfill(2)}' for i in range(len(subset_df))]

    print()
    print("=" * 80)
    print("SELECTION COMPLETE")
    print("=" * 80)
    print(f"Total selected: {len(subset_df)} summaries")
    print()

    # Final distribution summary
    print("Final subset distribution:")
    for score in sorted(subset_df['score'].unique()):
        count = len(subset_df[subset_df['score'] == score])
        auth_count = len(subset_df[(subset_df['score'] == score) & (subset_df['synthetic_flag'] == 0)])
        synth_count = len(subset_df[(subset_df['score'] == score) & (subset_df['synthetic_flag'] == 1)])
        print(f"  Score {score}: {count} ({auth_count} authentic, {synth_count} synthetic)")

    total_auth = len(subset_df[subset_df['synthetic_flag'] == 0])
    total_synth = len(subset_df[subset_df['synthetic_flag'] == 1])
    print()
    print(f"Overall: {total_auth} authentic ({total_auth/len(subset_df)*100:.1f}%), "
          f"{total_synth} synthetic ({total_synth/len(subset_df)*100:.1f}%)")

    return subset_df


def create_scoring_text_file(subset_df, output_txt):
    """
    Create formatted text file for manual scoring.

    Parameters:
    -----------
    subset_df : pd.DataFrame
        Selected validation subset
    output_txt : str
        Path for output text file
    """

    content = []

    # Header
    content.append("=" * 80)
    content.append("FAST-TRACK VALIDATION SET - 25 SUMMARIES FOR SCORING")
    content.append("=" * 80)
    content.append("")
    content.append("Instructions:")
    content.append("1. Score each summary across all 4 dimensions using your calibrated approach")
    content.append("2. Record scores in your scoring template spreadsheet")
    content.append("3. Document brief rationale for borderline cases")
    content.append("4. These scores will be your ground truth for LLM validation")
    content.append("")
    content.append("Timeline: Complete all 25 by end of day Monday, December 2")
    content.append("Estimated time: 6-8 hours (15-20 min per summary)")
    content.append("")
    content.append("=" * 80)
    content.append("")

    # Each summary
    for idx, row in subset_df.iterrows():
        content.append("")
        content.append("=" * 80)
        content.append(f"{row['validation_id']}: {row['essay_id']}")
        content.append("=" * 80)
        content.append(f"Original Score: {row['score']}")
        content.append(f"Source: {'Authentic' if row['synthetic_flag'] == 0 else 'Synthetic'}")
        content.append(f"Word Count: {row['word_count']}")
        if row['synthetic_flag'] == 1 and pd.notna(row.get('target_error_pattern')):
            content.append(f"Error Pattern: {row['target_error_pattern']}")
        content.append("")
        content.append("-" * 80)
        content.append("SUMMARY TEXT:")
        content.append("-" * 80)
        content.append("")
        content.append(row['full_text'])
        content.append("")
        content.append("-" * 80)
        content.append("YOUR SCORES (Complete after reading):")
        content.append("-" * 80)
        content.append("Completeness (1-5): _____")
        content.append("Accuracy (1-5): _____")
        content.append("Coherence (1-5): _____")
        content.append("Conciseness (1-5): _____")
        content.append("")
        content.append("Brief rationale/notes:")
        content.append("")
        content.append("")
        content.append("=" * 80)
        content.append("")

    # Footer
    content.append("")
    content.append("=" * 80)
    content.append("END OF VALIDATION SET")
    content.append("=" * 80)
    content.append("")
    content.append("Next steps after scoring:")
    content.append("1. Transfer scores to spreadsheet")
    content.append("2. Begin LLM prompt design (Tuesday)")
    content.append("3. Test initial prompt on 5 summaries (Tuesday)")
    content.append("4. Prepare progress update presentation (Wednesday)")

    # Write to file
    with open(output_txt, 'w', encoding='utf-8') as f:
        f.write('\n'.join(content))

    print(f"âœ“ Scoring text file created")


def main():
    """Main execution function."""

    # Create output directory if it doesn't exist
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"Output directory: {OUTPUT_DIR}")
    print()

    # Define output paths
    output_csv = os.path.join(OUTPUT_DIR, "validation_subset_25.csv")
    output_txt = os.path.join(OUTPUT_DIR, "validation_subset_25_for_scoring.txt")

    # Select subset
    subset_df = select_validation_subset(
        VALIDATION_DATASET_PATH,
        TARGET_DISTRIBUTION,
        RANDOM_SEED
    )

    # Save CSV
    print(f"\nSaving subset CSV...")
    subset_df.to_csv(output_csv, index=False)
    print(f"âœ“ CSV saved: {output_csv}")
    print(f"  {len(subset_df)} summaries")

    # Create scoring text file
    print(f"\nCreating scoring text file...")
    create_scoring_text_file(subset_df, output_txt)
    print(f"âœ“ Text file saved: {output_txt}")

    # Summary statistics
    print()
    print("=" * 80)
    print("FILES CREATED IN GOOGLE DRIVE")
    print("=" * 80)
    print(f"1. validation_subset_25.csv - Subset data for analysis")
    print(f"2. validation_subset_25_for_scoring.txt - Formatted for manual scoring")
    print()
    print(f"Location: {OUTPUT_DIR}")
    print()
    print("=" * 80)
    print("NEXT STEPS - FAST-TRACK SCHEDULE")
    print("=" * 80)
    print()
    print("ðŸ“… MONDAY DEC 1 (Tomorrow):")
    print("   â€¢ Score all 25 summaries (6-8 hours)")
    print("   â€¢ Use your calibrated approach from practice rounds")
    print("   â€¢ Document scores in spreadsheet as you go")
    print()
    print("ðŸ“… TUESDAY DEC 2:")
    print("   â€¢ Design base LLM evaluation prompt")
    print("   â€¢ Set up Llama 3.1 8B in Colab")
    print("   â€¢ Test on 5 summaries")
    print("   â€¢ Calculate initial agreement metrics")
    print()
    print("ðŸ“… WEDNESDAY DEC 3:")
    print("   â€¢ Prepare progress update presentation")
    print("   â€¢ Iterate on prompt based on results")
    print("   â€¢ DELIVERABLE: Progress update")
    print()
    print("ðŸŽ¯ You're on track for the December 10 demo!")
    print("=" * 80)


if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Output directory: /content/drive/MyDrive/Courses/2025/3_Fall/EDUC_6192_Large_Language_Model_Applications_in_Education/Project/Phase_2/data

FAST-TRACK VALIDATION SUBSET SELECTION

Reading validation dataset...
  Path: /content/drive/MyDrive/Courses/2025/3_Fall/EDUC_6192_Large_Language_Model_Applications_in_Education/Project/Phase_1/data/dataset/validation_set_combined_60.csv
âœ“ Loaded 60 summaries

Current score distribution (60 summaries):
  Score 1: 8 total (5 authentic, 3 synthetic)
  Score 2: 18 total (11 authentic, 7 synthetic)
  Score 3: 20 total (12 authentic, 8 synthetic)
  Score 4: 10 total (6 authentic, 4 synthetic)
  Score 5: 3 total (2 authentic, 1 synthetic)
  Score 6: 1 total (1 authentic, 0 synthetic)

Target distribution (25 summaries):
  Score 1: 3 summaries
  Score 2: 8 summaries
  Score 3: 8 summaries
  Score 4: 4 summaries
  Score 5: 1 su