In [None]:
# Install Hugging Face libraries
!pip install transformers[torch] -U
!pip install datasets
!pip install accelerate
!pip install evaluate
!pip install PyPDF2
!pip install pdfplumber
import pandas as pd
import pdfplumber
import re
import os
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
from evaluate import load
import numpy as np
import json
import torch
import torch.nn.functional as F
import random
from collections import Counter, defaultdict
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score

Collecting transformers[torch]
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers[torch])
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m98.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    F

In [None]:
# Disable WANDB login requirement
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Mount Google Drive (for Data Storage)
"""from google.colab import drive
drive.mount('/content/drive')"""

Mounted at /content/drive


In [None]:
def parse_json_with_multiple_grades(json_file):
    """
    Extracts grade levels and full statements from a JSON file.
    Handles both 'educationLevel' and 'gradeLevels' fields for grade data.
    Creates a separate entry for each grade associated with a statement.
    """
    with open(json_file, "r") as f:
        data = json.load(f)

    curriculum_data = []

    # Handle case where the JSON root is a list or a dictionary
    items = data if isinstance(data, list) else data.get("CFItems", [])

    for item in items:
        # Try both 'educationLevel' and 'gradeLevels'
        grade_levels = item.get("educationLevel", []) or item.get("gradeLevels", [])
        statement = item.get("fullStatement", "").strip() or item.get("statement", "").strip()

        if grade_levels and statement:
            # Create a separate entry for each grade
            for level in grade_levels:
                curriculum_data.append({
                    "text": statement,
                    "label": level  # Keep grade as a string for now
                })

    return curriculum_data


# Combine data from all JSON files
all_data = []
json_file_paths = [
    "NLP_Final_Project/State_JSON/Alabama_Standards.json",
    "NLP_Final_Project/State_JSON/Chicago_Standards.json",
    "NLP_Final_Project/State_JSON/Columbia_County_Standards.json",
    "NLP_Final_Project/State_JSON/Florida_Standards.json",
    "NLP_Final_Project/State_JSON/Georgia_Standards.json",
    "NLP_Final_Project/State_JSON/Indiana_Standards.json",
    "NLP_Final_Project/State_JSON/Maine_Standards.json",
    "NLP_Final_Project/State_JSON/Nebraska_Standards.json",
    "NLP_Final_Project/State_JSON/North_Dakota_Standards.json",
    "NLP_Final_Project/State_JSON/Oklahoma_Standards.json",
    "NLP_Final_Project/State_JSON/Oregon_Standards.json",
    "NLP_Final_Project/State_JSON/Rhode_Island_Standards.json",
    "NLP_Final_Project/State_JSON/South_Carolina_Standards.json",
    "NLP_Final_Project/State_JSON/Texas_Standards.json",
    "NLP_Final_Project/State_JSON/Virginia_Standards.json",
    "NLP_Final_Project/State_JSON/Wisconsin_Standards.json",
    "NLP_Final_Project/State_JSON/literacy-0.8.0.json",
    "NLP_Final_Project/State_JSON/BEST_Standards.json",
    "NLP_Final_Project/State_JSON/Board_Education_Standards.json",
    "NLP_Final_Project/State_JSON/Gwinnett_County_Standards.json",
    "NLP_Final_Project/State_JSON/Idaho_Standards.json",
    "NLP_Final_Project/State_JSON/Pennsylvania_Standards.json",
    "NLP_Final_Project/State_JSON/Washington_DC_Standards.json"
]
for file_path in json_file_paths:
    all_data.extend(parse_json_with_multiple_grades(file_path))

print(f"Total samples: {len(all_data)}")
print("Sample entry:", all_data[:3])

# Map grade levels to numeric labels
grade_mapping = {
    "KG": 0,   # Kindergarten
    "01": 1,   # 1st Grade
    "02": 2,   # 2nd Grade
    "03": 3,   # 3rd Grade
    "04": 4,   # 4th Grade
    "05": 5,   # 5th Grade
    "06": 6,   # 6th Grade
    "07": 7,   # 7th Grade
    "08": 8,   # 8th Grade
    "09": 9,   # 9th Grade
    "10": 10,  # 10th Grade
    "11": 11,  # 11th Grade
    "12": 12   # 12th Grade
}

grade_cluster_mapping = {
    0: 0,  # KG -> Early Childhood
    1: 0,  # 1st Grade -> Early Childhood
    2: 0,  # 2nd Grade -> Early Childhood
    3: 0,  # 3rd Grade -> Elementary
    4: 0,  # 4th Grade -> Elementary
    5: 1,  # 5th Grade -> Elementary
    6: 1,  # 6th Grade -> Middle School
    7: 1,  # 7th Grade -> Middle School
    8: 1,  # 8th Grade -> Middle School
    9: 2,  # 9th Grade -> High School
    10: 2,  # 10th Grade -> High School
    11: 2,  # 11th Grade -> High School
    12: 2   # 12th Grade -> High School
}


# Apply the mapping
for entry in all_data:
    entry["label"] = grade_mapping.get(entry["label"], -1)  # Default to -1 if unknown

# Apply the cluster mapping
for entry in all_data:
    entry["label"] = grade_cluster_mapping.get(entry["label"], -1)  # Default to -1 if not found

# Filter out any invalid entries
all_data = [entry for entry in all_data if entry["label"] != -1]
print(f"Filtered data samples: {len(all_data)}")

Total samples: 41797
Sample entry: [{'text': 'Kindergarten', 'label': 'KG'}, {'text': 'Recurring Standards', 'label': 'KG'}, {'text': 'Utilize active listening skills during discussion and conversation in pairs, small groups, or whole-class settings, following agreed-upon rules for participation.', 'label': 'KG'}]
Filtered data samples: 41710


In [None]:
# Convert all_data into the correct format for Dataset.from_dict
all_data_dict = {
    "text": [entry["text"] for entry in all_data],
    "label": [entry["label"] for entry in all_data]
}

dataset = Dataset.from_dict(all_data_dict)

# Split into train, validation, and test sets
full_split = dataset.train_test_split(test_size=0.2, seed=42)
train_val_split = full_split["train"].train_test_split(test_size=0.1, seed=42)

train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]
test_dataset = full_split["test"]

print(f"Train: {len(train_dataset)}, Validation: {len(val_dataset)}, Test: {len(test_dataset)}")

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Tokenization function
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)

for entry in all_data:
    token_count = len(tokenizer.tokenize(entry["text"]))
    if token_count > 512:
        print(f"Text for grade {entry['label']} requires chunking. Token count: {token_count}")

# Apply tokenization
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Set dataset format for PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Train: 30031, Validation: 3337, Test: 8342


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (563 > 512). Running this sequence through the model will result in indexing errors


Text for grade 2 requires chunking. Token count: 563
Text for grade 2 requires chunking. Token count: 563
Text for grade 2 requires chunking. Token count: 563
Text for grade 2 requires chunking. Token count: 563


Map:   0%|          | 0/30031 [00:00<?, ? examples/s]

Map:   0%|          | 0/3337 [00:00<?, ? examples/s]

Map:   0%|          | 0/8342 [00:00<?, ? examples/s]

In [None]:
print(f"Total dataset entries: {len(all_data)}")

grade_counts = Counter(entry["label"] for entry in all_data)
print("Dataset entries per grade:", grade_counts)

# Check unique labels in the dataset
train_labels = set(train_dataset["label"])
val_labels = set(val_dataset["label"])
test_labels = set(test_dataset["label"])

print(f"Train labels: {train_labels}")
print(f"Validation labels: {val_labels}")
print(f"Test labels: {test_labels}")

# Output should only be {0, 1} for binary classification

Total dataset entries: 41710
Dataset entries per grade: Counter({2: 16101, 0: 12953, 1: 12656})
Train labels: {tensor(2), tensor(1), tensor(0), tensor(1), tensor(0), tensor(0), tensor(1), tensor(0), tensor(2), tensor(1), tensor(0), tensor(2), tensor(2), tensor(2), tensor(0), tensor(2), tensor(2), tensor(0), tensor(0), tensor(1), tensor(0), tensor(2), tensor(0), tensor(2), tensor(2), tensor(1), tensor(1), tensor(0), tensor(0), tensor(1), tensor(0), tensor(2), tensor(0), tensor(2), tensor(1), tensor(1), tensor(2), tensor(0), tensor(1), tensor(0), tensor(1), tensor(1), tensor(1), tensor(0), tensor(0), tensor(2), tensor(2), tensor(0), tensor(2), tensor(2), tensor(2), tensor(2), tensor(2), tensor(0), tensor(0), tensor(0), tensor(1), tensor(2), tensor(2), tensor(0), tensor(1), tensor(1), tensor(2), tensor(0), tensor(2), tensor(2), tensor(2), tensor(1), tensor(0), tensor(2), tensor(0), tensor(0), tensor(2), tensor(1), tensor(1), tensor(0), tensor(2), tensor(0), tensor(2), tensor(1), tensor(2)

In [None]:
# Model with classification head (num_labels = total grade levels)
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=3  # Adjust num_labels for the number of grades or cluster
)

# Training arguments
training_args = TrainingArguments(
    output_dir="NLP_Final_Project/Model_Results_Roberta",
    eval_strategy="epoch",
    logging_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=12,
    weight_decay=0.01,
    logging_dir="NLP_Final_Project/Model_logs_Roberta",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_strategy="epoch",
    save_total_limit=2,
)

metric = load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate on the validation set (during training)
val_results = trainer.evaluate(val_dataset)
print("Validation Results:", val_results)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6378,0.632386,0.702128
2,0.5616,0.570276,0.712616
3,0.5277,0.574412,0.728199
4,0.5218,0.571157,0.732394
5,0.4463,0.569249,0.724903
6,0.45,0.554237,0.733293


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6378,0.632386,0.702128
2,0.5616,0.570276,0.712616
3,0.5277,0.574412,0.728199
4,0.5218,0.571157,0.732394
5,0.4463,0.569249,0.724903
6,0.45,0.554237,0.733293
7,0.4573,0.59265,0.748277
8,0.459,0.563828,0.740785
9,0.4281,0.579887,0.752173
10,0.412,0.594519,0.741085


Validation Results: {'eval_loss': 0.5798866152763367, 'eval_accuracy': 0.7521726101288583, 'eval_runtime': 21.8113, 'eval_samples_per_second': 152.994, 'eval_steps_per_second': 9.582, 'epoch': 12.0}


In [None]:
# Evaluate on the test set
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)

# Save the fine-tuned model
trainer.save_model("NLP_Final_Project/Fine_Tuned_Model_Roberta")

with open("test_results.json", "w") as f:
    json.dump(test_results, f)

Test Results: {'eval_loss': 0.5926963686943054, 'eval_accuracy': 0.7439462958523136, 'eval_runtime': 54.1242, 'eval_samples_per_second': 154.127, 'eval_steps_per_second': 9.644, 'epoch': 12.0}


In [None]:
# Get predictions and labels
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=1)

# Generate report
print(classification_report(labels, predictions, target_names=[str(i) for i in range(3)]))

NameError: name 'trainer' is not defined

In [None]:
def chunk_text(text, max_tokens, tokenizer):
    """
    Split a long text into smaller chunks based on max_tokens.
    """
    tokens = tokenizer.tokenize(text)
    chunks = [
        tokenizer.convert_tokens_to_string(tokens[i : i + max_tokens])
        for i in range(0, len(tokens), max_tokens)
    ]
    return chunks


In [None]:
# Updated function to include true grade and chunked texts
def extract_and_chunk_json(json_file, grade_mapping, tokenizer, max_tokens=128):
    """
    Extract true grades and chunked texts from the JSON file.
    """
    curriculum_data = parse_json_with_multiple_grades(json_file)
    chunked_texts_by_grade = defaultdict(list)

    for entry in curriculum_data:
        true_grade = grade_mapping.get(entry["label"], -1)
        if true_grade == -1:
            continue  # Skip invalid grades

        # Chunk text
        chunks = chunk_text(entry["text"], max_tokens, tokenizer)

        # Append each chunk with a suffix
        for i, chunk in enumerate(chunks):
            chunked_texts_by_grade[true_grade].append(f"Grade_{true_grade}_{i + 1}: {chunk}")

    return chunked_texts_by_grade

In [None]:
def predict_chunks(trainer, dataset):
    """
    Predict clusters for all chunks in the dataset.
    """
    # Predict logits for each chunk
    logits = trainer.predict(dataset).predictions
    probabilities = torch.softmax(torch.tensor(logits), dim=-1).numpy()

    # Aggregate predictions by grade
    aggregated_results = defaultdict(list)
    for idx, grade in enumerate(dataset["grade"]):
        aggregated_results[int(grade)].append(probabilities[idx])  # Ensure grade is int

    # Compute average probabilities and determine the predicted cluster
    final_results = {}
    for grade, grade_probs in aggregated_results.items():
        avg_probs = np.mean(grade_probs, axis=0)
        predicted_cluster = np.argmax(avg_probs)

        final_results[grade] = {
            "predicted_cluster": predicted_cluster,
            "chunk_probabilities": grade_probs,
            "average_probabilities": avg_probs,
        }

        print(f"Probabilities for grade {grade}: {grade_probs}")
        print(f"Average probabilities for grade {grade}: {avg_probs}")

    return final_results

In [None]:
def analyze_results_with_chunks(evaluation_results):
    """
    Compute metrics for binary classification and cluster correction for chunked data.
    """
    binary_metrics = defaultdict(list)
    cluster_metrics = defaultdict(list)

    for true_grade, result in evaluation_results.items():
        # Convert true_grade to int if not already
        true_grade = int(true_grade)

        # Get true cluster and predicted cluster
        true_cluster = grade_cluster_mapping.get(true_grade, -1)
        predicted_cluster = result.get("predicted_cluster")

        if true_cluster == -1 or predicted_cluster is None:
            print(f"Skipping invalid grade {true_grade}: true_cluster={true_cluster}, predicted_cluster={predicted_cluster}")
            continue

        # Binary classification metrics
        binary_label = 1 if true_cluster == predicted_cluster else 0
        binary_metrics["true"].append(1)  # All grades are inherently "Appropriate"
        binary_metrics["predicted"].append(binary_label)

        # Cluster correction metrics
        cluster_metrics["true"].append(true_cluster)
        cluster_metrics["predicted"].append(predicted_cluster)

        # Debug: Print probabilities for each grade
        print(f"True Grade: {true_grade}, Predicted Cluster: {predicted_cluster}")
        print(f"Average Probabilities: {result['average_probabilities']}")

    # Compute final metrics
    binary_accuracy = accuracy_score(binary_metrics["true"], binary_metrics["predicted"])
    binary_precision, binary_recall, binary_f1, _ = precision_recall_fscore_support(
        binary_metrics["true"], binary_metrics["predicted"], average="binary"
    )
    cluster_accuracy = accuracy_score(cluster_metrics["true"], cluster_metrics["predicted"])
    cluster_precision, cluster_recall, cluster_f1, _ = precision_recall_fscore_support(
        cluster_metrics["true"], cluster_metrics["predicted"], average="macro"
    )

    print(f"Binary Metrics True Labels: {binary_metrics['true']}")
    print(f"Binary Metrics Predicted Labels: {binary_metrics['predicted']}")

    return {
        "binary_classification": {
            "accuracy": binary_accuracy,
            "precision": binary_precision,
            "recall": binary_recall,
            "f1_score": binary_f1,
        },
        "cluster_correction": {
            "accuracy": cluster_accuracy,
            "precision": cluster_precision,
            "recall": cluster_recall,
            "f1_score": cluster_f1,
        },
    }

In [None]:
# Preprocess JSON
chunked_texts = extract_and_chunk_json(
    json_file="NLP_Final_Project/State_JSON/New_York_Standards.json",
    grade_mapping=grade_mapping,
    tokenizer=tokenizer,
    max_tokens=128
)

# Prepare a dataset that includes grades
texts, grades = [], []
for grade, chunks in chunked_texts.items():
    texts.extend(chunks)
    grades.extend([grade] * len(chunks))

# Create Dataset with text and grades
dataset = Dataset.from_dict({"text": texts, "grade": grades})
dataset = dataset.map(tokenize, batched=True)
dataset.set_format("torch", columns=["input_ids", "attention_mask", "grade"])

Map:   0%|          | 0/2690 [00:00<?, ? examples/s]

'print("Binary Classification Metrics:")\nfor metric, value in metrics["binary_classification"].items():\n    print(f"{metric.capitalize()}: {value:.4f}")\n\nprint("\nCluster Correction Metrics:")\nfor metric, value in metrics["cluster_correction"].items():\n    print(f"{metric.capitalize()}: {value:.4f}")'

In [None]:
# Baseline: Curriculum-based
grade_cluster_mapping_baseline = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 1, 6: 1, 7: 1, 8: 1, 9: 2, 10: 2, 11: 2, 12: 2}

# Practical grouping
grade_cluster_mapping_practical = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 1, 7: 1, 8: 1, 9: 2, 10: 2, 11: 2, 12: 2}

# Teaching pairs
grade_cluster_mapping_pairs = {
    0: 0, 1: 0, 2: 1, 3: 1, 4: 2, 5: 2, 6: 3, 7: 3, 8: 4, 9: 4, 10: 5, 11: 5, 12: 5
}

# Teaching triple
grade_cluster_mapping_triple = {
    0: 0, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6: 2, 7: 2, 8: 2, 9: 3, 10: 3, 11: 3, 12: 3
}

# Individual grades
grade_cluster_mapping_individual = {i: i for i in range(13)}

# Replace `grade_cluster_mapping` dynamically
cluster_schemes = {
    "baseline": grade_cluster_mapping_baseline,
    "practical": grade_cluster_mapping_practical,
    "pairs": grade_cluster_mapping_pairs,
    "triple": grade_cluster_mapping_triple,
    "individual": grade_cluster_mapping_individual,
}

results = {}
for scheme_name, mapping in cluster_schemes.items():
    grade_cluster_mapping = mapping
    evaluation_results = predict_chunks(trainer, dataset)
    metrics = analyze_results_with_chunks(evaluation_results)
    results[scheme_name] = metrics
    print(f"Results for {scheme_name}:")
    print(metrics)

Probabilities for grade 0: [array([0.9034232 , 0.07256478, 0.02401198], dtype=float32), array([0.5569984, 0.2915005, 0.1515011], dtype=float32), array([0.95011586, 0.04716082, 0.00272326], dtype=float32), array([9.9823111e-01, 1.6138388e-03, 1.5504447e-04], dtype=float32), array([0.7888306 , 0.19754978, 0.01361973], dtype=float32), array([0.42669067, 0.34066722, 0.23264214], dtype=float32), array([0.40732375, 0.36501467, 0.22766149], dtype=float32), array([0.90123886, 0.08097551, 0.01778564], dtype=float32), array([0.9597185 , 0.03723876, 0.00304272], dtype=float32), array([0.4341742 , 0.30619895, 0.25962675], dtype=float32), array([0.6623326, 0.2950619, 0.0426055], dtype=float32), array([0.53935945, 0.32838047, 0.13226007], dtype=float32), array([0.19344789, 0.4418818 , 0.3646703 ], dtype=float32), array([0.89983004, 0.09251562, 0.00765433], dtype=float32), array([0.51768297, 0.43583763, 0.04647945], dtype=float32), array([0.4602189 , 0.30465496, 0.23512612], dtype=float32), array([0.

Probabilities for grade 0: [array([0.9034232 , 0.07256478, 0.02401198], dtype=float32), array([0.5569984, 0.2915005, 0.1515011], dtype=float32), array([0.95011586, 0.04716082, 0.00272326], dtype=float32), array([9.9823111e-01, 1.6138388e-03, 1.5504447e-04], dtype=float32), array([0.7888306 , 0.19754978, 0.01361973], dtype=float32), array([0.42669067, 0.34066722, 0.23264214], dtype=float32), array([0.40732375, 0.36501467, 0.22766149], dtype=float32), array([0.90123886, 0.08097551, 0.01778564], dtype=float32), array([0.9597185 , 0.03723876, 0.00304272], dtype=float32), array([0.4341742 , 0.30619895, 0.25962675], dtype=float32), array([0.6623326, 0.2950619, 0.0426055], dtype=float32), array([0.53935945, 0.32838047, 0.13226007], dtype=float32), array([0.19344789, 0.4418818 , 0.3646703 ], dtype=float32), array([0.89983004, 0.09251562, 0.00765433], dtype=float32), array([0.51768297, 0.43583763, 0.04647945], dtype=float32), array([0.4602189 , 0.30465496, 0.23512612], dtype=float32), array([0.

Probabilities for grade 0: [array([0.9034232 , 0.07256478, 0.02401198], dtype=float32), array([0.5569984, 0.2915005, 0.1515011], dtype=float32), array([0.95011586, 0.04716082, 0.00272326], dtype=float32), array([9.9823111e-01, 1.6138388e-03, 1.5504447e-04], dtype=float32), array([0.7888306 , 0.19754978, 0.01361973], dtype=float32), array([0.42669067, 0.34066722, 0.23264214], dtype=float32), array([0.40732375, 0.36501467, 0.22766149], dtype=float32), array([0.90123886, 0.08097551, 0.01778564], dtype=float32), array([0.9597185 , 0.03723876, 0.00304272], dtype=float32), array([0.4341742 , 0.30619895, 0.25962675], dtype=float32), array([0.6623326, 0.2950619, 0.0426055], dtype=float32), array([0.53935945, 0.32838047, 0.13226007], dtype=float32), array([0.19344789, 0.4418818 , 0.3646703 ], dtype=float32), array([0.89983004, 0.09251562, 0.00765433], dtype=float32), array([0.51768297, 0.43583763, 0.04647945], dtype=float32), array([0.4602189 , 0.30465496, 0.23512612], dtype=float32), array([0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Probabilities for grade 0: [array([0.9034232 , 0.07256478, 0.02401198], dtype=float32), array([0.5569984, 0.2915005, 0.1515011], dtype=float32), array([0.95011586, 0.04716082, 0.00272326], dtype=float32), array([9.9823111e-01, 1.6138388e-03, 1.5504447e-04], dtype=float32), array([0.7888306 , 0.19754978, 0.01361973], dtype=float32), array([0.42669067, 0.34066722, 0.23264214], dtype=float32), array([0.40732375, 0.36501467, 0.22766149], dtype=float32), array([0.90123886, 0.08097551, 0.01778564], dtype=float32), array([0.9597185 , 0.03723876, 0.00304272], dtype=float32), array([0.4341742 , 0.30619895, 0.25962675], dtype=float32), array([0.6623326, 0.2950619, 0.0426055], dtype=float32), array([0.53935945, 0.32838047, 0.13226007], dtype=float32), array([0.19344789, 0.4418818 , 0.3646703 ], dtype=float32), array([0.89983004, 0.09251562, 0.00765433], dtype=float32), array([0.51768297, 0.43583763, 0.04647945], dtype=float32), array([0.4602189 , 0.30465496, 0.23512612], dtype=float32), array([0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Probabilities for grade 0: [array([0.9034232 , 0.07256478, 0.02401198], dtype=float32), array([0.5569984, 0.2915005, 0.1515011], dtype=float32), array([0.95011586, 0.04716082, 0.00272326], dtype=float32), array([9.9823111e-01, 1.6138388e-03, 1.5504447e-04], dtype=float32), array([0.7888306 , 0.19754978, 0.01361973], dtype=float32), array([0.42669067, 0.34066722, 0.23264214], dtype=float32), array([0.40732375, 0.36501467, 0.22766149], dtype=float32), array([0.90123886, 0.08097551, 0.01778564], dtype=float32), array([0.9597185 , 0.03723876, 0.00304272], dtype=float32), array([0.4341742 , 0.30619895, 0.25962675], dtype=float32), array([0.6623326, 0.2950619, 0.0426055], dtype=float32), array([0.53935945, 0.32838047, 0.13226007], dtype=float32), array([0.19344789, 0.4418818 , 0.3646703 ], dtype=float32), array([0.89983004, 0.09251562, 0.00765433], dtype=float32), array([0.51768297, 0.43583763, 0.04647945], dtype=float32), array([0.4602189 , 0.30465496, 0.23512612], dtype=float32), array([0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print(metrics)

print(f"Evaluation Results Keys: {list(evaluation_results.keys())}")
print(f"Sample Evaluation Result: {next(iter(evaluation_results.items())) if evaluation_results else 'No data'}")


In [None]:
# Debugging aggregated results
for grade, result in evaluation_results.items():
    print(f"True Grade: {grade}")
    print(f"Predicted Cluster: {result['predicted_cluster']}")
    print(f"Chunk Probabilities: {result['chunk_probabilities']}")
    print(f"Average Probabilities: {result['average_probabilities']}")


In [None]:
for grade in evaluation_results.keys():
    if grade not in grade_cluster_mapping:
        print(f"Warning: Grade {grade} not found in grade_cluster_mapping.")

for grade, result in evaluation_results.items():
    predicted_cluster = result['predicted_cluster']
    avg_probs = result['average_probabilities']
    max_prob_cluster = np.argmax(avg_probs)
    if predicted_cluster != max_prob_cluster:
        print(f"Inconsistent Prediction for Grade {grade}: Predicted={predicted_cluster}, Max Prob={max_prob_cluster}")