# Academic Specific Code

## Install Libraries

In [None]:
!pip install transformers torch
!pip install transformers torch scikit-learn
!pip install ipywidgets --upgrade
!pip install datasets --upgrade
!pip install pyarrow --upgrade
!pip install huggingface_hub
!pip install torch
!pip install transformers
!pip install scikit-learn
!pip install datasets
!pip install datasketch
!pip install transformers[torch] accelerate
!pip install ipywidgets
!pip install ipywidgets
!jupyter labextension install @jupyter-widgets/jupyterlab-manager
!pip install requests
!pip install tiktoken
!pip install sentencepiece
!pip install --upgrade notebook ipywidgets
!pip install openai

## Import Libraries

In [None]:
import ipywidgets as widgets
widgets.IntSlider()

import json
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, AutoModelForSequenceClassification
import torch
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sentencepiece import SentencePieceProcessor

print("SentencePiece is installed and ready to use.")

device = torch.device("cpu")
print(f"Using device: {device}")


## Configuration

In [None]:
config = {
    "model_name": "UBC-NLP/araT5-base",  # Load AraT5 locally
    "fine_tune_model": "aubmindlab/bert-base-arabertv02",  # Model to fine-tune
    "threshold": 3,  # Minimum score for high-quality content
    "annotation_samples": 100,  # Total number of samples annotated with AraT5.
    "validation_samples": 30,  # Subset of annotation_samples reserved for validation
    "max_samples_to_fine_tune": 70,  # Maximum annotated samples used for fine-tuning
    "epochs": 5,  # Fine-tuning epochs
    "batch_size": 4,  # Lower the batch size to reduce memory usage
}

## Arabic Rubric Prompt

In [None]:
arabic_prompt = """
فيما يلي مقتطف من صفحة ويب. قم بتقييم ما إذا كانت الصفحة ذات قيمة تعليمية عالية ويمكن أن تكون مفيدة في بيئة تعليمية لتدريس المستويات من المرحلة الابتدائية إلى المرحلة الإعدادية باستخدام نظام تقييم مكون من 5 نقاط تراكمية وفقًا للمعايير التالية:
أضف نقطة واحدة إذا كان المقتطف يقدم بعض المعلومات الأساسية ذات الصلة بالموضوعات التعليمية، حتى لو تضمن محتوى غير ذي صلة أو غير أكاديمي مثل الإعلانات والمواد الترويجية.
•	أضف نقطة أخرى إذا كان المقتطف يتناول بعض العناصر ذات الصلة بالتعليم ولكنه لا يتماشى بشكل وثيق مع المعايير التعليمية. قد يخلط بين المحتوى التعليمي وغير التعليمي، ويقدم نظرة عامة سطحية عن موضوعات قد تكون مفيدة، أو يعرض المعلومات بطريقة غير منظمة وأسلوب كتابة غير واضح.
•	امنح نقطة ثالثة إذا كان المقتطف مناسبًا للاستخدام التعليمي ويقدم مفاهيم رئيسية ذات صلة بالمناهج المدرسية. يكون المحتوى واضحًا ولكنه قد لا يكون شاملاً، أو قد يتضمن بعض المعلومات الزائدة. قد يشبه القسم التمهيدي لكتاب مدرسي أو درس تعليمي بسيط مناسب للتعلم ولكنه يحتوي على بعض القيود مثل معالجة مفاهيم معقدة جدًا لطلاب المرحلة الإعدادية.
•	امنح نقطة رابعة إذا كان المقتطف ذا صلة كبيرة ومفيدًا للأغراض التعليمية لمستوى لا يتجاوز المرحلة الإعدادية، مع أسلوب كتابة واضح ومتسق. يمكن أن يشبه فصلًا من كتاب مدرسي أو درسًا تعليميًا، حيث يقدم محتوى تعليميًا غنيًا، بما في ذلك التمارين والحلول، مع الحد الأدنى من المعلومات غير ذات الصلة، والمفاهيم ليست معقدة للغاية لطلاب هذه المرحلة. يكون المحتوى منظمًا ومركّزًا وقيمًا للتعلم المنهجي.
•	امنح نقطة خامسة إذا كان المقتطف ممتازًا في قيمته التعليمية ومناسبًا تمامًا للتدريس في المرحلة الابتدائية أو الإعدادية. يتبع المقتطف منطقًا تفصيليًا، وأسلوب الكتابة سهل الفهم، ويقدم رؤى عميقة وشاملة حول الموضوع دون أي محتوى غير تعليمي أو معقد.
المقتطف: <EXAMPLE>. بعد فحص المقتطف:
•	برر بإيجاز مجموع النقاط، بحد أقصى 100 كلمة.
•	اختتم بالنقاط الإجمالية بالتنسيق التالي: "التقييم التعليمي: <مجموع النقاط>".
"""

## Custom Dataset Class
standardizes the process of preparing text data for machine learning models by tokenizing text, truncating or padding sequences to a fixed length, and formatting inputs and labels into PyTorch tensors. This enables efficient batching and compatibility with PyTorch's DataLoader for training and evaluation.

In [None]:
from torch.utils.data import Dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item

## Step 1: Load Dataset

In [None]:
def load_dataset(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return [{"text": item["text"], "metadata": item["metadata"]} for item in data]


## Step 2: Annotate Data Locally with AraT5

In [None]:
def annotate_samples(samples, model_name):
    tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    annotated_data = []

    for sample in samples:
        text = sample["text"]
        prompt = arabic_prompt.format(text=text)

        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
        outputs = model.generate(**inputs, max_length=100)
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)

        scores = extract_scores(result)
        annotated_data.append({"text": text, "scores": scores, "metadata": sample["metadata"]})

    return annotated_data

## Extract Scores

In [None]:
def extract_scores(output):
    lines = output.split("\n")
    scores = {}
    for line in lines:
        if "ملاءمة" in line:
            scores["relevance"] = int(line.split(":")[-1].strip())
        elif "وضوح" in line:
            scores["clarity"] = int(line.split(":")[-1].strip())
        elif "عمق" in line:
            scores["depth"] = int(line.split(":")[-1].strip())
    total = sum(scores.values())
    scores["total"] = total
    return scores

## Step 3: Fine-Tune AraBERT

In [None]:
def fine_tune_arabert(train_data, tokenizer, model):
    texts = [item["text"] for item in train_data]
    labels = [item["scores"]["total"] for item in train_data]

    dataset = CustomDataset(texts, labels, tokenizer)
    model.to(device)

    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=config["epochs"],
        per_device_train_batch_size=config["batch_size"],
        save_steps=10_000,
        save_total_limit=2,
        no_cuda=True,
        logging_dir="./logs",
        logging_steps=100,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
    )

    print("Starting fine-tuning...")
    trainer.train()
    print("Fine-tuning complete.")

## Step 4: Predict with Fine-Tuned AraBERT

In [None]:
def predict_with_arabert(unlabeled_data, model, tokenizer):
    model.eval()
    model.to(device)
    predictions = []

    for sample in unlabeled_data:
        text = sample["text"]
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predicted_label = logits.argmax(dim=-1).item()

        predictions.append({"text": text, "predicted_score": predicted_label, "metadata": sample["metadata"]})

    return predictions

## Step 5: Validate Model

In [None]:
def validate_model(validation_data, model, tokenizer):
    model.to(device)
    true_labels = []
    predicted_labels = []

    for item in validation_data:
        text = item["text"]
        true_labels.append(item["scores"]["total"])
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predicted_label = logits.argmax(dim=-1).item()
        predicted_labels.append(predicted_label)

    f1 = f1_score(true_labels, predicted_labels, average="macro")
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average="macro", zero_division=0)
    recall = recall_score(true_labels, predicted_labels, average="macro", zero_division=0)
    conf_matrix = confusion_matrix(true_labels, predicted_labels)

    print(f"Validation F1 Score: {f1:.2f}")
    print(f"Validation Accuracy: {accuracy:.2f}")
    print(f"Validation Precision: {precision:.2f}")
    print(f"Validation Recall: {recall:.2f}")
    print("Confusion Matrix:")
    print(conf_matrix)

## Step 6: Filter Dataset

In [None]:
def filter_dataset(annotated_data, threshold):
    return [
        doc for doc in annotated_data
        if doc["predicted_score"] >= threshold
    ]

## Main Pipeline

In [None]:
def main_pipeline():
    # Step 1: Load the dataset from a specified JSON file
    dataset = load_dataset("/Users/ameeraattiah/Desktop/arabicweb24/jeje.json")
    print(f"Loaded {len(dataset)} samples.")

    # Step 2: Select a subset of the dataset for annotation
    sample_data = dataset[:config["annotation_samples"]]
    annotated_data = annotate_samples(sample_data, config["model_name"])
    print(f"Annotated {len(annotated_data)} samples.")

    # Step 3: Load the tokenizer and model for fine-tuning
    tokenizer = AutoTokenizer.from_pretrained(config["fine_tune_model"])
    model = AutoModelForSequenceClassification.from_pretrained(config["fine_tune_model"], num_labels=6)

    # Step 4: Fine-tune the model using the annotated data
    fine_tune_arabert(annotated_data, tokenizer, model)

    # Step 5: Use the fine-tuned model to predict the remaining dataset
    remaining_data = dataset[config["annotation_samples"]:]
    predictions = predict_with_arabert(remaining_data, model, tokenizer)
    print(f"Predicted {len(predictions)} samples with fine-tuned AraBERT.")

    # Step 6: Validate the fine-tuned model on a subset of the annotated data
    validation_data = annotated_data[:config["validation_samples"]]
    validate_model(validation_data, model, tokenizer)

    # Step 7: Filter the predictions to include only high-quality samples
    filtered_data = filter_dataset(predictions, config["threshold"])
    print(f"Filtered dataset contains {len(filtered_data)} high-quality samples.")

    # Step 8: Save the filtered data to a JSON file for future use
    with open("/Users/ameeraattiah/Desktop/arabicweb24/jeje-edu.json", "w", encoding="utf-8") as file:
        json.dump(filtered_data, file, ensure_ascii=False, indent=4)
    print("Filtered data saved.")

main_pipeline()


## Original Code

In [1]:
!pip install transformers torch
!pip install transformers torch scikit-learn
!pip install ipywidgets --upgrade
!pip install datasets --upgrade
!pip install pyarrow --upgrade
!pip install huggingface_hub
!pip install torch
!pip install transformers
!pip install scikit-learn
!pip install datasets
!pip install datasketch
!pip install transformers[torch] accelerate
!pip install ipywidgets
!pip install ipywidgets
!jupyter labextension install @jupyter-widgets/jupyterlab-manager
!pip install requests
!pip install tiktoken
!pip install sentencepiece
!pip install --upgrade notebook ipywidgets
!pip install openai

zsh:1: no matches found: transformers[torch]
[33m(Deprecated) Installing extensions with the jupyter labextension install command is now deprecated and will be removed in a future major version of JupyterLab.

Users should manage prebuilt extensions with package managers like pip and conda, and extension authors are encouraged to distribute their extensions as prebuilt packages [0m


In [13]:
import ipywidgets as widgets
widgets.IntSlider()

import json
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, AutoModelForSequenceClassification
import torch
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

from sentencepiece import SentencePieceProcessor
print("SentencePiece is installed and ready to use.")

# Device Selection
# device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print(f"Using device: {device}")


# Configuration
config = {
    "model_name": "UBC-NLP/araT5-base",  # Load AraT5 locally
    "fine_tune_model": "aubmindlab/bert-base-arabertv02",  # Model to fine-tune
    "threshold": 1,  # Minimum score for high-quality content
    
    "annotation_samples": 100,  # Total number of samples annotated with AraT5.
    "validation_samples": 30,  # Subset of annotation_samples reserved for validation
    "max_samples_to_fine_tune": 70,  # Maximum annotated samples used for fine-tuning
    
    "epochs": 5,  # Fine-tuning epochs
    "batch_size": 4,  # Lower the batch size to reduce memory usage
}


# Arabic Rubric Prompt for Additive Scoring
# arabic_prompt = """
# النص التالي مقتبس من محتوى تعليمي. يرجى تقييم جودة النص بناءً على المعايير التالية:
# 1. مدى ملاءمة النص لموضوعات المناهج الدراسية (0-2).
# 2. وضوح النص وسهولة فهمه بالنسبة للطلاب (0-2).
# 3. عمق المحتوى التعليمي المقدم (0-1).

# امنح نقاطًا لكل معيار على حدة، ثم احسب المجموع النهائي (0-5).

# النص: "{text}"
# """

arabic_prompt = """
فيما يلي مقتطف من صفحة ويب. قم بتقييم ما إذا كانت الصفحة ذات قيمة تعليمية عالية ويمكن أن تكون مفيدة في بيئة تعليمية لتدريس المستويات من المرحلة الابتدائية إلى المرحلة الإعدادية باستخدام نظام تقييم مكون من 5 نقاط تراكمية وفقًا للمعايير التالية:
أضف نقطة واحدة إذا كان المقتطف يقدم بعض المعلومات الأساسية ذات الصلة بالموضوعات التعليمية، حتى لو تضمن محتوى غير ذي صلة أو غير أكاديمي مثل الإعلانات والمواد الترويجية.
•	أضف نقطة أخرى إذا كان المقتطف يتناول بعض العناصر ذات الصلة بالتعليم ولكنه لا يتماشى بشكل وثيق مع المعايير التعليمية. قد يخلط بين المحتوى التعليمي وغير التعليمي، ويقدم نظرة عامة سطحية عن موضوعات قد تكون مفيدة، أو يعرض المعلومات بطريقة غير منظمة وأسلوب كتابة غير واضح.
•	امنح نقطة ثالثة إذا كان المقتطف مناسبًا للاستخدام التعليمي ويقدم مفاهيم رئيسية ذات صلة بالمناهج المدرسية. يكون المحتوى واضحًا ولكنه قد لا يكون شاملاً، أو قد يتضمن بعض المعلومات الزائدة. قد يشبه القسم التمهيدي لكتاب مدرسي أو درس تعليمي بسيط مناسب للتعلم ولكنه يحتوي على بعض القيود مثل معالجة مفاهيم معقدة جدًا لطلاب المرحلة الإعدادية.
•	امنح نقطة رابعة إذا كان المقتطف ذا صلة كبيرة ومفيدًا للأغراض التعليمية لمستوى لا يتجاوز المرحلة الإعدادية، مع أسلوب كتابة واضح ومتسق. يمكن أن يشبه فصلًا من كتاب مدرسي أو درسًا تعليميًا، حيث يقدم محتوى تعليميًا غنيًا، بما في ذلك التمارين والحلول، مع الحد الأدنى من المعلومات غير ذات الصلة، والمفاهيم ليست معقدة للغاية لطلاب هذه المرحلة. يكون المحتوى منظمًا ومركّزًا وقيمًا للتعلم المنهجي.
•	امنح نقطة خامسة إذا كان المقتطف ممتازًا في قيمته التعليمية ومناسبًا تمامًا للتدريس في المرحلة الابتدائية أو الإعدادية. يتبع المقتطف منطقًا تفصيليًا، وأسلوب الكتابة سهل الفهم، ويقدم رؤى عميقة وشاملة حول الموضوع دون أي محتوى غير تعليمي أو معقد.
المقتطف: <EXAMPLE>. بعد فحص المقتطف:
•	برر بإيجاز مجموع النقاط، بحد أقصى 100 كلمة.
•	اختتم بالنقاط الإجمالية بالتنسيق التالي: "التقييم التعليمي: <مجموع النقاط>".
"""

# Modify the dataset to use a Dataset class that provides a dictionary-like format. 
# Replace the torch.utils.data.TensorDataset with a Dataset object.
from torch.utils.data import Dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Tokenize the text
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        # Flatten tensors and include labels
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item


# Step 1: Load Dataset
def load_dataset(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    # Adjust to handle `text` and `metadata`
    return [{"text": item["text"], "metadata": item["metadata"]} for item in data]


# Step 2: Annotate Data Locally with AraT5
def annotate_samples(samples, model_name):
    tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    annotated_data = []

    for sample in samples:
        text = sample["text"]
        prompt = arabic_prompt.format(text=text)

        # Encode input text and generate output
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
        outputs = model.generate(**inputs, max_length=100)
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)

        scores = extract_scores(result)
        annotated_data.append({"text": text, "scores": scores, "metadata": sample["metadata"]})

    return annotated_data

# Helper Function to Extract Scores
def extract_scores(output):
    lines = output.split("\n")
    scores = {}
    for line in lines:
        if "ملاءمة" in line:
            scores["relevance"] = int(line.split(":")[-1].strip())
        elif "وضوح" in line:
            scores["clarity"] = int(line.split(":")[-1].strip())
        elif "عمق" in line:
            scores["depth"] = int(line.split(":")[-1].strip())
    total = sum(scores.values())
    scores["total"] = total
    return scores

# Step 3: Fine-Tune AraBERT
def fine_tune_arabert(train_data, tokenizer, model):
    texts = [item["text"] for item in train_data]
    labels = [item["scores"]["total"] for item in train_data]

    # Use CustomDataset
    dataset = CustomDataset(texts, labels, tokenizer)

    # Move model to device
    model.to(device)

    training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=config["epochs"],
    per_device_train_batch_size=config["batch_size"],
    save_steps=10_000,
    save_total_limit=2,
    no_cuda=True,
    logging_dir="./logs",
    logging_steps=100  # Log progress every 100 steps
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
    )

    print("Starting fine-tuning...")
    trainer.train()
    print("Fine-tuning complete.")


# Step 4: Validate and Benchmark
def validate_model(validation_data, model, tokenizer):
    # Ensure the model is on the correct device
    model.to(device)

    true_labels = []
    predicted_labels = []

    for item in validation_data:
        text = item["text"]
        true_labels.append(item["scores"]["total"])  # Actual labels

        # Prepare inputs and move to the same device as the model
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predicted_label = logits.argmax(dim=-1).item()  # Predicted label
        predicted_labels.append(predicted_label)

    # Compute metrics
    f1 = f1_score(true_labels, predicted_labels, average="macro")
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average="macro", zero_division=0)
    recall = recall_score(true_labels, predicted_labels, average="macro", zero_division=0)
    conf_matrix = confusion_matrix(true_labels, predicted_labels)

    # Print metrics directly here
    print(f"Validation F1 Score: {f1:.2f}")
    print(f"Validation Accuracy: {accuracy:.2f}")
    print(f"Validation Precision: {precision:.2f}")
    print(f"Validation Recall: {recall:.2f}")
    print("Confusion Matrix:")
    print(conf_matrix)

    # Return all metrics as a dictionary for further use if needed
    return {
        "f1": f1,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "confusion_matrix": conf_matrix,
    }

def predict_with_arabert(unlabeled_data, model, tokenizer):
    """
    Use the fine-tuned AraBERT model to predict labels for the rest of the dataset.
    Args:
        unlabeled_data: List of unannotated samples.
        model: Fine-tuned AraBERT model.
        tokenizer: Tokenizer for AraBERT.
    Returns:
        List of samples with predicted scores.
    """
    model.eval()  # Set the model to evaluation mode
    model.to(device)

    predictions = []

    for sample in unlabeled_data:
        text = sample["text"]
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predicted_label = logits.argmax(dim=-1).item()

        predictions.append({"text": text, "predicted_score": predicted_label, "metadata": sample["metadata"]})

    return predictions


def filter_dataset(annotated_data, threshold):
    """
    Filters the dataset to retain high-quality samples.
    Args:
        annotated_data: List of annotated samples or predictions.
        threshold: Minimum score for filtering.
    Returns:
        Filtered dataset.
    """
    return [
        doc for doc in annotated_data
        if doc["predicted_score"] >= threshold  # Removed 'language_score' filter
    ]

# Main Pipeline
def main_pipeline():
    # Step 1: Load the dataset
    dataset = load_dataset("/Users/ameeraattiah/Desktop/arabicweb24/jeje.json")
    print(f"Loaded {len(dataset)} samples.")

    # Step 2: Annotate a sample of the dataset with AraT5
    sample_data = dataset[:config["annotation_samples"]]
    annotated_data = annotate_samples(sample_data, config["model_name"])
    print(f"Annotated {len(annotated_data)} samples.")

    # Step 3: Fine-tune AraBERT on the synthetic annotations
    tokenizer = AutoTokenizer.from_pretrained(config["fine_tune_model"])
    model = AutoModelForSequenceClassification.from_pretrained(config["fine_tune_model"], num_labels=6)
    fine_tune_arabert(annotated_data, tokenizer, model)

    # Step 4: Predict the rest of the dataset with fine-tuned AraBERT
    remaining_data = dataset[config["annotation_samples"]:]
    predictions = predict_with_arabert(remaining_data, model, tokenizer)
    print(f"Predicted {len(predictions)} samples with fine-tuned AraBERT.")

    # Step 5: Validate fine-tuned AraBERT
    validation_data = annotated_data[:config["validation_samples"]]
    validation_results = validate_model(validation_data, model, tokenizer)
    
    # Print evaluation metrics
    print(f"Validation Results of AraBert: {validation_results}")


    # Step 6: Filter high-quality samples
    filtered_data = filter_dataset(predictions, config["threshold"])
    print(f"Filtered dataset contains {len(filtered_data)} high-quality samples.")

    # Step7: Save annotated and filtered dataset
    with open("/Users/ameeraattiah/Desktop/arabicweb24/jeje-edu.json", "w", encoding="utf-8") as file:
        json.dump(filtered_data, file, ensure_ascii=False, indent=4)
    print("Filtered data saved.")

# Run the pipeline
main_pipeline()


SentencePiece is installed and ready to use.
Using device: cpu
Loaded 173 samples.
Annotated 100 samples.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fine-tuning...


Step,Training Loss
100,0.0949


Fine-tuning complete.
Predicted 73 samples with fine-tuned AraBERT.
Validation F1 Score: 1.00
Validation Accuracy: 1.00
Validation Precision: 1.00
Validation Recall: 1.00
Confusion Matrix:
[[30]]
Validation Results of AraBert: {'f1': 1.0, 'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'confusion_matrix': array([[30]])}
Filtered dataset contains 0 high-quality samples.
Filtered data saved.




In [None]:
# import ipywidgets as widgets
# widgets.IntSlider()

# import json
# import requests
# from sklearn.metrics import f1_score
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
# import torch

# # Configuration
# config = {
#     "api_url": "https://api.jais.ai/v1/generate",  # Replace with the actual Jais API endpoint
#     "api_key": "your_api_key_here",  # Replace with your Jais API key
#     "fine_tune_model": "aubmindlab/bert-base-arabertv02",  # Model to fine-tune
#     "threshold": 3,  # Minimum score for high-quality content
#     "annotation_samples": 500,  # Number of samples to annotate for testing
#     "validation_samples": 100,  # Number of samples for validation
#     "max_samples_to_fine_tune": 1000,  # Max samples for fine-tuning
#     "epochs": 20,  # Fine-tuning epochs
#     "batch_size": 8,  # Batch size for fine-tuning
# }

# # Arabic Rubric Prompt for Additive Scoring
# arabic_prompt = """
# النص التالي مقتبس من محتوى تعليمي. يرجى تقييم جودة النص بناءً على المعايير التالية:
# 1. مدى ملاءمة النص لموضوعات المناهج الدراسية (0-2).
# 2. وضوح النص وسهولة فهمه بالنسبة للطلاب (0-2).
# 3. عمق المحتوى التعليمي المقدم (0-1).

# امنح نقاطًا لكل معيار على حدة، ثم احسب المجموع النهائي (0-5).

# النص: "{text}"
# """

# # Step 1: Load Dataset
# def load_dataset(file_path):
#     with open(file_path, "r", encoding="utf-8") as file:
#         data = json.load(file)
#     return [{"text": item["text"], "metadata": item["metadata"]} for item in data]

# # Step 2: Annotate Data with API
# def annotate_samples(samples, api_url, api_key):
#     headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
#     annotated_data = []

#     for sample in samples:
#         text = sample["text"]
#         prompt = arabic_prompt.format(text=text)
#         payload = {
#             "prompt": prompt,
#             "max_tokens": 100,  # Adjust as needed
#             "temperature": 0.7,  # Optional: Controls randomness
#         }
#         response = requests.post(api_url, headers=headers, json=payload)

#         if response.status_code == 200:
#             result = response.json()["generated_text"]
#             scores = extract_scores(result)
#             annotated_data.append({"text": text, "scores": scores, "metadata": sample["metadata"]})
#         else:
#             print(f"Error: {response.status_code}, {response.text}")

#     return annotated_data

# # Helper Function to Extract Scores
# def extract_scores(output):
#     lines = output.split("\n")
#     scores = {}
#     for line in lines:
#         if "ملاءمة" in line:
#             scores["relevance"] = int(line.split(":")[-1].strip())
#         elif "وضوح" in line:
#             scores["clarity"] = int(line.split(":")[-1].strip())
#         elif "عمق" in line:
#             scores["depth"] = int(line.split(":")[-1].strip())
#     total = sum(scores.values())
#     scores["total"] = total
#     return scores

# # Step 3: Fine-Tune AraBERT
# def fine_tune_arabert(train_data, tokenizer, model):
#     texts = [item["text"] for item in train_data]
#     labels = [item["scores"]["total"] for item in train_data]

#     encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
#     dataset = torch.utils.data.TensorDataset(encodings["input_ids"], encodings["attention_mask"], torch.tensor(labels))

#     training_args = TrainingArguments(
#         output_dir="./results",
#         num_train_epochs=config["epochs"],
#         per_device_train_batch_size=config["batch_size"],
#         save_steps=10_000,
#         save_total_limit=2,
#         logging_dir="./logs",
#     )

#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=dataset,
#     )

#     print("Starting fine-tuning...")
#     trainer.train()
#     print("Fine-tuning complete.")

# # Step 4: Validate and Benchmark
# def validate_model(validation_data, model, tokenizer):
#     true_labels = []
#     predicted_labels = []
#     for item in validation_data:
#         text = item["text"]
#         true_labels.append(item["scores"]["total"])
#         inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
#         with torch.no_grad():
#             outputs = model(**inputs)
#         logits = outputs.logits
#         predicted_label = logits.argmax(dim=-1).item()
#         predicted_labels.append(predicted_label)

#     f1 = f1_score(true_labels, predicted_labels, average="macro")
#     print(f"Validation F1 Score: {f1:.2f}")
#     return f1

# # Main Pipeline
# def main_pipeline():
#     # Load dataset
#     dataset = load_dataset("/Users/ameeraattiah/Desktop/warc/meero_cleaned.json")
#     print(f"Loaded {len(dataset)} samples.")

#     # Annotate samples using the Jais API
#     annotation_samples = dataset[:config["annotation_samples"]]
#     annotated_data = annotate_samples(annotation_samples, config["api_url"], config["api_key"])
#     print(f"Annotated {len(annotated_data)} samples.")

#     # Fine-tune AraBERT
#     tokenizer = AutoTokenizer.from_pretrained(config["fine_tune_model"])
#     model = AutoModelForSequenceClassification.from_pretrained(config["fine_tune_model"], num_labels=6)
#     fine_tune_arabert(annotated_data[:config["max_samples_to_fine_tune"]], tokenizer, model)

#     # Validate the model
#     validation_data = annotated_data[:config["validation_samples"]]
#     f1 = validate_model(validation_data, model, tokenizer)

#     # Save annotated data
#     with open("/Users/ameeraattiah/Desktop/warc/annotated_meero.json", "w", encoding="utf-8") as file:
#         json.dump(annotated_data, file, ensure_ascii=False, indent=4)
#     print("Saved annotated dataset.")

# # Run the pipeline
# main_pipeline()


In [14]:
# import json
# import numpy as np
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import f1_score
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, Trainer, TrainingArguments
# import torch
# import sys
# import os
# from datatrove.data import Document  # Import the Document class



# # Configuration
# config = {
#     "annotator_model": "inceptionai/jais-13b",  # Correct Jais model name
#     "fine_tune_model": "aubmindlab/bert-base-arabertv02",  # Model to fine-tune
#     "threshold": 3,  # Minimum score for high-quality content
#     "annotation_samples": 500,  # Number of samples to annotate for testing
#     "validation_samples": 100,  # Number of samples for validation
#     "max_samples_to_fine_tune": 1000,  # Max samples for fine-tuning
#     "epochs": 20,  # Fine-tuning epochs
#     "batch_size": 8,  # Batch size for fine-tuning
#     "benchmark_tasks": ["QA", "Curriculum"],  # Placeholder for benchmarking
# }
# model_pipeline = pipeline(
#     "text-generation", 
#     model="inceptionai/jais-13b", 
#     trust_remote_code=True
# )


# from transformers import pipeline


# # Arabic Rubric Prompt for Additive Scoring
# arabic_prompt = """
# النص التالي مقتبس من محتوى تعليمي. يرجى تقييم جودة النص بناءً على المعايير التالية:
# 1. مدى ملاءمة النص لموضوعات المناهج الدراسية (0-2).
# 2. وضوح النص وسهولة فهمه بالنسبة للطلاب (0-2).
# 3. عمق المحتوى التعليمي المقدم (0-1).

# امنح نقاطًا لكل معيار على حدة، ثم احسب المجموع النهائي (0-5).

# النص: "{text}"
# """

# # Step 1: Load Dataset
# def load_dataset(file_path):
#     with open(file_path, "r", encoding="utf-8") as file:
#         data = json.load(file)
#     return [{"text": item["text"], "metadata": item["metadata"]} for item in data]

# # Step 2: Annotate Data with Additive Scoring
# def annotate_samples(samples, model_pipeline):
#     annotated_data = []
#     for sample in samples:
#         text = sample["text"]
#         prompt = arabic_prompt.format(text=text)
#         result = model_pipeline(prompt)[0]
#         scores = result["generated_text"].split()  # Assume the model generates scores per criterion
#         relevance, clarity, depth = map(int, scores[:3])  # Extract individual scores
#         total_score = relevance + clarity + depth
#         annotated_data.append({"text": text, "score": total_score, "metadata": sample["metadata"]})
#     return annotated_data

# # Step 3: Fine-Tune AraBERT
# def fine_tune_arabert(train_data, tokenizer, model):
#     # Prepare data
#     texts = [item["text"] for item in train_data]
#     labels = [item["score"] for item in train_data]

#     # Tokenize data
#     encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
#     dataset = torch.utils.data.TensorDataset(encodings["input_ids"], encodings["attention_mask"], torch.tensor(labels))

#     # Define training arguments
#     training_args = TrainingArguments(
#         output_dir="./results",
#         num_train_epochs=config["epochs"],
#         per_device_train_batch_size=config["batch_size"],
#         save_steps=10_000,
#         save_total_limit=2,
#         logging_dir="./logs",
#     )

#     # Trainer setup
#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=dataset,
#     )

#     print("Starting fine-tuning...")
#     trainer.train()
#     print("Fine-tuning complete.")

# # Step 4: Validate and Benchmark
# def validate_model(validation_data, model, tokenizer):
#     true_labels = []
#     predicted_labels = []
#     for item in validation_data:
#         text = item["text"]
#         true_labels.append(item["score"])
#         inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
#         with torch.no_grad():
#             outputs = model(**inputs)
#         logits = outputs.logits
#         predicted_label = logits.argmax(dim=-1).item()
#         predicted_labels.append(predicted_label)

#     # Calculate F1 Score
#     f1 = f1_score(true_labels, predicted_labels, average="macro")
#     print(f"Validation F1 Score: {f1:.2f}")
#     return f1

# # Step 5: Ablation Studies and Threshold Optimization
# def ablation_studies(filtered_data):
#     thresholds = range(1, 6)
#     best_threshold = None
#     best_f1 = 0

#     for threshold in thresholds:
#         current_data = [item for item in filtered_data if item["score"] >= threshold]
#         print(f"Threshold: {threshold}, Data Size: {len(current_data)}")
#         f1 = validate_model(current_data, model, tokenizer)
#         if f1 > best_f1:
#             best_f1 = f1
#             best_threshold = threshold

#     print(f"Optimal Threshold: {best_threshold}, Best F1: {best_f1:.2f}")
# # Main Pipeline
# def main_pipeline():
#     # Load dataset
#     dataset = load_dataset("/Users/ameeraattiah/Desktop/warc/meero_cleaned.json")
#     print(f"Loaded {len(dataset)} samples.")

#     # Annotate samples using the correct Jais model
#     annotation_samples = dataset[:config["annotation_samples"]]
#     model_pipeline = pipeline(
#         "text-generation", 
#         model=config["annotator_model"], 
#         trust_remote_code=True
#     )
#     annotated_data = annotate_samples(annotation_samples, model_pipeline)
#     print(f"Annotated {len(annotated_data)} samples.")

#     # Fine-tune AraBERT
#     tokenizer = AutoTokenizer.from_pretrained(config["fine_tune_model"])
#     model = AutoModelForSequenceClassification.from_pretrained(config["fine_tune_model"], num_labels=6)
#     fine_tune_arabert(annotated_data[:config["max_samples_to_fine_tune"]], tokenizer, model)

#     # Validate the model
#     validation_data = annotated_data[:config["validation_samples"]]
#     f1 = validate_model(validation_data, model, tokenizer)

#     # Apply ablation studies
#     ablation_studies(annotated_data)

#     # Save annotated data
#     with open("/Users/ameeraattiah/Desktop/warc/annotated_meero.json", "w", encoding="utf-8") as file:
#         json.dump(annotated_data, file, ensure_ascii=False, indent=4)
#     print("Saved annotated dataset.")

# # Run the pipeline
# main_pipeline()


The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.


Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]



pytorch_model-00003-of-00006.bin:  96%|#########5| 9.53G/9.96G [00:00<?, ?B/s]

ValueError: Could not load model inceptionai/jais-13b with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'>,). See the original errors:

while loading with AutoModelForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/transformers/pipelines/base.py", line 289, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 559, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/transformers/modeling_utils.py", line 3974, in from_pretrained
    resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
                                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files
    cached_filename = cached_file(
                      ^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/transformers/utils/hub.py", line 403, in cached_file
    resolved_file = hf_hub_download(
                    ^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 862, in hf_hub_download
    return _hf_hub_download_to_cache_dir(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1011, in _hf_hub_download_to_cache_dir
    _download_to_tmp_and_move(
  File "/opt/anaconda3/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1545, in _download_to_tmp_and_move
    http_get(
  File "/opt/anaconda3/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 457, in http_get
    temp_file.write(chunk)
OSError: [Errno 28] No space left on device


