In [None]:
import pandas as pd
import random
import json
from datasets import Dataset, DatasetDict

# Load data from file
with open("data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Update score-to-label mapping to handle fractional scores with step 0.25 (e.g., 0.0, 0.25, 0.5, ..., 4.0)
score_to_label = {
    i * 0.25: int(i) for i in range(17)
}  # Generates scores from 0.0 to 4.0 with a step of 0.25
label_to_score = {v: k for k, v in score_to_label.items()}


# Preprocess the data
def preprocess_data(data):
    random.shuffle(data)
    df = pd.DataFrame(data)
    # Ensure that scores are valid before mapping to labels
    df["label"] = df["score"].map(score_to_label)
    df["text"] = df["questionText"] + " " + df["answerText"]
    df = df.dropna(subset=["label"])  # Remove rows where 'label' is NaN
    df = df.drop(columns=["questionNumber", "score", "questionText", "answerText"])
    return df


# Split data (80% train, 10% val, 10% test)
data_df = preprocess_data(data)
train_size = int(0.8 * len(data_df))
val_size = int(0.1 * len(data_df))

train_df = data_df[:train_size]
val_df = data_df[train_size : train_size + val_size]
test_df = data_df[train_size + val_size :]

datasets = DatasetDict(
    {
        "train": Dataset.from_pandas(train_df),
        "validation": Dataset.from_pandas(val_df),
        "test": Dataset.from_pandas(test_df),
    }
)

datasets


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


tokenized_datasets = datasets.map(preprocess_function, batched=True)

from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased",
    num_labels=len(label_to_score),
    id2label=label_to_score,
    label2id=score_to_label,
)

In [None]:
training_args = TrainingArguments(
    output_dir="results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

results = []
tokenizer_kwargs = {"padding": True, "truncation": True, "max_length": 512}
for sample in datasets["test"]:
    prediction = classifier(sample["text"], **tokenizer_kwargs)[0]
    predicted_label = prediction["label"]
    predicted_score = prediction["score"]
    results.append(
        {
            "true_label": label_to_score[sample["label"]],
            "predicted_label": predicted_label,
            "predicted_score": predicted_score,
            "text": sample["text"],
        }
    )

# Display the first few results
for result in random.choices(results, k=5):
    print(result)

In [None]:
from sklearn.metrics import mean_absolute_error

true_scores = [r["true_label"] for r in results]
predicted_scores = [r["predicted_score"] for r in results]

mae = mean_absolute_error(true_scores, predicted_scores)
print(f"Mean Absolute Error: {mae}")