In [None]:
!pip install evaluate
import torch
import pandas as pd
from bs4 import BeautifulSoup
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from datasets import Dataset
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import evaluate
from sklearn.metrics import classification_report

data_path = "NFR_data.csv"
text_column_name = "sentence"
label_column_name = "class_name"
model_name = "distilbert-base-uncased"
test_size = 0.2

class Cleaner():
    def __init__(self):
        pass

    def normalize_line_breaks(self, text):
        return str(text).replace('\r\n', '\n').replace('\r', '\n')

    def remove_html_tags(self, text):
        return BeautifulSoup(str(text), "lxml").text

    def clean(self, text):
        text = self.normalize_line_breaks(text)
        text = self.remove_html_tags(text)
        return text

try:
    df = pd.read_csv(data_path, encoding="latin1")
except FileNotFoundError:
    print(f"Error: The file {data_path} was not found. Please ensure it's in the correct path.")
    exit()
except Exception as e:
    print(f"Error reading CSV: {e}")
    exit()

df.dropna(subset=[text_column_name, label_column_name], inplace=True)
df['text_cleaned'] = df[text_column_name].astype(str).apply(Cleaner().clean)

le = preprocessing.LabelEncoder()
df['label'] = le.fit_transform(df[label_column_name].astype(str).tolist())
num_labels_actual = len(le.classes_)
print(f"Found {num_labels_actual} unique labels: {le.classes_}")

df_train, df_test = train_test_split(df, test_size=test_size, stratify=df['label'], random_state=42)

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text_cleaned"], truncation=True, padding=True)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

tokenized_train = tokenized_train.remove_columns([text_column_name, label_column_name, "text_cleaned", "__index_level_0__"])
tokenized_test = tokenized_test.remove_columns([text_column_name, label_column_name, "text_cleaned", "__index_level_0__"])

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels_actual)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="steps",
    save_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Starting model training...")
trainer.train()
print("Training complete.")

trainer.save_model('nfr_model_final')
tokenizer.save_pretrained('nfr_model_final')
print("Model saved to 'nfr_model_final'.")

print("\n--- Model Evaluation ---")
class_names_ordered = le.classes_.tolist()

print("\nEvaluating on Training Set:")
train_predictions_output = trainer.predict(tokenized_train)
train_logits = train_predictions_output.predictions
train_predicted_numeric_labels = np.argmax(train_logits, axis=1)
train_actual_numeric_labels = tokenized_train['label']
print("Classification Report for Training Set:")
print(classification_report(train_actual_numeric_labels, train_predicted_numeric_labels, target_names=class_names_ordered, zero_division=0))

print("\nEvaluating on Test Set:")
test_predictions_output = trainer.predict(tokenized_test)
test_logits = test_predictions_output.predictions
test_predicted_numeric_labels = np.argmax(test_logits, axis=1)
test_actual_numeric_labels = tokenized_test['label']
print("Classification Report for Test Set:")
print(classification_report(test_actual_numeric_labels, test_predicted_numeric_labels, target_names=class_names_ordered, zero_division=0))


In [None]:
print("\n--- Custom Text Prediction Example ---")

custom_texts = [
    "Developers must be able to replace modules without affecting the rest of the system.",
    "The system should include unit tests for at least 80% of the code.",
    "The user interface needs to be intuitive for novice users.",
    "The Proposer shall test application enhancements, fixes, and upgrades and assure the integrity of the resulting data."
]

custom_data_dict = {'text_cleaned': custom_texts}
custom_hf_dataset = Dataset.from_dict(custom_data_dict)

tokenized_custom_dataset = custom_hf_dataset.map(lambda examples: tokenizer(examples["text_cleaned"], truncation=True, padding=True), batched=True)
tokenized_custom_dataset = tokenized_custom_dataset.remove_columns(["text_cleaned"])

custom_predictions_output = trainer.predict(tokenized_custom_dataset)
custom_logits = custom_predictions_output.predictions
custom_predicted_numeric_labels = np.argmax(custom_logits, axis=1)

print("Predictions for custom texts:")
for i in range(len(custom_texts)):
    predicted_class_name = class_names_ordered[custom_predicted_numeric_labels[i]]
    print(f"Text: \"{custom_texts[i]}\" -> Predicted Class: {predicted_class_name}")