In [None]:
!pip install evaluate
import torch
import pandas as pd
from bs4 import BeautifulSoup
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, logging as transformers_logging
import evaluate
import transformers

transformers_logging.set_verbosity_error()

DATA_PATH = "NFR_five4.csv"
TEXT_COL = "sentence"
LABEL_COL = "class_name"
MODEL_NAME = "distilbert-base-uncased"
TEST_SIZE = 0.2
RANDOM_STATE = 42

class Cleaner:
    def normalize_line_breaks(self, text):
        return str(text).replace('\r\n', '\n').replace('\r', '\n')

    def remove_html_tags(self, text):
        return BeautifulSoup(str(text), "lxml").text

    def clean(self, text):
        text = self.normalize_line_breaks(text)
        text = self.remove_html_tags(text)
        return text

try:
    df = pd.read_csv(DATA_PATH, encoding="latin1")
except FileNotFoundError:
    raise SystemExit(f"Error: The file {DATA_PATH} was not found.")

df.dropna(subset=[TEXT_COL, LABEL_COL], inplace=True)
cleaner = Cleaner()
df['text_cleaned'] = df[TEXT_COL].astype(str).apply(cleaner.clean)

le = preprocessing.LabelEncoder()
df['label'] = le.fit_transform(df[LABEL_COL].astype(str).tolist())
class_names = le.classes_
num_labels = len(class_names)
print(f"Found {num_labels} unique labels: {class_names}")

labels = df['label'].values
class_weights_np = compute_class_weight(class_weight='balanced', classes=np.arange(num_labels), y=labels)
class_weights = torch.tensor(class_weights_np, dtype=torch.float)
counts = np.bincount(labels, minlength=num_labels)
prior = counts / counts.sum()
class_bias = torch.log(torch.tensor(prior, dtype=torch.float))

df_train, df_test = train_test_split(df, test_size=TEST_SIZE, stratify=df['label'], random_state=RANDOM_STATE)

d_train = Dataset.from_pandas(df_train)
d_test = Dataset.from_pandas(df_test)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def preprocess_fn(examples):
    return tokenizer(examples['text_cleaned'], truncation=True, padding=True)

tok_train = d_train.map(preprocess_fn, batched=True)
tok_test  = d_test.map(preprocess_fn, batched=True)

drop_cols = [TEXT_COL, LABEL_COL, 'text_cleaned', '__index_level_0__']
tok_train = tok_train.remove_columns([c for c in drop_cols if c in tok_train.column_names])
tok_test  = tok_test.remove_columns([c for c in drop_cols if c in tok_test.column_names])

class CustomModel(torch.nn.Module):
    def __init__(self, model_name, num_labels, class_weights, class_bias):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )
        self.model.classifier.bias.data = class_bias.to(self.model.device)
        self.class_weights = class_weights

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        kwargs.pop('num_items_in_batch', None)
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(self.model.device))
            loss = loss_fct(outputs.logits, labels)
        return transformers.modeling_outputs.SequenceClassifierOutput(
            loss=loss,
            logits=outputs.logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions
        )

model = CustomModel(MODEL_NAME, num_labels, class_weights, class_bias)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.05,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='steps',
    save_strategy='steps',
    save_steps=50,
    eval_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_train,
    eval_dataset=tok_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("Starting training...")
trainer.train()
print("Training complete.")

trainer.save_model('nfr_model_final')
tokenizer.save_pretrained('nfr_model_final')
print("Model saved to 'nfr_model_final'.")

from sklearn.metrics import classification_report

def eval_and_report(dataset, name):
    print(f"\nEvaluating on {name} set:")
    output = trainer.predict(dataset)
    preds = np.argmax(output.predictions, axis=1)
    labels = dataset['label']
    print(classification_report(labels, preds, target_names=class_names, zero_division=0))

print("\n--- Model Evaluation ---")
eval_and_report(tok_train, 'Training')
eval_and_report(tok_test, 'Test')

print("\n--- Custom Text Prediction Example ---")
custom_texts = [
    "No unauthorized access should be permitted to sensitive data.",
    "The system must respond to user queries within 3 seconds.",
    "The user interface needs to be intuitive for novice users.",
    "The Proposer shall test application enhancements, fixes, and upgrades and assure the integrity of the resulting data."
]
custom_ds = Dataset.from_dict({'text_cleaned': custom_texts})
custom_tok = custom_ds.map(lambda ex: tokenizer(ex['text_cleaned'], truncation=True, padding=True), batched=True)
custom_tok = custom_tok.remove_columns(['text_cleaned'])

pred_out = trainer.predict(custom_tok)
preds = np.argmax(pred_out.predictions, axis=1)

print("Predictions for custom texts:")
for txt, idx in zip(custom_texts, preds):
    print(f"Text: '{txt}' -> Predicted Class: {class_names[idx]}")




In [None]:
print("\n--- Custom Text Prediction Example ---")

custom_texts = [
    "No unauthorized access should be permitted to sensitive data.",
    "The system must respond to user queries within 3 seconds.",
    "All aspects of the website shall be accessed by a web browser over the Internet.",
    "The Proposer shall test application enhancements, fixes, and upgrades and assure the integrity of the resulting data.",
    "All details about cardmembers must be retrieved from the Cardmember Information Database."
]

custom_data_dict = {'text_cleaned': custom_texts}
custom_hf_dataset = Dataset.from_dict(custom_data_dict)

tokenized_custom_dataset = custom_hf_dataset.map(lambda examples: tokenizer(examples["text_cleaned"], truncation=True, padding=True), batched=True)
tokenized_custom_dataset = tokenized_custom_dataset.remove_columns(["text_cleaned"])

custom_predictions_output = trainer.predict(tokenized_custom_dataset)
custom_logits = custom_predictions_output.predictions
custom_predicted_numeric_labels = np.argmax(custom_logits, axis=1)

print("Predictions for custom texts:")
for i in range(len(custom_texts)):
    predicted_class_name = class_names_ordered[custom_predicted_numeric_labels[i]]
    print(f"Text: \"{custom_texts[i]}\" -> Predicted Class: {predicted_class_name}")
