In [12]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import numpy as np
import time

# Define custom dataset class
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_map = {'normal': 0, 'fraud': 1}  # Adjust if your labels differ
        self.reverse_label_map = {0: 'normal', 1: 'fraud'}  # For decoding predictions

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.label_map[self.labels[idx]]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long),
            'index': idx  # Store index to track samples
        }

# Load test dataset
def load_test_data(file_path):
    try:
        df = pd.read_csv(file_path, encoding='cp1252')  # Use Windows-1252 encoding
    except UnicodeDecodeError:
        print("Error: Unable to decode file with cp1252 encoding. Trying latin1 encoding...")
        df = pd.read_csv(file_path, encoding='latin1')  # Fallback to latin1
    texts = df['text'].values
    labels = df['label'].values
    return texts, labels

# Evaluate model
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    sample_indices = []

    start_time = time.time()

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            indices = batch['index']

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
            sample_indices.extend(indices.numpy())

    end_time = time.time()
    response_time = end_time - start_time

    # Compute metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    report = classification_report(true_labels, predictions, target_names=['normal', 'fraud'])

    return accuracy, precision, recall, f1, report, response_time, sample_indices, true_labels, predictions

def main():
    # Configuration
    model_path = r"C:\Users\HAN4COB\.conda\envs\test\AI_Enabled_Scam_Call_Detection\bert-spam-model2"  # Update with your model directory path
    test_file = 'Test_Dataset.csv'
    max_len = 128
    batch_size = 16
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load tokenizer and model
    tokenizer = BertTokenizer.from_pretrained(model_path, use_safetensors=True)
    model = BertForSequenceClassification.from_pretrained(model_path, use_safetensors=True)
    model.to(device)

    # Load test data
    texts, labels = load_test_data(test_file)

    # Create test dataset and dataloader
    test_dataset = TextClassificationDataset(texts, labels, tokenizer, max_len)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Evaluate
    accuracy, precision, recall, f1, report, response_time, sample_indices, true_labels, predictions = evaluate_model(model, test_dataloader, device)

    # Convert numeric labels back to strings
    reverse_label_map = {0: 'normal', 1: 'fraud'}
    true_labels_str = [reverse_label_map[label] for label in true_labels]
    predictions_str = [reverse_label_map[pred] for pred in predictions]

    # Print results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Response Time: {response_time:.2f} seconds")
    print("\nClassification Report:")
    print(report)

    # Print correct and predicted labels for each sample
    print("\nSample Predictions:")
    print("-" * 50)
    print(f"{'Sample Index':<15} {'Correct Label':<15} {'Predicted Label':<15}")
    print("-" * 50)
    for idx, true_label, pred_label in zip(sample_indices, true_labels_str, predictions_str):
        print(f"{idx:<15} {true_label:<15} {pred_label:<15}")

if __name__ == '__main__':
    main()

Accuracy: 0.9100
Precision: 1.0000
Recall: 0.8393
F1-Score: 0.9126
Response Time: 3.97 seconds

Classification Report:
              precision    recall  f1-score   support

      normal       0.83      1.00      0.91        44
       fraud       1.00      0.84      0.91        56

    accuracy                           0.91       100
   macro avg       0.92      0.92      0.91       100
weighted avg       0.93      0.91      0.91       100


Sample Predictions:
--------------------------------------------------
Sample Index    Correct Label   Predicted Label
--------------------------------------------------
0               fraud           normal         
1               fraud           normal         
2               fraud           normal         
3               fraud           normal         
4               fraud           fraud          
5               fraud           fraud          
6               fraud           normal         
7               fraud           fraud          