<a href="https://colab.research.google.com/github/alex-nuclearboy/goit-python-ds-project/blob/main/toxic_albert_model_test_2_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Test of the ALBERT model trained on the `train_claud_2_0` dataset

In [1]:
!pip install torch
!pip install transformers



In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
# Download the dataset from Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Path to a clean test dataset
test_dataset_path = '/content/drive/My Drive/toxic_comments_data/test_cleaned.csv'

# Load the dataset into a DataFrame
df_test = pd.read_csv(test_dataset_path)

Mounted at /content/drive


In [4]:
df_test.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0001ea8717f6de06,thank you for understanding. i think very high...,0,0,0,0,0,0
1,000247e83dcc1211,dear god this site is horrible.,0,0,0,0,0,0
2,0002f87b16116a7f,somebody will invariably try to add religion r...,0,0,0,0,0,0
3,0003e1cccfd5a40a,it says it right there that it is a type. the ...,0,0,0,0,0,0
4,00059ace3e3e9a53,"before adding a new product to the list, make ...",0,0,0,0,0,0


In [5]:
# Extract the comment texts and labels from the test DataFrame
test_data = df_test['comment_text']
test_labels = df_test.iloc[:, 2:]

In [6]:
print(f'Test data shape: {test_data.shape}')
print(f'Test labels shape: {test_labels.shape}')

Test data shape: (63978,)
Test labels shape: (63978, 6)


In [7]:
# Check for NaN values in test_data
print("NaN values in test_data:")
print(test_data.isnull().sum())

# Drop rows with NaN values in test_data
test_data_cleaned = test_data.dropna()

# Filter test_labels to match the cleaned test_data
test_labels_cleaned = test_labels.loc[test_data_cleaned.index]

# Recheck for NaN values after cleaning
print("NaN values in test_data_cleaned:")
print(test_data_cleaned.isnull().sum())
print("NaN values in test_labels_cleaned:")
print(test_labels_cleaned.isnull().sum())

NaN values in test_data:
501
NaN values in test_data_cleaned:
0
NaN values in test_labels_cleaned:
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


In [8]:
# Token and Encode Function
def tokenize_and_encode(tokenizer, comments, labels, max_length=512):
	# Initialize empty lists to store tokenized inputs and attention masks
	input_ids = []
	attention_masks = []

	# Iterate through each comment in the 'comments' list
	for comment in comments:

		# Tokenize and encode the comment using the ALBERT tokenizer
		encoded_dict = tokenizer.encode_plus(
			comment,

			# Add special tokens like [CLS] and [SEP]
			add_special_tokens=True,

			# Truncate or pad the comment to 'max_length'
			max_length=max_length,
      padding='max_length',
      truncation=True,

			# Return attention mask to mask padded tokens
			return_attention_mask=True,

			# Return PyTorch tensors
			return_tensors='pt'
		)

		# Append the tokenized input and attention mask to their respective lists
		input_ids.append(encoded_dict['input_ids'])
		attention_masks.append(encoded_dict['attention_mask'])

	# Concatenate the tokenized inputs and attention masks into tensors
	input_ids = torch.cat(input_ids, dim=0)
	attention_masks = torch.cat(attention_masks, dim=0)

	# Convert the labels to a PyTorch tensor with the data type float32
	labels = torch.tensor(labels, dtype=torch.float32)

	# Return the tokenized inputs, attention masks, and labels as PyTorch tensors
	return input_ids, attention_masks, labels

In [9]:
# Set the device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
# Load the tokenizer and model from the saved directory
model_name = "/content/drive/My Drive/model_output_2_0"
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name)

model = model.to(device)

In [11]:
# Tokenize and Encode the comments and labels for the test set
test_input_ids, test_attention_masks, test_labels = tokenize_and_encode(
    tokenizer,
    test_data_cleaned,
    test_labels_cleaned.values
)

In [12]:
from torch.utils.data import DataLoader, TensorDataset

# Creating DataLoader
batch_size = 8

# Creating DataLoader for the testing dataset
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [13]:
# Evaluate the model
def evaluate_model(model, test_loader, device, label_names):
    model.eval()  # Set the model to evaluation mode

    true_labels = []  # List to store true labels
    predicted_probs = []  # List to store predicted probabilities

    # Disable gradient computation for evaluation
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [t.to(device) for t in batch]

            # Get model's predictions
            outputs = model(input_ids, attention_mask=attention_mask)
            # Apply sigmoid for multilabel classification to get probabilities
            predicted_probs_batch = torch.sigmoid(outputs.logits)
            predicted_probs.append(predicted_probs_batch.cpu().numpy())

            # Store true labels for later evaluation
            true_labels_batch = labels.cpu().numpy()
            true_labels.append(true_labels_batch)

    # Combine predictions and labels for evaluation
    true_labels = np.concatenate(true_labels, axis=0)
    predicted_probs = np.concatenate(predicted_probs, axis=0)

    # Apply threshold for binary classification (0.5 is commonly used)
    predicted_labels = (predicted_probs > 0.5).astype(int)

    # Calculate evaluation metrics for overall performance
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='micro')
    recall = recall_score(true_labels, predicted_labels, average='micro')
    f1 = f1_score(true_labels, predicted_labels, average='micro')

    # Calculate evaluation metrics for each label (class-specific)
    precision_per_label = precision_score(true_labels, predicted_labels, average=None)
    recall_per_label = recall_score(true_labels, predicted_labels, average=None)
    f1_per_label = f1_score(true_labels, predicted_labels, average=None)

    # Print the overall evaluation metrics
    print(f'Overall Accuracy: {accuracy:.4f}')
    print(f'Overall Precision: {precision:.4f}')
    print(f'Overall Recall: {recall:.4f}')
    print(f'Overall F1 Score: {f1:.4f}')

    # Print evaluation metrics for each label
    print("\nEvaluation per label:")
    for i, label in enumerate(label_names):
        print(f'{label}:')
        print(f'  Precision: {precision_per_label[i]:.4f}')
        print(f'  Recall: {recall_per_label[i]:.4f}')
        print(f'  F1 Score: {f1_per_label[i]:.4f}')

In [14]:
# List of labels in the dataset
label_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Call the function to evaluate the model on the test data
evaluate_model(model, test_loader, device, label_names)

Overall Accuracy: 0.8769
Overall Precision: 0.6083
Overall Recall: 0.7426
Overall F1 Score: 0.6688

Evaluation per label:
toxic:
  Precision: 0.5614
  Recall: 0.8710
  F1 Score: 0.6827
severe_toxic:
  Precision: 0.4028
  Recall: 0.3994
  F1 Score: 0.4011
obscene:
  Precision: 0.6715
  Recall: 0.7372
  F1 Score: 0.7028
threat:
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
insult:
  Precision: 0.6705
  Recall: 0.6780
  F1 Score: 0.6743
identity_hate:
  Precision: 0.7348
  Recall: 0.3768
  F1 Score: 0.4981


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
