<a href="https://colab.research.google.com/github/alex-nuclearboy/goit-python-ds-project/blob/main/toxic_albert_model_and_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AlbertTokenizer, AlbertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
import numpy as np

In [3]:
# Download the dataset from Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Path to a clean dataset
dataset_path = '/content/drive/My Drive/toxic_comments_data/train_data_cleaned.csv'

# Load the dataset into a DataFrame
df_train = pd.read_csv(dataset_path)

Mounted at /content/drive


In [4]:
print(df_train.columns)

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate', 'token_count'],
      dtype='object')


In [5]:
# Split data into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(
    df_train['comment_text'],
    df_train.iloc[:, 2:8],
    test_size=0.20,
    random_state=42
)

In [6]:
print(f'Training data shape: {train_data.shape}')
print(f'Training labels shape: {train_labels.shape}')
print(f'Validation data shape: {val_data.shape}')
print(f'Validation labels shape: {val_labels.shape}')

Training data shape: (127656,)
Training labels shape: (127656, 6)
Validation data shape: (31915,)
Validation labels shape: (31915, 6)


In [7]:
# Token and Encode Function
def tokenize_and_encode(tokenizer, comments, labels, max_length=512):
	# Initialize empty lists to store tokenized inputs and attention masks
	input_ids = []
	attention_masks = []

	# Iterate through each comment in the 'comments' list
	for comment in comments:

		# Tokenize and encode the comment using the ALBERT tokenizer
		encoded_dict = tokenizer.encode_plus(
			comment,

			# Add special tokens like [CLS] and [SEP]
			add_special_tokens=True,

			# Truncate or pad the comment to 'max_length'
			max_length=max_length,
      padding='max_length',
      truncation=True,

			# Return attention mask to mask padded tokens
			return_attention_mask=True,

			# Return PyTorch tensors
			return_tensors='pt'
		)

		# Append the tokenized input and attention mask to their respective lists
		input_ids.append(encoded_dict['input_ids'])
		attention_masks.append(encoded_dict['attention_mask'])

	# Concatenate the tokenized inputs and attention masks into tensors
	input_ids = torch.cat(input_ids, dim=0)
	attention_masks = torch.cat(attention_masks, dim=0)

	# Convert the labels to a PyTorch tensor with the data type float32
	labels = torch.tensor(labels, dtype=torch.float32)

	# Return the tokenized inputs, attention masks, and labels as PyTorch tensors
	return input_ids, attention_masks, labels

In [8]:
# Token Initialization
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]



In [9]:
# Model Initialization
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=6)

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Check for NaN values in train_data
print("NaN values in train_data:")
print(train_data.isnull().sum())

# Drop rows with NaN values in train_data
train_data_cleaned = train_data.dropna()

# Filter train_labels to match the cleaned train_data
train_labels_cleaned = train_labels.loc[train_data_cleaned.index]

# Recheck for NaN values after cleaning
print("NaN values in train_data_cleaned:")
print(train_data_cleaned.isnull().sum())
print("NaN values in train_labels_cleaned:")
print(train_labels_cleaned.isnull().sum())

NaN values in train_data:
7
NaN values in train_data_cleaned:
0
NaN values in train_labels_cleaned:
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


In [11]:
# Check for NaN values in val_data
print("NaN values in val_data:")
print(val_data.isnull().sum())

# Drop rows with NaN values in val_data
val_data_cleaned = val_data.dropna()

# Filter val_labels to match the cleaned val_data
val_labels_cleaned = val_labels.loc[val_data_cleaned.index]

# Recheck for NaN values after cleaning
print("NaN values in val_data_cleaned:")
print(val_data_cleaned.isnull().sum())
print("NaN values in val_labels_cleaned:")
print(val_labels_cleaned.isnull().sum())

NaN values in val_data:
2
NaN values in val_data_cleaned:
0
NaN values in val_labels_cleaned:
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


In [12]:
# Set the device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

In [13]:
# Tokenize and Encode the comments and labels for the training set
input_ids, attention_masks, labels = tokenize_and_encode(
    tokenizer,
    train_data_cleaned,
    train_labels_cleaned.values
)

In [14]:
# Tokenize and Encode the comments and labels for the validation set
val_input_ids, val_attention_masks, val_labels = tokenize_and_encode(
    tokenizer,
    val_data_cleaned,
    val_labels_cleaned.values
)

In [15]:
print('Training Comments :',train_data_cleaned.shape)
print('Input Ids         :',input_ids.shape)
print('Attention Mask    :',attention_masks.shape)
print('Labels            :',labels.shape)

Training Comments : (127649,)
Input Ids         : torch.Size([127649, 512])
Attention Mask    : torch.Size([127649, 512])
Labels            : torch.Size([127649, 6])


In [16]:
# Check an encoded text with the corresponding text and labels (e.g., comment #25)
i = 25
print('Example comment:',train_data_cleaned.values[i])
print('\nInput Ids:\n',input_ids[i])
print('\nDecoded Ids:\n',tokenizer.decode(input_ids[i]))
print('\nAttention Mask:\n',attention_masks[i])
print('\nLabels:',labels[i])

Example comment: you, and no one else, have explained why this one word is vandalism. it cetainly appiles to him, and you are now saying that people have to decide wheter it applies by some sort of vote that is certainly not neutrality. on what basis is this word vanalism

Input Ids:
 tensor([    2,    42,    15,    17,    90,    53,   962,    15,    57,  2897,
          483,    48,    53,   833,    25, 29359,     9,    32,  4000,  5851,
          102,  4865,  3599,    18,    20,    61,    15,    17,    42,    50,
          130,  1148,    30,   148,    57,    20,  4073, 11153, 12382,    32,
        13169,    34,   109,  2058,    16,  2018,    30,    25,  3850,    52,
        23079,     9,    27,    98,  2239,    25,    48,   833,  1019,   192,
          756,     3,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            

In [17]:
from torch.utils.data import DataLoader, TensorDataset

# Creating DataLoader
batch_size = 8

# Training set
train_dataset = TensorDataset(input_ids, attention_masks, labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Validation set
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [18]:
# Check the train_loader data
print('Batch Size :',train_loader.batch_size)
Batch = next(iter(train_loader))
print('Each Input ids shape :', Batch[0].shape)
print('Input ids :\n', Batch[0][0])
print('Corresponding Decoded text:\n', tokenizer.decode(Batch[0][0]))
print('Corresponding Attention Mask :\n', Batch[1][0])
print('Corresponding Label:', Batch[2][0])

Batch Size : 8
Each Input ids shape : torch.Size([8, 512])
Input ids :
 tensor([    2,  2680,  9317,  3141,   380,   127,    89, 11232,   312,  3376,
         1049,  3794,    56,  6843,   747,    35,    17,  5908,  8997,    86,
         1793,   154,  9392,    18,    20,    14,   354, 14026,  2002,    50,
        19429,    43,  3141,   380,    65,    95,    50,  5787,    20,   781,
           88,    25,    14,   312,  1145,    52,   113,  2967,   747,    35,
           56,  6843,    86,  1077,   154,  9192,  9065,     3,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,

In [19]:
from transformers import AdamW

# Optimizer setup
optimizer = AdamW(model.parameters(), lr=2e-5)



In [20]:
# Function to train the model
def train_model(model, train_loader, optimizer, device, num_epochs):
    # Loop through the specified number of epochs
    for epoch in range(num_epochs):
        # Set the model to training mode
        model.train()
        # Initialize total loss for the current epoch
        total_loss = 0

        # Loop through the batches in the training data
        for batch in train_loader:
            input_ids, attention_mask, labels = [t.to(device) for t in batch]

            optimizer.zero_grad() # Clear the gradients from the previous step to prevent accumulation

            outputs = model(
                input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()  # Set the model to evaluation mode
        val_loss = 0

        # Disable gradient computation during validation
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = [
                    t.to(device) for t in batch]

                outputs = model(
                    input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                val_loss += loss.item()
        # Print the average loss for the current epoch
        print(
            f'Epoch {epoch+1}, Training Loss: {total_loss/len(train_loader)},Validation loss:{val_loss/len(val_loader)}')

In [21]:
# Call the function to train the model
train_model(model, train_loader, optimizer, device, num_epochs=3)

Epoch 1, Training Loss: 0.056317608065662814,Validation loss:0.049704621682692796
Epoch 2, Training Loss: 0.04687964002530933,Validation loss:0.046434779040909896
Epoch 3, Training Loss: 0.04187554124527808,Validation loss:0.04448216918815496


In [22]:
# Path to a clean test dataset
test_dataset_path = '/content/drive/My Drive/toxic_comments_data/test_cleaned.csv'

# Load the dataset into a DataFrame
df_test = pd.read_csv(test_dataset_path)

In [23]:
df_test.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0001ea8717f6de06,thank you for understanding. i think very high...,0,0,0,0,0,0
1,000247e83dcc1211,dear god this site is horrible.,0,0,0,0,0,0
2,0002f87b16116a7f,somebody will invariably try to add religion r...,0,0,0,0,0,0
3,0003e1cccfd5a40a,it says it right there that it is a type. the ...,0,0,0,0,0,0
4,00059ace3e3e9a53,"before adding a new product to the list, make ...",0,0,0,0,0,0


In [24]:
# Extract the comment texts and labels from the test DataFrame
test_data = df_test['comment_text']
test_labels = df_test.iloc[:, 2:]

In [25]:
print(f'Test data shape: {test_data.shape}')
print(f'Test labels shape: {test_labels.shape}')

Test data shape: (63978,)
Test labels shape: (63978, 6)


In [26]:
# Check for NaN values in test_data
print("NaN values in test_data:")
print(test_data.isnull().sum())

# Drop rows with NaN values in test_data
test_data_cleaned = test_data.dropna()

# Filter test_labels to match the cleaned test_data
test_labels_cleaned = test_labels.loc[test_data_cleaned.index]

# Recheck for NaN values after cleaning
print("NaN values in test_data_cleaned:")
print(test_data_cleaned.isnull().sum())
print("NaN values in test_labels_cleaned:")
print(test_labels_cleaned.isnull().sum())

NaN values in test_data:
501
NaN values in test_data_cleaned:
0
NaN values in test_labels_cleaned:
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


In [27]:
# Tokenize and Encode the comments and labels for the test set
test_input_ids, test_attention_masks, test_labels = tokenize_and_encode(
    tokenizer,
    test_data_cleaned,
    test_labels_cleaned.values
)

In [28]:
# Creating DataLoader for the testing dataset
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [29]:
# Evaluate the model
def evaluate_model(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode

    true_labels = []  # List to store true labels
    predicted_probs = []  # List to store predicted probabilities

    # Disable gradient computation for evaluation
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [t.to(device) for t in batch]

            # Get model's predictions
            outputs = model(input_ids, attention_mask=attention_mask)
            # Apply sigmoid for multilabel classification to get probabilities
            predicted_probs_batch = torch.sigmoid(outputs.logits)
            predicted_probs.append(predicted_probs_batch.cpu().numpy())

            # Store true labels for later evaluation
            true_labels_batch = labels.cpu().numpy()
            true_labels.append(true_labels_batch)

    # Combine predictions and labels for evaluation
    true_labels = np.concatenate(true_labels, axis=0)
    predicted_probs = np.concatenate(predicted_probs, axis=0)

    # Apply threshold for binary classification (0.5 is commonly used)
    predicted_labels = (predicted_probs > 0.5).astype(int)

    # Calculate evaluation metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='micro')
    recall = recall_score(true_labels, predicted_labels, average='micro')
    f1 = f1_score(true_labels, predicted_labels, average='micro')

    # Print the evaluation metrics
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')

In [30]:
# Call the function to evaluate the model on the test data
evaluate_model(model, test_loader, device)

Accuracy: 0.8831
Precision: 0.6193
Recall: 0.6876
F1 Score: 0.6517


In [31]:
import os

# Define the output directory for the model and tokenizer
output_dir = "/content/drive/My Drive/toxic_albert_model"

# Create the directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the model's state dictionary and configuration
model.save_pretrained(output_dir)

# Save the tokenizer's configuration and vocabulary
tokenizer.save_pretrained(output_dir)

# Save the model weights separately
torch.save(model.state_dict(), os.path.join(output_dir, 'toxic_albert_model_weights.pth'))

In [32]:
from safetensors.torch import save_file

# Save the model in safetensors format
save_file(model.state_dict(), os.path.join(output_dir, 'model.safetensors'))

In [33]:
import pickle

with open(os.path.join(output_dir, 'toxic_albert_model.pkl'), 'wb') as fh:
    pickle.dump(model, fh)