# Install & Import Dependencies

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m117.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.6 MB/s[0m eta [36m0:00:

In [3]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer,  BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

# Load Data

In [5]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
X = df_train['comment_text'].values
y = df_train[df_train.columns[2:]].values

In [1]:
# seperate data to train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

NameError: name 'train_test_split' is not defined

In [8]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((127656,), (127656, 6), (31915,), (31915, 6))

# Define Tokenizer & Model

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=6)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
device = torch.device(
    'cuda' if torch.cuda.is_available() else torch.device('cpu')
)

model = model.to(device)

In [12]:
# Token and Encode Function
def tokenize_and_encode(tokenizer, comments, labels, max_length=128):
    # Initialize empty lists to store tokenized inputs and attention masks
    input_ids = []
    attention_masks = []

    # Iterate through each comment in the 'comments' list
    for comment in comments:

        # Tokenize and encode the comment using the BERT tokenizer
        encoded_dict = tokenizer.encode_plus(
            comment,

            # Add special tokens like [CLS] and [SEP]
            add_special_tokens=True,

            # Truncate or pad the comment to 'max_length'
            max_length=max_length,

            # Pad the comment to 'max_length' with zeros if needed
            pad_to_max_length=True,

            # Return attention mask to mask padded tokens
            return_attention_mask=True,

            # Return PyTorch tensors
            return_tensors='pt'
        )

        # Append the tokenized input and attention mask to their respective lists
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # Concatenate the tokenized inputs and attention masks into tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    # Convert the labels to a PyTorch tensor with the data type float32
    labels = torch.tensor(labels, dtype=torch.float32)

    # Return the tokenized inputs, attention masks, and labels as PyTorch tensors
    return input_ids, attention_masks, labels

In [13]:
input_ids, attention_masks, labels = tokenize_and_encode(
    tokenizer,
    X_train,
    y_train
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [18]:
test_input_ids, test_attention_masks, test_labels = tokenize_and_encode(
    tokenizer,
    X_test,
    y_test
)



In [20]:
print('Training Comments: ', X_train.shape)
print('Input Ids: ', input_ids.shape)
print('Attention Mask: ', attention_masks.shape)
print('Labels: ', labels.shape)

Training Comments:  (127656,)
Input Ids:  torch.Size([127656, 128])
Attention Mask:  torch.Size([127656, 128])
Labels:  torch.Size([127656, 6])


In [26]:
print("Comment -> ", X_train[53])
print("Input ids ->", input_ids[53])
print("Decode -> ", tokenizer.decode(input_ids[53]))
print("Attention Mask ->", attention_masks[53])
print("Labels ->", labels[53])

Comment ->  This is not everyones fault. 

This IP address belongs to our school, therfore what happens if the schools IP is blocked because of a few naughty students?

You could notify the school and request suspension of those students in particular.
Input ids -> tensor([  101,  2023,  2003,  2025,  3071,  2015,  6346,  1012,  2023, 12997,
         4769,  7460,  2000,  2256,  2082,  1010,  1996, 12881,  5686,  2054,
         6433,  2065,  1996,  2816, 12997,  2003,  8534,  2138,  1997,  1037,
         2261, 20355,  2493,  1029,  2017,  2071,  2025,  8757,  1996,  2082,
         1998,  5227,  8636,  1997,  2216,  2493,  1999,  3327,  1012,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,    

# Data Loader as PyTorch

In [29]:
train_data = TensorDataset(input_ids, attention_masks, labels)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

In [44]:
batch = next(iter(train_loader))
optimizer = AdamW(model.parameters(), lr=2e-5)



# Train the Model

In [48]:
for epoch in range(3):
  model.train()
  total_loss = 0

  for batch in train_loader:
    input_ids, attention_masks, labels = [t.to(device) for t in batch]

    optimizer.zero_grad()

    output = model(
        input_ids, attention_mask=attention_masks, labels=labels
    )

    loss = output.loss
    total_loss += loss.item()

    loss.backward()
    optimizer.step()
  print(f'epoch {epoch + 1}, training loss {total_loss/len(train_loader)}')

epoch 1, training loss 0.041556053206950126
epoch 2, training loss 0.032552883567820705
epoch 3, training loss 0.026121297561887996
