# BERT on its own (BERTForSequenceClassification)

In [1]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 14689491314097640236
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 15290466304
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 13011122073392264125
 physical_device_desc: "device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0"
 xla_global_id: 416903419]

In [2]:
!pip install transformers



In [3]:
import pandas as pd

In [4]:
# Replace 'your_file.csv' with the actual file path
file_path = 'https://raw.githubusercontent.com/vaamps/cyberbullying-detection/main/datasets/output_sentiment.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)
# Replace 'your_text_column' with the actual text column name
text_column = df['Text']
# vocab = pd.read_csv('filtered_vocab.txt', header=None)[0].tolist()

In [5]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
# Specify the path to your vocab file
import requests

# URL of the vocab file
vocab_file_path = 'https://raw.githubusercontent.com/vaamps/cyberbullying-detection/main/datasets/negative_words.txt'

# Read the vocab file and create a set of non-stopwords
response = requests.get(vocab_file_path)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    vocab_words = {word.strip() for word in response.text.split() if word.strip()}
    print(vocab_words)
else:
    print(f"Failed to fetch the vocabulary file. Status code: {response.status_code}")

# Remove stopwords and create a new vocabulary
filtered_vocabulary = set(word.lower() for word in vocab_words if word.lower() not in stopwords.words('english'))



In [7]:
import numpy as np

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch
import numpy as np
from sklearn.metrics import accuracy_score

In [9]:
print(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

cuda


In [10]:
df['oh_label'] = df['oh_label'].astype(int)

In [11]:


# Assuming df is your DataFrame and 'Text' and 'oh_label' are columns in it
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
encodings = tokenizer(df['Text'].tolist(), truncation=True, padding=True, max_length=512)

# Prepare the dataset
inputs = torch.tensor(encodings['input_ids'])
masks = torch.tensor(encodings['attention_mask'])
labels = torch.tensor(df['oh_label'].values)

# Create the dataset
dataset = TensorDataset(inputs, masks, labels)

# Split the data into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Function to calculate accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, pred_flat)

# Initialize accumulators for overall average calculation
total_train_accuracy = 0
total_val_accuracy = 0

# Training loop
for epoch_i in range(epochs):
    # Training
    model.train()
    total_epoch_train_accuracy = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_attention_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_attention_mask = b_attention_mask.to(device)
        b_labels = b_labels.to(device)

        model.zero_grad()

        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_epoch_train_accuracy += flat_accuracy(logits, label_ids)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_epoch_train_accuracy = total_epoch_train_accuracy / len(train_dataloader)
    total_train_accuracy += avg_epoch_train_accuracy
    print(f"Epoch {epoch_i + 1}")
    print(f"  Training Accuracy: {avg_epoch_train_accuracy}")

    # Validation
    model.eval()
    total_epoch_val_accuracy = 0
    for batch in validation_dataloader:
        b_input_ids, b_attention_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_attention_mask = b_attention_mask.to(device)
        b_labels = b_labels.to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_epoch_val_accuracy += flat_accuracy(logits, label_ids)

    avg_epoch_val_accuracy = total_epoch_val_accuracy / len(validation_dataloader)
    total_val_accuracy += avg_epoch_val_accuracy
    print(f"  Validation Accuracy: {avg_epoch_val_accuracy}")

# Calculate overall average accuracy across all epochs
overall_avg_train_accuracy = total_train_accuracy / epochs
overall_avg_val_accuracy = total_val_accuracy / epochs
print(f"Overall Average Training Accuracy: {overall_avg_train_accuracy}")
print(f"Overall Average Validation Accuracy: {overall_avg_val_accuracy}")

print("Training complete!")


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1
  Training Accuracy: 0.9242
  Validation Accuracy: 0.9335
Epoch 2
  Training Accuracy: 0.9540125
  Validation Accuracy: 0.93655
Epoch 3
  Training Accuracy: 0.9766875
  Validation Accuracy: 0.93235
Epoch 4
  Training Accuracy: 0.988775
  Validation Accuracy: 0.933
Overall Average Training Accuracy: 0.96091875
Overall Average Validation Accuracy: 0.9338500000000001
Training complete!


In [12]:
df['oh_label'].unique()

array([0, 1])

In [13]:
torch.save(model.state_dict(), 'bert_sequence_classification_model.pth')


In [14]:
tokenizer.save_pretrained('bert_tokenizer')


('bert_tokenizer/tokenizer_config.json',
 'bert_tokenizer/special_tokens_map.json',
 'bert_tokenizer/vocab.txt',
 'bert_tokenizer/added_tokens.json')

In [15]:
import shutil
shutil.make_archive('/content/bert_tokenizer', 'zip', '/content/bert_tokenizer')

'/content/bert_tokenizer.zip'

Load Model back:

In [16]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load('bert_sequence_classification_model.pth'))
model.to(device)  # Make sure to use the same device as before
# Using V100 GPU no High RAM

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert_tokenizer')


In [18]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
!rsync -r --progress "/content/bert_sequence_classification_model.pth" "/content/drive/MyDrive/DMFPBert"

sending incremental file list
bert_sequence_classification_model.pth
    438,022,335 100%  348.96MB/s    0:00:01 (xfr#1, to-chk=0/1)


# BERT Annotations training

In [20]:
# Assuming df['annotations'] contains the string labels
label_mapping = {'racism': 0, 'sexism': 1, 'toxicity': 2, 'none': 3}
df['numeric_labels'] = df['Annotation'].replace(label_mapping)
nlabels = torch.tensor(df['numeric_labels'].values)


In [24]:


# Assuming df is your DataFrame and 'Text' and 'oh_label' are columns in it
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
encodings = tokenizer(df['Text'].tolist(), truncation=True, padding=True, max_length=512)

# Prepare the dataset
inputs = torch.tensor(encodings['input_ids'])
masks = torch.tensor(encodings['attention_mask'])
#labels = torch.tensor(df['oh_label'].values)

# Create the dataset
dataset = TensorDataset(inputs, masks, nlabels)

# Split the data into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Function to calculate accuracy
def flat_accuracy(preds, nlabels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = nlabels.flatten()
    return accuracy_score(labels_flat, pred_flat)

# Initialize accumulators for overall average calculation
total_train_accuracy = 0
total_val_accuracy = 0

# Training loop
for epoch_i in range(epochs):
    # Training
    model.train()
    total_epoch_train_accuracy = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_attention_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_attention_mask = b_attention_mask.to(device)
        b_labels = b_labels.to(device)

        model.zero_grad()

        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_epoch_train_accuracy += flat_accuracy(logits, label_ids)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_epoch_train_accuracy = total_epoch_train_accuracy / len(train_dataloader)
    total_train_accuracy += avg_epoch_train_accuracy
    print(f"Epoch {epoch_i + 1}")
    print(f"  Training Accuracy: {avg_epoch_train_accuracy}")

    # Validation
    model.eval()
    total_epoch_val_accuracy = 0
    for batch in validation_dataloader:
        b_input_ids, b_attention_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_attention_mask = b_attention_mask.to(device)
        b_labels = b_labels.to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_epoch_val_accuracy += flat_accuracy(logits, label_ids)

    avg_epoch_val_accuracy = total_epoch_val_accuracy / len(validation_dataloader)
    total_val_accuracy += avg_epoch_val_accuracy
    print(f"  Validation Accuracy: {avg_epoch_val_accuracy}")

# Calculate overall average accuracy across all epochs
overall_avg_train_accuracy = total_train_accuracy / epochs
overall_avg_val_accuracy = total_val_accuracy / epochs
print(f"Overall Average Training Accuracy: {overall_avg_train_accuracy}")
print(f"Overall Average Validation Accuracy: {overall_avg_val_accuracy}")

print("Training complete!")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1
  Training Accuracy: 0.9215875
  Validation Accuracy: 0.9287
Epoch 2
  Training Accuracy: 0.9547125
  Validation Accuracy: 0.93155
Epoch 3
  Training Accuracy: 0.9772
  Validation Accuracy: 0.93275
Overall Average Training Accuracy: 0.9511666666666666
Overall Average Validation Accuracy: 0.931
Training complete!


In [25]:
torch.save(model.state_dict(), 'bert_model_annotation.pth')


In [26]:
tokenizer.save_pretrained('annotate_tokenizer')
shutil.make_archive('/content/annotate_tokenizer', 'zip', '/content/annotate_tokenizer')

'/content/annotate_tokenizer.zip'

In [27]:
!rsync -r --progress "/content/bert_model_annotation.pth" "/content/drive/MyDrive/DMFPBert"

sending incremental file list
bert_model_annotation.pth
    438,025,750 100%  401.64MB/s    0:00:01 (xfr#1, to-chk=0/1)


# Twitter and Kaggle data testing

In [51]:
# Replace 'your_file.csv' with the actual file path
file_path = 'https://raw.githubusercontent.com/vaamps/cyberbullying-detection/main/datasets/final_test.csv'

# Read the CSV file into a DataFrame
testdf = pd.read_csv(file_path)
# Replace 'your_text_column' with the actual text column name
test_text = testdf['Text']
# vocab = pd.read_csv('filtered_vocab.txt', header=None)[0].tolist()

In [56]:
testdf.head()

Unnamed: 0,id,oh_label,Text,source,Sentiment_Score
0,0,1,you are a complete jackass to blame the mayor ...,kaggle,-6.0
1,1,1,thank u soooo much d dad im a cow moooooo im a...,youtube,-25.0
2,2,1,pfuck you media matters troll,kaggle,-1.0
3,3,1,youre an egoistic dimwitted cunt thats why you...,kaggle,-8.0
4,4,0,you are correct one must purchase the mlbnpack...,kaggle,-2.0


In [53]:
testdf.drop('Unnamed: 0.1', axis=1, inplace=True)
testdf.drop('U_Text', axis=1, inplace=True)

In [55]:
testdf.rename(columns={'Unnamed: 0': 'id'}, inplace=True)

In [31]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
model.load_state_dict(torch.load('/content/bert_model_annotation.pth'))
model.to(device)  # Make sure to use the same device as before
# Using V100 GPU no High RAM

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [32]:
tokenizer = BertTokenizer.from_pretrained('/content/annotate_tokenizer')

In [57]:
testdf['Text'].isnull().sum()

0

In [58]:
def predict_with_bert(dataframe, model, batch_size=16):
    model.eval()  # Set the model to evaluation mode
    predictions = []

    for i in range(0, len(dataframe), batch_size):
        batch = dataframe.iloc[i:i+batch_size]
        inputs = tokenizer(
            batch['Text'].tolist(),
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(batch_predictions)

    return predictions

# Perform predictions in batches
predictions = predict_with_bert(testdf, model, batch_size=16)


# Map predictions back to class labels
label_mapping = {0: 'racism', 1: 'sexism', 2: 'toxicity', 3: 'none'}
mapped_predictions = [label_mapping[label] for label in predictions]

# Add predictions to the DataFrame
testdf['Annotation'] = mapped_predictions


In [59]:
testdf.columns


Index(['id', 'oh_label', 'Text', 'source', 'Sentiment_Score', 'Annotation'], dtype='object')

In [60]:
testdf.head()


Unnamed: 0,id,oh_label,Text,source,Sentiment_Score,Annotation
0,0,1,you are a complete jackass to blame the mayor ...,kaggle,-6.0,toxicity
1,1,1,thank u soooo much d dad im a cow moooooo im a...,youtube,-25.0,toxicity
2,2,1,pfuck you media matters troll,kaggle,-1.0,toxicity
3,3,1,youre an egoistic dimwitted cunt thats why you...,kaggle,-8.0,toxicity
4,4,0,you are correct one must purchase the mlbnpack...,kaggle,-2.0,none


In [61]:
testdf.tail(10)

Unnamed: 0,id,oh_label,Text,source,Sentiment_Score,Annotation
12243,12253,1,odd you post this you critize me every day v...,kaggle,-1.0,none
12244,12254,0,you are correct,kaggle,0.0,none
12245,12255,1,u talk like u hard but rappin no shit u better...,youtube,-2.0,toxicity
12246,12256,1,your all talk if you saw a black man in the st...,kaggle,-14.0,toxicity
12247,12257,1,youre an obama dicsuck er slurp burp yep thats...,kaggle,0.0,toxicity
12248,12258,0,ids is one of a dying breed politicians who re...,kaggle,-16.0,toxicity
12249,12259,1,at this point it is safe to say your ignorance...,kaggle,-3.0,toxicity
12250,12260,0,we are very proud of you ladies tt thanks yg y...,youtube,15.0,none
12251,12261,0,bambino he clearly just won the ball har jag s...,youtube,6.0,none
12252,12262,0,ja chyba te w takim razie dpolskie pieni patri...,youtube,34.0,none


In [63]:
testdf.to_csv("test_annotated.csv", index= False)