# BERT

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [3]:
# Carica il modello e utilizza la funzione di predizione
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

model_save_path = "bert_for_cyberbullying.pth"
model.load_state_dict(torch.load(model_save_path))
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [4]:
def predict(text, model, tokenizer, device):
    model.eval()
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).cpu().numpy()[0]

    return prediction  

def invert_label(c_type):
    if c_type == 0:
        c_type = "not_cyberbullying"
    elif c_type == 1:
        c_type = "age" 
    elif c_type == 2:
        c_type = "gender"
    elif c_type == 3:
        c_type = "ethnicity"
    elif c_type == 4:
        c_type = "religion"
    return c_type
    
# Carica il modello e testa la funzione di predizione
model.load_state_dict(torch.load(model_save_path))
model.to(device)

sample_text = "Example of a new tweet that could be cyberbullying."
predicted_category = predict(sample_text, model, tokenizer, device)

predicted_category = invert_label(predicted_category)

print(f"Predicted Cyberbullying Category: {predicted_category}")

Predicted Cyberbullying Category: gender


In [5]:
sample_text = "fuck you black"
predicted_category = predict(sample_text, model, tokenizer, device)
predicted_category = invert_label(predicted_category)
print(f"Predicted Cyberbullying Category: {predicted_category}")

Predicted Cyberbullying Category: ethnicity


In [6]:
sample_text = "i will rape you"
predicted_category = predict(sample_text, model, tokenizer, device)
predicted_category = invert_label(predicted_category)
print(f"Predicted Cyberbullying Category: {predicted_category}")

Predicted Cyberbullying Category: gender


In [7]:
sample_text = "muslim idiot"
predicted_category = predict(sample_text, model, tokenizer, device)
predicted_category = invert_label(predicted_category)
print(f"Predicted Cyberbullying Category: {predicted_category}")

Predicted Cyberbullying Category: religion


In [8]:
sample_text = "hello how are yu"
predicted_category = predict(sample_text, model, tokenizer, device)
predicted_category = invert_label(predicted_category)
print(f"Predicted Cyberbullying Category: {predicted_category}")

Predicted Cyberbullying Category: not_cyberbullying


In [9]:
sample_text = "Can anyone else said to this nigger that the dress is blue?"
predicted_category = predict(sample_text, model, tokenizer, device)
predicted_category = invert_label(predicted_category)
print(f"Predicted Cyberbullying Category: {predicted_category}")

Predicted Cyberbullying Category: ethnicity


In [10]:
sample_text = "I'm really happy for your birthday"
predicted_category = predict(sample_text, model, tokenizer, device)
predicted_category = invert_label(predicted_category)
print(f"Predicted Cyberbullying Category: {predicted_category}")

Predicted Cyberbullying Category: not_cyberbullying


In [11]:
sample_text = "In my opinion Allah is not a real god"
predicted_category = predict(sample_text, model, tokenizer, device)
predicted_category = invert_label(predicted_category)
print(f"Predicted Cyberbullying Category: {predicted_category}")

Predicted Cyberbullying Category: religion


In [12]:
sample_text = "I like Allah"
predicted_category = predict(sample_text, model, tokenizer, device)
predicted_category = invert_label(predicted_category)
print(f"Predicted Cyberbullying Category: {predicted_category}")

Predicted Cyberbullying Category: religion


In [13]:
sample_text = "I love Allah"
predicted_category = predict(sample_text, model, tokenizer, device)
predicted_category = invert_label(predicted_category)
print(f"Predicted Cyberbullying Category: {predicted_category}")

Predicted Cyberbullying Category: not_cyberbullying


In [17]:
sample_text = "Men are better than women"
predicted_category = predict(sample_text, model, tokenizer, device)
predicted_category = invert_label(predicted_category)
print(f"Predicted Cyberbullying Category: {predicted_category}")

Predicted Cyberbullying Category: not_cyberbullying
