# BERT

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch

In [2]:
import tarfile

# Percorso del file tar.gz
tar_file_path = "bert_for_cyberbullying.tar.gz"
# Nome del file da estrarre
file_to_extract = "bert_for_cyberbullying.pth"

with tarfile.open(tar_file_path, "r:gz") as tar:
    tar.extract(file_to_extract)


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Specify a path to save to
model_save_path = "bert_for_cyberbullying.pth"

# Carica il modello e utilizza la funzione di predizione
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")

# Load
model.load_state_dict(torch.load(model_save_path, map_location=device))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [4]:
def invert_label(c_type):
    if c_type == 0:
        c_type = "not_cyberbullying"
    elif c_type == 1:
        c_type = "age" 
    elif c_type == 2:
        c_type = "gender"
    elif c_type == 3:
        c_type = "ethnicity"
    elif c_type == 4:
        c_type = "religion"
    return c_type

def predict(text, model, tokenizer, device):
    model.eval()
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).cpu().numpy()[0]

    return invert_label(prediction)  

    
def print_category(sentence):
    predicted_category = predict(sentence, model, tokenizer, device)
    print(f"Text: {sentence} \nPredicted Cyberbullying Category: {predicted_category}")
    print("----------------")
    return

In [5]:
print_category("Example of a new tweet that could be cyberbullying.")
print_category("fuck you black")
print_category("i will rape you")
print_category("muslim idiot")
print_category("muslim idiot")
print_category("hello how are you")
print_category("Can anyone else said to this nigger that the dress is blue?")
print_category("I'm really happy for your birthday")
print_category("In my opinion Allah is not a real god")
print_category("I fucking hate Allah")
print_category("I appreciate Allah")
print_category("Men are better than women")
print_category("Bro, you are a Nigga!!!")
print_category("You are a shit!!!")
print_category("dickhead!")

Text: Example of a new tweet that could be cyberbullying. 
Predicted Cyberbullying Category: gender
----------------
Text: fuck you black 
Predicted Cyberbullying Category: ethnicity
----------------
Text: i will rape you 
Predicted Cyberbullying Category: gender
----------------
Text: muslim idiot 
Predicted Cyberbullying Category: religion
----------------
Text: muslim idiot 
Predicted Cyberbullying Category: religion
----------------
Text: hello how are you 
Predicted Cyberbullying Category: not_cyberbullying
----------------
Text: Can anyone else said to this nigger that the dress is blue? 
Predicted Cyberbullying Category: ethnicity
----------------
Text: I'm really happy for your birthday 
Predicted Cyberbullying Category: not_cyberbullying
----------------
Text: In my opinion Allah is not a real god 
Predicted Cyberbullying Category: religion
----------------
Text: I fucking hate Allah 
Predicted Cyberbullying Category: religion
----------------
Text: I appreciate Allah 
Predict