In [None]:
pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m302.6/302.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset


#Data processing


***Reading the dataset***

In [None]:
#Reading the csv
df = pd.read_csv('/content/drive/MyDrive/Collab/cyberbullying_tweets.csv', names=['text', 'cyberbullying_type'])


total_nulls = df.isnull().sum().sum()

print("Null data:", total_nulls)

# Making sure is a string (just in case)
df['cyberbullying_type'] = df['cyberbullying_type'].astype(str)

# Select 200 text for each label
df_sampled = df.groupby('cyberbullying_type').apply(lambda x: x.sample(n=min(len(x), 200), random_state=42)).reset_index(drop=True)

# Drop the categories we don't plan to use
df_sampled = df_sampled[~df_sampled['cyberbullying_type'].isin(['cyberbullying_type', 'other_cyberbullying'])]

Null data: 0


***Getting the labels***

In [None]:
# Convert to categorical and set a index number for the selected labels
df_sampled['cyberbullying_type'] = df_sampled['cyberbullying_type'].astype('category')
df_sampled['label'] = df_sampled['cyberbullying_type'].cat.codes

# Crear el mapeo de categor√≠as a c√≥digos num√©ricos
category_mapping = dict(enumerate(df_sampled['cyberbullying_type'].cat.categories))

print("Ciberbullying numerical labels:")
for code, category in category_mapping.items():
    print(f"{code}: {category}")


Ciberbullying numerical labels:
0: age
1: ethnicity
2: gender
3: not_cyberbullying
4: religion


***Getting the train and test values***

In [None]:
# Split data
train_df, test_df = train_test_split(df_sampled, test_size=0.2, random_state=42)

***Tokenization using GPT-2***

In [None]:
# GPT-2 Tokenizator
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = '[PAD]'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

***Preparing the data in order to train and validate***

In [None]:
class CyberbullyingDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }


***Calling the class we made***

In [None]:
# Creating the datasets for the training and tests
train_dataset = CyberbullyingDataset(
    texts=train_df['text'].tolist(),
    labels=train_df['label'].tolist(),
    tokenizer=tokenizer
)

test_dataset = CyberbullyingDataset(
    texts=test_df['text'].tolist(),
    labels=test_df['label'].tolist(),
    tokenizer=tokenizer
)

***Instantiating the model***

In [None]:
# Definir el modelo GPT-2 para clasificaci√≥n de secuencias y ajustar los tokens especiales
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=df_sampled['cyberbullying_type'].nunique())
model.resize_token_embeddings(len(tokenizer))

# Establecer el token de padding en la configuraci√≥n del modelo
model.config.pad_token_id = tokenizer.pad_token_id

# Mover el modelo al mismo dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

***Defining and training the model***

In [None]:
# Definir los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',          # Directorio de salida para los checkpoints y los logs
    num_train_epochs=3,              # N√∫mero total de epochs de entrenamiento
    per_device_train_batch_size=8,   # Tama√±o del batch de entrenamiento por dispositivo
    per_device_eval_batch_size=8,    # Tama√±o del batch de evaluaci√≥n por dispositivo
    warmup_steps=500,                # N√∫mero de pasos para el calentamiento del optimizador
    weight_decay=0.01,               # Factor de decaimiento para el ajuste de los pesos
    logging_dir='./logs',            # Directorio de logs
    evaluation_strategy='epoch'      # Evaluar el modelo al final de cada epoch
)


# Crear el objeto Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Entrenar el modelo
trainer.train()

***Evaluation and proccess to save the trained model***

In [None]:
# Evaluar el modelo en el conjunto de prueba
predictions = trainer.predict(test_dataset)
predicted_classes = np.argmax(predictions.predictions, axis=1)
true_classes = test_df['label'].values

# Mostrar m√©tricas de clasificaci√≥n
print(classification_report(true_classes, predicted_classes))

# Guardar el modelo entrenado
model.save_pretrained('./gpt2_cyberbullying_model')
tokenizer.save_pretrained('./gpt2_cyberbullying_tokenizer')

Mapeo de c√≥digos num√©ricos a categor√≠as de ciberacoso:
0: age
1: ethnicity
2: gender
3: not_cyberbullying
4: religion


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.297518
2,No log,1.557155
3,No log,1.237377


              precision    recall  f1-score   support

           0       0.61      0.96      0.75        28
           1       0.50      0.79      0.61        14
           2       0.17      0.20      0.18        10
           3       0.44      0.29      0.35        24
           4       1.00      0.25      0.40        24

    accuracy                           0.53       100
   macro avg       0.54      0.50      0.46       100
weighted avg       0.60      0.53      0.49       100



('./gpt2_cyberbullying_tokenizer/tokenizer_config.json',
 './gpt2_cyberbullying_tokenizer/special_tokens_map.json',
 './gpt2_cyberbullying_tokenizer/vocab.json',
 './gpt2_cyberbullying_tokenizer/merges.txt',
 './gpt2_cyberbullying_tokenizer/added_tokens.json')

***Using the model in inference***

In [None]:
# Hacer inferencias con el modelo entrenado
text_to_predict = "man fuck lebron and Eli y'all just mad our owner a better gm then that dumb nigger y'all have in office"
encoded_text = tokenizer(text_to_predict, truncation=True, padding=True, return_tensors="pt")

# Mover los tensores a la misma GPU que el modelo si es necesario
encoded_text = {k: v.to(device) for k, v in encoded_text.items()}

logits = model(**encoded_text).logits
predicted_class = torch.argmax(logits, dim=1).item()

# Mostrar la clase predicha junto con su descripci√≥n
predicted_category = category_mapping[predicted_class]
print("Clase predicha:", predicted_class, "-", predicted_category)

Clase predicha: 0 - age
