In [1]:
import torch
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Using device: cuda:2


In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [5]:
import os
os.environ["WANDB_DISABLED"] = "true"
seed = 25
# random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [6]:
train_data = pd.read_csv('datasets/subtask_1/es/train.tsv',sep='\t')
train_data = train_data.reset_index(drop=True)
print(train_data.head())

      id                                               text      label
0   5464  Entrada en vigor. La presente Directiva entrar...      human
1  30129  Preguntas: 1. ¿Cuáles son los principales argu...  generated
2  19553  ¿Desea algo? Póngame una caja de madera. ¿Qué ...  generated
3  13005  @victor28088 1665 Tweets no originales, que as...      human
4  16919  De pequeño Dios me dio a elegir entre tener un...      human


In [7]:
from sklearn.model_selection import train_test_split
train_data_texts = train_data['text'].to_list()
train_data_labels = train_data['label'].to_list()
train_texts, test_texts, train_labels, test_labels = train_test_split(train_data_texts, train_data_labels, test_size=0.1, random_state=25)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=25)
print('train data size: ', len(train_texts))
print('validation data size: ', len(val_texts))
print('test data size: ', len(test_texts))

train data size:  25969
validation data size:  2886
test data size:  3207


In [8]:
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
tokenizer = AutoTokenizer.from_pretrained("DeepESP/gpt2-spanish")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained("DeepESP/gpt2-spanish").to(device)
model.config.pad_token_id = model.config.eos_token_id
print("Model Configurations")
print()
print(model.config)

Downloading:   0%|          | 0.00/115 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/914 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/840k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/262 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/261M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepESP/gpt2-spanish were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at DeepESP/gpt2-spanish and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Configurations

GPT2Config {
  "_name_or_path": "DeepESP/gpt2-spanish",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.24.0",
  "use_cache": true,
  "vocab_size": 50257
}



In [9]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=256)

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels[idx] == 'human':
            item['labels'] = torch.tensor(0)
        else:
            item['labels'] = torch.tensor(1)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

print("Sample Data Point")
print()
print(train_dataset[0])

Sample Data Point

{'input_ids': tensor([ 4130, 21291,  2218,    21,  2406,   299,   873,   363,  6520,   576,
        37061, 18612,    40,   299,  4248,    21,   366,   396,  2005,   644,
           21,  2871, 34834, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50

In [10]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [11]:
training_args = TrainingArguments(
    output_dir='./models/spanish_gpt2_task1',       
    num_train_epochs=4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 25969
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3248
  Number of trainable parameters = 124441344


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3142,0.30735,0.869716,0.868565,0.873321,0.867207
2,0.1288,0.328884,0.889813,0.888625,0.89561,0.886869
3,0.063,0.419005,0.911642,0.911245,0.912451,0.910573
4,0.0009,0.509878,0.912682,0.912226,0.914008,0.911355


***** Running Evaluation *****
  Num examples = 2886
  Batch size = 32
Saving model checkpoint to ./models/spanish_gpt2_task1/checkpoint-812
Configuration saved in ./models/spanish_gpt2_task1/checkpoint-812/config.json
Model weights saved in ./models/spanish_gpt2_task1/checkpoint-812/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2886
  Batch size = 32
Saving model checkpoint to ./models/spanish_gpt2_task1/checkpoint-1624
Configuration saved in ./models/spanish_gpt2_task1/checkpoint-1624/config.json
Model weights saved in ./models/spanish_gpt2_task1/checkpoint-1624/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2886
  Batch size = 32
Saving model checkpoint to ./models/spanish_gpt2_task1/checkpoint-2436
Configuration saved in ./models/spanish_gpt2_task1/checkpoint-2436/config.json
Model weights saved in ./models/spanish_gpt2_task1/checkpoint-2436/pytorch_model.bin
Deleting older checkpoint [models/spanish_gpt2_task1/checkpoint-812] due to args.save

TrainOutput(global_step=3248, training_loss=0.15916052786038393, metrics={'train_runtime': 1419.0228, 'train_samples_per_second': 73.202, 'train_steps_per_second': 2.289, 'total_flos': 9383233679327232.0, 'train_loss': 0.15916052786038393, 'epoch': 4.0})

In [12]:
trainer.evaluate(test_dataset)
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=1)

from sklearn.metrics import classification_report
print(classification_report(labels, predictions))

***** Running Evaluation *****
  Num examples = 3207
  Batch size = 32


***** Running Prediction *****
  Num examples = 3207
  Batch size = 32


              precision    recall  f1-score   support

           0       0.95      0.89      0.92      1579
           1       0.90      0.96      0.93      1628

    accuracy                           0.92      3207
   macro avg       0.93      0.92      0.92      3207
weighted avg       0.93      0.92      0.92      3207



In [13]:
trainer.save_model('./models/spanish_gpt2_task1/trained_model')

Saving model checkpoint to ./models/spanish_gpt2_task1/trained_model
Configuration saved in ./models/spanish_gpt2_task1/trained_model/config.json
Model weights saved in ./models/spanish_gpt2_task1/trained_model/pytorch_model.bin
