# Ejemplo GPT Finetunning

In [25]:
#!pip install transformers==4.55.0 datasets peft trl accelerate

In [2]:
import os
import torch
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)

# Twitter

In [4]:
df = pd.read_csv("datasets/twitter_reddit/sarcasm_2_twitter.csv")
df.head()

Unnamed: 0,label,response,context,source
0,1,USER USER USER i do not get this .. obviously ...,['a minor child deserves privacy and should be...,twitter
1,1,USER USER trying to protest about . talking ab...,['USER USER why is he a loser ? he is just a p...,twitter
2,1,USER USER USER he makes an insane about of mon...,['donald j . trump is guilty as charged . the ...,twitter
3,1,USER USER meanwhile trump will not even releas...,['jamie raskin tanked doug collins . collins l...,twitter
4,1,USER USER pretty sure the anti lincoln crowd c...,['man ... y all gone both sides the apocalypse...,twitter


In [27]:
df.iloc[0]['response']

'USER USER USER i do not get this .. obviously you do care or you would have moved right along .. instead you decided to care and troll her ..'

In [28]:
df.iloc[0]['context']

"['a minor child deserves privacy and should be kept out of politics . pamela karlan , you should be ashamed of your very angry and obviously biased public pandering , and using a child to do it .'\n 'USER if your child is not named barron ... bebest melania could not care less . fact .']"

Dado que está todo en diferentes formatos, lo primero es hacer un poco de limpieza y conseguir un formato obicuo entre todos los datasets:
- text: el texto sarcástico
- is_sarcastic: booleano
- degree_of_sarcasm: un entero del 0 al 10 (puede ser nulo)
- paraphrase: el texto parafraseado sin sarcasmo si es que corresponde
- context: el contexto del que proviene si es que es una respuesta a un hilo
- type: one of "sarcasm", "satire", "rhetorical question", etc
- source: la fuente, de que dataset proviene
- task: la tarea para la cual se va a usar

In [62]:
def process_text(text, is_assistant=False):
    count = text.count('USER')    
    to_replace = ' '.join(['USER'] * count)

    tag = ""
    if is_assistant:
        tag = f"<|assistant|>"
    else:
        tag = f"<|user: {count}|>"

    if count == 0:
        return ' '.join([tag, text])
    else:
        return text.replace(to_replace, tag)

def process_context(context):
    arr = context.replace('[', '').replace(']', '').replace("\'", '').split('\n')
    
    comments_processed = []
    for comment in arr:
        processed = process_text(comment)
        comments_processed.append(processed)

    return "\n".join(comments_processed)

def new_row(df, index, source=''):
    return {
        'text': process_text(df.loc[index]['response'], is_assistant=True),
        'is_sarcastic': df.loc[index]['label'],
        'degree_of_sarcasm': None,
        'paraphrase': None,
        'context': process_context(df.loc[index]['context']),
        'type': 'unknown',
        'source': source,
        'task': 'conversational'
    }

In [67]:
processed = []
for index, row in df.iterrows():
    processed.append(new_row(df, index, source='twitter_reddit/sarcasm_2_twitter'))

print("Dataset de twitter formateado")

Dataset de twitter formateado


# Reddit

In [68]:
df = pd.read_csv("datasets/twitter_reddit/sarcasm_2_reddit.csv")
df.head()

Unnamed: 0,label,response,context,source
0,1,"yeah i mean there is only one gender anyways, ...",['lpt if you are worried about hurting someone...,reddit
1,1,"sounds like you do not like science, you theis...",['promotional images for some guy s facebook p...,reddit
2,1,"of course play them in try mode, blizzard were...",['my friends will not play dota2 i will not pl...,reddit
3,1,"i do not understand, reddit told me that hilla...",['poll convention boosts clinton to 11 point l...,reddit
4,1,"yeh, they are the reigning triple premiers, wh...",['wayne ludbey jordan lewis has the ultimate c...,reddit


In [79]:
# Solamente redefinimos process_context

def process_context(context):
    arr = context.replace('[', '').replace(']', '').replace("\'", '').split('\n')
    
    comments_processed = []
    for i, comment in enumerate(arr):
        comment = ' '.join(['USER']*i) + comment
        processed = process_text(comment)
        comments_processed.append(processed)

    return "\n".join(comments_processed)

<|user: 0|> poll convention boosts clinton to 11 point lead over trump in pa.
<|user: 1|> 11 in pa 3 in az 15 in nh 9 in mi 1 in mo 1 in nv
<|assistant|> i do not understand, reddit told me that hillary got a negative convention bump and that trump had nowhere to go but up.


In [84]:
processed = []
for index, row in df.iterrows():
    processed.append(new_row(df, index, source='twitter_reddit/sarcasm_2_reddit'))

print("Dataset de reddit formateado")

Dataset de reddit formateado


## Guardamos los resultados

In [85]:
import os

df_processed = pd.DataFrame(processed)
os.makedirs("processed", exist_ok=True)
df_processed.to_csv("processed/gon.csv", index=False)

In [6]:
# Crear Dataset Hugging Face
dataset = Dataset.from_pandas(df[['text']])
dataset = dataset.train_test_split(test_size=0.1, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', '__index_level_0__'],
        num_rows: 1980
    })
    test: Dataset({
        features: ['text', '__index_level_0__'],
        num_rows: 220
    })
})

In [7]:
# Cargar modelo preentrenado
model_name = "gpt2"  # o "gpt2-medium", "gpt2-large", "gpt2-xl"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
special_tokens = {"additional_special_tokens": ["<|commenter|>", "<|assistant|>"]}
tokenizer.add_special_tokens(special_tokens)

model = GPT2LMHeadModel.from_pretrained(model_name)

In [8]:
# Verificamos el token especial
tokenizer.convert_tokens_to_ids("<|assistant|>")

50258

In [9]:
def mask_user_part(example):
    text = example["text"]
    max_len = 1024 

    device = torch.device("mps")
    
    # Tokenizar
    tokens = tokenizer(
        text, 
        truncation=True, 
        padding="max_length", 
        max_length=max_len,
        return_tensors=None  # Asegurar que devuelve listas, no tensores
    ) 
    
    input_ids = tokens["input_ids"]
    labels = input_ids.copy()
    assistant_token_id = tokenizer.convert_tokens_to_ids("<|assistant|>")

    if assistant_token_id in input_ids:
        start_idx = input_ids.index(assistant_token_id) + 1
        labels[:start_idx] = [-100] * start_idx
    else:
        labels = [-100] * len(input_ids)

    tokens["labels"] = labels
    return tokens

tokenized_datasets = dataset.map(mask_user_part)

Map:   0%|          | 0/1980 [00:00<?, ? examples/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

In [10]:
# Verificar un ejemplo del dataset
print("Forma del dataset:")
print(f"Longitud de input_ids: {len(tokenized_datasets['train'][0]['input_ids'])}")
print(f"Longitud de labels: {len(tokenized_datasets['train'][0]['labels'])}")

# Verificar si hay -100 en los labels (partes ignoradas)
labels_sample = tokenized_datasets['train'][0]['labels']
print(f"Número de elementos -100 en labels: {sum(1 for x in labels_sample if x == -100)}")

Forma del dataset:
Longitud de input_ids: 1024
Longitud de labels: 1024
Número de elementos -100 en labels: 36


In [11]:
torch.mps.empty_cache()
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

model_output_dir = './results/gpt2-gon'
tokenizer = GPT2Tokenizer.from_pretrained(model_output_dir)

model = GPT2LMHeadModel.from_pretrained(model_output_dir)
model.resize_token_embeddings(len(tokenizer))

device = torch.device("mps")
model.to(device)

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    num_train_epochs=4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs'
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer
)

# Train the model
trainer.train()

model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,0.0624,0.073019
2,0.054,0.071992
3,0.0447,0.075763
4,0.0376,0.0796




('./results/gpt2-gon/tokenizer_config.json',
 './results/gpt2-gon/special_tokens_map.json',
 './results/gpt2-gon/vocab.json',
 './results/gpt2-gon/merges.txt',
 './results/gpt2-gon/added_tokens.json')

In [17]:
input_text = "<|commenter|> Terrific, that's a well poured beer \n <|assistant|>"

def get_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    return total_params

model_path = './results/gpt2-gon'
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
print(tokenizer.additional_special_tokens)

model = GPT2LMHeadModel.from_pretrained(model_path)
model.resize_token_embeddings(len(tokenizer))

# Calculate the Number of Parameters in the model being used for inference
total_params = get_model_parameters(model)
print(f"Total number of paramerers: {total_params}")

# Prepare the input text you want to generate predictions for
inputs = tokenizer(input_text, return_tensors='pt')

# Generate Text
outputs = model.generate(**inputs, max_length=50, num_return_sequences=1)

# Decode the generated text
generated_text = tokenizer.decode(outputs[0])

print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['<|commenter|>', '<|assistant|>']
Total number of paramerers: 124441344
<|commenter|>  Terrific, that's a well poured beer 
  <|assistant|>  but i drink it once a week, so it must be good<|endoftext|>


In [None]:
df = pd.read_json("datasets/sarc/sarc.json")
df.head()