In [1]:
import os
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling, 
    AutoTokenizer
)
import torch
from datasets import Dataset

In [2]:
df = pd.read_table("Dataset/chat.txt", header = None, on_bad_lines = "warn", names = ["text"])
df

Skipping line 8396: expected 1 fields, saw 2
Skipping line 8397: expected 1 fields, saw 2
Skipping line 8398: expected 1 fields, saw 2
Skipping line 8399: expected 1 fields, saw 2
Skipping line 31600: expected 1 fields, saw 2
Skipping line 74442: expected 1 fields, saw 2
Skipping line 74443: expected 1 fields, saw 2
Skipping line 74444: expected 1 fields, saw 2
Skipping line 74445: expected 1 fields, saw 2
Skipping line 74446: expected 1 fields, saw 2
Skipping line 74447: expected 1 fields, saw 2
Skipping line 74448: expected 1 fields, saw 2
Skipping line 74449: expected 1 fields, saw 2
Skipping line 74450: expected 1 fields, saw 2
Skipping line 74451: expected 1 fields, saw 2
Skipping line 74452: expected 1 fields, saw 2
Skipping line 74453: expected 1 fields, saw 2
Skipping line 74454: expected 1 fields, saw 2
Skipping line 74455: expected 1 fields, saw 2
Skipping line 74456: expected 1 fields, saw 2
Skipping line 89163: expected 1 fields, saw 2
Skipping line 89164: expected 1 fields

Unnamed: 0,text
0,"6/26/17, 5:19 PM - Linda Roldán: Hoola"
1,"6/26/17, 5:19 PM - Linda Roldán: Bebe"
2,"6/26/17, 5:46 PM - Alejandro Castellanos: Hola..."
3,"6/26/17, 5:46 PM - Alejandro Castellanos: Y es..."
4,"6/26/17, 5:54 PM - Linda Roldán: Este es mi nu..."
...,...
231352,"7/22/25, 3:46 PM - Alejandro Castellanos: pens..."
231353,"7/22/25, 3:46 PM - Alejandro Castellanos: y to..."
231354,"7/22/25, 3:51 PM - Linda Roldán: <Media omitted>"
231355,"7/22/25, 4:07 PM - Alejandro Castellanos: y tú..."


In [3]:
df.shape

(231357, 1)

In [4]:
# # Deleted Messages
df = df[~df["text"].str.contains("This message was deleted")]

# Delete all rows with "Media omitted" Message
df = df[~df["text"].str.contains("Media omitted")]

# Delete row with line breaks or without time data, string patter "M - " such as "PM - " or "AM - "
df = df[df["text"].str.contains("M - ")]

df.head()

Unnamed: 0,text
0,"6/26/17, 5:19 PM - Linda Roldán: Hoola"
1,"6/26/17, 5:19 PM - Linda Roldán: Bebe"
2,"6/26/17, 5:46 PM - Alejandro Castellanos: Hola..."
3,"6/26/17, 5:46 PM - Alejandro Castellanos: Y es..."
4,"6/26/17, 5:54 PM - Linda Roldán: Este es mi nu..."


In [5]:
df.shape

(203079, 1)

In [6]:
# 1) Trim leading/trailing whitespace
df['text'] = df['text'].str.strip()

# 2) Split into “date_str” and “rest” on the first dash (hyphen or en-dash)
parts = df['text'].str.split(r'\s*[-–]\s*', n=1, expand=True)
df['date_str'], df['rest'] = parts[0], parts[1]

# 3) Parse the date string (invalid formats become NaT)
df['date'] = pd.to_datetime(
    df['date_str'],
    format='%m/%d/%y, %I:%M %p',
    errors='coerce'
)

# 4) Split “rest” into “user” and “message” on the first colon
usr_msg = df['rest'].str.split(r':\s*', n=1, expand=True)
df['user'], df['message'] = usr_msg[0], usr_msg[1]

# 5) Drop the helper columns
df = df.drop(columns=['date_str', 'rest'])

#6) Drop nan
df = df.dropna(subset=["message"])

In [7]:
df.head()

Unnamed: 0,text,date,user,message
0,"6/26/17, 5:19 PM - Linda Roldán: Hoola",2017-06-26 17:19:00,Linda Roldán,Hoola
1,"6/26/17, 5:19 PM - Linda Roldán: Bebe",2017-06-26 17:19:00,Linda Roldán,Bebe
2,"6/26/17, 5:46 PM - Alejandro Castellanos: Hola...",2017-06-26 17:46:00,Alejandro Castellanos,Hola chiquita
3,"6/26/17, 5:46 PM - Alejandro Castellanos: Y es...",2017-06-26 17:46:00,Alejandro Castellanos,Y este número¿
4,"6/26/17, 5:54 PM - Linda Roldán: Este es mi nu...",2017-06-26 17:54:00,Linda Roldán,Este es mi numero anterior


In [8]:
# Split data
other = df[df["user"] != "Alejandro Castellanos"].reset_index(drop = True)
mine = df[df["user"] == "Alejandro Castellanos"].reset_index(drop = True)

In [9]:
other.shape

(116957, 4)

In [10]:
mine.shape

(86117, 4)

In [11]:
pairs_list = []
context_msgs = []

for _, row in df.iterrows():
    if row["user"] != "Alejandro Castellanos":
        # acumula mensaje de Linda
        context_msgs.append(row["message"])
    else:
        # aparece tu respuesta → creamos un par si hubo contexto
        if context_msgs:
            context = " ".join(context_msgs)
            reply   = row["message"]
            pairs_list.append({
                "context": context,
                "reply":   reply
            })
            context_msgs = []  # reiniciamos para el siguiente turno

# Convertimos a DataFrame
pairs = pd.DataFrame(pairs_list)

# Vemos los primeros ejemplos
pairs.head(10)

Unnamed: 0,context,reply
0,Hoola Bebe,Hola chiquita
1,Este es mi numero anterior,"A síii, y que cel tienes?"
2,El mio le mande a arreglar,A síii???
3,Siiii,Muy bien
4,30.000 pero lo estoy pagando a plazos,Y quedó bien?
5,No se,Por qué no sabes?
6,No tiene corrector de ortografia,A eso es lo de menos :p
7,Nooo En mi situacion es lo de mas,Pero eso es de configuración
8,Asiii Y como hagoo para configurarlo,Espérate busco
9,Bueno,"Entra a configuración, busca Teclado, seleccio..."


In [12]:
hf_ds = Dataset.from_pandas(pairs)

In [13]:
# 5.2 Cargamos el tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
# Aseguramos que el token de padding sea el mismo que eos
tokenizer.pad_token = tokenizer.eos_token

# 5.3 Definimos función de tokenización
def tokenize_fn(example):
    # Construimos el prompt: ponemos 'User:' antes del contexto y 'Bot:' antes de la respuesta
    prompt = f"User: {example['context']}\nBot:"
    # Tokenizamos prompt + reply juntos
    enc = tokenizer(
        prompt + " " + example["reply"],
        truncation=True,
        max_length=64,
        padding="max_length"
    )
    # En causal LM las etiquetas son los propios input_ids
    enc["labels"] = enc["input_ids"].copy()
    return enc

# 5.4 Aplicamos el mapeo (esto puede tardar unos minutos)
tokenized = hf_ds.map(
    tokenize_fn,
    batched=False,            # True si quieres tokenizar en lotes; con CPU pequeño mejor False
    remove_columns=hf_ds.column_names
)

# Comprobamos un ejemplo tokenizado
print(tokenized[0])

Map:   0%|          | 0/50726 [00:00<?, ? examples/s]

{'input_ids': [12982, 25, 367, 10513, 1355, 1350, 198, 20630, 25, 367, 5708, 442, 1557, 5350, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [12982, 25, 367, 10513, 1355, 1350, 198, 20630, 25, 367, 5708, 442, 1557, 5350, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 5

In [14]:
len(tokenized[0]["labels"])

64

In [15]:
# 1) Carga el modelo en PyTorch
model = AutoModelForCausalLM.from_pretrained("distilgpt2")


# 2) Collator para causal LM (no hacemos MLM)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# 3) Parámetros de entrenamiento
training_args = TrainingArguments(
    output_dir="./chat_style_model",
    per_device_train_batch_size=1,
    num_train_epochs=2,
    logging_steps=100,
    dataloader_num_workers=4,
    save_steps=500,
    no_cuda=True       # fuerza CPU
)

# 4) Creamos el Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,      # tu HF Dataset ya tokenizado
    data_collator=data_collator,
)

# 5) Entrenamos
trainer.train()

# 6) Guardamos modelo y tokenizer
trainer.save_model("./chat_style_model")
tokenizer.save_pretrained("./chat_style_model")


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


KeyboardInterrupt: 