In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving movie_lines.txt to movie_lines.txt
Saving movie_conversations.txt to movie_conversations.txt
User uploaded file "movie_lines.txt" with length 34641919 bytes
User uploaded file "movie_conversations.txt" with length 6760930 bytes


In [None]:
!unzip "drive-download-20250730T074752Z-1-001.zip"

unzip:  cannot find or open drive-download-20250730T074752Z-1-001.zip, drive-download-20250730T074752Z-1-001.zip.zip or drive-download-20250730T074752Z-1-001.zip.ZIP.


In [None]:
# Step 1: Parse movie_lines.txt
id2line = {}
with open("movie_lines.txt", encoding="ISO-8859-1") as f:  # You can try changing this and see what changes in the response you get :)
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 5:
            line_id = parts[0]
            text = parts[4]
            id2line[line_id] = text

# Step 2: Parse movie_conversations.txt into a list of conversations
conversations = []
with open("movie_conversations.txt", encoding="ISO-8859-1") as f:  # Make sure to keet encodings consistent
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 4:
            try:
                utterance_ids = eval(parts[3])  # Converts string list to actual list
                conversations.append(utterance_ids)
            except Exception as e:
                print(f"Skipping line due to eval error: {e}")

# Step 3: Build input-output pairs (prompt-response)
pairs = []
for conv in conversations:
    for i in range(len(conv) - 1):
        if conv[i] in id2line and conv[i+1] in id2line:
            input_line = id2line[conv[i]].strip()
            target_line = id2line[conv[i+1]].strip()
            if input_line and target_line:  # skip empty lines
                pairs.append((input_line, target_line))


print(f"Loaded {len(pairs)} dialog pairs.")


Loaded 221282 dialog pairs.


In [None]:
import random

# Parameters
SAMPLE_SIZE = 5_000      # how many pairs you want, you can change it
RANDOM_SEED = 42          # set this if you need deterministic sampling

# Draw the sample
random.seed(RANDOM_SEED)          # comment this out for a fresh shuffle each run
sample_pairs = random.sample(pairs, SAMPLE_SIZE)

print(f"Sampled {len(sample_pairs)} pairs.")

Sampled 5000 pairs.


In [None]:
from datasets import Dataset

# Create a Hugging Face Dataset from your list of (input, output) pairs
data = [{"input": q, "output": a} for q, a in sample_pairs]
hf_dataset = Dataset.from_list(data)

print(hf_dataset[0])  # sanity check


{'input': "You didn't come here to destroy Wintermute. You can to save a man you love. A man who isn't even capable of returning that love. Such a waste...", 'output': "My man's coming to get my ass out of here. That's good enough for me."}


In [None]:
# If not installed, uncomment the code and install
# !pip install transformers


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
tokenizer.pad_token = tokenizer.eos_token  # Fix the pad token issue


def tokenize(example):
    #input_text = example["input"] + tokenizer.eos_token
    #output_text = example["output"] + tokenizer.eos_token
    #full_text = input_text + output_text
    input_text = "human: " + example["input"] + "\n"
    output_text = "bot: " + example["output"] + tokenizer.eos_token
    full_text = input_text + output_text
    tokens = tokenizer(full_text, truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = tokens["input_ids"].copy()  # Causal language modeling
    return tokens

tokenized_dataset = hf_dataset.map(tokenize, batched=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
# If error persists, uncomment this code, execute it and run below snippet again
# !pip install -U transformers


In [None]:
import os
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
# Need to import get_last_checkpoint from the trainer_utils module
from transformers.trainer_utils import get_last_checkpoint


# 1. Detect an existing checkpoint (if any)
# Removed checkpoint detection to force training from scratch


# 2. Load model (fresh or from checkpoint)

# Always load model from scratch
model_name_or_path = "microsoft/DialoGPT-medium"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)

# (Optional but tidy) – make sure pad token is set

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
tokenizer.pad_token = tokenizer.eos_token


training_args = TrainingArguments(
    output_dir           = "./dialogpt-finetuned",
    per_device_train_batch_size = 4,
    num_train_epochs     = 3,
    dataloader_num_workers = 2,

    # logging & checkpointing
    logging_strategy     = "steps",
    logging_steps        = 200,
    save_strategy        = "steps",
    save_steps           = 500,
    save_total_limit     = 2,

    # misc
    fp16                 = True,     # comment out if GPU doesn’t support fp16
    report_to            = "none",   # no WandB/HF Hub logging
)


# 4. Trainer
trainer = Trainer(
    model         = model,
    args          = training_args,
    train_dataset = tokenized_dataset,
    tokenizer     = tokenizer,  # keeps pad/eos alignment neat
)


# 5. Train – resume if we have a checkpoint
# Removed checkpoint resumption to force training from scratch
trainer.train()

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
200,1.2028
400,0.8439
600,0.8717
800,0.8236


Step,Training Loss
200,1.2028
400,0.8439
600,0.8717
800,0.8236
1000,0.8144
1200,0.8741
1400,0.7508
1600,0.723
1800,0.7179
2000,0.6968


TrainOutput(global_step=3750, training_loss=0.7454117899576823, metrics={'train_runtime': 1411.3146, 'train_samples_per_second': 10.628, 'train_steps_per_second': 2.657, 'total_flos': 3482627604480000.0, 'train_loss': 0.7454117899576823, 'epoch': 3.0})

In [None]:
# Saving the freshly trained moel and its tokeniser
trainer.save_model("./dialogpt-finetuned/final")
tokenizer.save_pretrained("./dialogpt-finetuned/final")


('./dialogpt-finetuned/final/tokenizer_config.json',
 './dialogpt-finetuned/final/special_tokens_map.json',
 './dialogpt-finetuned/final/chat_template.jinja',
 './dialogpt-finetuned/final/vocab.json',
 './dialogpt-finetuned/final/merges.txt',
 './dialogpt-finetuned/final/added_tokens.json',
 './dialogpt-finetuned/final/tokenizer.json')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the fine-tuned model
model_path = "./dialogpt-finetuned/final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model     = AutoModelForCausalLM.from_pretrained(model_path)
model.eval()

# Chat prompt
chat_history = []
while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit"]: break

    new_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')
    bot_input_ids = torch.cat(chat_history + [new_input_ids], dim=-1) if chat_history else new_input_ids
    #change 1
    #print(f"Bot input ids: {bot_input_ids}")

    output_ids = model.generate(
        bot_input_ids,
        max_length=1000,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.9,
    )
    #print(f"Bot output ids: {output_ids}")


    response = tokenizer.decode(output_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    print(f"Bot: {response}")
    chat_history.append(new_input_ids)

You: hello


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Bot: Hello.  I see you've been here before. Is that a problem?
You: nope
Bot: I'm sorry I'm so late... but I've had so much fun this year already. It's a wonderful thing to have so many people come over to see you. And it's gonna be such a great year for you, too. I'm so glad to have you here.
You: how are you
Bot: good. your daughter went to hs. shes great. shes the reason we moved to tampa. she went to hs. and we wanted to live there. we wanted to be there with her and her girl. we wanted to be there. we wanted to have her... and we didn't.
You: right
Bot: right... so how are you doing?
You: im okay
Bot: how was your day?
You: not bad honestly
Bot: not bad in my eyes
You: okay
Bot: well thank you very much for your concern.
You: exit
