<a href="https://colab.research.google.com/github/banno-0720/Deep-Learning-Projects/blob/main/Belief_State_Transformers_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Environment

In [1]:
!pip install transformers datasets gradio huggingface_hub

Collecting gradio
  Downloading gradio-5.30.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

In [2]:
import torch
from torch import nn
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, GPT2Model
from datasets import load_dataset
from huggingface_hub import HfApi, notebook_login
import gradio as gr

# Loading Movie Dialogue Dataset

In [5]:
# 1) Download and unzip the raw Cornell dataset
!wget -q http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip -O cornell.zip
!unzip -q cornell.zip -d cornell_data

# 2) Read raw lines and conversations
import os
import re
from itertools import islice
from datasets import Dataset

# Load the lines into a dict: lineID → text
lines_path = "cornell_data/cornell movie-dialogs corpus/movie_lines.txt"
conv_path  = "cornell_data/cornell movie-dialogs corpus/movie_conversations.txt"

id2line = {}
with open(lines_path, encoding="latin-1") as f:
    for line in f:
        # Format: lineID +++$+++ characterID +++$+++ movieID +++$+++ character name +++$+++ text
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 5:
            id2line[parts[0]] = parts[4]

# 3) Build (context, response) pairs from the first 1,000 conversations
examples = []
with open(conv_path, encoding="latin-1") as f:
    for conv in islice(f, 1000):
        parts = conv.strip().split(" +++$+++ ")
        # parts[-1] is a string like "['L1045','L1044',...]"
        ids = re.findall(r"L[0-9]+", parts[-1])
        # For each adjacent pair in the conversation, get context/response
        for i in range(len(ids) - 1):
            c_id, r_id = ids[i], ids[i+1]
            if c_id in id2line and r_id in id2line:
                examples.append({
                    "context":  id2line[c_id],
                    "response": id2line[r_id]
                })

# 4) Take a small slice and create a HuggingFace Dataset
small = examples[:3000]   # 3K pairs for Colab‐friendly speed
data  = Dataset.from_list(small)
split = data.train_test_split(test_size=0.2, shuffle=True)
train_ds, val_ds = split["train"], split["test"]

print(f"Loaded {len(examples)} pairs, using {len(train_ds)} train and {len(val_ds)} val examples.")

Loaded 2487 pairs, using 1989 train and 498 val examples.


# Data Preprocessing and Tokenization

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # so pad_id == eos_id
MAX_LEN = 128

def tokenize_function(example):
    # 1) tokenize separately, no special tokens
    ctx_ids = tokenizer.encode(example['context'], add_special_tokens=False)
    resp_ids = tokenizer.encode(example['response'], add_special_tokens=False)

    # 2) build the full sequence with EOS separators
    eos = tokenizer.eos_token_id
    input_ids = ctx_ids + [eos] + resp_ids + [eos]

    # 3) truncate or pad
    input_ids = input_ids[:MAX_LEN]
    padding_length = MAX_LEN - len(input_ids)
    input_ids = input_ids + [tokenizer.pad_token_id] * padding_length

    # 4) build labels: mask all context + the first EOS, keep only response tokens
    labels = [-100] * MAX_LEN
    # response starts at index len(ctx_ids) + 1
    start = len(ctx_ids) + 1
    end   = start + len(resp_ids)
    end   = min(end, MAX_LEN)  # in case of truncation

    labels[start:end] = input_ids[start:end]

    return {
        'input_ids': input_ids,
        'attention_mask': [1 if i < end else 0 for i in range(MAX_LEN)],
        'labels': labels
    }

# Then remap:
train_tokenized = train_ds.map(tokenize_function, batched=False)
val_tokenized   = val_ds.map(tokenize_function, batched=False)

Map:   0%|          | 0/1989 [00:00<?, ? examples/s]

Map:   0%|          | 0/498 [00:00<?, ? examples/s]

# Baseline Transformer: GPT-2 Chatbot

In [8]:
# Load GPT-2 small model
config = GPT2Config.from_pretrained('gpt2')
config.pad_token_id = tokenizer.pad_token_id
model_gpt = GPT2LMHeadModel.from_pretrained('gpt2', config=config)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_gpt.to(device)

# Simple training loop for a few epochs (scaled down for demo)
optimizer = torch.optim.AdamW(model_gpt.parameters(), lr=5e-5)
model_gpt.train()
for epoch in range(1):
    total_loss = 0
    for batch in train_tokenized.shuffle().select(range(1000)):  # only 1000 samples for speed
        input_ids = torch.tensor([batch['input_ids']]).to(device)
        labels = torch.tensor([batch['labels']]).to(device)
        outputs = model_gpt(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss/1000:.4f}")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch 1 - Loss: nan


# Belief State Transformer Implementation

In [9]:
class BeliefStateTransformer(nn.Module):
    def __init__(self, hidden_size=768, vocab_size=None):
        super().__init__()
        # Use GPT-2 Transformer encoders for forward and backward contexts
        self.forward_encoder  = GPT2Model(config)  # encodes prefix
        self.backward_encoder = GPT2Model(config)  # encodes suffix (we will input reversed suffix)
        # Linear output heads that take concatenated forward+back states and produce vocab logits for 2 predictions
        # We output 2 vocab-size logits for (next token, prev token) jointly
        self.text_head = nn.Sequential(
            nn.Linear(hidden_size*2, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, vocab_size*2)  # 2 * vocab for two token predictions
        )

    def forward(self, input_ids, suffix_ids):
        # input_ids: [batch, T] prefix tokens
        # suffix_ids: [batch, T] suffix tokens to encode (we will reverse them)
        # Forward encoding of prefix (last hidden state)
        forward_outputs = self.forward_encoder(input_ids)[0]  # [batch, T, hidden]
        forward_state = forward_outputs[:, -1, :]  # encoding of last token

        # Backward encoding: reverse suffix sequence for GPT2-style encoding
        reversed_suffix = torch.flip(suffix_ids, dims=[1])
        backward_outputs = self.backward_encoder(reversed_suffix)[0]
        backward_outputs = torch.flip(backward_outputs, dims=[1])  # flip back
        backward_state = backward_outputs[:, 0, :]  # encoding of first token of suffix

        # Concatenate forward and backward representations
        combined = torch.cat([forward_state, backward_state], dim=-1)  # [batch, hidden*2]
        logits = self.text_head(combined)  # [batch, 2*vocab]
        # Split logits into next-token and prev-token parts
        next_logits, prev_logits = torch.chunk(logits, 2, dim=-1)  # each [batch, vocab]
        return next_logits, prev_logits

In [12]:
from torch.nn.utils.rnn import pad_sequence

bst_model = BeliefStateTransformer(hidden_size=768, vocab_size=len(tokenizer))
bst_model.to(device)
optimizer_bst = torch.optim.AdamW(bst_model.parameters(), lr=5e-5)
bst_model.train()

# Prepare simple training: split each response into prefix/suffix halves
for epoch in range(1):
    total_loss = 0.0
    # Use only a small number of examples for demonstration
    for i in range(0, 1000, 2):
        batch_inputs      = []
        batch_suffix      = []
        batch_next_labels = []
        batch_prev_labels = []

        # Build a batch of 2 samples
        for j in range(2):
            text = train_ds[i+j]['response']
            tokens = tokenizer.encode(text, max_length=50, truncation=True)
            if len(tokens) < 4:
                continue
            # Split tokens into prefix and suffix parts
            mid = len(tokens) // 2
            prefix, suffix = tokens[:mid], tokens[mid:]

            # Prepare input and suffix tensors
            inp_ids    = torch.tensor(prefix + [tokenizer.eos_token_id]).squeeze(0)
            suff_ids   = torch.tensor([tokenizer.eos_token_id] + suffix).squeeze(0)
            next_label = torch.tensor([suffix[0]])   # next-token ground truth
            prev_label = torch.tensor([prefix[-1]])  # prev-token ground truth

            batch_inputs.append(inp_ids)
            batch_suffix.append(suff_ids)
            batch_next_labels.append(next_label)
            batch_prev_labels.append(prev_label)

        if not batch_inputs:
            continue

        # Pad all sequences in the batch to the same length
        pad_id = tokenizer.eos_token_id
        input_ids  = pad_sequence(batch_inputs, batch_first=True, padding_value=pad_id).to(device)
        suffix_ids = pad_sequence(batch_suffix, batch_first=True, padding_value=pad_id).to(device)

        # Stack label tensors
        next_labels = torch.cat(batch_next_labels).to(device)
        prev_labels = torch.cat(batch_prev_labels).to(device)

        # Forward pass and loss
        next_logits, prev_logits = bst_model(input_ids, suffix_ids)
        loss_next = nn.CrossEntropyLoss()(next_logits, next_labels)
        loss_prev = nn.CrossEntropyLoss()(prev_logits, prev_labels)
        loss = loss_next + loss_prev

        # Backward & optimize
        loss.backward()
        optimizer_bst.step()
        optimizer_bst.zero_grad()

        total_loss += loss.item()

    print(f"BST Epoch {epoch+1} - Loss: {total_loss:.4f}")

BST Epoch 1 - Loss: 8142.5363


# Model Comparison

In [15]:
model_gpt.eval()
bst_model.eval()

def reply_baseline(user_input):
    # Tokenize with padding/truncation so we get an attention_mask
    encoding = tokenizer(
        user_input,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    ).to(device)

    # Generate up to 50 new tokens, block repeated trigrams
    gen_ids = model_gpt.generate(
        input_ids=encoding["input_ids"],
        attention_mask=encoding["attention_mask"],
        max_new_tokens=50,
        no_repeat_ngram_size=3,
        pad_token_id=tokenizer.eos_token_id
    )
    # Strip off the prompt tokens
    reply = tokenizer.decode(
        gen_ids[0, encoding["input_ids"].size(1):],
        skip_special_tokens=True
    )
    return reply

def reply_bst(user_input):
    # 1) Tokenize the prefix with mask
    encoding = tokenizer(
        user_input,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    ).to(device)

    input_ids     = encoding["input_ids"]
    attn_mask     = encoding["attention_mask"]
    # 2) Build an “empty” suffix (just EOS) with mask
    suffix_ids    = torch.tensor([[tokenizer.eos_token_id]], device=device)
    suffix_mask   = torch.ones_like(suffix_ids, device=device)

    with torch.no_grad():
        # Pass both through BST
        next_logits, prev_logits = bst_model(input_ids, suffix_ids)

        # Extract belief‐state vector
        fwd = bst_model.forward_encoder(input_ids, attention_mask=attn_mask)[0][:, -1, :]
        bwd = bst_model.backward_encoder(torch.flip(suffix_ids, [1]))[0]
        bwd = torch.flip(bwd, [1])[:, 0, :]
        belief_state = (fwd + bwd).cpu().numpy().flatten()[:10]  # first 10 dims

        # For reply we again use GPT-2 but with proper mask
        gen_ids = model_gpt.generate(
            input_ids=input_ids,
            attention_mask=attn_mask,
            max_new_tokens=30,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id
        )
        bst_reply = tokenizer.decode(
            gen_ids[0, input_ids.size(1):],
            skip_special_tokens=True
        )

    # Format belief as string
    belief_str = ", ".join(f"{x:.3f}" for x in belief_state)
    return bst_reply, belief_str

# Example turn:
context = "Hi, how are you?"
print("User:", context)
print("GPT-2 Baseline Reply:", reply_baseline(context))
bst_resp, bst_belief = reply_bst(context)
print("Belief-State Model Reply:", bst_resp)
print("Belief State (first 10 dims):", bst_belief)

User: Hi, how are you?
GPT-2 Baseline Reply:   I'm fine.  I know what you did.  You're a good guy.  And you're a great guy. I'm sorry.  What?  You know what I'm saying.  It's not a good thing.
Belief-State Model Reply:   I'm fine.  I know what you did.  You're a good guy.  And you're a great guy. I'm sorry
Belief State (first 10 dims): -0.878, -0.247, -1.241, 1.932, -0.945, 1.102, 0.552, -2.854, -0.904, 0.830


# Interactive Chat Demo with Gradio

In [17]:
# Make sure your reply functions return exactly what we expect:

def reply_baseline(user_input):
    encoding = tokenizer(
        user_input,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    ).to(device)
    gen_ids = model_gpt.generate(
        input_ids=encoding["input_ids"],
        attention_mask=encoding["attention_mask"],
        max_new_tokens=50,
        no_repeat_ngram_size=3,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(
        gen_ids[0, encoding["input_ids"].size(1):],
        skip_special_tokens=True
    )

def reply_bst(user_input):
    encoding = tokenizer(
        user_input,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    ).to(device)
    input_ids   = encoding["input_ids"]
    attn_mask   = encoding["attention_mask"]
    suffix_ids  = torch.tensor([[tokenizer.eos_token_id]], device=device)

    with torch.no_grad():
        # belief‐state vector
        fwd = bst_model.forward_encoder(input_ids, attention_mask=attn_mask)[0][:, -1, :]
        bwd = bst_model.backward_encoder(torch.flip(suffix_ids, [1]))[0]
        bwd = torch.flip(bwd, [1])[:, 0, :]
        belief_vec = (fwd + bwd).cpu().numpy().flatten()[:10]

        # reply text via GPT-2
        gen_ids = model_gpt.generate(
            input_ids=input_ids,
            attention_mask=attn_mask,
            max_new_tokens=30,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id
        )
        bst_reply = tokenizer.decode(
            gen_ids[0, input_ids.size(1):],
            skip_special_tokens=True
        )

    # format belief vector as comma-separated string
    belief_str = ", ".join(f"{x:.3f}" for x in belief_vec)
    return bst_reply, belief_str

# Corrected chat_models:
def chat_models(user_input):
    baseline_resp = reply_baseline(user_input)
    bst_resp, belief_str = reply_bst(user_input)
    return baseline_resp, bst_resp, belief_str

# Rebuild the interface:
import gradio as gr

iface = gr.Interface(
    fn=chat_models,
    inputs=gr.Textbox(lines=2, placeholder="Type your message...", label="Your Message"),
    outputs=[
        gr.Textbox(label="Baseline (GPT-2) Reply"),
        gr.Textbox(label="Belief-State Model Reply"),
        gr.Textbox(label="Belief State (first 10 dims)")
    ],
    title="Movie Dialogue Belief-State Chatbot",
    description="Enter a movie-style dialogue line; see both models' replies and the BST's hidden state."
)

# Launch (you can add share=True if you need a public URL)
iface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://eecfdb116be04f2eed.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# Saving and Pushing Models to Hugging Face Hub

In [19]:
from huggingface_hub import HfApi
import os
import torch
import json

# 1) Make sure you’ve already created the repo on HF Hub:
api = HfApi()
repo_bst = "HimanshuGoyal2004/movie-dialog-beliefstate"
api.create_repo(repo_bst, exist_ok=True)

# 2) Save your model’s state_dict + a minimal “config.json”
#    and the tokenizer to a local folder
bst_dir = "bst_model/"
os.makedirs(bst_dir, exist_ok=True)

# Save weights
torch.save(bst_model.state_dict(), os.path.join(bst_dir, "pytorch_model.bin"))

# Save a simple config so users know how to rebuild:
config = {
    "hidden_size": 768,
    "vocab_size": len(tokenizer),
    # (add any other hyperparams your __init__ needs)
}
with open(os.path.join(bst_dir, "config.json"), "w") as f:
    json.dump(config, f)

# Save the tokenizer files too
tokenizer.save_pretrained(bst_dir)

# 3) Upload the entire folder to the Hub
api.upload_folder(
    folder_path=bst_dir,
    repo_id=repo_bst,
    repo_type="model",
    commit_message="Add Belief State Transformer model files"
)

pytorch_model.bin:   0%|          | 0.00/1.31G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HimanshuGoyal2004/movie-dialog-beliefstate/commit/2cf7bfbd947cde8ffb3a67a5c5e31475084c2a75', commit_message='Add Belief State Transformer model files', commit_description='', oid='2cf7bfbd947cde8ffb3a67a5c5e31475084c2a75', pr_url=None, repo_url=RepoUrl('https://huggingface.co/HimanshuGoyal2004/movie-dialog-beliefstate', endpoint='https://huggingface.co', repo_type='model', repo_id='HimanshuGoyal2004/movie-dialog-beliefstate'), pr_revision=None, pr_num=None)

# Deploying to Hugging Face Spaces

In [22]:
# (In a local or Colab terminal)
!huggingface-cli login
!huggingface-cli repo create HimanshuGoyal2004/movie-dialog-belief-demo --space-sdk gradio --type space


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write

In [24]:
from huggingface_hub import HfApi

api = HfApi()
# Replace with your username and desired space name
space_id = "HimanshuGoyal2004/movie-dialog-belief-demo"

# repo_type="space" and space_sdk="gradio" tell HF this is a Gradio Space
api.create_repo(
    repo_id=space_id,
    repo_type="space",
    space_sdk="gradio",
    exist_ok=True   # won't fail if it already exists
)

print(f"✅ Created space: https://huggingface.co/spaces/{space_id}")

✅ Created space: https://huggingface.co/spaces/HimanshuGoyal2004/movie-dialog-belief-demo


In [25]:
from IPython.display import HTML


# You can get embeddable HTML code for your demo by clicking the "Embed" button on the demo page
HTML(data='''
<iframe
	src="https://himanshugoyal2004-movie-dialog-belief-demo.hf.space"
	frameborder="0"
	width="850"
	height="450"
></iframe>
''')