## Importing libraires

In [None]:
!pip install datasets



In [None]:
import pandas as pd
import torch
import os
import re
import gc
import glob
import json
import pickle
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from google.colab import drive
from difflib import get_close_matches
from datasets import load_dataset

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
target_directory = '/content/drive/My Drive/chatbot_data'

## Find Parquet files

In [None]:
# Find Parquet files
def get_parquet_files(directory):
    file_list = []
    pattern = re.compile(r'dataset\d+\.parquet$|daily_dialog\.parquet$')
    for root, dirs, files in os.walk(directory):
        for file in files:
            if pattern.match(file):
                full_path = os.path.join(root, file)
                file_list.append(full_path)

    # Sort files: daily_dialog.parquet first, then dataset0.parquet, dataset1.parquet, etc.
    def sort_key(x):
        if 'daily_dialog' in x.lower():
            return -1  # Place daily_dialog first
        match = re.search(r'dataset(\d+)\.parquet', x, re.IGNORECASE)
        return int(match.group(1)) if match else float('inf')  # Sort by number, or place at end if no match

    file_list.sort(key=sort_key)
    return file_list

# Find and process the actual Parquet files
parquet_files = get_parquet_files(f'{target_directory}/dataset/')
print(f"Found {len(parquet_files)} files:", parquet_files)

# Verify one file by loading a few rows
df_sample = pd.read_parquet(parquet_files[0])
print("Sample data from first file (first 2 rows):")
print(df_sample.head(2))
print("Number of rows in first file:", len(df_sample))

Found 6 files: ['/content/drive/My Drive/chatbot_data/dataset/daily_dialog.parquet', '/content/drive/My Drive/chatbot_data/dataset/dataset0.parquet', '/content/drive/My Drive/chatbot_data/dataset/dataset1.parquet', '/content/drive/My Drive/chatbot_data/dataset/dataset2.parquet', '/content/drive/My Drive/chatbot_data/dataset/dataset3.parquet', '/content/drive/My Drive/chatbot_data/dataset/dataset4.parquet']
Sample data from first file (first 2 rows):
                                                text
0  Say , Jim , how about going for a few beers af...
1   You know that is tempting but is really not g...
Number of rows in first file: 87170


## Load DailyDialog dataset

In [None]:
# Load DailyDialog dataset
dataset = load_dataset('daily_dialog', split='train', trust_remote_code=True)
dialogs = dataset['dialog']  # List of dialogues (each dialogue is a list of utterances)

# Convert to a DataFrame with a 'text' column (to match Cosmopedia format)
texts = []
for dialog in dialogs:
    for utterance in dialog:
        texts.append(utterance)

# Create a DataFrame with just the 'text' column
dialog_df = pd.DataFrame({'text': texts})

# Save as a Parquet file in your Google Drive
dialog_parquet_path = f'{target_directory}/dataset/daily_dialog.parquet'
dialog_df.to_parquet(dialog_parquet_path)

# Update parquet_files to include the new file
parquet_files = get_parquet_files(f'{target_directory}/dataset/')
print(f"Updated parquet files: {parquet_files}")

# Verify the DailyDialog file
df_dialog_sample = pd.read_parquet(dialog_parquet_path)
print("Sample data from DailyDialog file (first 2 rows):")
print(df_dialog_sample.head(2))
print("Number of rows in DailyDialog file:", len(df_dialog_sample))

Updated parquet files: ['/content/drive/My Drive/chatbot_data/dataset/daily_dialog.parquet', '/content/drive/My Drive/chatbot_data/dataset/dataset0.parquet', '/content/drive/My Drive/chatbot_data/dataset/dataset1.parquet', '/content/drive/My Drive/chatbot_data/dataset/dataset2.parquet', '/content/drive/My Drive/chatbot_data/dataset/dataset3.parquet', '/content/drive/My Drive/chatbot_data/dataset/dataset4.parquet']
Sample data from DailyDialog file (first 2 rows):
                                                text
0  Say , Jim , how about going for a few beers af...
1   You know that is tempting but is really not g...
Number of rows in DailyDialog file: 87170


## Build Vocab

### Save and load vocab functions

In [None]:
def save_vocab(vocab, word_to_idx, idx_to_word, path=f'{target_directory}/config/'):
    os.makedirs(path, exist_ok=True)

    with open(os.path.join(path, 'vocab.json'), 'w') as f:
        json.dump(vocab, f)

    with open(os.path.join(path, 'word_to_idx.json'), 'w') as f:
        json.dump(word_to_idx, f)

    with open(os.path.join(path, 'idx_to_word.json'), 'w') as f:
        json.dump(idx_to_word, f)

    print(f"✅ Vocabulary saved to {path}")

def load_vocab(path=f'{target_directory}/config/'):
    with open(os.path.join(path, 'vocab.json'), 'r') as f:
        vocab = json.load(f)

    with open(os.path.join(path, 'word_to_idx.json'), 'r') as f:
        word_to_idx = json.load(f)

    with open(os.path.join(path, 'idx_to_word.json'), 'r') as f:
        idx_to_word = json.load(f)

    Config.vocab_size = len(vocab)
    print(f"✅ Vocabulary loaded from {path} (size: {Config.vocab_size})")
    return vocab, word_to_idx, idx_to_word

### Save and load config

In [None]:
def save_config(path=f'{target_directory}/config/'):
    os.makedirs(path, exist_ok=True)
    config_dict = {
        'seq_length': Config.seq_length,
        'batch_size': Config.batch_size,
        'learning_rate': Config.learning_rate,
        'device': Config.device,
        'vocab_size': Config.vocab_size,
        'pad_token': Config.pad_token,
        'unk_token': Config.unk_token
    }
    with open(os.path.join(path, 'config.json'), 'w') as f:
        json.dump(config_dict, f)
    print(f"✅ Config saved to {path}")

def load_config(path=f'{target_directory}/config/'):
    with open(os.path.join(path, 'config.json'), 'r') as f:
        config_dict = json.load(f)
    Config.seq_length = config_dict['seq_length']
    Config.batch_size = config_dict['batch_size']
    Config.learning_rate = config_dict['learning_rate']
    Config.device = config_dict['device']
    Config.vocab_size = config_dict['vocab_size']
    Config.pad_token = config_dict['pad_token']
    Config.unk_token = config_dict['unk_token']
    print(f"✅ Config loaded from {path}")

### Build Vocab

In [None]:
# Config
class Config:
    tokenizer = word_tokenize
    seq_length = 128
    batch_size = 16
    learning_rate = 0.001
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    vocab_size = None
    pad_token = '<pad>'
    unk_token = '<unk>'

# Build vocabulary
def build_vocab(parquet_files, min_freq=100):
    word_counts = Counter()
    texts = []
    for file in parquet_files:
        df = pd.read_parquet(file)
        texts.extend(df['text'].astype(str).tolist())
        del df
        gc.collect()
    for text in texts:
        word_counts.update(Config.tokenizer(text))

    vocab = [Config.pad_token, Config.unk_token] + list({
        word for word, count in word_counts.items() if count >= min_freq and word.isalpha()
    })
    word_to_idx = {word: i for i, word in enumerate(vocab)}
    idx_to_word = {i: word for word, i in word_to_idx.items()}
    Config.vocab_size = len(vocab)
    return vocab, word_to_idx, idx_to_word

# Build the vocabulary and print results
vocab, word_to_idx, idx_to_word = build_vocab(parquet_files, min_freq=100)
print(f"Vocabulary size: {Config.vocab_size}")
print("Sample vocab words:", vocab[:10])

Vocabulary size: 48293
Sample vocab words: ['<pad>', '<unk>', 'swings', 'Aesthetic', 'Knicks', 'Development', 'cyclical', 'corresponds', 'drawback', 'avail']
✅ Vocabulary saved to /content/drive/My Drive/chatbot_data/config/


### Save config and vocab

In [None]:
save_vocab(vocab, word_to_idx, idx_to_word)
save_config()

✅ Vocabulary saved to /content/drive/My Drive/chatbot_data/config/
✅ Config saved to /content/drive/My Drive/chatbot_data/config/


## Create dataset class

In [None]:
# Config
class Config:
    tokenizer = word_tokenize
    seq_length = None
    batch_size = None
    learning_rate = None
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    vocab_size = None
    pad_token = None
    unk_token = None

load_config()
vocab, word_to_idx, idx_to_word = load_vocab()

# Dataset
class ChatDataset(Dataset):
    def __init__(self, file_path, word_to_idx):
        self.df = pd.read_parquet(file_path)
        self.texts = self.df['text'].astype(str).tolist()
        self.word_to_idx = word_to_idx
        del self.df
        gc.collect()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = [token.lower() for token in Config.tokenizer(self.texts[idx])]
        input_ids = [self.word_to_idx.get(token, self.word_to_idx[Config.unk_token]) for token in tokens]
        input_ids = input_ids[:Config.seq_length-1]
        input_ids += [self.word_to_idx[Config.pad_token]] * (Config.seq_length-1 - len(input_ids))
        labels = input_ids[1:] + [self.word_to_idx[Config.pad_token]]
        return {
            'input_ids': torch.tensor(input_ids).long(),
            'labels': torch.tensor(labels).long()
        }

# Test the dataset with the first file
dataset = ChatDataset(parquet_files[0], word_to_idx)
sample = dataset[0]
print("Sample input IDs:", sample['input_ids'][:10].tolist())
print("Sample labels:", sample['labels'][:10].tolist())
print("Dataset size:", len(dataset))

✅ Config loaded from /content/drive/My Drive/chatbot_data/config/
✅ Vocabulary loaded from /content/drive/My Drive/chatbot_data/config/ (size: 48293)
Sample input IDs: [38657, 1, 1, 1, 32783, 838, 8901, 40401, 29945, 46638]
Sample labels: [1, 1, 1, 32783, 838, 8901, 40401, 29945, 46638, 1700]
Dataset size: 87170


## Tranformer model

In [None]:
# Transformer Model
class MiniTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=256, n_heads=4, num_layers=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Parameter(torch.zeros(1, Config.seq_length, d_model))
        self.transformer = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model, n_heads), num_layers
        )
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, input_ids):
        x = self.embed(input_ids) + self.pos_embed[:, :input_ids.size(1), :]
        x = self.transformer(x, x)  # Self-attention
        return self.fc(x)

# Initialize model
model = MiniTransformer(Config.vocab_size).to(Config.device)
print("Model initialized with vocab size:", Config.vocab_size)
print("Model parameters:", sum(p.numel() for p in model.parameters()))

# Test forward pass with a sample
sample_input = dataset[0]['input_ids'].unsqueeze(0).to(Config.device)  # Add batch dimension
with torch.no_grad():
    output = model(sample_input)
print("Sample output shape:", output.shape)

Model initialized with vocab size: 48293
Model parameters: 27964581
Sample output shape: torch.Size([1, 127, 48293])


## Train transformer

In [None]:
# Training function with checkpoint resumption
def train_model(file_list, num_epochs=5, checkpoint_dir=f'{target_directory}/checkpoints/'):
    os.makedirs(checkpoint_dir, exist_ok=True)

    model = MiniTransformer(Config.vocab_size).to(Config.device)
    optimizer = optim.Adam(model.parameters(), lr=Config.learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=word_to_idx[Config.pad_token])

    # Check for existing checkpoints
    checkpoint_files = glob.glob(os.path.join(checkpoint_dir, 'epoch_*.pt'))
    start_epoch = 1
    if checkpoint_files:
        # Find the latest checkpoint by epoch number
        latest_checkpoint = max(checkpoint_files, key=lambda x: int(re.search(r'epoch_(\d+)\.pt', x).group(1)))
        checkpoint = torch.load(latest_checkpoint, map_location=Config.device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        print(f"Resuming from checkpoint: {latest_checkpoint} (starting at epoch {start_epoch})")

    # Train from start_epoch to num_epochs
    for epoch in range(start_epoch, num_epochs + 1):
        print(f"\nEpoch {epoch}/{num_epochs}")
        for file_idx, file_path in enumerate(file_list):
            print(f"Processing file {file_idx+1}/{len(file_list)}: {os.path.basename(file_path)}")
            dataset = ChatDataset(file_path, word_to_idx)
            dataloader = DataLoader(dataset, batch_size=Config.batch_size, shuffle=True)

            for batch_idx, batch in enumerate(dataloader):
                inputs = batch['input_ids'].to(Config.device)
                labels = batch['labels'].to(Config.device)

                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs.view(-1, Config.vocab_size), labels.view(-1))
                loss.backward()
                optimizer.step()

                if batch_idx % 50 == 0:
                    print(f"Batch {batch_idx}, Loss: {loss.item():.4f}")
                    torch.cuda.empty_cache()

            del dataset, dataloader
            gc.collect()

        # Save checkpoint
        checkpoint_path = os.path.join(checkpoint_dir, f"epoch_{epoch}.pt")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
        }, checkpoint_path)
        print(f"Checkpoint saved to {checkpoint_path}")

    return model

# Train on Cosmopedia files (exclude daily_dialog.parquet for now)
cosmopedia_files = [f for f in parquet_files if 'daily_dialog' not in f]
print(f"Training on Cosmopedia files: {cosmopedia_files}")

model = train_model(cosmopedia_files, num_epochs=5)

def save_trained_model(model, save_dir=f'{target_directory}/model_output/'):
    os.makedirs(save_dir, exist_ok=True)
    model_path = os.path.join(save_dir, 'chatbot_model.pt')
    torch.save(model.state_dict(), model_path)
    print(f"✅ Final trained model saved to {model_path}")

save_trained_model(model)

Training on Cosmopedia files: ['/content/drive/My Drive/chatbot_data/dataset/dataset0.parquet', '/content/drive/My Drive/chatbot_data/dataset/dataset1.parquet', '/content/drive/My Drive/chatbot_data/dataset/dataset2.parquet', '/content/drive/My Drive/chatbot_data/dataset/dataset3.parquet', '/content/drive/My Drive/chatbot_data/dataset/dataset4.parquet']
Resuming from checkpoint: /content/drive/My Drive/chatbot_data/checkpoints/epoch_5.pt (starting at epoch 6)
✅ Final trained model saved to /content/drive/My Drive/chatbot_data/model_output/chatbot_model.pt


In [None]:
checkpoint_dir = f'{target_directory}/checkpoints/'
checkpoint_files = glob.glob(os.path.join(checkpoint_dir, 'epoch_*.pt'))
latest_checkpoint = max(checkpoint_files, key=lambda x: int(re.search(r'epoch_(\d+)\.pt', x).group(1)))
model = MiniTransformer(Config.vocab_size).to(Config.device)
checkpoint = torch.load(latest_checkpoint, map_location=Config.device)
model.load_state_dict(checkpoint['model_state_dict'])
print(f"✅ Loaded checkpoint: {latest_checkpoint}")

# Fine-tuning function
def fine_tune_model(file_path, num_epochs=1, checkpoint_dir=f'{target_directory}/checkpoints/'):
    os.makedirs(checkpoint_dir, exist_ok=True)
    optimizer = optim.Adam(model.parameters(), lr=Config.learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=word_to_idx[Config.pad_token])

    dataset = ChatDataset(file_path, word_to_idx)
    dataloader = DataLoader(dataset, batch_size=Config.batch_size, shuffle=True)

    model.train()
    for epoch in range(1, num_epochs + 1):
        print(f"\nFine-tuning Epoch {epoch}/{num_epochs}")
        for batch_idx, batch in enumerate(dataloader):
            inputs = batch['input_ids'].to(Config.device)
            labels = batch['labels'].to(Config.device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, Config.vocab_size), labels.view(-1))
            loss.backward()
            optimizer.step()

            if batch_idx % 50 == 0:
                print(f"Batch {batch_idx}, Loss: {loss.item():.4f}")
                torch.cuda.empty_cache()

        # Save checkpoint
        checkpoint_path = os.path.join(checkpoint_dir, f"finetune_epoch_{epoch}.pt")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
        }, checkpoint_path)
        print(f"Fine-tuning checkpoint saved to {checkpoint_path}")

    del dataset, dataloader
    gc.collect()
    return model

# Fine-tune on daily_dialog.parquet
daily_dialog_file = [f for f in parquet_files if 'daily_dialog' in f][0]
print(f"Fine-tuning on: {daily_dialog_file}")
model = fine_tune_model(daily_dialog_file, num_epochs=1)

# Save fine-tuned model
def save_finetuned_model(model, save_dir=f'{target_directory}/model_output/'):
    os.makedirs(save_dir, exist_ok=True)
    model_path = os.path.join(save_dir, 'chatbot_finetuned.pt')
    torch.save(model.state_dict(), model_path)
    print(f"✅ Fine-tuned model weights saved to {model_path}")

save_finetuned_model(model)

✅ Loaded checkpoint: /content/drive/My Drive/chatbot_data/checkpoints/epoch_5.pt
Fine-tuning on: /content/drive/My Drive/chatbot_data/dataset/daily_dialog.parquet

Fine-tuning Epoch 1/1
Batch 0, Loss: 14.1991
Batch 50, Loss: 6.0999
Batch 100, Loss: 6.0511
Batch 150, Loss: 5.9441
Batch 200, Loss: 5.7168
Batch 250, Loss: 5.7921
Batch 300, Loss: 5.9514
Batch 350, Loss: 5.3910
Batch 400, Loss: 5.2049
Batch 450, Loss: 5.2378
Batch 500, Loss: 5.5569
Batch 550, Loss: 5.0712
Batch 600, Loss: 5.4929
Batch 650, Loss: 5.4384
Batch 700, Loss: 5.1645
Batch 750, Loss: 5.1523
Batch 800, Loss: 5.5741
Batch 850, Loss: 5.2248
Batch 900, Loss: 5.0548
Batch 950, Loss: 4.8425
Batch 1000, Loss: 4.5918
Batch 1050, Loss: 4.5528
Batch 1100, Loss: 4.9230
Batch 1150, Loss: 5.3549
Batch 1200, Loss: 4.8522
Batch 1250, Loss: 5.0114
Batch 1300, Loss: 4.9219
Batch 1350, Loss: 5.0446
Batch 1400, Loss: 4.6027
Batch 1450, Loss: 5.1551
Batch 1500, Loss: 4.6188
Batch 1550, Loss: 4.6313
Batch 1600, Loss: 5.0622
Batch 1650,

In [None]:
def generate_response(prompt, max_length=20, temperature=1.0):
    model.eval()
    tokens = [token.lower() for token in Config.tokenizer(prompt) if token.lower() in word_to_idx]
    if not tokens:
        tokens = ['hello']
    input_ids = [word_to_idx[token] for token in tokens]
    input_ids = input_ids[-Config.seq_length+1:]
    input_ids += [word_to_idx[Config.pad_token]] * (Config.seq_length-1 - len(input_ids))
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(Config.device)

    generated_ids = []
    with torch.no_grad():
        for _ in range(max_length):
            outputs = model(input_tensor)
            logits = outputs[0, -1, :]
            logits[word_to_idx[Config.unk_token]] = -float('inf')  # Suppress <unk>
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)
            # Print top 5 predicted tokens
            top_probs, top_ids = torch.topk(probs, 5)
            print("\nTop 5 predicted tokens:")
            for prob, id in zip(top_probs.tolist(), top_ids.tolist()):
                word = idx_to_word.get(str(id), '<not found>')
                print(f"  {word}: {prob:.4f}")
            next_token_id = torch.multinomial(probs, num_samples=1).item()
            generated_ids.append(next_token_id)
            input_ids = (input_ids[1:] + [next_token_id])[-Config.seq_length+1:]
            input_tensor = torch.tensor([input_ids], dtype=torch.long).to(Config.device)
            if next_token_id == word_to_idx[Config.pad_token]:
                break

    generated_words = [idx_to_word.get(str(id), Config.unk_token) for id in generated_ids]
    return ' '.join(generated_words)

# Test fine-tuned model
test_prompts = [
    "Hello, how are you today?",
    "What is the internet?",
    "Tell me about yourself."
]
for prompt in test_prompts:
    print(f"\nPrompt: {prompt}")
    response = generate_response(prompt, max_length=20, temperature=1.0)
    print(f"Response: {response}")


Prompt: Hello, how are you today?

Top 5 predicted tokens:
  you: 0.0531
  it: 0.0514
  the: 0.0438
  a: 0.0420
  t: 0.0241

Top 5 predicted tokens:
  much: 0.2926
  to: 0.0627
  about: 0.0562
  many: 0.0411
  long: 0.0399

Top 5 predicted tokens:
  to: 0.0679
  a: 0.0583
  you: 0.0526
  have: 0.0298
  the: 0.0276

Top 5 predicted tokens:
  a: 0.0788
  the: 0.0355
  have: 0.0250
  to: 0.0233
  very: 0.0215

Top 5 predicted tokens:
  you: 0.0815
  i: 0.0680
  a: 0.0514
  the: 0.0379
  it: 0.0341

Top 5 predicted tokens:
  morning: 0.0463
  way: 0.0303
  afternoon: 0.0199
  i: 0.0172
  to: 0.0171

Top 5 predicted tokens:
  to: 0.0418
  time: 0.0301
  and: 0.0269
  year: 0.0214
  of: 0.0196

Top 5 predicted tokens:
  the: 0.0383
  to: 0.0222
  and: 0.0195
  you: 0.0144
  do: 0.0123

Top 5 predicted tokens:
  i: 0.2499
  we: 0.0401
  the: 0.0253
  you: 0.0242
  it: 0.0168

Top 5 predicted tokens:
  i: 0.1430
  you: 0.0490
  s: 0.0483
  sorry: 0.0259
  it: 0.0247

Top 5 predicted tokens:
 

## Load the model and generate responses

In [None]:
# Define Transformer Model
class MiniTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=256, n_heads=4, num_layers=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Parameter(torch.zeros(1, 128, d_model))
        self.transformer = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model, n_heads), num_layers
        )
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, input_ids):
        x = self.embed(input_ids) + self.pos_embed[:, :input_ids.size(1), :]
        x = self.transformer(x, x)  # self-attention only
        return self.fc(x)

# Load config from /config
with open(f'{target_directory}/config/config.json', 'r') as f:
    config_dict = json.load(f)

# Load vocab from /config
with open(f'{target_directory}/config/word_to_idx.json', 'r') as f:
    word_to_idx = json.load(f)
with open(f'{target_directory}/config/idx_to_word.json', 'r') as f:
    idx_to_word = json.load(f)

# Reinitialize model & load fine-tuned weights
model = MiniTransformer(config_dict['vocab_size']).to(config_dict['device'])
model.load_state_dict(torch.load(f'{target_directory}/model_output/chatbot_finetuned.pt', map_location=config_dict['device']))
model.eval()

print("✅ Model loaded")
print("Vocab size from config:", config_dict['vocab_size'])
print("Actual size of idx_to_word:", len(idx_to_word))

# --- Set Config class dynamically ---
class Config:
    tokenizer = word_tokenize
    seq_length = config_dict['seq_length']
    batch_size = config_dict['batch_size']
    learning_rate = config_dict['learning_rate']
    device = config_dict['device']
    vocab_size = config_dict['vocab_size']
    pad_token = config_dict.get('pad_token', '<pad>')
    unk_token = config_dict.get('unk_token', '<unk>')

✅ Model loaded
Vocab size from config: 48293
Actual size of idx_to_word: 48293


In [None]:
# Text generation function
def generate_response(model, prompt, max_length=50, top_p=0.9, temperature=0.7, repetition_penalty=2.0):
    model.eval()
    tokens = Config.tokenizer(prompt)
    input_ids = [int(word_to_idx.get(token, word_to_idx[Config.unk_token])) for token in tokens]
    input_ids = input_ids[:Config.seq_length - 1] + [word_to_idx[Config.pad_token]] * (Config.seq_length - 1 - len(input_ids))
    input_ids = torch.tensor([input_ids], dtype=torch.long).to(Config.device)

    generated_tokens = input_ids.clone()
    with torch.no_grad():
        for _ in range(max_length):
            if input_ids.size(1) > Config.seq_length:
                input_ids = input_ids[:, -Config.seq_length:]

            outputs = model(input_ids)
            logits = outputs[:, -1, :]

            # Repetition penalty
            for token in set(generated_tokens[0].tolist()):
                logits[0, token] /= repetition_penalty

            # Temperature scaling
            logits = logits / temperature

            # Top-p (nucleus) sampling
            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
            probs = torch.softmax(sorted_logits, dim=-1)
            cumulative_probs = torch.cumsum(probs, dim=-1)
            sorted_indices_to_keep = cumulative_probs <= top_p
            if not sorted_indices_to_keep.any():
                sorted_indices_to_keep[..., 0] = True
            top_p_logits = sorted_logits[sorted_indices_to_keep]
            top_p_indices = sorted_indices[sorted_indices_to_keep]
            top_p_probs = torch.softmax(top_p_logits, dim=-1)

            next_token_idx = torch.multinomial(top_p_probs, num_samples=1).item()
            next_token = top_p_indices[next_token_idx].item()

            input_ids = torch.cat([input_ids, torch.tensor([[next_token]]).to(Config.device)], dim=1)
            generated_tokens = torch.cat([generated_tokens, torch.tensor([[next_token]]).to(Config.device)], dim=1)

            if next_token == word_to_idx[Config.pad_token]:
                break

    return " ".join([idx_to_word.get(str(idx), Config.unk_token) for idx in generated_tokens[0].tolist() if idx != word_to_idx[Config.pad_token]])

# Chat loop
print("\n🟢 Chatbot ready! Type 'exit' to stop.\n")
while True:
    prompt = input("You: ")
    if prompt.strip().lower() == "exit":
        print("👋 Exiting chat.")
        break
    response = generate_response(model, prompt)
    print(f"Bot: {response}\n")


🟢 Chatbot ready! Type 'exit' to stop.

You: hi
Bot: hi the customer <unk> please very good <unk> like a lot of work than your job with me about any movies you can say i don too much were doing profit to go in that afraid it is anything but we have no kitchen month <unk> sorry s be so many

You: exit
👋 Exiting chat.
