# Importing Libraries

In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from tqdm import tqdm
from typing import Optional, Tuple
import math 
from torch.utils.data import DataLoader

# Set up GPT config

In [2]:
class GPTConfig:
    def __init__(
        self,
        vocab_size: int = 13025,        
        d_model: int = 768,
        n_layers: int = 12,
        n_heads: int = 12,
        d_ff: Optional[int] = None,
        max_seq_len: int = 64,           
        dropout: float = 0.1,
        tie_word_embeddings: bool = True,
        use_bias_in_proj: bool = True,
    ):
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.d_ff = d_ff or 4 * d_model
        self.max_seq_len = max_seq_len
        self.dropout = dropout
        self.tie_word_embeddings = tie_word_embeddings
        self.use_bias_in_proj = use_bias_in_proj

# Building Transformer from scratch

In [3]:
def causal_mask(sz: int, device: torch.device) -> torch.Tensor:
    # Returns (1, 1, sz, sz) mask with -inf where future positions are
    mask = torch.triu(torch.ones((sz, sz), device=device), diagonal=1).bool()
    return mask  # shape (sz, sz) boolean mask

In [4]:
class MultiheadAttn(nn.Module):
  def __init__(self,cfg):
        super(MultiheadAttn,self).__init__()
        assert cfg.d_model % cfg.n_heads == 0, "d_model must be divisible by n_heads"
        self.n_heads = cfg.n_heads
        self.head_dim = cfg.d_model // cfg.n_heads
        self.scale = 1.0 / math.sqrt(self.head_dim)
        

        self.qkv_proj = nn.Linear(in_features=cfg.d_model,out_features=3 * cfg.d_model,bias=cfg.use_bias_in_proj)
        self.out_proj = nn.Linear(in_features=cfg.d_model,out_features=cfg.d_model)
        self.attn_dropout = nn.Dropout(cfg.dropout)
        self.proj_dropout = nn.Dropout(cfg.dropout)
  def forward(self,x : torch.Tensor, attn_mask:Optional[torch.Tensor] =None) -> torch.Tensor:
      batch_size, max_seq, d_model = x.size()
      qkv = self.qkv_proj(x)
      qkv = qkv.reshape(batch_size,max_seq,3,self.n_heads,self.head_dim)
      q,k,v = qkv.unbind(dim =2)
      
      q = q.permute(0,2,1,3)
      k = k.permute(0,2,1,3) 
      v = v.permute(0,2,1,3)
      # scaled dot-product 
      attn_scores = torch.matmul(q,k.transpose(-2,-1)) * self.scale
      if attn_mask is None:
          mask = causal_mask(max_seq,device=x.device)
          attn_scores = attn_scores.masked_fill(mask.unsqueeze(0).unsqueeze(0), float("-inf"))
      else :
          attn_scores = attn_scores.masked_fill(attn_mask, float("-inf"))
      attn_probs = F.softmax(attn_scores, dim=-1)
      attn_probs = self.attn_dropout(attn_probs)
      
      out = torch.matmul(attn_probs,v)
      out = out.permute(0, 2, 1, 3).contiguous().view(batch_size, max_seq, -1)
      out = self.out_proj(out)
      out = self.proj_dropout(out)
      return out 

In [5]:
cfg = GPTConfig()
model = MultiheadAttn(cfg)
x = torch.randn(2, 5, cfg.d_model)

print("Input shape:", x.shape)
out = model(x)
print("Output shape:", out.shape)

Input shape: torch.Size([2, 5, 768])
Output shape: torch.Size([2, 5, 768])


# **Building Multi-Layer Perceptron**

In [6]:
class Multi_Layer_Perceptron(nn.Module):
  def __init__(self,cfg):
    super(Multi_Layer_Perceptron,self).__init__()
    self.d_model = cfg.d_model
    self.d_ff = cfg.d_ff
    self.net = nn.Sequential(
        nn.Linear(self.d_model,self.d_ff),
        nn.GELU(),
        nn.Linear(self.d_ff,self.d_model)
    )
  def forward(self,x:torch.Tensor) -> torch.Tensor:
    return self.net(x)

In [7]:
cfg = GPTConfig()
model = Multi_Layer_Perceptron(cfg)
x = torch.randn(2, 5, cfg.d_model)

print("Input shape:", x.shape)
out = model(x)
print("Output shape:", out.shape)

Input shape: torch.Size([2, 5, 768])
Output shape: torch.Size([2, 5, 768])


# **Building Layer Normalization**

In [8]:
class LayerNormalization(nn.Module):
  def __init__(self,cfg,eps = 1e-5):
    super().__init__()
    self.eps = eps
    self.gamma = nn.Parameter(torch.ones(cfg.d_model))
    self.beta = nn.Parameter(torch.zeros(cfg.d_model))
  def forward(self,x:torch.Tensor):
    mean = x.mean(dim=-1,keepdim=True)
    var = x.var(dim=-1,keepdim=True,unbiased=False)
    x_norm = (x-mean) / torch.sqrt(var + self.eps)
    return self.gamma * x_norm + self.beta


In [9]:
cfg = GPTConfig()
model = LayerNormalization(cfg)
x = torch.randn(2, 5, cfg.d_model)

print("Input shape:", x.shape)
out = model(x)
print("Output shape:", out.shape)

Input shape: torch.Size([2, 5, 768])
Output shape: torch.Size([2, 5, 768])


# **Building Transformer Decoder Block**

In [10]:
class TransformerDecoderBlock(nn.Module):
    def __init__(self, cfg: GPTConfig):
        super().__init__()
        self.norm1 = LayerNormalization(cfg)
        self.MultiheadAttn = MultiheadAttn(cfg)
        self.norm2 = LayerNormalization(cfg)
        self.ff = Multi_Layer_Perceptron(cfg)
        self.dropout1 = nn.Dropout(cfg.dropout)
        self.dropout2 = nn.Dropout(cfg.dropout)

    def forward(self, x, attn_mask=None): 
        norm1 = self.norm1(x)
        attn = self.MultiheadAttn(norm1, attn_mask=attn_mask)
        attn = self.dropout1(attn)
        x = self.norm2(attn + x)
        ff = self.ff(x)
        ff = self.dropout2(ff)
        return ff + x

In [11]:
cfg = GPTConfig()
model = TransformerDecoderBlock(cfg)
x = torch.randn(2, 5, cfg.d_model)

print("Input shape:", x.shape)
out = model(x)
print("Output shape:", out.shape)

Input shape: torch.Size([2, 5, 768])
Output shape: torch.Size([2, 5, 768])


# **Building GPT Model**

- Iteratively predict and sample the next token: Runs the model on the current sequence, takes the last-step logits, applies temperature scaling and optional top-k filtering, then samples one token from the probability distribution.
- Append the new token and repeat until done: Adds the token to the sequence, truncates if too long, and stops early if the EOS token is generated or the maximum new token count is reached.


In [12]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()  # Fixed: Use modern super() syntax
        self.cfg = cfg  # Store config for later use

        # Token and positional embeddings
        self.token_emb = nn.Embedding(cfg.vocab_size, cfg.d_model)
        self.pos_emb = nn.Parameter(torch.zeros(1, cfg.max_seq_len, cfg.d_model))

        # Transformer blocks
        self.blocks = nn.ModuleList([TransformerDecoderBlock(cfg) for _ in range(cfg.n_layers)])

        # Final layer norm and head
        self.ln_f = LayerNormalization(cfg)  # Fixed: Renamed from norm1 to ln_f for clarity
        self.head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)

        # Weight tying (optional)
        if cfg.tie_word_embeddings:
            self.head.weight = self.token_emb.weight

        self._init_weights()

    def _init_weights(self):
        # Initialize weights
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        # Special initialization for positional embeddings
        nn.init.normal_(self.pos_emb, mean=0.0, std=0.02)

    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        batch_size, seq_len = input_ids.size()

        # Embeddings
        tok_emb = self.token_emb(input_ids)  # (B, T, d_model)
        pos_emb = self.pos_emb[:, :seq_len, :]  # (1, T, d_model)  # Fixed: Use seq_len instead of undefined T
        x = tok_emb + pos_emb  # (B, T, d_model)

        # Transformer blocks
        for block in self.blocks:
            x = block(x, attn_mask=attention_mask)

        # Final layer norm and projection
        x = self.ln_f(x)
        logits = self.head(x)  # (B, T, vocab_size)
        return logits

    @torch.no_grad()
    def generate(
        self,
        input_ids: torch.LongTensor,
        max_new_tokens: int = 50,
        temperature: float = 1.0,
        top_k: Optional[int] = None,
        eos_token_id: Optional[int] = None,
    ) -> torch.LongTensor:
        """
        Autoregressive text generation with optional:
        - Temperature scaling
        - Top-k filtering
        - Early stopping via EOS token
        """
        self.eval()
        generated = input_ids.clone()

        for _ in range(max_new_tokens):
            # Truncate if sequence grows too long
            if generated.size(1) > self.cfg.max_seq_len:
                generated = generated[:, -self.cfg.max_seq_len:]

            # Forward pass
            logits = self(generated)  # (B, T, V)
            next_logits = logits[:, -1, :] / max(temperature, 1e-8)  # (B, V)

            # Top-k filtering
            if top_k is not None:
                topk_vals, topk_idx = torch.topk(next_logits, min(top_k, next_logits.size(-1)), dim=-1)
                next_logits[next_logits < topk_vals[:, -1].unsqueeze(-1)] = float('-inf')

            # Sample next token
            probs = F.softmax(next_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)  # (B, 1)
            generated = torch.cat([generated, next_token], dim=1)

            # Early stopping
            if eos_token_id is not None and (next_token == eos_token_id).all():
                break

        return generated

In [13]:
cfg = GPTConfig()
model = GPTModel(cfg)

input_ids = torch.randint(0, cfg.vocab_size, (2, 5))
print("Input IDs:", input_ids)

logits = model(input_ids)
print("Logits shape:", logits.shape)

generated = model.generate(input_ids, max_new_tokens=5, top_k=10)
print("Generated IDs:", generated)


Input IDs: tensor([[12458,  3286,  1744,  8664,  3337],
        [ 3875,  7280,  7907,  9068,  1165]])
Logits shape: torch.Size([2, 5, 13025])
Generated IDs: tensor([[12458,  3286,  1744,  8664,  3337, 10896,  8384,  1035,  8384,  1035],
        [ 3875,  7280,  7907,  9068,  1165,  7481, 12288,  7717,  1204,  6904]])


# Data Preprocesing

In [14]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("noahpersaud/89k-chatgpt-conversations")

# print("Path to dataset files:", path)

Structuring Dataset into X and y, when X is "Hello, Good morning", y is Good Morning Chatgpt", and so on. Thus, making a model to predict and generate text.

In [15]:
# import json
# import pandas as pd
# import os

# INPUT_FILE = "chatlogs.jsonl"  
# SEQUENCE_LENGTH = 5               
# OUTPUT_FILE = "word_level_dataset.csv"
# MAX_PAIRS = 100000  # limit to 100k pairs

# def load_json_file(filename):
#     with open(filename, "r", encoding="utf-8") as f:
#         try:
#             data = json.load(f)  # try load as single JSON object/array
#             if isinstance(data, dict):
#                 data = [data]
#         except json.JSONDecodeError:
#             # fallback: try JSON Lines format
#             f.seek(0)
#             data = [json.loads(line) for line in f if line.strip()]
#     return data

# def extract_all_texts(data):
#     texts = []
#     for item in data:
#         for turn in item.get("conversation", []):
#             msg = turn.get("message", "").strip()
#             if msg:
#                 texts.append(msg)
#     return texts

# def create_word_pairs(words, seq_length):
#     X, Y = [], []
#     for i in range(len(words) - seq_length):
#         x_seq = words[i : i + seq_length - 1]  # first n-1 words
#         y_seq = words[i + 1 : i + seq_length]  # shifted sequence
#         X.append(" ".join(x_seq))
#         Y.append(" ".join(y_seq))
#     return X, Y

In [16]:
# def main():
#     if not os.path.exists(INPUT_FILE):
#         print(f"Error: {INPUT_FILE} not found.")
#         return

#     print(f"Loading dataset from {INPUT_FILE}...")
#     data = load_json_file(INPUT_FILE)

#     print("Extracting messages...")
#     all_texts = extract_all_texts(data)
#     print(f"Total messages: {len(all_texts)}")

#     print("Tokenizing...")
#     words = " ".join(all_texts).split()

#     print(f"Creating (X, Y) pairs with sequence length {SEQUENCE_LENGTH}...")
#     X, Y = create_word_pairs(words, SEQUENCE_LENGTH)

#     # Limit to MAX_PAIRS for easier training
#     if len(X) > MAX_PAIRS:
#         X = X[:MAX_PAIRS]
#         Y = Y[:MAX_PAIRS]

#     print(f"Total training pairs after truncation: {len(X)}")
#     df = pd.DataFrame({"input": X, "target": Y})
#     df.to_csv(OUTPUT_FILE, index=False)

#     print(f"Saved dataset to {OUTPUT_FILE}")

# if __name__ == "__main__":
#     main()


# Making vocabulary encoder and decoder  & Creating Dataset

In [17]:
import json
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from sklearn.model_selection import train_test_split

# ==== CONFIG ====
INPUT_CSV = "/kaggle/input/processing-dataset/word_level_dataset.csv"      # Your raw CSV input file
VOCAB_PATH = "/kaggle/working/vocab.json"                  # Where to save vocab
INV_VOCAB_PATH = "/kaggle/working/inv_vocab.json"          # Where to save inverse vocab
ENCODED_CSV = "/kaggle/working/encoded_dataset.csv"        # Where to save encoded dataset
TRAIN_CSV = "/kaggle/working/train_encoded.csv"            # Train split CSV
VAL_CSV = "/kaggle/working/val_encoded.csv"                # Validation split CSV

SPECIAL_TOKENS = ["<PAD>", "<BOS>", "<EOS>", "<UNK>"]
MAX_SEQ_LEN = 5                            # Max sequence length for dataset padding
BATCH_SIZE = 200                            # Batch size for DataLoader
VAL_RATIO = 0.1                           # 10% validation split
# =================

def build_vocab(csv_path):
    df = pd.read_csv(csv_path)
    all_words = []
    for col in ["input", "target"]:
        for seq in df[col]:
            all_words.extend(seq.split())

    vocab = {token: idx for idx, token in enumerate(SPECIAL_TOKENS)}
    for word in set(all_words):
        if word not in vocab:
            vocab[word] = len(vocab)
    return vocab

def save_vocab(vocab, vocab_path, inv_vocab_path):
    with open(vocab_path, "w", encoding="utf-8") as f:
        json.dump(vocab, f, ensure_ascii=False, indent=2)
    inv_vocab = {str(idx): word for word, idx in vocab.items()}
    with open(inv_vocab_path, "w", encoding="utf-8") as f:
        json.dump(inv_vocab, f, ensure_ascii=False, indent=2)

def encode_sequence(seq, vocab):
    return [vocab["<BOS>"]] + [vocab.get(word, vocab["<UNK>"]) for word in seq.split()] + [vocab["<EOS>"]]

def encode_dataset(input_csv, output_csv, vocab):
    df = pd.read_csv(input_csv)
    df["input_ids"] = df["input"].apply(lambda x: encode_sequence(x, vocab))
    df["target_ids"] = df["target"].apply(lambda x: encode_sequence(x, vocab))
    df.to_csv(output_csv, index=False)

def split_dataset(encoded_csv_path, train_csv_path, val_csv_path, val_ratio=0.1, random_state=42):
    df = pd.read_csv(encoded_csv_path)
    df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)  # shuffle

    train_df, val_df = train_test_split(df, test_size=val_ratio, random_state=random_state)
    train_df.to_csv(train_csv_path, index=False)
    val_df.to_csv(val_csv_path, index=False)

    print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}")

class GPTDataset(Dataset):
    def __init__(self, csv_file, vocab, max_seq_len):
        self.vocab = vocab
        self.max_seq_len = max_seq_len
        self.df = pd.read_csv(csv_file)
        self.inputs = self.df["input_ids"].apply(self._parse_ids).tolist()
        self.targets = self.df["target_ids"].apply(self._parse_ids).tolist()
        self.inputs = [self._pad_or_truncate(seq) for seq in self.inputs]
        self.targets = [self._pad_or_truncate(seq) for seq in self.targets]

    def _parse_ids(self, s):
        if isinstance(s, str) and s.startswith("["):
            return json.loads(s.replace("'", '"'))
        return []

    def _pad_or_truncate(self, seq):
        if len(seq) > self.max_seq_len:
            return seq[:self.max_seq_len]
        else:
            pad_len = self.max_seq_len - len(seq)
            return seq + [self.vocab["<PAD>"]] * pad_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.inputs[idx], dtype=torch.long)
        target_ids = torch.tensor(self.targets[idx], dtype=torch.long)
        return input_ids, target_ids

# Training Dataset

Using data pararel -> to speed up our training process , becuase we have to t4 *2 gpu

In [18]:
import torch
import torch.nn.functional as F
from tqdm import tqdm
from torch.optim import AdamW

def decode_batch(batch_ids, inv_vocab):
    decoded = []
    for seq in batch_ids.cpu().numpy():
        words = []
        for idx in seq:
            word = inv_vocab.get(str(idx), "<UNK>")
            if word == "<EOS>":
                break
            if word not in ["<PAD>", "<BOS>"]:
                words.append(word)
        # Add this to handle empty outputs
        if not words:
            words = ["[EMPTY]"]
        decoded.append(" ".join(words))
    return decoded

def train_epoch(model, dataloader, optimizer, device, pad_token_id):
    model.train()
    total_loss = 0
    for input_ids, target_ids in tqdm(dataloader):
        input_ids = input_ids.to(device)
        target_ids = target_ids.to(device)

        optimizer.zero_grad()
        logits = model(input_ids)  # (B, seq_len, vocab_size)

        logits_flat = logits.view(-1, logits.size(-1))       # (B*seq_len, vocab_size)
        targets_flat = target_ids.view(-1)                   # (B*seq_len)

        loss = F.cross_entropy(logits_flat, targets_flat, ignore_index=pad_token_id)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device, pad_token_id):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for input_ids, target_ids in dataloader:
            input_ids = input_ids.to(device)
            target_ids = target_ids.to(device)

            logits = model(input_ids)
            logits_flat = logits.view(-1, logits.size(-1))
            targets_flat = target_ids.view(-1)

            loss = F.cross_entropy(logits_flat, targets_flat, ignore_index=pad_token_id)
            total_loss += loss.item()
    return total_loss / len(dataloader)

def train(model, train_loader, val_loader, inv_vocab, epochs=10, lr=4e-4, save_path="gpt_model.pth", pad_token_id=0, print_samples=3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Wrap model with DataParallel if multiple GPUs available
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs for training")
        model = torch.nn.DataParallel(model)
    
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=lr)

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        train_loss = train_epoch(model, train_loader, optimizer, device, pad_token_id)
        val_loss = evaluate(model, val_loader, device, pad_token_id)
        print(f"Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}")

        # Print sample input, expected, and predicted outputs
        model.eval()
        with torch.no_grad():
            for input_ids, target_ids in val_loader:
                input_ids = input_ids.to(device)
                logits = model(input_ids)  # (B, seq_len, vocab_size)
                preds = logits.argmax(dim=-1)  # (B, seq_len)

                inputs_text = decode_batch(input_ids, inv_vocab)
                targets_text = decode_batch(target_ids, inv_vocab)
                preds_text = decode_batch(preds, inv_vocab)
                

                print("\nSamples:")
                for i in range(min(print_samples, len(inputs_text))):
                    print(f"Input    : {inputs_text[i]}")
                    print(f"Expected : {targets_text[i]}")
                    print(f"Predicted: {preds_text[i]}")
                    print("---")
                break  # only print from first batch

        # Save model checkpoint (note DataParallel model.state_dict() contains "module." prefix)
        torch.save(model.state_dict(), "Model.pth")
        print(f"Model saved to {save_path}_epoch{epoch+1}.pth\n")


In [19]:
print("Building vocabulary...")
vocab = build_vocab(INPUT_CSV)
save_vocab(vocab, VOCAB_PATH, INV_VOCAB_PATH)
print(f"Vocab saved. Size: {len(vocab)}")

print("Encoding dataset...")
encode_dataset(INPUT_CSV, ENCODED_CSV, vocab)
print(f"Encoded dataset saved to {ENCODED_CSV}")

print("Splitting dataset into train and validation...")
split_dataset(ENCODED_CSV, TRAIN_CSV, VAL_CSV, val_ratio=VAL_RATIO)

print("Loading train dataset...")
train_dataset = GPTDataset(TRAIN_CSV, vocab, MAX_SEQ_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

print("Loading validation dataset...")
val_dataset = GPTDataset(VAL_CSV, vocab, MAX_SEQ_LEN)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)



Building vocabulary...
Vocab saved. Size: 13025
Encoding dataset...
Encoded dataset saved to /kaggle/working/encoded_dataset.csv
Splitting dataset into train and validation...
Train size: 90000, Validation size: 10000
Loading train dataset...
Loading validation dataset...


In [20]:
cfg = GPTConfig(
    vocab_size=len(vocab),  # Make sure this matches your actual vocab size
    d_model=512,            # Reduced from 768 for faster training
    n_layers=6,             # Reduced from 12
    n_heads=8,              # Reduced from 12
    max_seq_len=MAX_SEQ_LEN, # Now using 64
    dropout=0.1
)
model = GPTModel(cfg)
dataset = GPTDataset(ENCODED_CSV, vocab, MAX_SEQ_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

vocab = build_vocab(INPUT_CSV)
inv_vocab = {str(idx): word for word, idx in vocab.items()}
train(
    model,
    train_loader,
    val_loader,
    inv_vocab,
    epochs=100,              # Start with fewer epochs
    lr=3e-4,               # Slightly lower learning rate
    print_samples=3,       # Print more samples
    pad_token_id=vocab["<PAD>"]
)

Using 2 GPUs for training
Epoch 1/100


100%|██████████| 450/450 [00:28<00:00, 15.71it/s]


Train loss: 5.3783, Val loss: 4.6436

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: and can be the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the the to and
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: and the of and
---
Model saved to gpt_model.pth_epoch1.pth

Epoch 2/100


100%|██████████| 450/450 [00:27<00:00, 16.29it/s]


Train loss: 4.3100, Val loss: 3.9940

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: I can be the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the the the with
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with the statement: I
---
Model saved to gpt_model.pth_epoch2.pth

Epoch 3/100


100%|██████████| 450/450 [00:27<00:00, 16.37it/s]


Train loss: 3.7587, Val loss: 3.5635

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: if can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: create the human-like the
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with the aspects I
---
Model saved to gpt_model.pth_epoch3.pth

Epoch 4/100


100%|██████████| 450/450 [00:27<00:00, 16.19it/s]


Train loss: 3.3689, Val loss: 3.2406

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: if can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the the than which
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with the machines I
---
Model saved to gpt_model.pth_epoch4.pth

Epoch 5/100


100%|██████████| 450/450 [00:27<00:00, 16.31it/s]


Train loss: 3.0441, Val loss: 2.9512

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the the investments you
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with the years, I
---
Model saved to gpt_model.pth_epoch5.pth

Epoch 6/100


100%|██████████| 450/450 [00:27<00:00, 16.36it/s]


Train loss: 2.7583, Val loss: 2.7269

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the the of it
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with low techniques. I
---
Model saved to gpt_model.pth_epoch6.pth

Epoch 7/100


100%|██████████| 450/450 [00:27<00:00, 16.21it/s]


Train loss: 2.5164, Val loss: 2.5205

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: a can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the the of and
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with the techniques, The
---
Model saved to gpt_model.pth_epoch7.pth

Epoch 8/100


100%|██████████| 450/450 [00:27<00:00, 16.40it/s]


Train loss: 2.3117, Val loss: 2.3658

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: you can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the Salina of the
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with any techniques, Once
---
Model saved to gpt_model.pth_epoch8.pth

Epoch 9/100


100%|██████████| 450/450 [00:27<00:00, 16.38it/s]


Train loss: 2.1301, Val loss: 2.2203

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all tables which
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch9.pth

Epoch 10/100


100%|██████████| 450/450 [00:27<00:00, 16.32it/s]


Train loss: 1.9876, Val loss: 2.1162

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers which
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch10.pth

Epoch 11/100


100%|██████████| 450/450 [00:27<00:00, 16.20it/s]


Train loss: 1.8672, Val loss: 2.0277

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all the which
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch11.pth

Epoch 12/100


100%|██████████| 450/450 [00:27<00:00, 16.31it/s]


Train loss: 1.7717, Val loss: 1.9564

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the Supreem's the including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch12.pth

Epoch 13/100


100%|██████████| 450/450 [00:27<00:00, 16.27it/s]


Train loss: 1.6847, Val loss: 1.8877

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all ads which
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new technologies I
---
Model saved to gpt_model.pth_epoch13.pth

Epoch 14/100


100%|██████████| 450/450 [00:27<00:00, 16.10it/s]


Train loss: 1.6182, Val loss: 1.8380

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: be all the as
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch14.pth

Epoch 15/100


100%|██████████| 450/450 [00:27<00:00, 16.15it/s]


Train loss: 1.5572, Val loss: 1.7939

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: make can easily the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the inactive the which
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch15.pth

Epoch 16/100


100%|██████████| 450/450 [00:27<00:00, 16.31it/s]


Train loss: 1.5084, Val loss: 1.7574

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all the or
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch16.pth

Epoch 17/100


100%|██████████| 450/450 [00:27<00:00, 16.24it/s]


Train loss: 1.4656, Val loss: 1.7233

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: be all ads including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch17.pth

Epoch 18/100


100%|██████████| 450/450 [00:27<00:00, 16.30it/s]


Train loss: 1.4295, Val loss: 1.6984

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all the including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch18.pth

Epoch 19/100


100%|██████████| 450/450 [00:27<00:00, 16.31it/s]


Train loss: 1.3951, Val loss: 1.6742

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all audio you
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch19.pth

Epoch 20/100


100%|██████████| 450/450 [00:27<00:00, 16.38it/s]


Train loss: 1.3652, Val loss: 1.6531

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: if can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all audio including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch20.pth

Epoch 21/100


100%|██████████| 450/450 [00:27<00:00, 16.23it/s]


Train loss: 1.3407, Val loss: 1.6389

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: make can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch21.pth

Epoch 22/100


100%|██████████| 450/450 [00:27<00:00, 16.32it/s]


Train loss: 1.3152, Val loss: 1.6221

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers you
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch22.pth

Epoch 23/100


100%|██████████| 450/450 [00:28<00:00, 16.05it/s]


Train loss: 1.2934, Val loss: 1.5947

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: if can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch23.pth

Epoch 24/100


100%|██████████| 450/450 [00:27<00:00, 16.18it/s]


Train loss: 1.2746, Val loss: 1.5852

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all ads including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch24.pth

Epoch 25/100


100%|██████████| 450/450 [00:27<00:00, 16.36it/s]


Train loss: 1.2550, Val loss: 1.5761

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the Supreem's people, you
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch25.pth

Epoch 26/100


100%|██████████| 450/450 [00:27<00:00, 16.34it/s]


Train loss: 1.2372, Val loss: 1.5625

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: e can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the Supreem's answers including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch26.pth

Epoch 27/100


100%|██████████| 450/450 [00:27<00:00, 16.33it/s]


Train loss: 1.2260, Val loss: 1.5554

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch27.pth

Epoch 28/100


100%|██████████| 450/450 [00:27<00:00, 16.25it/s]


Train loss: 1.2119, Val loss: 1.5468

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all ads including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch28.pth

Epoch 29/100


100%|██████████| 450/450 [00:27<00:00, 16.32it/s]


Train loss: 1.1996, Val loss: 1.5410

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the Supreem's people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch29.pth

Epoch 30/100


100%|██████████| 450/450 [00:27<00:00, 16.38it/s]


Train loss: 1.1884, Val loss: 1.5378

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch30.pth

Epoch 31/100


100%|██████████| 450/450 [00:27<00:00, 16.29it/s]


Train loss: 1.1766, Val loss: 1.5345

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch31.pth

Epoch 32/100


100%|██████████| 450/450 [00:27<00:00, 16.31it/s]


Train loss: 1.1694, Val loss: 1.5292

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch32.pth

Epoch 33/100


100%|██████████| 450/450 [00:27<00:00, 16.28it/s]


Train loss: 1.1592, Val loss: 1.5273

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, you
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch33.pth

Epoch 34/100


100%|██████████| 450/450 [00:27<00:00, 16.09it/s]


Train loss: 1.1505, Val loss: 1.5229

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch34.pth

Epoch 35/100


100%|██████████| 450/450 [00:27<00:00, 16.18it/s]


Train loss: 1.1429, Val loss: 1.5110

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: a can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch35.pth

Epoch 36/100


100%|██████████| 450/450 [00:27<00:00, 16.26it/s]


Train loss: 1.1365, Val loss: 1.5132

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: make can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch36.pth

Epoch 37/100


100%|██████████| 450/450 [00:27<00:00, 16.19it/s]


Train loss: 1.1306, Val loss: 1.5056

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch37.pth

Epoch 38/100


100%|██████████| 450/450 [00:27<00:00, 16.30it/s]


Train loss: 1.1241, Val loss: 1.5077

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: you can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch38.pth

Epoch 39/100


100%|██████████| 450/450 [00:27<00:00, 16.27it/s]


Train loss: 1.1172, Val loss: 1.5110

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch39.pth

Epoch 40/100


100%|██████████| 450/450 [00:27<00:00, 16.16it/s]


Train loss: 1.1102, Val loss: 1.5157

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: with can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch40.pth

Epoch 41/100


100%|██████████| 450/450 [00:27<00:00, 16.25it/s]


Train loss: 1.1063, Val loss: 1.5086

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch41.pth

Epoch 42/100


100%|██████████| 450/450 [00:27<00:00, 16.35it/s]


Train loss: 1.1007, Val loss: 1.5106

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch42.pth

Epoch 43/100


100%|██████████| 450/450 [00:27<00:00, 16.17it/s]


Train loss: 1.0931, Val loss: 1.5090

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch43.pth

Epoch 44/100


100%|██████████| 450/450 [00:27<00:00, 16.27it/s]


Train loss: 1.0907, Val loss: 1.5099

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch44.pth

Epoch 45/100


100%|██████████| 450/450 [00:27<00:00, 16.28it/s]


Train loss: 1.0853, Val loss: 1.5070

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can use the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch45.pth

Epoch 46/100


100%|██████████| 450/450 [00:27<00:00, 16.26it/s]


Train loss: 1.0819, Val loss: 1.5122

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch46.pth

Epoch 47/100


100%|██████████| 450/450 [00:27<00:00, 16.12it/s]


Train loss: 1.0799, Val loss: 1.4996

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all ads including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch47.pth

Epoch 48/100


100%|██████████| 450/450 [00:27<00:00, 16.31it/s]


Train loss: 1.0736, Val loss: 1.5019

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch48.pth

Epoch 49/100


100%|██████████| 450/450 [00:27<00:00, 16.28it/s]


Train loss: 1.0697, Val loss: 1.5032

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch49.pth

Epoch 50/100


100%|██████████| 450/450 [00:27<00:00, 16.17it/s]


Train loss: 1.0661, Val loss: 1.5034

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: you can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all ads including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch50.pth

Epoch 51/100


100%|██████████| 450/450 [00:27<00:00, 16.23it/s]


Train loss: 1.0644, Val loss: 1.4885

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch51.pth

Epoch 52/100


100%|██████████| 450/450 [00:27<00:00, 16.26it/s]


Train loss: 1.0603, Val loss: 1.4979

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch52.pth

Epoch 53/100


100%|██████████| 450/450 [00:27<00:00, 16.36it/s]


Train loss: 1.0564, Val loss: 1.4988

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch53.pth

Epoch 54/100


100%|██████████| 450/450 [00:27<00:00, 16.25it/s]


Train loss: 1.0543, Val loss: 1.4939

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: at can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch54.pth

Epoch 55/100


100%|██████████| 450/450 [00:27<00:00, 16.26it/s]


Train loss: 1.0511, Val loss: 1.4925

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the Supreem's people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch55.pth

Epoch 56/100


100%|██████████| 450/450 [00:27<00:00, 16.28it/s]


Train loss: 1.0486, Val loss: 1.5007

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch56.pth

Epoch 57/100


100%|██████████| 450/450 [00:27<00:00, 16.15it/s]


Train loss: 1.0458, Val loss: 1.4950

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all ads including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch57.pth

Epoch 58/100


100%|██████████| 450/450 [00:27<00:00, 16.29it/s]


Train loss: 1.0424, Val loss: 1.4948

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the Supreem's ads including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch58.pth

Epoch 59/100


100%|██████████| 450/450 [00:27<00:00, 16.29it/s]


Train loss: 1.0398, Val loss: 1.4981

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch59.pth

Epoch 60/100


100%|██████████| 450/450 [00:27<00:00, 16.18it/s]


Train loss: 1.0371, Val loss: 1.4966

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the or people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch60.pth

Epoch 61/100


100%|██████████| 450/450 [00:27<00:00, 16.31it/s]


Train loss: 1.0348, Val loss: 1.5057

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: you can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the or ads including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch61.pth

Epoch 62/100


100%|██████████| 450/450 [00:27<00:00, 16.34it/s]


Train loss: 1.0337, Val loss: 1.4927

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all wells including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch62.pth

Epoch 63/100


100%|██████████| 450/450 [00:27<00:00, 16.21it/s]


Train loss: 1.0298, Val loss: 1.4948

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch63.pth

Epoch 64/100


100%|██████████| 450/450 [00:27<00:00, 16.26it/s]


Train loss: 1.0284, Val loss: 1.4972

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: you can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch64.pth

Epoch 65/100


100%|██████████| 450/450 [00:27<00:00, 16.41it/s]


Train loss: 1.0269, Val loss: 1.4831

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch65.pth

Epoch 66/100


100%|██████████| 450/450 [00:27<00:00, 16.33it/s]


Train loss: 1.0246, Val loss: 1.4907

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch66.pth

Epoch 67/100


100%|██████████| 450/450 [00:27<00:00, 16.22it/s]


Train loss: 1.0228, Val loss: 1.4967

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch67.pth

Epoch 68/100


100%|██████████| 450/450 [00:27<00:00, 16.35it/s]


Train loss: 1.0200, Val loss: 1.5063

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch68.pth

Epoch 69/100


100%|██████████| 450/450 [00:27<00:00, 16.34it/s]


Train loss: 1.0182, Val loss: 1.4994

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch69.pth

Epoch 70/100


100%|██████████| 450/450 [00:27<00:00, 16.22it/s]


Train loss: 1.0163, Val loss: 1.4930

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: in can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch70.pth

Epoch 71/100


100%|██████████| 450/450 [00:27<00:00, 16.32it/s]


Train loss: 1.0150, Val loss: 1.4958

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the ad's people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch71.pth

Epoch 72/100


100%|██████████| 450/450 [00:27<00:00, 16.33it/s]


Train loss: 1.0130, Val loss: 1.4973

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the or people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch72.pth

Epoch 73/100


100%|██████████| 450/450 [00:28<00:00, 15.94it/s]


Train loss: 1.0147, Val loss: 1.4938

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the or people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch73.pth

Epoch 74/100


100%|██████████| 450/450 [00:28<00:00, 15.71it/s]


Train loss: 1.0092, Val loss: 1.5077

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch74.pth

Epoch 75/100


100%|██████████| 450/450 [00:27<00:00, 16.24it/s]


Train loss: 1.0087, Val loss: 1.4964

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch75.pth

Epoch 76/100


100%|██████████| 450/450 [00:27<00:00, 16.39it/s]


Train loss: 1.0058, Val loss: 1.5026

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all ads including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch76.pth

Epoch 77/100


100%|██████████| 450/450 [00:27<00:00, 16.23it/s]


Train loss: 1.0061, Val loss: 1.5066

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch77.pth

Epoch 78/100


100%|██████████| 450/450 [00:27<00:00, 16.34it/s]


Train loss: 1.0017, Val loss: 1.5023

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: at can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the or people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch78.pth

Epoch 79/100


100%|██████████| 450/450 [00:27<00:00, 16.35it/s]


Train loss: 1.0020, Val loss: 1.4961

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch79.pth

Epoch 80/100


100%|██████████| 450/450 [00:27<00:00, 16.22it/s]


Train loss: 1.0013, Val loss: 1.4951

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch80.pth

Epoch 81/100


100%|██████████| 450/450 [00:27<00:00, 16.34it/s]


Train loss: 0.9977, Val loss: 1.4979

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the or people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch81.pth

Epoch 82/100


100%|██████████| 450/450 [00:27<00:00, 16.35it/s]


Train loss: 0.9964, Val loss: 1.4981

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: at can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch82.pth

Epoch 83/100


100%|██████████| 450/450 [00:27<00:00, 16.34it/s]


Train loss: 0.9964, Val loss: 1.4972

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch83.pth

Epoch 84/100


100%|██████████| 450/450 [00:27<00:00, 16.21it/s]


Train loss: 0.9939, Val loss: 1.4999

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch84.pth

Epoch 85/100


100%|██████████| 450/450 [00:27<00:00, 16.33it/s]


Train loss: 0.9929, Val loss: 1.5023

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the or people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch85.pth

Epoch 86/100


100%|██████████| 450/450 [00:27<00:00, 16.35it/s]


Train loss: 0.9930, Val loss: 1.5012

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch86.pth

Epoch 87/100


100%|██████████| 450/450 [00:27<00:00, 16.28it/s]


Train loss: 0.9903, Val loss: 1.5014

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch87.pth

Epoch 88/100


100%|██████████| 450/450 [00:27<00:00, 16.36it/s]


Train loss: 0.9890, Val loss: 1.4998

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch88.pth

Epoch 89/100


100%|██████████| 450/450 [00:27<00:00, 16.37it/s]


Train loss: 0.9895, Val loss: 1.5031

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the or people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch89.pth

Epoch 90/100


100%|██████████| 450/450 [00:27<00:00, 16.20it/s]


Train loss: 0.9866, Val loss: 1.5042

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch90.pth

Epoch 91/100


100%|██████████| 450/450 [00:27<00:00, 16.34it/s]


Train loss: 0.9846, Val loss: 1.5130

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch91.pth

Epoch 92/100


100%|██████████| 450/450 [00:27<00:00, 16.31it/s]


Train loss: 0.9866, Val loss: 1.4997

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch92.pth

Epoch 93/100


100%|██████████| 450/450 [00:27<00:00, 16.23it/s]


Train loss: 0.9839, Val loss: 1.4985

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch93.pth

Epoch 94/100


100%|██████████| 450/450 [00:27<00:00, 16.34it/s]


Train loss: 0.9825, Val loss: 1.5102

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch94.pth

Epoch 95/100


100%|██████████| 450/450 [00:27<00:00, 16.33it/s]


Train loss: 0.9811, Val loss: 1.5059

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch95.pth

Epoch 96/100


100%|██████████| 450/450 [00:27<00:00, 16.36it/s]


Train loss: 0.9807, Val loss: 1.4970

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: = can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the or people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques. I
---
Model saved to gpt_model.pth_epoch96.pth

Epoch 97/100


100%|██████████| 450/450 [00:27<00:00, 16.20it/s]


Train loss: 0.9796, Val loss: 1.5095

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: you can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all answers including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new sounds I
---
Model saved to gpt_model.pth_epoch97.pth

Epoch 98/100


100%|██████████| 450/450 [00:27<00:00, 16.43it/s]


Train loss: 0.9787, Val loss: 1.5066

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch98.pth

Epoch 99/100


100%|██████████| 450/450 [00:27<00:00, 16.37it/s]


Train loss: 0.9770, Val loss: 1.5083

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch99.pth

Epoch 100/100


100%|██████████| 450/450 [00:27<00:00, 16.23it/s]


Train loss: 0.9775, Val loss: 1.4947

Samples:
Input    : breast-fed, you can exponentiate
Expected : you can exponentiate the
Predicted: even can exponentiate the
---
Input    : to treat all people,
Expected : treat all people, including
Predicted: the all people, including
---
Input    : experimenting with new techniques.
Expected : with new techniques. I
Predicted: with new techniques, I
---
Model saved to gpt_model.pth_epoch100.pth



# Inference

In [21]:
import json
import torch
import torch.nn.functional as F

# Assume GPTConfig and GPTModel classes are already defined or imported here

def encode_input(text, vocab, max_seq_len):
    tokens = [vocab["<BOS>"]] + [vocab.get(w, vocab["<UNK>"]) for w in text.split()] + [vocab["<EOS>"]]
    if len(tokens) > max_seq_len:
        tokens = tokens[:max_seq_len]
    else:
        tokens += [vocab["<PAD>"]] * (max_seq_len - len(tokens))
    return torch.tensor([tokens], dtype=torch.long)

def decode_output(token_ids, inv_vocab):
    words = []
    for idx in token_ids:
        word = inv_vocab.get(str(idx.item()), "<UNK>")
        if word == "<EOS>":
            break
        if word not in ("<PAD>", "<BOS>"):
            words.append(word)
    return " ".join(words)

def infer(model, input_text, vocab, inv_vocab, max_seq_len, max_new_tokens=20, temperature=1.0, top_k=10, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    model.to(device)

    input_ids = encode_input(input_text, vocab, max_seq_len).to(device)
    eos_token_id = vocab.get("<EOS>")

    with torch.no_grad():
        generated_ids = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_k=top_k,
            eos_token_id=eos_token_id,
        )

    output_ids = generated_ids[0].cpu()
    return decode_output(output_ids, inv_vocab)

# --- Usage Example ---
VOCAB_PATH = "/kaggle/working/vocab.json"
INV_VOCAB_PATH = "/kaggle/working/inv_vocab.json"
MODEL_CHECKPOINT = "/kaggle/working/Model.pth"  # update path

# Load vocabularies
with open(VOCAB_PATH, "r", encoding="utf-8") as f:
    vocab = json.load(f)
with open(INV_VOCAB_PATH, "r", encoding="utf-8") as f:
    inv_vocab = json.load(f)

# Create config matching your trained model
cfg = GPTConfig(
    vocab_size=len(vocab),
    d_model=512,
    n_layers=6,
    n_heads=8,
    max_seq_len=5,
    dropout=0.1,
)

model = GPTModel(cfg)
state_dict = torch.load(MODEL_CHECKPOINT, map_location="cpu")

# If trained with DataParallel, remove "module." prefix
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    name = k[7:] if k.startswith("module.") else k
    new_state_dict[name] = v
model.load_state_dict(new_state_dict)

# Example inference
input_text = "Hello, how are you ?"
output_text = infer(model, input_text, vocab, inv_vocab, cfg.max_seq_len)
print("Input:", input_text)
print("Output:", output_text)


Input: Hello, how are you ?
Output: would be happy to help, but
