In [None]:
!pip install torch tqdm streamlit



In [None]:
######################################
# Pseudo2Code.py
######################################
import os
import streamlit as st
import torch
import torch.nn as nn
import torch.optim as optim
import math
import re
from tqdm import tqdm
from typing import List, Tuple
import random
import requests
from torch.utils.data import DataLoader, TensorDataset

In [None]:
# ----------------------------
# 1. Hyperparameters
# ----------------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 128         # maximum sequence length
EMBED_DIM = 256       # embedding dimension
FF_DIM = 512          # feedforward dimension in Transformer
NHEAD = 4             # number of heads in multihead attention
NUM_ENCODER_LAYERS = 2
NUM_DECODER_LAYERS = 2
BATCH_SIZE = 64
EPOCHS = 10            # Increase for real training
LEARNING_RATE = 1e-4

# Special tokens
PAD_TOKEN = "<pad>"
SOS_TOKEN = "<sos>"
EOS_TOKEN = "<eos>"
UNK_TOKEN = "<unk>"

In [None]:
# ----------------------------
# 2. Data Loading & Preprocessing
# ----------------------------

def load_spoc_data(file_path: str):
    """
    Loads (pseudo_code, cpp_code) pairs from a TSV file or raw GitHub link.
    Each line is assumed to have: pseudocode <tab> c++ code.
    """
    pairs = []

    # If file_path is a URL, fetch it with requests
    if file_path.startswith("http"):
        response = requests.get(file_path)
        response.raise_for_status()
        lines = response.text.strip().split("\n")
    else:
        # Otherwise, assume it's a local file path
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()

    for line in lines:
        line = line.strip()
        if not line:
            continue
        cols = line.split('\t')
        if len(cols) >= 2:
            pseudo = cols[0].strip()
            cpp    = cols[1].strip()
            pairs.append((pseudo, cpp))

    return pairs

def create_dataloader(pairs, src_stoi, tgt_stoi, batch_size):
    src_batches = []
    tgt_batches = []
    for pseudo, cpp in pairs:
        src_ids = pad_sequence(numericalize(pseudo, src_stoi), MAX_LEN, src_stoi[PAD_TOKEN])
        tgt_ids = pad_sequence(numericalize(cpp, tgt_stoi), MAX_LEN, tgt_stoi[PAD_TOKEN])
        src_batches.append(src_ids)
        tgt_batches.append(tgt_ids)

    src_tensor = torch.tensor(src_batches, dtype=torch.long)
    tgt_tensor = torch.tensor(tgt_batches, dtype=torch.long)
    dataset = TensorDataset(src_tensor, tgt_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

def tokenize_line(text: str) -> List[str]:
    """Enhanced tokenizer for pseudocode/C++ patterns"""
    # Separate operators and punctuation
    text = re.sub(r'([=+\-*/%<>!&|^~])', r' \1 ', text)  # Operators
    text = re.sub(r'(?<!:):(?!:)', r' : ', text)  # Single colon
    return re.findall(r'\b\w+\b|[-+*/%=<>!&|^~]+|[:;{},()\[\]\.]', text)

def build_vocab(pairs: List[Tuple[str, str]]) -> Tuple[dict, dict, dict, dict]:
    """
    Build source (pseudo) and target (cpp) vocabularies from training data.
    Returns:
      src_stoi, src_itos, tgt_stoi, tgt_itos
    """
    src_words = set()
    tgt_words = set()

    for (pseudo, cpp) in pairs:
        for tok in tokenize_line(pseudo):
            src_words.add(tok)
        for tok in tokenize_line(cpp):
            tgt_words.add(tok)

    # Add special tokens
    src_vocab = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN] + sorted(list(src_words))
    tgt_vocab = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN] + sorted(list(tgt_words))

    src_stoi = {w: i for i, w in enumerate(src_vocab)}
    src_itos = {i: w for i, w in enumerate(src_vocab)}
    tgt_stoi = {w: i for i, w in enumerate(tgt_vocab)}
    tgt_itos = {i: w for i, w in enumerate(tgt_vocab)}

    return src_stoi, src_itos, tgt_stoi, tgt_itos

def numericalize(text: str, stoi: dict) -> List[int]:
    """
    Convert text string to a list of token IDs.
    """
    tokens = tokenize_line(text)
    ids = []
    for t in tokens:
        if t in stoi:
            ids.append(stoi[t])
        else:
            ids.append(stoi[UNK_TOKEN])
    return ids

def pad_sequence(seq: List[int], max_len: int, pad_id: int) -> List[int]:
    """Proper padding with SOS/EOS handling"""
    seq = seq[:max_len-2]  # Leave space for SOS/EOS
    seq = [src_stoi[SOS_TOKEN]] + seq + [src_stoi[EOS_TOKEN]]  # Add control tokens
    padding = [pad_id] * (max_len - len(seq))
    return seq + padding

def create_batches(pairs, src_stoi, tgt_stoi, batch_size):
    """
    Yield batches of data (source_ids, target_ids).
    """
    random.shuffle(pairs)
    for i in range(0, len(pairs), batch_size):
        batch_pairs = pairs[i:i+batch_size]
        src_batch = []
        tgt_batch = []
        for pseudo, cpp in batch_pairs:
            src_ids = numericalize(pseudo, src_stoi)
            tgt_ids = numericalize(cpp, tgt_stoi)

            # Pad/truncate
            src_ids = pad_sequence(src_ids, MAX_LEN, src_stoi[PAD_TOKEN])
            tgt_ids = pad_sequence(tgt_ids, MAX_LEN, tgt_stoi[PAD_TOKEN])

            src_batch.append(src_ids)
            tgt_batch.append(tgt_ids)

        src_batch = torch.tensor(src_batch, dtype=torch.long, device=DEVICE)
        tgt_batch = torch.tensor(tgt_batch, dtype=torch.long, device=DEVICE)
        yield src_batch, tgt_batch

In [None]:
# ----------------------------
# 3. Transformer Model Implementation (from scratch)
# ----------------------------

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # shape (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x shape: (batch_size, seq_len, d_model)
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len, :]
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % n_heads == 0
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads

        self.query_linear = nn.Linear(d_model, d_model)
        self.key_linear = nn.Linear(d_model, d_model)
        self.value_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        # query/key/value shape: (batch_size, seq_len, d_model)
        B, Q_len, _ = query.size()
        B, K_len, _ = key.size()
        B, V_len, _ = value.size()

        # Linear projections
        Q = self.query_linear(query)  # (B, Q_len, d_model)
        K = self.key_linear(key)      # (B, K_len, d_model)
        V = self.value_linear(value)  # (B, V_len, d_model)

        # Reshape for multi-head
        Q = Q.view(B, Q_len, self.n_heads, self.head_dim).transpose(1,2)  # (B, n_heads, Q_len, head_dim)
        K = K.view(B, K_len, self.n_heads, self.head_dim).transpose(1,2)  # (B, n_heads, K_len, head_dim)
        V = V.view(B, V_len, self.n_heads, self.head_dim).transpose(1,2)  # (B, n_heads, V_len, head_dim)

        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)  # (B, n_heads, Q_len, K_len)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn = torch.softmax(scores, dim=-1)  # (B, n_heads, Q_len, K_len)

        context = torch.matmul(attn, V)  # (B, n_heads, Q_len, head_dim)
        context = context.transpose(1,2).contiguous().view(B, Q_len, self.d_model)
        out = self.out_linear(context)
        return out

class FeedForward(nn.Module):
    def __init__(self, d_model, dim_feedforward):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, dim_feedforward)
        self.fc2 = nn.Linear(dim_feedforward, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, dim_feedforward):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads)
        self.ff = FeedForward(d_model, dim_feedforward)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, src, src_mask=None):
        # Self-attention
        attn_out = self.self_attn(src, src, src, mask=src_mask)
        src = self.norm1(src + self.dropout(attn_out))
        # Feed Forward
        ff_out = self.ff(src)
        src = self.norm2(src + self.dropout(ff_out))
        return src

class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, dim_feedforward):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads)
        self.cross_attn = MultiHeadAttention(d_model, n_heads)
        self.ff = FeedForward(d_model, dim_feedforward)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
        # Self-attention (mask future tokens)
        _tgt = tgt
        tgt = self.norm1(tgt + self.dropout(self.self_attn(tgt, tgt, tgt, mask=tgt_mask)))
        # Cross-attention
        _tgt2 = tgt
        tgt = self.norm2(tgt + self.dropout(self.cross_attn(tgt, memory, memory, mask=memory_mask)))
        # Feed Forward
        ff_out = self.ff(tgt)
        tgt = self.norm3(tgt + self.dropout(ff_out))
        return tgt

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, num_layers, dim_feedforward):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, n_heads, dim_feedforward)
            for _ in range(num_layers)
        ])

    def forward(self, src, src_mask=None):
        # src shape: (batch_size, seq_len)
        x = self.embedding(src)  # (batch_size, seq_len, d_model)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, src_mask)
        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, num_layers, dim_feedforward):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, n_heads, dim_feedforward)
            for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
        x = self.embedding(tgt)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, memory, tgt_mask, memory_mask)
        logits = self.fc_out(x)  # (batch_size, seq_len, vocab_size)
        return logits

class TransformerSeq2Seq(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, n_heads, num_encoder_layers,
                 num_decoder_layers, dim_feedforward):
        super(TransformerSeq2Seq, self).__init__()
        self.encoder = Encoder(src_vocab_size, d_model, n_heads, num_encoder_layers, dim_feedforward)
        self.decoder = Decoder(tgt_vocab_size, d_model, n_heads, num_decoder_layers, dim_feedforward)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        # src: (batch_size, src_seq_len)
        # tgt: (batch_size, tgt_seq_len)
        memory = self.encoder(src, src_mask)  # (batch_size, src_seq_len, d_model)
        outputs = self.decoder(tgt, memory, tgt_mask)  # (batch_size, tgt_seq_len, vocab_size)
        return outputs

In [None]:
# ----------------------------
# 4. Training Setup
# ----------------------------
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from typing import List, Tuple
import random
def generate_subsequent_mask(size):
    # Mask out subsequent positions (for decoding)
    mask = torch.triu(torch.ones(size, size), diagonal=1).bool()
    return ~mask  # True where we can attend, False where we cannot

def train_one_epoch(model, optimizer, criterion, train_data, src_stoi, tgt_stoi):
    model.train()
    total_loss = 0
    steps = 0

    data_loader = create_dataloader(train_pairs, src_stoi, tgt_stoi, BATCH_SIZE)
    for src_batch, tgt_batch in data_loader:
        src_batch = src_batch.to(DEVICE)
        tgt_batch = tgt_batch.to(DEVICE)

        # Prepare the target inputs and outputs (shifted by one token)
        tgt_inp = tgt_batch[:, :-1]
        tgt_out = tgt_batch[:, 1:]

        # Create subsequent mask for the target sequence
        tgt_seq_len = tgt_inp.size(1)
        tgt_mask = generate_subsequent_mask(tgt_seq_len).to(DEVICE)

        optimizer.zero_grad()
        logits = model(src_batch, tgt_inp, None, tgt_mask)  # (B, seq_len, vocab_size)

        # Use .reshape() instead of .view() to avoid runtime errors
        loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        steps += 1

    return total_loss / steps

def evaluate(model, criterion, eval_data, src_stoi, tgt_stoi):
    model.eval()
    total_loss = 0
    steps = 0
    with torch.no_grad():
        for src_batch, tgt_batch in create_batches(eval_data, src_stoi, tgt_stoi, BATCH_SIZE):
            tgt_inp = tgt_batch[:, :-1]
            tgt_out = tgt_batch[:, 1:]
            tgt_seq_len = tgt_inp.size(1)
            tgt_mask = generate_subsequent_mask(tgt_seq_len).to(DEVICE)

            logits = model(src_batch, tgt_inp, None, tgt_mask)
            # Use .reshape() instead of .view()
            loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))

            total_loss += loss.item()
            steps += 1
    return total_loss / steps

def greedy_decode(model, src, src_stoi, tgt_stoi, tgt_itos, max_len=MAX_LEN):
    """
    Given a single source sequence (1D list of token IDs),
    generate a decoded target sequence using greedy search.
    """
    model.eval()
    src = torch.tensor(src, dtype=torch.long, device=DEVICE).unsqueeze(0)  # (1, seq_len)
    memory = model.encoder(src)  # (1, seq_len, d_model)

    ys = torch.tensor([tgt_stoi[SOS_TOKEN]], dtype=torch.long, device=DEVICE).unsqueeze(0)  # (1, 1)
    for i in range(max_len-1):
        tgt_mask = generate_subsequent_mask(ys.size(1)).to(DEVICE)
        out = model.decoder(ys, memory, tgt_mask)  # (1, seq_len, vocab_size)
        prob = out[:, -1, :]  # last timestep
        next_token = torch.argmax(prob, dim=1).item()
        ys = torch.cat([ys, torch.tensor([[next_token]], device=DEVICE)], dim=1)
        if next_token == tgt_stoi[EOS_TOKEN]:
            break

    # Convert back to tokens
    out_tokens = ys.squeeze(0).tolist()  # e.g. [SOS, ..., EOS]
    # Remove the initial SOS
    out_tokens = out_tokens[1:]
    # Stop at EOS if present
    if tgt_stoi[EOS_TOKEN] in out_tokens:
        eos_idx = out_tokens.index(tgt_stoi[EOS_TOKEN])
        out_tokens = out_tokens[:eos_idx]

    return " ".join(tgt_itos[t] for t in out_tokens)

In [None]:
# ----------------------------
# 5. Main: Train the Model
# ----------------------------
if __name__ == "__main__":
    # Hardcode the file paths from your GitHub repo (raw URLs):
    train_path = "https://raw.githubusercontent.com/asadsandhu/Pseudo2Code/main/spoc/train/spoc-train.tsv"
    eval_path  = "https://raw.githubusercontent.com/asadsandhu/Pseudo2Code/main/spoc/train/split/spoc-train-eval.tsv"

    print(f"Loading training data from {train_path} ...")
    train_pairs = load_spoc_data(train_path)
    print(f"Loaded {len(train_pairs)} training pairs.")

    print(f"Loading eval data from {eval_path} ...")
    eval_pairs = load_spoc_data(eval_path)
    print(f"Loaded {len(eval_pairs)} eval pairs.")

    print("Building vocab...")
    src_stoi, src_itos, tgt_stoi, tgt_itos = build_vocab(train_pairs)
    global stoi_eos
    stoi_eos = tgt_stoi[EOS_TOKEN]  # for pad_sequence usage

    print("Creating model...")
    model = TransformerSeq2Seq(
        src_vocab_size=len(src_stoi),
        tgt_vocab_size=len(tgt_stoi),
        d_model=EMBED_DIM,
        n_heads=NHEAD,
        num_encoder_layers=NUM_ENCODER_LAYERS,
        num_decoder_layers=NUM_DECODER_LAYERS,
        dim_feedforward=FF_DIM
    ).to(DEVICE)

    criterion = nn.CrossEntropyLoss(ignore_index=tgt_stoi[PAD_TOKEN])
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    print("Starting training...")
    for epoch in range(1, EPOCHS+1):
        train_loss = train_one_epoch(model, optimizer, criterion, train_pairs, src_stoi, tgt_stoi)
        eval_loss  = evaluate(model, criterion, eval_pairs, src_stoi, tgt_stoi)
        print(f"Epoch [{epoch}/{EPOCHS}] - Train Loss: {train_loss:.4f}, Eval Loss: {eval_loss:.4f}")

    # Save model & vocab
    torch.save({
        'model_state_dict': model.state_dict(),
        'src_stoi': src_stoi,
        'src_itos': src_itos,
        'tgt_stoi': tgt_stoi,
        'tgt_itos': tgt_itos
    }, "model.pth")

    print("Model and vocab saved to model.pth")

Loading training data from https://raw.githubusercontent.com/asadsandhu/Pseudocode2Cpp/main/spoc/train/spoc-train.tsv ...
Loaded 293855 training pairs.
Loading eval data from https://raw.githubusercontent.com/asadsandhu/Pseudocode2Cpp/main/spoc/train/split/spoc-train-eval.tsv ...
Loaded 27289 eval pairs.
Building vocab...
Creating model...
Starting training...
Epoch [1/10] - Train Loss: 0.9915, Eval Loss: 0.4901
Epoch [2/10] - Train Loss: 0.4401, Eval Loss: 0.3597
Epoch [3/10] - Train Loss: 0.3326, Eval Loss: 0.2897
Epoch [4/10] - Train Loss: 0.2752, Eval Loss: 0.2735
Epoch [5/10] - Train Loss: 0.2401, Eval Loss: 0.2281
Epoch [6/10] - Train Loss: 0.2166, Eval Loss: 0.2111
Epoch [7/10] - Train Loss: 0.2002, Eval Loss: 0.2015
Epoch [8/10] - Train Loss: 0.1883, Eval Loss: 0.1919
Epoch [9/10] - Train Loss: 0.1793, Eval Loss: 0.1848
Epoch [10/10] - Train Loss: 0.1724, Eval Loss: 0.1819
Model and vocab saved to transformer_spoc.pth
