In [5]:
!pip install pandas nltk



In [6]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import math
import argparse
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [8]:
class Vocabulary:
    def __init__(self, freq_threshold=1):
        # Special tokens
        self.freq_threshold = freq_threshold
        self.itos = {0: "<pad>", 1: "<sos>", 2: "<eos>", 3: "<unk>"}
        self.stoi = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
        self.freqs = {}
        self.idx = 4

    def build_vocabulary(self, sentence_list):
        for sentence in sentence_list:
            for token in str(sentence).strip().split():
                self.freqs[token] = self.freqs.get(token, 0) + 1
        for token, freq in self.freqs.items():
            if freq >= self.freq_threshold:
                self.stoi[token] = self.idx
                self.itos[self.idx] = token
                self.idx += 1

    def numericalize(self, text):
        if text is None or (isinstance(text, float) and pd.isna(text)):
            return []
        text = str(text)
        tokens = text.strip().split()
        return [self.stoi.get(token, self.stoi["<unk>"]) for token in tokens]

In [9]:
class PseudocodeDataset(Dataset):
    def __init__(self, filepath, src_vocab, trg_vocab):
        self.data = pd.read_csv(filepath, sep='\t')
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src = self.data.iloc[idx]["text"]
        trg = self.data.iloc[idx]["code"]
        src_indices = [self.src_vocab.stoi["<sos>"]] + self.src_vocab.numericalize(src) + [self.src_vocab.stoi["<eos>"]]
        trg_indices = [self.trg_vocab.stoi["<sos>"]] + self.trg_vocab.numericalize(trg) + [self.trg_vocab.stoi["<eos>"]]
        return torch.tensor(src_indices), torch.tensor(trg_indices)

In [10]:
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_pad = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=src_vocab.stoi["<pad>"])
    trg_pad = nn.utils.rnn.pad_sequence(trg_batch, batch_first=True, padding_value=trg_vocab.stoi["<pad>"])
    return src_pad, trg_pad

In [11]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.3, max_len=5000): 
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model) 
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0) 
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)


In [12]:
class TransformerSeq2Seq(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model=512, nhead=8,
                 num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=512, dropout=0.3):  # Increased dropout
        super().__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward, dropout=dropout)
        self.fc_out = nn.Linear(d_model, trg_vocab_size)
        self.d_model = d_model

    def forward(self, src, trg):
        src_mask = self.generate_square_subsequent_mask(src.size(1)).to(src.device)
        trg_mask = self.generate_square_subsequent_mask(trg.size(1)).to(trg.device)
        src_emb = self.positional_encoding(self.src_embedding(src) * math.sqrt(self.d_model))
        trg_emb = self.positional_encoding(self.trg_embedding(trg) * math.sqrt(self.d_model))
        src_emb = src_emb.transpose(0, 1)
        trg_emb = trg_emb.transpose(0, 1)
        output = self.transformer(src_emb, trg_emb, src_mask=src_mask, tgt_mask=trg_mask)
        output = output.transpose(0, 1)  
        return self.fc_out(output)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask


In [13]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for src, trg in dataloader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg_target = trg[:, 1:].contiguous().view(-1)
        loss = criterion(output, trg_target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

In [14]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, trg in dataloader:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg[:, :-1])
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

In [15]:
def translate_sentence(model, sentence, src_vocab, trg_vocab, device, max_len=50):
    model.eval()
    tokens = [src_vocab.stoi["<sos>"]] + src_vocab.numericalize(sentence) + [src_vocab.stoi["<eos>"]]
    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
    with torch.no_grad():
        src_mask = model.generate_square_subsequent_mask(src_tensor.size(1)).to(device)
        src_emb = model.positional_encoding(model.src_embedding(src_tensor) * math.sqrt(model.d_model))
        src_emb = src_emb.transpose(0, 1)
        memory = model.transformer.encoder(src_emb, src_mask)
    trg_indices = [trg_vocab.stoi["<sos>"]]
    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indices).unsqueeze(0).to(device)
        trg_mask = model.generate_square_subsequent_mask(trg_tensor.size(1)).to(device)
        trg_emb = model.positional_encoding(model.trg_embedding(trg_tensor) * math.sqrt(model.d_model))
        trg_emb = trg_emb.transpose(0, 1)
        with torch.no_grad():
            output = model.transformer.decoder(trg_emb, memory, tgt_mask=trg_mask)
            output = output.transpose(0, 1)
            pred_token = output[:, -1, :].argmax(1).item()
        trg_indices.append(pred_token)
        if pred_token == trg_vocab.stoi["<eos>"]:
            break
    trg_tokens = [trg_vocab.itos[idx] for idx in trg_indices]
    return trg_tokens


In [16]:
if __name__ == '__main__':
    train_file = "/kaggle/input/psu-to-code/train.csv"
    eval_file = "/kaggle/input/psu-to-code/eval.csv"
    epochs = 30
    batch_size = 256
    save_path = "transformer_model.pth"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    train_df = pd.read_csv(train_file,sep='\t')
    train_df.dropna(inplace=True)
    train_df.info()
    src_sentences = train_df['text'].fillna("").astype(str).tolist()
    trg_sentences = train_df['code'].fillna("").astype(str).tolist()

    global src_vocab, trg_vocab
    src_vocab = Vocabulary(freq_threshold=1)
    src_vocab.build_vocabulary(src_sentences)
    trg_vocab = Vocabulary(freq_threshold=1)
    trg_vocab.build_vocabulary(trg_sentences)

    train_dataset = PseudocodeDataset(train_file, src_vocab, trg_vocab)
    eval_dataset = PseudocodeDataset(eval_file, src_vocab, trg_vocab)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    eval_loader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    model = TransformerSeq2Seq(len(src_vocab.stoi), len(trg_vocab.stoi)).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=src_vocab.stoi["<pad>"])
    optimizer = optim.Adam(model.parameters(), lr=0.0005)

    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        eval_loss = evaluate(model, eval_loader, criterion, device)
        print(f'Epoch: {epoch+1}, Train Loss: {train_loss:.3f}, Eval Loss: {eval_loss:.3f}')

    torch.save({
        'model_state_dict': model.state_dict(),
        'src_vocab': src_vocab,
        'trg_vocab': trg_vocab
    }, save_path)
    print("Model saved to", save_path)


Using device: cuda
<class 'pandas.core.frame.DataFrame'>
Index: 181862 entries, 1 to 246083
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    181862 non-null  object
 1   code    181862 non-null  object
dtypes: object(2)
memory usage: 4.2+ MB




Epoch: 1, Train Loss: 2.186, Eval Loss: 1.443
Epoch: 2, Train Loss: 1.407, Eval Loss: 1.236
Epoch: 3, Train Loss: 1.162, Eval Loss: 1.175
Epoch: 4, Train Loss: 1.002, Eval Loss: 1.134
Epoch: 5, Train Loss: 0.878, Eval Loss: 1.114
Epoch: 6, Train Loss: 0.781, Eval Loss: 1.109
Epoch: 7, Train Loss: 0.713, Eval Loss: 1.115
Epoch: 8, Train Loss: 0.645, Eval Loss: 1.123
Epoch: 9, Train Loss: 0.594, Eval Loss: 1.146
Epoch: 10, Train Loss: 0.551, Eval Loss: 1.140
Epoch: 12, Train Loss: 0.484, Eval Loss: 1.139
Epoch: 13, Train Loss: 0.457, Eval Loss: 1.162
Epoch: 14, Train Loss: 0.434, Eval Loss: 1.145
Epoch: 15, Train Loss: 0.418, Eval Loss: 1.137
Epoch: 16, Train Loss: 0.399, Eval Loss: 1.152
Epoch: 17, Train Loss: 0.387, Eval Loss: 1.140
Epoch: 18, Train Loss: 0.372, Eval Loss: 1.140
Epoch: 19, Train Loss: 0.362, Eval Loss: 1.169
Epoch: 20, Train Loss: 0.353, Eval Loss: 1.145
Epoch: 21, Train Loss: 0.344, Eval Loss: 1.111
Epoch: 22, Train Loss: 0.335, Eval Loss: 1.149
Epoch: 23, Train Loss:

In [17]:
newSavePath="PsuToCode.pth"
torch.save(model.state_dict(), newSavePath)
print("Model saved to", newSavePath)


Model saved to PsuToCode.pth


In [18]:
import pickle
with open("src_vocab.pkl", "wb") as f:
    pickle.dump(src_vocab, f)
with open("trg_vocab.pkl", "wb") as f:
    pickle.dump(trg_vocab, f)

In [24]:
import torch
import pandas as pd
import nltk
import pickle
from torch.utils.data import DataLoader
import torch.nn.utils.rnn as rnn_utils

def load_vocab(filepath):
    with open(filepath, "rb") as f:
        vocab = pickle.load(f)
    return vocab

def eval_collate_fn(batch):
    # Each batch element is assumed to be a tuple: (src, trg)
    srcs, trgs = zip(*batch)
    # Pad the source sequences (adjust padding_value as needed)
    padded_srcs = rnn_utils.pad_sequence(srcs, batch_first=True, 
                                         padding_value=src_vocab.stoi.get("<pad>", 0))
    return padded_srcs, trgs

def translate_batch(model, src_batch, src_vocab, trg_vocab, device, max_len=50):
    """
    Batch decoding using greedy search.
    Assumes the model.forward(src, trg) returns logits for the target sequence.
    """
    batch_size = src_batch.size(0)
    src_batch = src_batch.to(device)
    
    # Initialize target sequences with <sos> token.
    trg_init = torch.LongTensor([trg_vocab.stoi["<sos>"]] * batch_size).unsqueeze(1).to(device)
    preds = trg_init

    for _ in range(max_len):
        # Forward pass: output shape assumed to be (batch_size, seq_len, vocab_size)
        output = model(src_batch, preds)
        # Get logits for the last time step and perform greedy selection.
        next_token_logits = output[:, -1, :]
        next_tokens = next_token_logits.argmax(dim=-1, keepdim=True)
        preds = torch.cat([preds, next_tokens], dim=1)
        # Stop if all sequences have generated an <eos> token.
        if (next_tokens == trg_vocab.stoi["<eos>"]).all():
            break

    # Convert token indices to strings, ignoring <sos> and truncating at <eos>.
    batch_tokens = []
    for seq in preds:
        tokens = []
        for token in seq:
            token = token.item()
            if token == trg_vocab.stoi["<sos>"]:
                continue
            if token == trg_vocab.stoi["<eos>"]:
                break
            tokens.append(trg_vocab.itos[token])
        batch_tokens.append(tokens)
    return batch_tokens

def calculate_bleu(model, dataloader, src_vocab, trg_vocab, device):
    references = []
    hypotheses = []
    with torch.no_grad():
        for src_batch, trgs in dataloader:
            batch_pred_tokens = translate_batch(model, src_batch, src_vocab, trg_vocab, device)
            for i, trg in enumerate(trgs):
                # Convert target token indices to strings, skipping special tokens.
                trg_tokens = [trg_vocab.itos[token.item()] for token in trg 
                              if token.item() not in {trg_vocab.stoi["<sos>"], trg_vocab.stoi["<eos>"]}]
                references.append([trg_tokens])
                pred_tokens = [tok for tok in batch_pred_tokens[i] if tok not in {"<sos>", "<eos>"}]
                hypotheses.append(pred_tokens)
    bleu = nltk.translate.bleu_score.corpus_bleu(references, hypotheses)
    return bleu

if __name__ == '__main__':
    eval_file = "/kaggle/input/psu-to-code/eval.csv"     
    model_path = "/kaggle/working/PsuToCode.pth"  
    src_vocab_path = "/kaggle/working/src_vocab.pkl"
    trg_vocab_path = "/kaggle/working/trg_vocab.pkl"
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    
    src_vocab = load_vocab(src_vocab_path)
    trg_vocab = load_vocab(trg_vocab_path)
    
    # Initialize your model (ensure TransformerSeq2Seq is defined/imported)
    model = TransformerSeq2Seq(len(src_vocab.stoi), len(trg_vocab.stoi)).to(device)
    state_dict = torch.load(model_path, map_location=device)
    model.load_state_dict(state_dict)
    model.eval()
    
    eval_dataset = PseudocodeDataset(eval_file, src_vocab, trg_vocab)
    eval_dataloader = DataLoader(eval_dataset, batch_size=32, collate_fn=eval_collate_fn, pin_memory=True)
    
    bleu_score = calculate_bleu(model, eval_dataloader, src_vocab, trg_vocab, device)
    print(f"BLEU Score on Evaluation Set: {bleu_score:.4f}")


Using device: cuda


  state_dict = torch.load(model_path, map_location=device)


BLEU Score on Evaluation Set: 0.5691


In [22]:
import torch
import pickle
from torch.utils.data import DataLoader
import torch.nn.utils.rnn as rnn_utils

def load_vocab(filepath):
    with open(filepath, "rb") as f:
        vocab = pickle.load(f)
    return vocab

def collate_fn(batch):
    # Each batch element is assumed to be a tuple: (src, _)
    srcs, _ = zip(*batch)
    # Pad the source sequences; adjust padding_value as needed.
    padded_srcs = rnn_utils.pad_sequence(srcs, batch_first=True, 
                                         padding_value=src_vocab.stoi.get("<pad>", 0))
    return padded_srcs

def translate_batch(model, src_batch, src_vocab, trg_vocab, device, max_len=50):
    """
    Batch version of translation using the model's forward method.
    The model is assumed to take (src, trg) as input, where `trg` is the
    currently generated sequence. Greedy decoding is used.
    """
    batch_size = src_batch.size(0)
    src_batch = src_batch.to(device)
    
    # Initialize target sequences with <sos> token.
    trg_init = torch.LongTensor([trg_vocab.stoi["<sos>"]] * batch_size).unsqueeze(1).to(device)
    preds = trg_init

    for _ in range(max_len):
        # Call the model's forward method with the source and current target sequence.
        # The output should be of shape (batch_size, seq_len, vocab_size).
        output = model(src_batch, preds)
        
        # Get the logits for the last time step.
        next_token_logits = output[:, -1, :]
        # Greedy decoding: select the token with the highest probability.
        next_tokens = next_token_logits.argmax(dim=-1, keepdim=True)
        preds = torch.cat([preds, next_tokens], dim=1)
        
        # Check if every sequence in the batch generated an <eos> token.
        if (next_tokens == trg_vocab.stoi["<eos>"]).all():
            break

    # Convert predicted token indices to tokens, ignoring <sos> and stopping at <eos>.
    batch_tokens = []
    for seq in preds:
        tokens = []
        for token in seq:
            token = token.item()
            if token == trg_vocab.stoi["<sos>"]:
                continue
            if token == trg_vocab.stoi["<eos>"]:
                break
            tokens.append(trg_vocab.itos[token])
        batch_tokens.append(tokens)
    return batch_tokens

if __name__ == '__main__':
    test_file = "/kaggle/input/psu-to-code/test.csv"             
    model_path = "/kaggle/working/PsuToCode.pth"
    src_vocab_path = "/kaggle/working/src_vocab.pkl"
    trg_vocab_path = "/kaggle/working/trg_vocab.pkl"
    output_file = "test_predictions.txt"
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    
    src_vocab = load_vocab(src_vocab_path)
    trg_vocab = load_vocab(trg_vocab_path)
    
    # Initialize your model and load state dict.
    model = TransformerSeq2Seq(len(src_vocab.stoi), len(trg_vocab.stoi)).to(device)
    state_dict = torch.load(model_path, map_location=device)
    model.load_state_dict(state_dict)
    model.eval()
    
    test_dataset = PseudocodeDataset(test_file, src_vocab, trg_vocab)
    test_dataloader = DataLoader(test_dataset, batch_size=32, 
                                 collate_fn=collate_fn, pin_memory=True)
    predictions = []
    
    with torch.no_grad():
        for src_batch in test_dataloader:
            # Perform batch translation on GPU using the updated translate_batch function.
            batch_pred_tokens = translate_batch(model, src_batch, src_vocab, trg_vocab, device)
            for tokens in batch_pred_tokens:
                predictions.append(" ".join(tokens))
    
    with open(output_file, "w") as f:
        for pred in predictions:
            f.write(pred + "\n")
    print("Predictions saved to", output_file)


Using device: cuda


  state_dict = torch.load(model_path, map_location=device)


Predictions saved to test_predictions.txt
