In [1]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from datasets import load_dataset
import json

# Load dataset from JSON file
def load_json_dataset(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

data = load_json_dataset("dataset.json")

# Extract pseudocode and C++ code
pseudocode_texts = [item["pseudocode"] for item in data]
code_texts = [item["code"] for item in data]

# Initialize BPE tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Train the tokenizer on both pseudocode and code
trainer = trainers.BpeTrainer(vocab_size=8000,min_frequency=5, special_tokens=["<pad>", "<sos>", "<eos>", "<unk>"])
tokenizer.train_from_iterator(pseudocode_texts + code_texts, trainer)

# Save the tokenizer
tokenizer.save("bpe_tokenizer.json")

print("Tokenizer trained and saved as 'bpe_tokenizer.json'")


  from .autonotebook import tqdm as notebook_tqdm


Tokenizer trained and saved as 'bpe_tokenizer.json'


In [2]:
from tokenizers import Tokenizer
import json

# Load trained BPE tokenizer
tokenizer = Tokenizer.from_file("bpe_tokenizer.json")

# Enable padding and truncation
tokenizer.enable_padding(pad_id=0, pad_token="<pad>")
tokenizer.enable_truncation(max_length=256)

# Load dataset
with open("dataset.json", "r", encoding="utf-8") as f:
    dataset = json.load(f)

# Tokenize dataset
tokenized_dataset = []
for item in dataset:
    pseudocode_tokens = tokenizer.encode(item["pseudocode"]).ids
    code_tokens = tokenizer.encode(item["code"]).ids
    tokenized_dataset.append({"input": pseudocode_tokens, "output": code_tokens})

# Save tokenized dataset
with open("tokenized_dataset.json", "w", encoding="utf-8") as f:
    json.dump(tokenized_dataset, f)

print("Tokenized dataset saved successfully!")


Tokenized dataset saved successfully!


In [3]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset

# Load tokenized dataset
with open("tokenized_dataset.json", "r", encoding="utf-8") as f:
    tokenized_dataset = json.load(f)

# Convert lists to tensors
input_ids = [torch.tensor(d["input"]) for d in tokenized_dataset]
output_ids = [torch.tensor(d["output"]) for d in tokenized_dataset]

# Pad sequences to max length (256)
padded_inputs = pad_sequence(input_ids, batch_first=True, padding_value=0)
padded_outputs = pad_sequence(output_ids, batch_first=True, padding_value=0)

# Create dataset
dataset = TensorDataset(padded_inputs, padded_outputs)

# DataLoader with optimized batch size
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, pin_memory=True)

print("Padded dataset successfully created!")


Padded dataset successfully created!


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler

# Define Transformer Model
class CustomTransformer(nn.Module):
    def __init__(self, vocab_size=8000, embed_dim=128, hidden_dim=256, num_heads=4, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.decoder = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.encoder(x)
        return self.decoder(x)


# Initialize Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomTransformer().to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)

scaler = GradScaler()

def train(model, dataloader, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            
            with autocast():  # Mixed Precision
                outputs = model(inputs)
                loss = criterion(outputs.view(-1, 8000), targets.view(-1))
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

# Train Model
train(model, dataloader)

# Save Model
torch.save(model.state_dict(), "transformer_model.pth")
print("Model saved successfully!")
