# Claude attempts

In [67]:
import os
import json
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader
import sentencepiece as spm


# Define the dataset class
class NLC2CMDDataset(Dataset):
    def __init__(self, data_path, tokenizer):
        self.data = self.load_data(data_path)
        self.tokenizer = tokenizer

    def load_data(self, data_path):
        with open(data_path, 'r') as file:
            data = json.load(file)
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        nl = item['invocation']
        cmd = item['cmd']
        nl_tokens = self.tokenizer.encode(nl)
        cmd_tokens = self.tokenizer.encode(cmd)
        return torch.tensor(nl_tokens), torch.tensor(cmd_tokens)

# Define the Transformer model
class TransformerModel(pl.LightningModule):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dropout):
        super().__init__()
        self.encoder_layer = TransformerEncoderLayer(d_model, nhead, dropout=dropout)
        self.transformer_encoder = TransformerEncoder(self.encoder_layer, num_layers)
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, src):
        src = self.embedding(src)
        output = self.transformer_encoder(src)
        output = self.fc(output)
        return output

    def training_step(self, batch, batch_idx):
        src, tgt = batch
        src = src.transpose(0, 1)
        tgt = tgt.transpose(0, 1)
        output = self(src)
        loss = nn.CrossEntropyLoss()(output.view(-1, output.size(-1)), tgt.contiguous().view(-1))
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        src, tgt = batch
        src = src.transpose(0, 1)
        tgt = tgt.transpose(0, 1)
        output = self(src)
        loss = nn.CrossEntropyLoss()(output.view(-1, output.size(-1)), tgt.contiguous().view(-1))
        self.log('val_loss', loss)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-4)

# Train the SentencePiece tokenizer
def train_tokenizer(data_path, model_prefix):
    with open(data_path, 'r') as file:
        data = json.load(file)
    text_data = [item['invocation'] + '\n' + item['cmd'] for item in data.values()]
    with open('temp_data.txt', 'w') as file:
        file.write('\n'.join(text_data))
    spm.SentencePieceTrainer.Train(
        f'--input=temp_data.txt --model_prefix={model_prefix} --vocab_size=-1 --character_coverage=1.0'
    )
    os.remove('temp_data.txt')

# Generate text from the trained model
def generate_text(model, tokenizer, input_text, max_length=100):
    model.eval()
    input_tokens = tokenizer.encode(input_text)
    input_tensor = torch.tensor(input_tokens).unsqueeze(0).transpose(0, 1)
    output = model(input_tensor)
    output_tokens = output.argmax(dim=-1).squeeze().tolist()
    generated_text = tokenizer.decode(output_tokens)
    return generated_text

# Main function
def main():
    # Set up the dataset and tokenizer
    data_path = 'data/nl2bash-data.json'
    tokenizer_model_prefix = 'nlc2cmd_tokenizer'
    train_tokenizer(data_path, tokenizer_model_prefix)
    tokenizer = spm.SentencePieceProcessor(model_file=f'{tokenizer_model_prefix}.model')
    dataset = NLC2CMDDataset(data_path, tokenizer)
    train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

    # Set up the model and trainer
    d_model = 256
    nhead = 8
    num_layers = 4
    dropout = 0.1
    model = TransformerModel(vocab_size, d_model, nhead, num_layers, dropout)
    checkpoint_callback = ModelCheckpoint(dirpath='checkpoints', save_top_k=1, monitor='val_loss')
    trainer = pl.Trainer(max_epochs=10, callbacks=[checkpoint_callback])

    # Train the model
    trainer.fit(model, train_loader)

    # Generate text from the trained model
    input_text = "list all files in the current directory"
    generated_text = generate_text(model, tokenizer, input_text)
    print(f"Input: {input_text}")
    print(f"Generated: {generated_text}")

if __name__ == '__main__':
    main()

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=temp_data.txt --model_prefix=nlc2cmd_tokenizer --vocab_size=-1 --character_coverage=1.0
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: temp_data.txt
  input_format: 
  model_prefix: nlc2cmd_tokenizer
  model_type: UNIGRAM
  vocab_size: -1
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
 

RuntimeError: Internal: /Users/runner/work/sentencepiece/sentencepiece/src/trainer_interface.cc(53) [(trainer_spec.vocab_size()) > (0)] 

# vanilla copiloat attempts

In [46]:
import torch
from torch import nn

class CharTransformer(nn.Module):
    def __init__(self, n_chars, d_model=512, nhead=8, num_layers=6):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(n_chars, d_model, device='mps')
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_layers)
        self.fc = nn.Linear(d_model, n_chars)

    def forward(self, x, tgt):
        x = self.embedding(x.long()) * torch.sqrt(torch.tensor(self.d_model).to(x.device))
        x = self.pos_encoder(x)
        x = self.transformer(x, tgt)
        x = self.fc(x)
        return x

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x
    
    
from numpy import dtype
import sentencepiece as spm
from torch.nn.utils.rnn import pad_sequence

# Load the trained tokenizer
sp = spm.SentencePieceProcessor()
sp.load('m.model')

# Load the dataset
with open('data/nl2bash-data.json') as f:
    data = json.load(f)

# Tokenize the 'invocation' and 'cmd' fields and convert them to integer sequences
inputs = [torch.Tensor(sp.encode_as_ids(item['invocation'])) for item in data.values()]
targets = [torch.Tensor(sp.encode_as_ids(item['cmd'])) for item in data.values()]

# Convert the lists of integer sequences to tensors
# Pad or truncate the sequences to a fixed length of 31
inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
targets = pad_sequence(targets, batch_first=True, padding_value=0)


from curses import nl
from numpy import char
import pytorch_lightning as pl
from torch.nn import functional as F
import json
from torch.utils.data import TensorDataset, DataLoader

class CharTransformerModule(pl.LightningModule):
    def __init__(self, char_transformer, lr=1e-3):
        super().__init__()
        self.char_transformer = char_transformer
        self.lr = lr

    def forward(self, x):
        return self.char_transformer(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat.view(-1, y_hat.size(-1)), y.view(-1))
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

class NLC2CMDDataModule(pl.LightningDataModule):
    def __init__(self, inputs, targets, batch_size=64):
        super().__init__()
        self.inputs = inputs
        self.targets = targets
        self.batch_size = batch_size

    def setup(self, stage=None):
        self.train_dataset = TensorDataset(self.inputs, self.targets)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)


    
data_module = NLC2CMDDataModule(inputs, targets)

# Initialize the model
char_transformer = CharTransformer(n_chars=sp.get_piece_size())
model = CharTransformerModule(char_transformer)

# Initialize the trainer
trainer = pl.Trainer(max_epochs=10, accelerator='mps')

# Train the model
trainer.fit(model, data_module)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name             | Type            | Params
-----------------------------------------------------
0 | char_transformer | CharTransformer | 46.2 M
-----------------------------------------------------
46.2 M    Trainable params
0         Non-trainable params
46.2 M    Total params
184.762   Total estimated model params size (MB)
/opt/homebrew/anaconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

TypeError: CharTransformer.forward() missing 1 required positional argument: 'tgt'

# working attempt

In [1]:
import json
import sentencepiece as spm

# # Load the dataset
# with open('data/nl2bash-data.json') as f:
#     data = json.load(f)

# # Extract the text and write it to a new file
# with open('data/nl2bash.txt', 'w') as f:
#     for item in data.values():
#         f.write(f"{item['invocation']} \t {item['cmd']}\n")

# # Train the tokenizer
# spm.SentencePieceTrainer.train('--input=data/nl2bash.txt --model_prefix=m --vocab_size=2000')

# # Load the trained tokenizer
# sp = spm.SentencePieceProcessor()
# sp.load('m.model')

In [5]:
import json
import math
from numpy import pad
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
import sentencepiece as spm
import pytorch_lightning as pl
from lightning.pytorch import loggers as pl_loggers

tb_logger = pl_loggers.TensorBoardLogger(save_dir="logs/")
# Load the dataset
with open('data/nl2bash-data.json') as f:
    data = json.load(f)

# # Extract the text and write it to a new file
# with open('text.txt', 'w') as f:
#     for item in data:
#         f.write(item['invocation'] + '\n')
#         f.write(item['cmd'] + '\n')

# # Train the tokenizer
# spm.SentencePieceTrainer.train('--input=text.txt --model_prefix=m --vocab_size=2000')

# Load the trained tokenizer
sp = spm.SentencePieceProcessor()
sp.load('m.model')

# Tokenize the 'invocation' and 'cmd' fields and convert them to integer sequences
inputs = [torch.tensor([sp.bos_id()]+sp.encode_as_ids(item['invocation'])+ [sp.eos_id()], dtype=torch.long) for item in data.values()]
targets = [torch.tensor([sp.bos_id()]+sp.encode_as_ids(item['cmd'])+ [sp.eos_id()], dtype=torch.long) for item in data.values()]

# Pad the sequences to a fixed length
max_length = max(max(len(x) for x in inputs), max(len(x) for x in targets))
inputs = pad_sequence([torch.cat([x, torch.zeros(max_length - len(x), dtype=torch.long)]) for x in inputs], padding_value=0, batch_first=True)
targets = pad_sequence([torch.cat([x, torch.zeros(max_length - len(x), dtype=torch.long)]) for x in targets], padding_value=0, batch_first=True)


# Create a PyTorch Dataset and DataLoader
dataset = TensorDataset(inputs, targets)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)
        
    def forward(self, token_embedding: torch.Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# Define the Transformer model
class TransformerModel(pl.LightningModule):
    def __init__(self, n_chars, d_model=128, nhead=8, num_layers=6):
        super().__init__()
        self.d_model = d_model  # Store d_model as an attribute
        self.embedding = nn.Embedding(n_chars, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_layers)
        self.generator = nn.Linear(d_model, n_chars)
        self.pos_encoder = PositionalEncoding(d_model, 0.1)
    def forward(self, src, tgt=None):
        src_embedding = self.embedding(src) * math.sqrt(self.d_model)
        src_embedding = self.pos_encoder(src_embedding)
        trg_embedding = self.embedding(tgt) * math.sqrt(self.d_model)
        trg_embedding = self.pos_encoder(trg_embedding)
        output = self.transformer(src_embedding, trg_embedding)
        output = self.generator(output)
        return output
        


    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x, y)
        loss = nn.CrossEntropyLoss()(y_hat.view(-1, y_hat.size(-1)), y.view(-1))
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-5)

# Initialize the model
model = TransformerModel(n_chars=sp.get_piece_size())

# Initialize the trainer
trainer = pl.Trainer(max_epochs=10, log_every_n_steps=10)

# Train the model
trainer.fit(model, dataloader)

AttributeError: module 'torch.nn' has no attribute 'Tensor'

In [24]:
sp = spm.SentencePieceProcessor()
sp.load('m.model')
sp.bos_id()


1

In [2]:
def generate_command(model, sp, text, max_length=15):
    # Tokenize the input text
    input_ids = torch.tensor([sp.encode_as_ids(text)], dtype=torch.long)

    # Move the input to the same device as the model
    input_ids = input_ids.to(next(model.parameters()).device)

    # Initialize the output sequence with the input
    output_ids = input_ids.clone()

    # Generate the output sequence one token at a time
    for i in range(max_length):
        # Get the model's predictions for the next token
        with torch.no_grad():
            predictions = model(output_ids)

        # Get the index of the predicted token
        predicted_id = torch.argmax(predictions[0, -1]).unsqueeze(0)

        # Append the predicted token to the output sequence
        output_ids = torch.cat([output_ids, predicted_id.unsqueeze(0)], dim=-1)

        # Stop generating if the end-of-sequence token is predicted
        if predicted_id == sp.eos_id():
            break

    # Decode the output sequence into a string
    command = sp.decode_ids(output_ids[0].tolist())

    return command

In [3]:
text = "list all files in the current directory"
command = generate_command(model, sp, text)
print(command)

list all files in the current directory ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 
