In [None]:
import sys
PATH_TO_CLAI_UTILS = 'clai/utils/' ## YOUR CODE HERE ##
sys.path.append(PATH_TO_CLAI_UTILS)

In [None]:
import numpy as np
import pandas as pd

from bashlint.data_tools import bash_parser, pretty_print, cmd2template
from metric.metric_utils import compute_metric
from functools import partial

from collections import Counter
import sentencepiece as spm

import torch
import torch.nn as nn
from torch.functional import F
from torch.utils.data import Dataset, DataLoader

Setting bashlex grammar using file: clai/utils/bashlint/grammar/grammar100.txt
Bashlint grammar set up (148 utilities)



In [None]:
train_data = pd.read_csv('data/train.csv')
train_data.head()

Unnamed: 0,invocation,cmd
0,"copy loadable kernel module ""mymodule.ko"" to t...",sudo cp mymodule.ko /lib/modules/$(uname -r)/k...
1,"display all lines containing ""ip_mroute"" in th...",cat /boot/config-`uname -r` | grep IP_MROUTE
2,display current running kernel's compile-time ...,cat /boot/config-`uname -r`
3,"find all loadable modules for current kernel, ...",find /lib/modules/`uname -r` -regex .*perf.*
4,"look for any instance of ""highmem"" in the curr...",grep “HIGHMEM” /boot/config-`uname -r`


In [None]:
test_data = pd.read_csv('data/test.csv')
test_data.head()

Unnamed: 0,invocation,cmd,origin
0,create ssh connection to specified ip from spe...,ssh user123@176.0.13.154,handcrafted
1,"search for commands containing string ""zeppeli...",history | grep zeppelin,handcrafted
2,search for location of specified file or appli...,whereis python3,handcrafted
3,grant all rights to root folder,sudo chmod 777 -R /,handcrafted
4,search in running processes for specified name,ps -aux | grep zepp,handcrafted


In [None]:
def clean_text(text):
    ### YOUR CODE HERE ###
    text = text.lower()
    text = text.replace("'s", "s")
    text = text.replace("'", '"').replace("`", '"') # all quotes to double "
    text = text.strip('.')
    return text

In [None]:
text = train_data['invocation'].sample(1).iloc[0]
print(text)
print(clean_text(text))

find all the files in the current directory which end with orig
find all the files in the current directory which end with orig


In [None]:
train_data['text_cleaned'] = train_data['invocation'].apply(clean_text)
test_data['text_cleaned'] = test_data['invocation'].apply(clean_text)

In [None]:
train_data['cmd_cleaned'] = train_data['cmd'].apply(partial(cmd2template, loose_constraints=True))
test_data['cmd_cleaned'] = test_data['cmd'].apply(partial(cmd2template, loose_constraints=True))

In [None]:
valid_data = train_data.iloc[-100:]
train_data = train_data.iloc[:-100]

In [None]:
## YOUR CODE HERE ###
import io
import sentencepiece as spm

In [None]:
train_data['text_cleaned'].head()

0    copy loadable kernel module "mymodule.ko" to t...
1    display all lines containing "ip_mroute" in th...
2    display current running kernels compile-time c...
3    find all loadable modules for current kernel, ...
4    look for any instance of "highmem" in the curr...
Name: text_cleaned, dtype: object

In [None]:
def train_sentencepiece(series, name, vocab_size):
    model = io.BytesIO()
    trainer = spm.SentencePieceTrainer.train(
        sentence_iterator=iter(series.values.tolist()), model_writer=model, vocab_size=vocab_size, \
        pad_id=0, bos_id=1, eos_id=2, unk_id=3, pad_piece='<PAD>', unk_piece='<UNK>', bos_piece='<BOS>', eos_piece='<EOS>')

    with open('{}.model'.format(name), 'wb') as f:
        f.write(model.getvalue())

    return spm.SentencePieceProcessor(model_proto=model.getvalue())

In [None]:
text_tokenizer = train_sentencepiece(train_data['text_cleaned'], 'text_tokenizer', 2500)
cmd_tokenizer = train_sentencepiece(train_data['cmd_cleaned'], 'cmd_tokenizer', 128)

In [None]:
PAD_ID = 0
BOS_ID = 1
EOS_ID = 2


MAX_TEXT_LENGTH = 256
MAX_CODE_LENGTH = 40

BATCH_SIZE = 64

In [None]:
from torch.nn.utils.rnn import pad_sequence

class TextToBashDataset(Dataset):
    def __init__(self, text, cmd, text_tokenizer, cmd_tokenizer):
        self.text = text
        self.cmd = cmd
        self.text_tokenizer = text_tokenizer
        self.cmd_tokenizer = cmd_tokenizer

    def __getitem__(self, idx):
        text_ids = self.text_tokenizer.encode(self.text.iloc[idx])
        cmd_ids = self.cmd_tokenizer.encode(self.cmd.iloc[idx])

        text_ids = text_ids[:MAX_TEXT_LENGTH]
        cmd_ids = [BOS_ID] + cmd_ids[:MAX_CODE_LENGTH - 2] + [EOS_ID]
        return torch.tensor(text_ids, dtype=torch.long), \
               torch.tensor(cmd_ids, dtype=torch.long)

    def __len__(self):
        return len(self.text)

class PaddingCollator:
    def __init__(self, batch_first=True):
        self.batch_first = batch_first
        
    def __call__(self, batch):
        """
        Args:
            batch: list of tuples of torch.tensors
        
        Returns:
            new_sentences: torch.tensor
            new_tags: torch.tensor
                Both tensors have the same size 
        """
        texts, cmds = [x[0] for x in batch], [x[1] for x in batch]

        padded_texts = pad_sequence(texts, batch_first=self.batch_first, padding_value=PAD_ID)
        padded_cmds = pad_sequence(cmds, batch_first=self.batch_first, padding_value=PAD_ID)
        return padded_texts, padded_cmds

In [None]:
## YOUR CODE HERE ###
train_ds = TextToBashDataset(train_data['text_cleaned'], 
                             train_data['cmd_cleaned'], 
                             text_tokenizer, 
                             cmd_tokenizer) 
valid_ds = TextToBashDataset(valid_data['text_cleaned'], 
                             valid_data['cmd_cleaned'], 
                             text_tokenizer, 
                             cmd_tokenizer) 

In [None]:
loaders = {
    'train': DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=PaddingCollator()),
    'valid': DataLoader(valid_ds, batch_size=BATCH_SIZE, collate_fn=PaddingCollator()),
}

In [None]:
from transformers import BertConfig, BertModel, EncoderDecoderConfig, EncoderDecoderModel

In [None]:
text_model_config = {
    'vocab': text_tokenizer.vocab_size(),
    'hidden_size': 256,
    'num_hidden_layers': 2,
    'num_attention_heads': 8,
    'intermediate_size': 256 * 4,
    'hidden_dropout_prob': 0.1,
    'pad_id': PAD_ID,
}

cmd_model_config = {
    'vocab': cmd_tokenizer.vocab_size(),
    'hidden_size': 256,
    'num_hidden_layers': 2,
    'num_attention_heads': 8,
    'intermediate_size': 256 * 4,
    'hidden_dropout_prob': 0.1,
    'pad_id': PAD_ID,
}

In [None]:
class TextToBashModel(nn.Module):
    def __init__(self, text_model_config, cmd_model_config):
        super(TextToBashModel, self).__init__()
        ## YOUR CODE HERE ##
        encoder_config = BertConfig.from_dict(text_model_config)
        decoder_config = BertConfig.from_dict(cmd_model_config)
        self.config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
        self.model = EncoderDecoderModel(self.config)

    def forward(self, input_ids, decoder_input_ids):
        ## YOUR CODE HERE ##
        attention_mask = (input_ids != PAD_ID).int()
        decoder_attention_mask = (decoder_input_ids != PAD_ID).int()
        labels = decoder_input_ids.detach().clone()
        labels[labels == PAD_ID] = -100

        out = self.model(
            input_ids=input_ids, 
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            labels=labels
            )
        return out.loss, out.logits

In [None]:
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter

In [None]:
## YOUR CODE HERE ##
model = TextToBashModel(text_model_config, cmd_model_config)
optimizer = torch.optim.Adam(model.parameters())

writer = SummaryWriter()

In [None]:
NUM_EPOCHS = 8 # оптимальное количество эпох. если поставить больше, то лосс на валидации не уменьшается
LOG_EVERY = 10
EVAL_EVERY = 100
device = 'cuda' if torch.cuda.is_available() else 'cpu'
global_step = 0


model.to(device)
model.train()
for epoch in range(NUM_EPOCHS):
    train_iter = tqdm(loaders['train'])
    for input_ids, decoder_input_ids in train_iter:
        input_ids, decoder_input_ids = input_ids.to(device), decoder_input_ids.to(device)
        
        model.zero_grad()
        loss, _ = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        loss.backward()
        optimizer.step()

        if (global_step + 1) % LOG_EVERY == 0:
            train_iter.set_description('loss = {}'.format(loss.item()))
            writer.add_scalar('loss', loss.item(), global_step=global_step)

        if (global_step + 1) % EVAL_EVERY == 0:
            model.eval()
            with torch.no_grad():
                mean_loss = 0
                for input_ids, decoder_input_ids in tqdm(loaders['valid'], desc='validation', leave=False):
                    input_ids, decoder_input_ids = input_ids.to(device), decoder_input_ids.to(device)
                    loss, _ = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
                    mean_loss += loss.item()
                writer.add_scalar('val_loss', mean_loss / len(loaders['valid']), global_step=global_step)
            
        global_step += 1


HBox(children=(FloatProgress(value=0.0, max=154.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='validation', max=2.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, max=154.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='validation', max=2.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='validation', max=2.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, max=154.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='validation', max=2.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, max=154.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='validation', max=2.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='validation', max=2.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, max=154.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='validation', max=2.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, max=154.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='validation', max=2.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='validation', max=2.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, max=154.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='validation', max=2.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, max=154.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='validation', max=2.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='validation', max=2.0, style=ProgressStyle(description_wid…




In [None]:
%load_ext tensorboard
%tensorboard --logdir runs

In [None]:
class BeamSearchGenerator:
    def __init__(
            self, pad_id, eos_id, bos_id,
            max_length=20, beam_width=5, temperature=1,
            device='cuda',
    ):
        """
        Parameters
        ----------
        pad_id : int
        eos_id : int
        bos_id : int
        max_length : int
            Maximum length of output sequence
        self.beam_width : int
            Width of the beam
        temperature : float
            Softmax temperature
        device : torch.device
            Your model device
        """
        self.pad_id = pad_id
        self.eos_id = eos_id
        self.bos_id = bos_id

        self.max_length = max_length
        self.beam_width = beam_width
        self.temperature = temperature

        self.device = device

    @torch.no_grad()
    def get_result(self, model, input_text_tokens):
        """
        Parameters
        ----------
        model : TextToBashModel
        input_text_tokens : torch.tensor
            One object input tensor
        """
        ## YOUR CODE HERE ##

        beam_scores = torch.zeros([self.beam_width], device=self.device)
        decoder_input_ids = torch.full([self.beam_width, 1], self.bos_id, device=self.device)
        beam_tokens = decoder_input_ids
        decoded_sequences = []
        current_len = 0

        model_kwargs = {'output_hidden_states': True}
        model_kwargs = model.model._prepare_encoder_decoder_kwargs_for_generation(
            input_ids=input_text_tokens.repeat([self.beam_width, 1]).to(self.device),
            model_kwargs=model_kwargs
        )  # output_hidden_states, encoder_outputs

        while current_len < self.max_length:
            outputs = model.model.forward(decoder_input_ids=beam_tokens, **model_kwargs)
            scores, tokens = torch.topk(outputs.logits[:, -1, :], self.beam_width)  # self.beam_width ** 2 токенов

            if current_len == 0:
                beam_scores += F.log_softmax(scores[0] / self.temperature, dim=-1).reshape(-1)
                decoder_input_ids = tokens[0].reshape(self.beam_width, 1)
            else:
                current_scores = (beam_scores.reshape(-1, 1).repeat([1, 5]) + \
                                    F.log_softmax(scores, dim=-1) / self.temperature).reshape(-1)
                current_tokens = tokens.reshape(-1)
                current_sent_idxs = torch.arange(self.beam_width).reshape(-1, 1).repeat([1, 5]).reshape(-1)
                top_scores, top_indexes = torch.topk(current_scores, self.beam_width)
                decoder_input_ids = current_tokens[top_indexes][:, None]

                # пока среди топ self.beam_width токенов есть EOS, добавляем сгенерированную последовательность
                # в финальный список последовательностей и снова берем топ self.beam_width токенов
                while (decoder_input_ids == self.eos_id).sum() > 0:
                    eos_idxs = top_indexes[(decoder_input_ids == self.eos_id).reshape(-1)]
                    # сохраняем завершенные последовательности и их скоры
                    finished_sequences = beam_tokens[current_sent_idxs[eos_idxs]]
                    finished_scores = current_scores[eos_idxs]
                    for seq, score in zip(finished_sequences, finished_scores):
                        decoded_sequences.append([seq.cpu().numpy(), score.item() / len(seq)])

                    # удаляем завершенные последовательности из списка кандидатов
                    other_idxs = torch.full(current_scores.shape, True)
                    other_idxs[eos_idxs] = False
                    current_scores = current_scores[other_idxs]
                    current_tokens = current_tokens[other_idxs]
                    current_sent_idxs = current_sent_idxs[other_idxs]

                    top_scores, top_indexes = torch.topk(current_scores, self.beam_width)
                    decoder_input_ids = current_tokens[top_indexes][:, None]
                beam_scores = current_scores[top_indexes]

            beam_tokens = torch.cat([beam_tokens, decoder_input_ids], dim=1)
            current_len += 1

        for seq, score in zip(beam_tokens, beam_scores):
            decoded_sequences.append([seq.cpu().numpy(), score.item() / len(seq)])

        decoded_sequences = sorted(decoded_sequences, key=lambda x: -x[1])
        return decoded_sequences

In [None]:
beam_search_enginge = BeamSearchGenerator(
    pad_id=PAD_ID, eos_id=EOS_ID, bos_id=BOS_ID,
    max_length=MAX_CODE_LENGTH, beam_width=5,
    temperature=1, device='cuda',
)

In [None]:
with torch.no_grad():
    for i in range(5):
        print()
        print('text:', valid_data.invocation.iloc[i])
        print('true:', valid_data.cmd.iloc[i])
        print('true cleaned:', valid_data.cmd_cleaned.iloc[i])

        src = valid_ds[i][0]
        pred = beam_search_enginge.get_result(model, src)
        
        scores = []
        for x, proba in pred[:5]:
            pred_cmd = cmd_tokenizer.decode(list(map(int, x)))
            score = compute_metric(pred_cmd, 1, valid_data.cmd.iloc[i])
            scores.append(score)
            print(pred_cmd, proba)
        print(max(scores))


text: searches through the root filesystem ("/") for the file named chapter1, and prints the location
true: find / -name Chapter1 -type f -print
true cleaned: find Path -name Regex -type f -print
find Path -name Regex -type f -0.07968467932481033
find Path -name Regex -0.10887665748596191
find Path -name Regex -type f -print -0.1808021068572998
find Path -name Regex -type f -print0Fileus Regex -0.26481080055236816
find Path -name Regex -type f -print0 -0.28794509172439575
1.0

text: searches through the root filesystem ("/") for the file named chapter1.
true: find / -name Chapter1 -type f
true cleaned: find Path -name Regex -type f
find Path -name Regex -type f -0.09823213173792912
find Path -name Regex -type f -print -0.11151231129964193
find Path -name Regex -0.15206730365753174
find Path -name Regex -type f -print | xargs -I {} wcegex {} -0.2382312704015661
sunci find PathfiRegex -0.26563410758972167
1.0

text: searches through the root filesystem ("/") for the file named chapter1.

In [None]:
def compute_all_scores(model, df, beam_engine):
    all_scores = []

    for i, (text, target_cmd) in tqdm(enumerate(zip(df.text_cleaned.values, df.cmd.values)), total=len(df)):
        input_tokens = torch.tensor(text_tokenizer.encode(text), dtype=torch.long).to(model.model.device) ## YOUR CODE HERE ##
        predictions = beam_search_enginge.get_result(model, input_tokens)
        
        # get only 5 top results
        predictions = predictions[:5]
        object_scores = []
        for output_tokens, _ in predictions:
            output_cmd = cmd_tokenizer.decode(list(map(int, output_tokens)))
            score = compute_metric(output_cmd, 1, target_cmd)
            object_scores.append(score)
        
        all_scores.append(max(object_scores))
    return all_scores

In [None]:
val_scores = compute_all_scores(model, valid_data, beam_search_enginge)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [None]:
np.mean(val_scores)

0.02030555555555555

In [None]:
test_scores = compute_all_scores(model, test_data[test_data['origin'] == 'handcrafted'], beam_search_enginge)

HBox(children=(FloatProgress(value=0.0, max=129.0), HTML(value='')))




In [None]:
np.mean(test_scores)

0.007988802756244626