In [2]:
import torch
import torch.nn as nn
import numpy as np
import pickle
from fastai.text.all import *

In [3]:
if torch.cuda.is_available():
  device = "cuda:0"
else:
  device = "cpu"
device

'cpu'

In [6]:
def scaled_dot_product_attention(query, key, value, mask=None):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / np.sqrt(dim_k)
    if mask is not None:
        if scores.shape[1] == mask.shape[1]:
            scores = scores.masked_fill(mask == 0, float("-inf"))
        else:
            mask = torch.tril(torch.ones(scores.shape[1], scores.shape[1])).unsqueeze(0).to(device)
            scores = scores.masked_fill(mask == 0, float("-inf"))
    weights = F.softmax(scores, dim=-1)
    return weights.bmm(value)

class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim, vocab_size):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)
        self.mask = torch.tril(torch.ones(vocab_size, vocab_size)).unsqueeze(0).to(device)

    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(
            self.q(hidden_state), self.k(hidden_state), self.v(hidden_state), self.mask)
        return attn_outputs

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim, 72) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        return x

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

class TransformerEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def forward(self, x):
        # Apply layer normalization and then copy input into query, key, value
        hidden_state = self.layer_norm_1(x)
        # Apply attention with a skip connection
        x = x + self.attention(hidden_state)
        # Apply feed-forward layer with a skip connection
        x = x + self.feed_forward(self.layer_norm_2(x))
        return x

class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size, 
                                             config.hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                                config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout()

    def forward(self, input_ids):
        # Create position IDs for input sequence
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0).to(device)
        # Create token and position embeddings
        token_embeddings = self.token_embeddings(input_ids)
        
        position_embeddings = self.position_embeddings(position_ids)
        # Combine token and position embeddings
        embeddings = token_embeddings #+ position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

class TransformerDecoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.layers = nn.ModuleList([TransformerEncoderLayer(config) 
                                     for _ in range(config.num_hidden_layers)])

    def forward(self, x):
        x = self.embeddings(x)
        for layer in self.layers:
            x = layer(x)
        return x

class ShellTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.encoder = TransformerDecoder(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        
    def forward(self, x):
        #print(x)
        x = self.encoder(x)#[:, 0, :] # select hidden state of [CLS] token
        #print(x)
        x = self.dropout(x)
        x = self.classifier(x)
        #print(x)
        return x

In [22]:
with open('/home/chris/University/gnn_project/dataset', 'rb') as fp:
    _ = pickle.load(fp)

In [23]:
_[0:10]

['nmap\n',
 'nmap -v 10.1.26.4\n',
 'nmap -v 10.1.26.9\n',
 'ssh --help\n',
 'ssh 10.1.26.9\n',
 'ssh 10.1.26.9 admin/123456\n',
 'ssh --help\n',
 'ssh 10.1.26.9\n',
 'ssh -l admin 10.1.26.9\n',
 'ssh admin@admin 10.1.26.9\n']

In [11]:
import os

# Given array of text elements
texts = _

# Create a folder to store the text files
folder_path = '/home/chris/University/gnn_project/data/'
os.makedirs(folder_path, exist_ok=True)

# Write each non-empty text element to a separate file
for i, text in enumerate(texts):
    # Remove trailing newline character
    text = text.rstrip('\n')
    
    # Check if text is not empty after stripping newline
    if text.strip():
        file_path = os.path.join(folder_path, f'text_{i}.txt')
        with open(file_path, 'w') as file:
            file.write(text)


In [24]:
txts = L(_)

In [25]:
len(txts)

203101

# Tokenizer 

### IMDB Tokenizer

In [None]:
class MyTokenizer(Transform):
    def setups(self, items):
        path = untar_data(URLs.IMDB)
        self.tok =  Tokenizer.from_folder(path)
        self.tok.setup(items)
        
    def encodes(self, txts):
        with open(txts, 'r') as file:
            content = file.read()
        return self.tok(content)
    
    def decodes(self, encoded):
        return self.tok.decode(encoded)
            
class MyNumerizer(Transform):
    def setups(self, items):
        self.num = Numericalize()
        self.num.setup(items)
        self.vocab = self.num.vocab

    def encodes(self, toks):
        return self.num(toks)
    
    def decodes(self, encoded):
        return self.num.decode(encoded)
    
limit = 10000
path_test = '/home/chris/University/gnn_project/'
tfms = [[MyTokenizer(),MyNumerizer()]]
files = get_text_files(path_test, folders = ['data'])
dsets = Datasets(files[:limit], tfms)
dls = dsets.dataloaders(dl_type=LMDataLoader, bs=4)

dls.show_batch(max_n=10)

Unnamed: 0,text,text_
0,"xxbos cmake build xxbos trabalho xxbos make xxbos watch -n 1 "" syscoind getinfo & & tail -n 20 debug.log "" xxbos pwd xxbos java xxmaj xxunk xxbos sudo systemctl stop network - manager ; sleep xxunk systemctl restart network - manager xxbos set xxup xxunk xxbos chmod + x xxunk xxbos grep xxunk xxunk xxunk | sed -i 's / xxup xxunk / xxup xxunk / xxup g ' xxbos msfconsole","cmake build xxbos trabalho xxbos make xxbos watch -n 1 "" syscoind getinfo & & tail -n 20 debug.log "" xxbos pwd xxbos java xxmaj xxunk xxbos sudo systemctl stop network - manager ; sleep xxunk systemctl restart network - manager xxbos set xxup xxunk xxbos chmod + x xxunk xxbos grep xxunk xxunk xxunk | sed -i 's / xxup xxunk / xxup xxunk / xxup g ' xxbos msfconsole xxbos"
1,"read xxbos sudo reboot xxbos rvm -v xxbos ls xxbos ls xxbos python3 xxunk xxbos "" xxunk - xxunk "" : "" xxunk "" , xxbos gwsh xxunk xxbos vim xxunk xxbos bash : [ goalador@gatanda : command not found xxbos ls xxbos bg xxunk xxbos rm bin / info_pc1_cesar xxbos make xxbos wget xxbos [ goalador@gatanda hhneuauf.de]$ [ goalador@gatanda hhneuauf.de]$ [ goalador@gatanda hhneuauf.de]$ [ chg ] xxmaj controller xxup c8 :","xxbos sudo reboot xxbos rvm -v xxbos ls xxbos ls xxbos python3 xxunk xxbos "" xxunk - xxunk "" : "" xxunk "" , xxbos gwsh xxunk xxbos vim xxunk xxbos bash : [ goalador@gatanda : command not found xxbos ls xxbos bg xxunk xxbos rm bin / info_pc1_cesar xxbos make xxbos wget xxbos [ goalador@gatanda hhneuauf.de]$ [ goalador@gatanda hhneuauf.de]$ [ goalador@gatanda hhneuauf.de]$ [ chg ] xxmaj controller xxup c8 : f7:33:33"
2,"xxunk admin xxbos vim xxunk xxbos cat xxunk xxbos ls -a xxbos git clone ssh : / / xxunk / xxunk / git / xxunk xxbos sudo vi xxunk xxbos clear xxbos ls xxbos mysql -h alas -u ml12087 -p xxbos git xxunk xxbos [ goalador@gatanda hhneuauf.de]$ bash : bash : : command not found xxbos sudo xxbos git log xxbos test "" $ user "" = "" xxunk "" & &","admin xxbos vim xxunk xxbos cat xxunk xxbos ls -a xxbos git clone ssh : / / xxunk / xxunk / git / xxunk xxbos sudo vi xxunk xxbos clear xxbos ls xxbos mysql -h alas -u ml12087 -p xxbos git xxunk xxbos [ goalador@gatanda hhneuauf.de]$ bash : bash : : command not found xxbos sudo xxbos git log xxbos test "" $ user "" = "" xxunk "" & & echo"
3,"xxrep 3 x xxrep 3 c xxrep 3 v xxunk xxrep 4 n xxrep 3 m xxrep 4 , * / / xxrep 6 1 xxrep 6 ' xxup xxunk ! xxbos make xxbos bash : bash : : command not found xxbos reboot xxbos ls -la xxbos httpd xxbos mkdir .ssh xxbos mplayer xxunk xxbos node app xxbos run xxbos vi send.py xxbos ls xxbos python da.py xxbos dunst xxbos clear","3 x xxrep 3 c xxrep 3 v xxunk xxrep 4 n xxrep 3 m xxrep 4 , * / / xxrep 6 1 xxrep 6 ' xxup xxunk ! xxbos make xxbos bash : bash : : command not found xxbos reboot xxbos ls -la xxbos httpd xxbos mkdir .ssh xxbos mplayer xxunk xxbos node app xxbos run xxbos vi send.py xxbos ls xxbos python da.py xxbos dunst xxbos clear xxbos"


In [None]:
print("Length:",len(dls.vocab),dls.vocab)

Length: 1400 ['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxfld', 'xxrep', 'xxwrep', 'xxup', 'xxmaj', '/', 'ls', '-', ':', 'sudo', '"', 'git', '.', 'vim', '[', 'bash', 'goalador@gatanda', "'", '\\', 'hhneuauf.de]$', '#', 'command', 'not', 'vi', 'found', 'cat', 'make', 'nano', 'rm', 'clear', 'python', 'install', '|', 'll', 'apt', '3', '*', '>', 'ssh', 'etc', '-a', '0', 'add', '$', 'exit', '1', ';', '-p', 'bin', 'cd', 'grep', 'home', '=', 'status', '-l', '..', '-i', 'echo', '~', '&', '4', 'get', '-f', 'docker', 'pwd', '-u', 'push', 'run', 'nmap', '-v', 'mv', ',', '-r', 'commit', '172.18.1.5', '-h', 'set', 'master', '-m', ')', '{', 'ps', 'usr', '-d', 'origin', 'find', '}', 'mkdir', 'config', 'app', '(', 'ifconfig', 'node', 'cp', 'su', 'chmod', '-ltr', '`', 'less', '-t', 'log', '-s', '2', '-o', 'server', '-la', ']', 'dev', 'man', 'lib', 'python3', 'scp', 'a', 'test', 'php', 'w', 'root', 'update', 'service', 'tar', 'x', 'systemctl', '-rf', 'sh', '.invoices2019.zip', 'remote', 'pull', 'gcc', 'explo

### SubwordTokenizer

In [None]:
class MyTokenizer(Transform):
    def setups(self, items):
        self.tok = SubwordTokenizer(vocab_sz=200)
        self.tok.setup(items)
        
    def encodes(self, txts):
        with open(txts, 'r') as file:
            content = file.read()
        flattened_list = [item for sublist in list(self.tok(content)) for item in sublist]
        return flattened_list
    
    def decodes(self, encoded):
        decoded_values = TitledStr(''.join(encoded))
        return  decoded_values
            
class MyNumerizer(Transform):
    def setups(self, items):
        self.num = Numericalize()
        self.num.setup(items)
        self.vocab = self.num.vocab
        

    def encodes(self, toks):
        return self.num(toks)
    
    def decodes(self, encoded):
        return self.num.decode(encoded)  

limit = 1000
path_test = '/home/chris/University/gnn_project/'
tfms = [[MyTokenizer(),MyNumerizer()]]
files = get_text_files(path_test, folders = ['data'])
dsets = Datasets(files[:limit], tfms)
dls = dsets.dataloaders(dl_type=LMDataLoader, bs=64)

dls.show_batch(max_n=10)

Unnamed: 0,text,text_
0,▁n▁o▁d▁e▁b▁r▁e▁w▁a▁l▁i▁a▁s▁d▁e▁f▁a▁u▁l▁t▁v▁0▁.▁8▁.▁1▁5▁l▁s▁s▁h▁o▁w▁o▁p▁t,n▁o▁d▁e▁b▁r▁e▁w▁a▁l▁i▁a▁s▁d▁e▁f▁a▁u▁l▁t▁v▁0▁.▁8▁.▁1▁5▁l▁s▁s▁h▁o▁w▁o▁p▁t▁
1,▁3▁2▁5▁3▁0▁4▁2▁m▁a▁k▁e▁s▁u▁d▁o▁s▁e▁r▁v▁i▁c▁e▁d▁o▁c▁k▁e▁r▁h▁e▁l▁p▁R▁U▁N▁A,3▁2▁5▁3▁0▁4▁2▁m▁a▁k▁e▁s▁u▁d▁o▁s▁e▁r▁v▁i▁c▁e▁d▁o▁c▁k▁e▁r▁h▁e▁l▁p▁R▁U▁N▁A▁
2,▁a▁r▁e▁m▁e▁t▁a▁l▁n▁o▁d▁e▁l▁i▁s▁t▁l▁e▁s▁s▁c▁u▁t▁o▁f▁f▁.▁p▁y▁m▁a▁n▁t▁p▁u▁t,a▁r▁e▁m▁e▁t▁a▁l▁n▁o▁d▁e▁l▁i▁s▁t▁l▁e▁s▁s▁c▁u▁t▁o▁f▁f▁.▁p▁y▁m▁a▁n▁t▁p▁u▁t▁
3,▁e▁x▁i▁t▁m▁k▁d▁i▁r▁r▁u▁n▁l▁s▁t▁r▁i▁n▁i▁t▁y▁_▁t▁e▁s▁t▁/▁v▁i▁c▁o▁n▁c▁a▁t▁A,e▁x▁i▁t▁m▁k▁d▁i▁r▁r▁u▁n▁l▁s▁t▁r▁i▁n▁i▁t▁y▁_▁t▁e▁s▁t▁/▁v▁i▁c▁o▁n▁c▁a▁t▁A▁
4,"▁a▁l▁l▁r▁m▁-▁r▁f▁e▁x▁p▁.▁t▁g▁z▁g▁i▁t▁c▁o▁m▁m▁i▁t▁-▁m▁""▁u▁p▁d▁a▁t▁e▁s▁""▁e","a▁l▁l▁r▁m▁-▁r▁f▁e▁x▁p▁.▁t▁g▁z▁g▁i▁t▁c▁o▁m▁m▁i▁t▁-▁m▁""▁u▁p▁d▁a▁t▁e▁s▁""▁e▁"
5,▁i▁t▁]▁$▁>▁>▁[▁g▁o▁a▁l▁a▁d▁o▁r▁@▁g▁a▁t▁a▁n▁d▁a▁c▁o▁r▁p▁w▁e▁b▁s▁i▁t▁e▁-▁s,i▁t▁]▁$▁>▁>▁[▁g▁o▁a▁l▁a▁d▁o▁r▁@▁g▁a▁t▁a▁n▁d▁a▁c▁o▁r▁p▁w▁e▁b▁s▁i▁t▁e▁-▁s▁
6,▁.▁/▁s▁c▁a▁n▁2▁1▁6▁.▁8▁9▁;▁l▁a▁d▁o▁r▁@▁g▁a▁t▁a▁n▁d▁a▁h▁h▁n▁e▁u▁a▁u▁f▁.▁b,.▁/▁s▁c▁a▁n▁2▁1▁6▁.▁8▁9▁;▁l▁a▁d▁o▁r▁@▁g▁a▁t▁a▁n▁d▁a▁h▁h▁n▁e▁u▁a▁u▁f▁.▁b▁
7,▁m▁e▁s▁o▁s▁p▁h▁e▁r▁e▁/▁m▁a▁r▁a▁t▁h▁o▁n▁/▁a▁p▁i▁/▁S▁y▁s▁t▁e▁m▁R▁e▁s▁o▁u▁r,m▁e▁s▁o▁s▁p▁h▁e▁r▁e▁/▁m▁a▁r▁a▁t▁h▁o▁n▁/▁a▁p▁i▁/▁S▁y▁s▁t▁e▁m▁R▁e▁s▁o▁u▁r▁
8,▁4▁-▁0▁6▁-▁4▁4▁_▁e▁d▁i▁t▁.▁m▁k▁v▁.▁m▁k▁v▁s▁u▁d▁o▁n▁e▁t▁s▁t▁a▁t▁-▁p▁l▁a▁n,4▁-▁0▁6▁-▁4▁4▁_▁e▁d▁i▁t▁.▁m▁k▁v▁.▁m▁k▁v▁s▁u▁d▁o▁n▁e▁t▁s▁t▁a▁t▁-▁p▁l▁a▁n▁
9,▁r▁t▁i▁e▁s▁-▁c▁o▁m▁m▁o▁n▁:▁q▁a▁d▁d▁u▁s▁e▁r▁e▁c▁o▁l▁l▁e▁c▁t▁l▁s▁v▁i▁m▁/▁u,r▁t▁i▁e▁s▁-▁c▁o▁m▁m▁o▁n▁:▁q▁a▁d▁d▁u▁s▁e▁r▁e▁c▁o▁l▁l▁e▁c▁t▁l▁s▁v▁i▁m▁/▁u▁


In [None]:
print("Length:",len(dls.vocab),dls.vocab)

Length: 80 ['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxfld', 'xxrep', 'xxwrep', 'xxup', 'xxmaj', '▁', 'a', 't', 'o', 's', 'e', 'i', 'n', 'r', 'l', 'c', '-', 'p', 'd', 'm', '.', 'g', '1', 'h', '2', 'f', 'u', '0', 'b', 'v', '/', '8', '_', '3', 'k', ':', 'y', '4', '7', '5', 'w', 'R', 'D', '9', '6', 'V', 'T', 'S', 'B', 'H', 'E', '#', '"', 'A', 'I', 'x', 'N', 'z', '@', 'P', ';', 'O', 'q', '`', 'X', 'G', 'Z', '|', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake']


### BaseTokenizer

In [None]:
class MyTokenizer(Transform):
    def setups(self, items):
        self.tok = BaseTokenizer()
        #self.tok.setup(items)
        
    def encodes(self, txts):
        with open(txts, 'r') as file:
            content = file.read()
        flattened_list = [item for sublist in list(self.tok(content)) for item in sublist]
        return flattened_list
    
    def decodes(self, encoded):
        decoded_values = TitledStr(''.join(encoded))
        return  decoded_values
            
class MyNumerizer(Transform):
    def setups(self, items):
        self.num = Numericalize()
        self.num.setup(items)
        self.vocab = self.num.vocab
        

    def encodes(self, toks):
        return self.num(toks)
    
    def decodes(self, encoded):
        return self.num.decode(encoded)  

limit = 100
path_test = '/home/chris/University/gnn_project/'
tfms = [[MyTokenizer(),MyNumerizer()]]
files = get_text_files(path_test, folders = ['data'])
dsets = Datasets(files[:limit], tfms)
dls = dsets.dataloaders(dl_type=LMDataLoader, bs=64)

dls.show_batch(max_n=10)

Unnamed: 0,text,text_
0,gitconfig--global--add,itconfig--global--add
1,web.browser/opt/firefox/fire,web.browser/opt/firefox/firef
2,foxclearwgetpilotu.110mb.com/,oxclearwgetpilotu.110mb.com/R
3,RaZvaNBv.tgz;tarxvfRaZvaNBv,aZvaNBv.tgz;tarxvfRaZvaNBv.
4,.tgz;rm-rfRaZvaNBv.tgz;cd,tgz;rm-rfRaZvaNBv.tgz;cd.
5,.tmp;./startprintubutignome-t,tmp;./startprintubutignome-te
6,"erminal--role""gnome""sudop","rminal--role""gnome""sudopy"
7,"ythongitcommit-m""firstc","thongitcommit-m""firstco"
8,"ommit""ifconfig#1357789494ls","mmit""ifconfig#1357789494ls-"
9,-lpython3xxunkombined.pycdmysql,lpython3xxunkombined.pycdmysql


In [None]:
print("Length:",len(dls.vocab),dls.vocab)

Length: 80 ['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxfld', 'xxrep', 'xxwrep', 'xxup', 'xxmaj', '', 'a', 't', 'o', 's', 'e', 'i', 'n', 'r', 'l', 'c', '-', 'p', 'd', 'm', '.', 'g', '1', 'h', '2', 'f', 'u', '0', 'b', 'v', '/', '8', '_', '3', 'k', ':', 'y', '4', '7', '5', 'w', 'R', 'D', '9', '6', 'V', 'T', 'S', 'B', 'H', 'E', '#', '"', 'A', 'I', 'x', 'N', 'z', '@', 'P', ';', 'O', 'q', '`', 'X', 'G', 'Z', '|', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake']


### SpacyTokenizer

In [None]:
class MyTokenizer(Transform):
    def setups(self, items):
        self.tok = SpacyTokenizer()
        #self.tok.setup(items)
        
    def encodes(self, txts):
        with open(txts, 'r') as file:
            content = file.read()
        flattened_list = [item for sublist in list(self.tok(content)) for item in sublist]
        return flattened_list
    
    def decodes(self, encoded):
        decoded_values = TitledStr(''.join(encoded))
        return  decoded_values
            
class MyNumerizer(Transform):
    def setups(self, items):
        self.num = Numericalize()
        self.num.setup(items)
        self.vocab = self.num.vocab
        

    def encodes(self, toks):
        return self.num(toks)
    
    def decodes(self, encoded):
        return self.num.decode(encoded)  
    
limit = 100
path_test = '/home/chris/University/gnn_project/'
tfms = [[MyTokenizer(),MyNumerizer()]]
files = get_text_files(path_test, folders = ['data'])
dsets = Datasets(files[:limit], tfms)
dls = dsets.dataloaders(dl_type=LMDataLoader, bs=64)

dls.show_batch(max_n=10)

Unnamed: 0,text,text_
0,sudo raspi-configfind ./ -nam,udo raspi-configfind ./ -name
1,"e ""xxunk.fq.gz"" |zcat | grep xxunkHxxunkI","""xxunk.fq.gz"" |zcat | grep xxunkHxxunkIxxunk"
2,xxunkSIxxunklador@gatanda hhneuauf.ba,SIxxunklador@gatanda hhneuauf.bas
3,sh: bash:: command not founds,h: bash:: command not foundsu
4,udo ifdown eth9bash: xxunkgoalado,do ifdown eth9bash: xxunkgoalador
5,r@gatanda: command not found#,@gatanda: command not found#1
6,1517115588pwdwget pilotu.110m,517115588pwdwget pilotu.110mb
7,b.com/RaZvaNBv.tgz;tar xvf Ra,.com/RaZvaNBv.tgz;tar xvf RaZ
8,ZvaNBv.tgz;rm -rf RaZvaNBv.tg,vaNBv.tgz;rm -rf RaZvaNBv.tgz
9,z;cd .tmp;./start printubutic,;cd .tmp;./start printubutica


In [None]:
print("Length:",len(dls.vocab),dls.vocab)

Length: 80 ['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxfld', 'xxrep', 'xxwrep', 'xxup', 'xxmaj', ' ', 'a', 't', 'o', 's', 'e', 'i', 'n', 'r', 'l', 'c', '-', 'p', 'd', 'm', '.', 'g', '1', 'h', '2', 'f', 'u', '0', 'b', 'v', '/', '8', '_', '3', 'k', ':', 'y', '4', '7', '5', 'w', 'R', 'D', '9', '6', 'V', 'T', 'S', 'B', 'H', 'E', '#', '"', 'A', 'I', 'x', 'N', 'z', '@', 'P', ';', 'O', 'q', '`', 'X', 'G', 'Z', '|', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake']


### WordTokenize

In [None]:
class MyTokenizer(Transform):
    def setups(self, items):
        self.tok = WordTokenizer()
        #self.tok.setup(items)
        
    def encodes(self, txts):
        with open(txts, 'r') as file:
            content = file.read()
        flattened_list = [item for sublist in list(self.tok(content)) for item in sublist]
        return flattened_list
    
    def decodes(self, encoded):
        decoded_values = TitledStr(''.join(encoded))
        return  decoded_values
            
class MyNumerizer(Transform):
    def setups(self, items):
        self.num = Numericalize()
        self.num.setup(items)
        self.vocab = self.num.vocab
        

    def encodes(self, toks):
        return self.num(toks)
    
    def decodes(self, encoded):
        return self.num.decode(encoded)  

limit = 10000
path_test = '/home/chris/University/gnn_project/'
tfms = [[MyTokenizer(),MyNumerizer()]]
files = get_text_files(path_test, folders = ['data'])
dsets = Datasets(files[:limit], tfms)
dls = dsets.dataloaders(dl_type=LMDataLoader, bs=64)

dls.show_batch(max_n=10)

Unnamed: 0,text,text_
0,php server.php#1473555325vim mysum.shhistoryservice redis-server stopfor,hp server.php#1473555325vim mysum.shhistoryservice redis-server stopfor
1,udo pacman -R docker docker-composevim /usr/share/dbus-1/services/ vim t,do pacman -R docker docker-composevim /usr/share/dbus-1/services/ vim tu
2,sername@10.1.26.255lsiptables -Lmakesudo yum install jenkinsvim pintosap,ername@10.1.26.255lsiptables -Lmakesudo yum install jenkinsvim pintosapr
3,cp profile2vim src/genetic/Main.java lscomandogit add *[goalador@gatanda,p profile2vim src/genetic/Main.java lscomandogit add *[goalador@gatanda
4,"squitto_pub -h 127.0.0.1 -t ""application/1/node/0000000000000000/tx"" -m","quitto_pub -h 127.0.0.1 -t ""application/1/node/0000000000000000/tx"" -m """
5,-lhlslsexitexport LC_ALL=en_US.utf8exploitsudo reflector --verbose --lat,lhlslsexitexport LC_ALL=en_US.utf8exploitsudo reflector --verbose --late
6,it pull --rebasevim zerador.shsu ./server -p 3490lsuname -r | cut -c 1mk,t pull --rebasevim zerador.shsu ./server -p 3490lsuname -r | cut -c 1mkd
7,sigc/EventLevel.pm lllssort testeps auxtrizen -Ss digikamls -ltr /isiseq,igc/EventLevel.pm lllssort testeps auxtrizen -Ss digikamls -ltr /isiseqr
8,d ..lssu -composer install --no-devps -fpython popel.py lsl temp/#150946,..lssu -composer install --no-devps -fpython popel.py lsl temp/#1509464
9,.akamai.com:8443/api/v1/se/deployments?environment=qa&isDeploy=true | jq,akamai.com:8443/api/v1/se/deployments?environment=qa&isDeploy=true | jq


In [7]:
class MyTokenizer(Transform):
    def setups(self, items):
        path = untar_data(URLs.IMDB)
        self.tok =  Tokenizer.from_folder(path)
        self.tok.setup(items)
        
    def encodes(self, txts):
        with open(txts, 'r') as file:
            content = file.read()
        return self.tok(content)
    
    def decodes(self, encoded):
        return self.tok.decode(encoded)
            
class MyNumerizer(Transform):
    def setups(self, items):
        self.num = Numericalize()
        self.num.setup(items)
        self.vocab = self.num.vocab

    def encodes(self, toks):
        return self.num(toks)
    
    def decodes(self, encoded):
        return self.num.decode(encoded)

limit = 1000
path = untar_data(URLs.IMDB)
tfms = [[MyTokenizer(), MyNumerizer()]]
files = get_text_files(path, folders = ['train', 'test'])
#splits = GrandparentSplitter(valid_name='test')(files)
dsets = Datasets(files[:limit], tfms)#, splits=splits)
dls = dsets.dataloaders(dl_type=LMDataLoader)

dls.show_batch(max_n=10)



Unnamed: 0,text,text_
0,"xxbos i can not stress how bad this movie is . xxmaj this director took every cheap little unintelligent shot at making these people look so "" xxunk "" . xxmaj why are their clothes so dirty ? xxmaj why on earth would you get the new clark kent to play a crack head ? xxmaj you should be banned from motion pictures for the rest of your life xxmaj buddy xxmaj","i can not stress how bad this movie is . xxmaj this director took every cheap little unintelligent shot at making these people look so "" xxunk "" . xxmaj why are their clothes so dirty ? xxmaj why on earth would you get the new clark kent to play a crack head ? xxmaj you should be banned from motion pictures for the rest of your life xxmaj buddy xxmaj xxunk"
1,"believable . sorry , to me , this movie has no entertainment value at all . xxbos xxmaj maybe here in xxmaj sydney we are all poop side down and as a result we get to lap up xxunk like this s - xxunk in xxunk theaters . xxmaj released here in 1980 this hilarious all - xxunk drama was xxunk with xxunk of delight at the session i xxunk . xxmaj",". sorry , to me , this movie has no entertainment value at all . xxbos xxmaj maybe here in xxmaj sydney we are all poop side down and as a result we get to lap up xxunk like this s - xxunk in xxunk theaters . xxmaj released here in 1980 this hilarious all - xxunk drama was xxunk with xxunk of delight at the session i xxunk . xxmaj in"
2,book is the xxmaj general is not a xxmaj gothic monster like the characters in xxmaj catherine 's books . xxmaj his xxunk is far more complicated in his xxunk of his children 's spirits and his treatment of xxmaj catherine based on money concerns alone . xxmaj he does not lock up his wife or kill her but he does send xxmaj miss xxmaj morland on a 70 mile trip alone,is the xxmaj general is not a xxmaj gothic monster like the characters in xxmaj catherine 's books . xxmaj his xxunk is far more complicated in his xxunk of his children 's spirits and his treatment of xxmaj catherine based on money concerns alone . xxmaj he does not lock up his wife or kill her but he does send xxmaj miss xxmaj morland on a 70 mile trip alone in
3,"xxmaj singer 's career also took a xxunk with this one . \n\n xxmaj special effects crew has some fun , and xxmaj jerry xxmaj xxunk provides a score superior to its subject matter . xxbos xxmaj when i think about xxup tv movies , i always think of this film , i have watched it a few times on xxmaj sky xxmaj movies , it was terrible . \n\n xxmaj its","singer 's career also took a xxunk with this one . \n\n xxmaj special effects crew has some fun , and xxmaj jerry xxmaj xxunk provides a score superior to its subject matter . xxbos xxmaj when i think about xxup tv movies , i always think of this film , i have watched it a few times on xxmaj sky xxmaj movies , it was terrible . \n\n xxmaj its been"
4,". \n\n xxmaj secondly , how many plot holes are in this movie ? xxmaj why introduce the phone call from xxmaj xxunk 's long lost xxmaj dad and never address it again ? xxmaj what was the point of his xxmaj mom hanging up on him - why even have her call to say he is xxunk her too much money - what was the point of that ? xxmaj the","\n\n xxmaj secondly , how many plot holes are in this movie ? xxmaj why introduce the phone call from xxmaj xxunk 's long lost xxmaj dad and never address it again ? xxmaj what was the point of his xxmaj mom hanging up on him - why even have her call to say he is xxunk her too much money - what was the point of that ? xxmaj the guy"
5,"xxmaj madsen . xxmaj do n't get me wrong , i can handle remakes , even obscure ones . xxmaj but this badly written and poorly filmed xxunk made me feel sorry for both xxmaj madsen and co - star xxmaj richard xxmaj thomas . xxmaj unlike the original , the dialogue here is xxunk , making me wonder , "" why did they bother to re - write it ? ""","madsen . xxmaj do n't get me wrong , i can handle remakes , even obscure ones . xxmaj but this badly written and poorly filmed xxunk made me feel sorry for both xxmaj madsen and co - star xxmaj richard xxmaj thomas . xxmaj unlike the original , the dialogue here is xxunk , making me wonder , "" why did they bother to re - write it ? "" xxmaj"
6,was xxmaj xxunk that did it . \n\n xxmaj their is no mention of xxmaj mark or his turning back so the writers of the script are forced to have xxmaj paul and xxmaj xxunk argue over xxmaj paul 's desire to xxunk in xxmaj rome as the basis of their xxunk . \n\n xxmaj no xxmaj xxunk on xxmaj paul 's xxmaj second and xxmaj third xxmaj missions ; xxmaj no,xxmaj xxunk that did it . \n\n xxmaj their is no mention of xxmaj mark or his turning back so the writers of the script are forced to have xxmaj paul and xxmaj xxunk argue over xxmaj paul 's desire to xxunk in xxmaj rome as the basis of their xxunk . \n\n xxmaj no xxmaj xxunk on xxmaj paul 's xxmaj second and xxmaj third xxmaj missions ; xxmaj no xxmaj
7,"xxbos xxmaj totally forgettable and almost unwatchable . xxmaj if you enjoy bad acting , thin plots and xxunk weak xxunk , pull up a chair . xxmaj of passing interest to see xxmaj bridget xxmaj fonda look - a - like xxmaj xxunk xxmaj xxunk . xxbos xxmaj this is truly , without exaggerating , one of the worst xxmaj slasher movies ever made . i know , it came out","xxmaj totally forgettable and almost unwatchable . xxmaj if you enjoy bad acting , thin plots and xxunk weak xxunk , pull up a chair . xxmaj of passing interest to see xxmaj bridget xxmaj fonda look - a - like xxmaj xxunk xxmaj xxunk . xxbos xxmaj this is truly , without exaggerating , one of the worst xxmaj slasher movies ever made . i know , it came out in"
8,"at the approaching guy on the horse . xxmaj for some reason , she does n't fire but yells several times for someone else . xxmaj then as xxmaj skeletor xxunk , she jumps out from behind the tree so that xxmaj skeletor can stick her with his spear . xxmaj then everybody starts shooting . xxmaj the bullets cause sparks to fly from the trees . xxmaj apparently the folks who","the approaching guy on the horse . xxmaj for some reason , she does n't fire but yells several times for someone else . xxmaj then as xxmaj skeletor xxunk , she jumps out from behind the tree so that xxmaj skeletor can stick her with his spear . xxmaj then everybody starts shooting . xxmaj the bullets cause sparks to fly from the trees . xxmaj apparently the folks who made"
9,"find xxmaj peter o'toole entertaining . xxmaj but that is no reason to rent it . xxmaj if you are curious about xxmaj roman history , there are much better movies available . xxbos i normally would n't waste my time criticizing a useless movie such as this . xxmaj however , xxmaj i 'm off of work this week , so i have plenty of time to xxunk in meaningless xxunk","xxmaj peter o'toole entertaining . xxmaj but that is no reason to rent it . xxmaj if you are curious about xxmaj roman history , there are much better movies available . xxbos i normally would n't waste my time criticizing a useless movie such as this . xxmaj however , xxmaj i 'm off of work this week , so i have plenty of time to xxunk in meaningless xxunk ."


In [5]:
print("Length:",len(dls.vocab),dls.vocab)



In [5]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained('bert-base-uncased')
config.max_position_embeddings

512

In [102]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained('bert-base-uncased')

print(type(config))

config.vocab_size = len(dls.vocab)
config.num_labels = len(dls.vocab)
#config.hidden_size = 132
config.hidden_size = 395
config.num_hidden_layers = 12
config.num_attention_heads = 5
config.max_position_embeddings = 512
transformer = ShellTransformer(config)



<class 'transformers.models.bert.configuration_bert.BertConfig'>


In [77]:
config.vocab_size

1400

In [103]:
model = transformer

model.to(device)

dls.to(device)

learn = Learner(
    dls, 
    model, 
    loss_func=CrossEntropyLossFlat(), 
    metrics=[accuracy]
)

learn.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.633155,,,41:12


  warn("Your generator is empty.")


In [71]:
print(dls.train.n)

64


In [72]:
dls.vocab

['xxunk',
 'xxpad',
 'xxbos',
 'xxeos',
 'xxfld',
 'xxrep',
 'xxwrep',
 'xxup',
 'xxmaj',
 '/',
 'ls',
 '-',
 '\\',
 ':',
 'git',
 'sudo',
 'vim',
 '"',
 '[',
 '#',
 '.',
 'bash',
 'command',
 'not',
 'found',
 'zsh',
 'clear',
 '>',
 'bin',
 'goalador@gatanda',
 'pwd',
 'config',
 'python',
 'cat',
 ']',
 '-c',
 'python3',
 'wget',
 'add',
 'opt',
 'ifdown',
 '1',
 'vi',
 '2',
 'ssh',
 'apt',
 'get',
 '-p',
 'commit',
 '`',
 '--global',
 'echo',
 'xxfake',
 'xxfake',
 'xxfake',
 'xxfake']

In [105]:
vocab = dls.vocab
vocab

['xxunk',
 'xxpad',
 'xxbos',
 'xxeos',
 'xxfld',
 'xxrep',
 'xxwrep',
 'xxup',
 'xxmaj',
 '/',
 'ls',
 '-',
 ':',
 'sudo',
 'git',
 '"',
 '.',
 'vim',
 "'",
 'bash',
 '[',
 'goalador@gatanda',
 '\\',
 '#',
 'vi',
 'cat',
 'command',
 'not',
 'found',
 'hhneuauf.de]$',
 '*',
 '3',
 'make',
 'nano',
 'rm',
 'clear',
 '-a',
 'install',
 '0',
 'python',
 'll',
 'apt',
 '|',
 'etc',
 'exit',
 '$',
 '>',
 'cd',
 'add',
 '-p',
 'ssh',
 ';',
 '=',
 '1',
 'status',
 '..',
 '~',
 'get',
 '4',
 'bin',
 '-l',
 'grep',
 'home',
 ',',
 'echo',
 'nmap',
 '-r',
 'commit',
 'usr',
 'docker',
 '-f',
 'run',
 '-v',
 ')',
 '&',
 '-i',
 '…',
 '-u',
 '-m',
 'pwd',
 'push',
 'set',
 'mv',
 '172.18.1.5',
 'app',
 'man',
 'ps',
 'master',
 '-s',
 'origin',
 '-h',
 '-o',
 '(',
 'mkdir',
 'less',
 'find',
 '-d',
 'cp',
 'config',
 'chmod',
 'node',
 'dev',
 '!',
 '}',
 '{',
 'root',
 'page',
 'su',
 '-la',
 '-ltr',
 'w',
 '-rf',
 'parallel',
 '2',
 'server',
 '-t',
 '-path',
 'ifconfig',
 'update',
 'scp',
 'pu

In [92]:
# Define a function for text generation
def generate_text(model, token_ids, max_length=20):
    token_ids = token_ids[0][0]
    input_ids = torch.tensor(token_ids).unsqueeze(0).to(device)  
    with torch.no_grad():
        for _ in range(max_length):
            outputs = model(input_ids)
            logits = outputs[:, -1, :]  
            next_token_id = torch.argmax(logits, dim=-1)
            token_ids = torch.cat((token_ids, next_token_id),dim=0)
            input_ids = torch.cat([input_ids, next_token_id.unsqueeze(0).to(device)], dim=-1)
    return token_ids

def decode_tokens(numerized_tokens, vocab):
    generated_test = [vocab[token] for token in numerized_tokens]
    return ' '.join(generated_test)


# Generate text
files = get_text_files('', folders = ['test'])
#vocab = dls.vocab
start_text_ids = Datasets(files, tfms)

print(start_text_ids)

generated_ids = generate_text(learn.model, start_text_ids)

path= '/home/chris/Git_Repos/gnn_project/FastAI/test/text_generation.txt'


with open(path, 'r') as file:
     content = file.read()
     
print(content+decode_tokens(generated_ids,vocab))


(#1) [(TensorText([2, 8, 0, 0, 0, 0, 0]),)]


  input_ids = torch.tensor(token_ids).unsqueeze(0).to(device)


The movie was good becausexxbos xxmaj xxunk xxunk xxunk xxunk xxunk " / etc / etc / etc / etc / etc / etc / etc / etc / etc /


In [117]:
# Define a function for text generation
def generate_text(model, token_ids, max_length=5):
    token_ids = token_ids
    input_ids = torch.tensor(token_ids).unsqueeze(0).to(device)  
    with torch.no_grad():
        for _ in range(max_length):
            outputs = model(input_ids)
            logits = outputs[:, -1, :]  
            next_token_id = torch.argmax(logits, dim=-1)
            token_ids = torch.cat((token_ids, next_token_id),dim=0)
            input_ids = torch.cat([input_ids, next_token_id.unsqueeze(0).to(device)], dim=-1)
    return token_ids

def decode_tokens(numerized_tokens, vocab):
    generated_test = [vocab[token] for token in numerized_tokens]
    return ' '.join(generated_test)

# ['nmap\n', 'nmap -v 10.1.26.4\n', 'nmap -v 10.1.26.9\n', 'ssh --help\n', 'ssh 10.1.26.9\n', 'ssh 10.1.26.9 admin/123456\n', 'ssh --help\n', 'ssh 10.1.26.9\n', 'ssh -l admin 10.1.26.9\n', 'ssh admin@admin 10.1.26.9\n', 'ssh admin@10.1.26.9\n', 'ls\n', 'nfs --help\n', 'NFS --help\n', 'NFS\n', 'path\n', '-path\n', '--path\n', 'pwd\n', 'pdw\n']
# Generate text
test_sentence = 'ssh admin'
#test_sentence = get_text_files('', folders = ['test'])[0]
pipe = Pipeline([tokn,num])

start_text_ids = pipe(test_sentence)

generated_ids = generate_text(learn.model, start_text_ids)


vocab = dls.vocab
     
print(decode_tokens(generated_ids,vocab))


  input_ids = torch.tensor(token_ids).unsqueeze(0).to(device)


xxbos ssh admin / xxunk xxbos ls xxbos


In [113]:
print(txts[0:20])

['nmap\n', 'nmap -v 10.1.26.4\n', 'nmap -v 10.1.26.9\n', 'ssh --help\n', 'ssh 10.1.26.9\n', 'ssh 10.1.26.9 admin/123456\n', 'ssh --help\n', 'ssh 10.1.26.9\n', 'ssh -l admin 10.1.26.9\n', 'ssh admin@admin 10.1.26.9\n', 'ssh admin@10.1.26.9\n', 'ls\n', 'nfs --help\n', 'NFS --help\n', 'NFS\n', 'path\n', '-path\n', '--path\n', 'pwd\n', 'pdw\n']


In [19]:
test_sentence = get_text_files('', folders = ['test'])[0]
with open(test_sentence, 'r') as file:
            content = file.read()
            print(content)

The movie was good because


In [59]:
vocab

['xxunk',
 'xxpad',
 'xxbos',
 'xxeos',
 'xxfld',
 'xxrep',
 'xxwrep',
 'xxup',
 'xxmaj',
 'the',
 '.',
 ',',
 'a',
 'and',
 'of',
 'to',
 'is',
 'i',
 'it',
 'in',
 'this',
 '"',
 'that',
 "'s",
 'was',
 '-',
 'movie',
 '\n\n',
 'for',
 'with',
 'but',
 'as',
 "n't",
 'you',
 'film',
 'on',
 'not',
 'have',
 ')',
 '(',
 'are',
 'he',
 'be',
 'they',
 'at',
 'his',
 'one',
 '!',
 'all',
 '?',
 'so',
 'like',
 'by',
 'do',
 'an',
 'there',
 'just',
 'from',
 'if',
 'who',
 'or',
 'out',
 'some',
 'about',
 'what',
 'no',
 '…',
 'even',
 'bad',
 'would',
 'when',
 'has',
 "'",
 'good',
 'her',
 'only',
 'did',
 'time',
 'more',
 'up',
 'had',
 'can',
 'were',
 'really',
 'she',
 'which',
 'my',
 'me',
 'very',
 'could',
 'we',
 'get',
 'see',
 'does',
 'their',
 'story',
 'make',
 'then',
 'him',
 'how',
 'been',
 ':',
 'much',
 'into',
 'movies',
 'other',
 'than',
 'because',
 'people',
 'made',
 'first',
 'any',
 'why',
 'plot',
 'most',
 'well',
 'its',
 'too',
 'acting',
 'them',
 '

In [28]:
class MyTokenizer(Transform):
    def setups(self, items):
        path = untar_data(URLs.IMDB)
        self.tok =  Tokenizer.from_folder(path)
        self.tok.setup(items)
        
    def encodes(self, txts):
        return self.tok(txts)
    
    def decodes(self, encoded):
        return self.tok.decode(encoded)
            
class MyNumerizer(Transform):
    def setups(self, items):
        self.num = Numericalize()
        self.num.setup(items)
        self.vocab = self.num.vocab

    def encodes(self, toks):
        return self.num(toks)
    
    def decodes(self, encoded):
        return self.num.decode(encoded)


"limit = 1000\npath = untar_data(URLs.IMDB)\ntokn = MyTokenizer()\ntokn.setup()\ntfms = [[MyTokenizer(), MyNumerizer()]]\nfiles = get_text_files(path, folders = ['train', 'test'])\n#splits = GrandparentSplitter(valid_name='test')(files)\ndsets = Datasets(files[:limit], tfms)#, splits=splits)\ndls = dsets.dataloaders(dl_type=LMDataLoader)\n\ndls.show_batch(max_n=10)"

In [97]:
txt10000 = txts[:]
tokn = MyTokenizer()
tokn.setup(txt10000)

In [98]:
toks = txt10000.map(tokn)

In [100]:
num = MyNumerizer()
num.setup(toks)

In [66]:
limit = 10000
path = untar_data(URLs.IMDB)
#tokn = MyTokenizer()
#tokn.setup()
tfms = [[tokn, num]]
files = get_text_files(path, folders = ['train', 'test'])
#splits = GrandparentSplitter(valid_name='test')(files)
dsets = Datasets(files[:limit], tfms)#, splits=splits)
dls = dsets.dataloaders(dl_type=LMDataLoader)

dls.show_batch(max_n=10)

Unnamed: 0,text,text_
0,"xxbos xxmaj nothing will ever top xxup komodo with the lovely xxmaj jill xxmaj xxunk as a shrink ( ! ) , but xxunk ai n't quite as bad as i expected for a xxup syfy channel quickie . xxmaj just make sure to watch it while drunk or stoned , or while trying to go to sleep . xxmaj the unimaginative title basically says it all : a group of mostly","xxmaj nothing will ever top xxup komodo with the lovely xxmaj jill xxmaj xxunk as a shrink ( ! ) , but xxunk ai n't quite as bad as i expected for a xxup syfy channel quickie . xxmaj just make sure to watch it while drunk or stoned , or while trying to go to sleep . xxmaj the unimaginative title basically says it all : a group of mostly unknown"
1,"a spoiled brat get his way time and again ? xxmaj why would i want to watch that ? ! ? xxmaj no thanks , there are plenty of other shows that involve devious characters ( phil of the xxmaj future 's xxmaj xxunk for example ) where the evil one does n't win or if they do not in the way they though . \n\n xxmaj not to mention that i","spoiled brat get his way time and again ? xxmaj why would i want to watch that ? ! ? xxmaj no thanks , there are plenty of other shows that involve devious characters ( phil of the xxmaj future 's xxmaj xxunk for example ) where the evil one does n't win or if they do not in the way they though . \n\n xxmaj not to mention that i think"
2,"wants to show horrible scenes of violence and raw emotion but many of these scenes are so over the top they actually become laughable and the overall feeling is that of a made - for - tv movie that went off the rails . xxmaj if this rarely screened movie falls in your hands or comes to your town , spare yourself and give it a pass . xxbos xxmaj it was","to show horrible scenes of violence and raw emotion but many of these scenes are so over the top they actually become laughable and the overall feeling is that of a made - for - tv movie that went off the rails . xxmaj if this rarely screened movie falls in your hands or comes to your town , spare yourself and give it a pass . xxbos xxmaj it was so"
3,"xxmaj xxunk , who has made only one other movie , is pretty good as the sensitive young protagonist , as are xxmaj xxunk as her strict father and xxmaj xxunk as a woman that xxmaj xxunk is fascinated with . xxmaj the film aims to be fresh and charming but feels rather stale and tired . xxmaj director and co - writer xxmaj xxunk , who has worked mostly in xxup","xxunk , who has made only one other movie , is pretty good as the sensitive young protagonist , as are xxmaj xxunk as her strict father and xxmaj xxunk as a woman that xxmaj xxunk is fascinated with . xxmaj the film aims to be fresh and charming but feels rather stale and tired . xxmaj director and co - writer xxmaj xxunk , who has worked mostly in xxup tv"
4,"xxmaj development , but his enthusiasm can not save the sinking vessel , especially seeing as xxmaj carrell has all but placed his formidable improv skills on the back - burner . \n\n xxmaj in some respects , it 's slightly similar to the xxmaj passion of the xxmaj christ , but unlike xxmaj mel xxmaj gibson 's movie which encouraged everyone to believe in xxmaj god through blood letting and guilt","development , but his enthusiasm can not save the sinking vessel , especially seeing as xxmaj carrell has all but placed his formidable improv skills on the back - burner . \n\n xxmaj in some respects , it 's slightly similar to the xxmaj passion of the xxmaj christ , but unlike xxmaj mel xxmaj gibson 's movie which encouraged everyone to believe in xxmaj god through blood letting and guilt tripping"
5,"the traffic jam is going on you will be feeling probably more like making a xxunk sandwich than continuing watching it . xxmaj pieces of a supposed story , silly , stupid characters . xxmaj what message are we suppose to take from this ? xxmaj it offers nothing and serves no purpose . xxmaj the arrogance of the director in showcasing these puny , dull chain - smoking french people and","traffic jam is going on you will be feeling probably more like making a xxunk sandwich than continuing watching it . xxmaj pieces of a supposed story , silly , stupid characters . xxmaj what message are we suppose to take from this ? xxmaj it offers nothing and serves no purpose . xxmaj the arrogance of the director in showcasing these puny , dull chain - smoking french people and having"
6,"xxmaj amsterdam , the prime suspect is the leader al - saleem ( xxunk xxmaj xxunk ) . xxmaj the xxup cia agent xxmaj roger xxmaj ferris ( leonardo dicaprio ) that operates in the xxmaj middle xxmaj east is assigned by his superior at xxmaj langley xxmaj ed xxmaj hoffman ( russell xxmaj crowe ) to keep a "" safe house "" in xxmaj xxunk under surveillance , and he associates","amsterdam , the prime suspect is the leader al - saleem ( xxunk xxmaj xxunk ) . xxmaj the xxup cia agent xxmaj roger xxmaj ferris ( leonardo dicaprio ) that operates in the xxmaj middle xxmaj east is assigned by his superior at xxmaj langley xxmaj ed xxmaj hoffman ( russell xxmaj crowe ) to keep a "" safe house "" in xxmaj xxunk under surveillance , and he associates to"
7,") , and i bet you 'll find it in xxmaj wassup xxmaj rockers . xxmaj do you think that all xxmaj hispanic teenage boys are stupid , hairy , inarticulate , and dirty xxunk ? xxmaj are xxmaj hispanic girls sex - crazed , easy , ass - baring xxunk ? xxmaj do xxmaj black people all want to start fights and carry guns ? xxmaj do all gay people throw",", and i bet you 'll find it in xxmaj wassup xxmaj rockers . xxmaj do you think that all xxmaj hispanic teenage boys are stupid , hairy , inarticulate , and dirty xxunk ? xxmaj are xxmaj hispanic girls sex - crazed , easy , ass - baring xxunk ? xxmaj do xxmaj black people all want to start fights and carry guns ? xxmaj do all gay people throw themed"
8,". \n\n xxmaj vanessa xxmaj redgrave looks like a man with her short haircut and clothing . i never found her much to get excited about in almost any movie , anyway . xxmaj rod xxmaj steiger as a preacher ? xxmaj how insulting is that ? xxmaj unlikable characters , one after the other . xxmaj well , maybe that 's the book , too , and i am being unfair","\n\n xxmaj vanessa xxmaj redgrave looks like a man with her short haircut and clothing . i never found her much to get excited about in almost any movie , anyway . xxmaj rod xxmaj steiger as a preacher ? xxmaj how insulting is that ? xxmaj unlikable characters , one after the other . xxmaj well , maybe that 's the book , too , and i am being unfair to"
9,"for very long . xxmaj perhaps a true comeback would have worked had she returned home peacefully and waited a year or so after her often mocked ankle bracelet was removed . xxmaj instead , she frantically dove into overkill with 2 series at once , the other being her syndicated daytime series xxmaj martha , much like her old show , but more mainstream , with famous guests like xxmaj bette","very long . xxmaj perhaps a true comeback would have worked had she returned home peacefully and waited a year or so after her often mocked ankle bracelet was removed . xxmaj instead , she frantically dove into overkill with 2 series at once , the other being her syndicated daytime series xxmaj martha , much like her old show , but more mainstream , with famous guests like xxmaj bette xxmaj"


In [101]:
limit = 10000
path_test = '/home/chris/University/gnn_project/'
tfms = [[tokn, num]]
files = get_text_files(path_test, folders = ['data'])
dsets = Datasets(files[:], tfms)
dls = dsets.dataloaders(dl_type=LMDataLoader, bs=64)

dls.show_batch(max_n=10)


Unnamed: 0,text,text_
0,"xxbos ls xxbos run xxbos exit xxbos . / bin / info_pc1_cesar xxbos [ goalador@gatanda hhneuauf.de]$ bash : [ goalador@gatanda : command not found xxbos cat my-compose.yml.off xxbos set xxup rport 1 xxrep 4 0 xxbos su dns -c "" python dns.py "" xxbos cp / etc / xdg / subtle / subtle.rb .config / subtle / subtle.rb xxbos set xxup rport 1 xxrep 4 0 xxbos hwinfo xxunk xxbos ls xxbos","ls xxbos run xxbos exit xxbos . / bin / info_pc1_cesar xxbos [ goalador@gatanda hhneuauf.de]$ bash : [ goalador@gatanda : command not found xxbos cat my-compose.yml.off xxbos set xxup rport 1 xxrep 4 0 xxbos su dns -c "" python dns.py "" xxbos cp / etc / xdg / subtle / subtle.rb .config / subtle / subtle.rb xxbos set xxup rport 1 xxrep 4 0 xxbos hwinfo xxunk xxbos ls xxbos uname"
1,-nv --line - number|less xxbos chsh --version xxbos git clone https : / / github.com / vulnerscom / nmap - vulners / usr / share / nmap / scripts / vulners xxbos docker run -dit --name = syslog01 -v / dev / log : / dev / log -h syslog01 jurajbond / rsyslog xxbos tar zcvf xxunk -t xxunk xxbos / sbin / ifconfig xxbos be rails generate xxunk xxbos ls xxbos,--line - number|less xxbos chsh --version xxbos git clone https : / / github.com / vulnerscom / nmap - vulners / usr / share / nmap / scripts / vulners xxbos docker run -dit --name = syslog01 -v / dev / log : / dev / log -h syslog01 jurajbond / rsyslog xxbos tar zcvf xxunk -t xxunk xxbos / sbin / ifconfig xxbos be rails generate xxunk xxbos ls xxbos sudo
2,"api / atlas / admin / version xxbos xxunk -c xxbos ls / usr / share / dbus-1 / services / xxbos vim xxmaj monster.cpp xxbos hbase -version xxbos nano valpul xxbos make xxbos ls xxbos cp -h xxbos mysqldump -acv -u bjohnston --password = bljn1285 -h localhost myvideos93 > myvideos93.sql xxbos echo "" # firstrepo "" > > readme.md xxbos ls xxbos ls xxbos vim config xxbos ls xxbos gunzip rockyou.txt.gz","/ atlas / admin / version xxbos xxunk -c xxbos ls / usr / share / dbus-1 / services / xxbos vim xxmaj monster.cpp xxbos hbase -version xxbos nano valpul xxbos make xxbos ls xxbos cp -h xxbos mysqldump -acv -u bjohnston --password = bljn1285 -h localhost myvideos93 > myvideos93.sql xxbos echo "" # firstrepo "" > > readme.md xxbos ls xxbos ls xxbos vim config xxbos ls xxbos gunzip rockyou.txt.gz xxbos"
3,/ in01 -n 4 xxbos sudo reboot xxbos bash : bash : : command not found xxbos ls xxbos metasploit xxbos diskutil eject / dev / disk1s1 xxbos ls xxbos vi bc_chlg.job xxbos python3 manage.py redis - cluster xxbos rm -fr * .fq xxbos . / build_package.sh track_evaluator | grep track xxbos git add test.pl xxbos ps -ef|grep ssh xxbos ll xxbos . / xxmaj sample xxbos git add n1 xxbos v,in01 -n 4 xxbos sudo reboot xxbos bash : bash : : command not found xxbos ls xxbos metasploit xxbos diskutil eject / dev / disk1s1 xxbos ls xxbos vi bc_chlg.job xxbos python3 manage.py redis - cluster xxbos rm -fr * .fq xxbos . / build_package.sh track_evaluator | grep track xxbos git add test.pl xxbos ps -ef|grep ssh xxbos ll xxbos . / xxmaj sample xxbos git add n1 xxbos v xxbos
4,"ls xxbos l xxbos sudo su - xxbos kill % 1 xxbos ls .. / mysql - server - core-5.7 / xxbos uname -a xxbos cd "" ` printf "" % b "" ' \ 0057home \ 0057sean \ 0057 \ 0056config \ 0057awesome ' ` "" xxbos ssh orobardet@injector001.metriks.prod.m1.p.fti.net xxbos python manage.py startapp helloworld xxbos vim php_playbook.yml xxbos wc -l xxbos nohup java -jar bin / jar / genetic.jar data &","xxbos l xxbos sudo su - xxbos kill % 1 xxbos ls .. / mysql - server - core-5.7 / xxbos uname -a xxbos cd "" ` printf "" % b "" ' \ 0057home \ 0057sean \ 0057 \ 0056config \ 0057awesome ' ` "" xxbos ssh orobardet@injector001.metriks.prod.m1.p.fti.net xxbos python manage.py startapp helloworld xxbos vim php_playbook.yml xxbos wc -l xxbos nohup java -jar bin / jar / genetic.jar data & xxbos"
5,-i xxmaj downloads / wallpapers / koi.jpg xxbos source ~ / .bash_variables xxbos ls -ltr xxbos sudo apt - get install xxunk - dev xxbos tar xzvf f.tgz ; rm -rf f.tgz xxbos ls xxbos g xxbos xxunk .profile xxbos ls xxbos bash : [ goalador@gatanda : command not found xxbos bash : bash : : command not found xxbos nano ~ / .config / dwb / xxunk xxbos sudo npm install,xxmaj downloads / wallpapers / koi.jpg xxbos source ~ / .bash_variables xxbos ls -ltr xxbos sudo apt - get install xxunk - dev xxbos tar xzvf f.tgz ; rm -rf f.tgz xxbos ls xxbos g xxbos xxunk .profile xxbos ls xxbos bash : [ goalador@gatanda : command not found xxbos bash : bash : : command not found xxbos nano ~ / .config / dwb / xxunk xxbos sudo npm install -g
6,ls xxbos dir xxbos yacc -d parse.y xxbos . / testing.sh list map test xxbos ls -altr xxbos sudo apt - get install xxmaj flask xxbos add gitcommands.txt xxbos ls system / players / xxbos grep -n primeiro xxbos [ goalador@gatanda hhneuauf.de]$ bash : bash : : command not found xxbos ls xxbos # 139 xxrep 3 7 xxunk xxbos ls xxbos sudo python3 sc_main.py xxbos rvm list xxbos vim .vimrc xxbos,xxbos dir xxbos yacc -d parse.y xxbos . / testing.sh list map test xxbos ls -altr xxbos sudo apt - get install xxmaj flask xxbos add gitcommands.txt xxbos ls system / players / xxbos grep -n primeiro xxbos [ goalador@gatanda hhneuauf.de]$ bash : bash : : command not found xxbos ls xxbos # 139 xxrep 3 7 xxunk xxbos ls xxbos sudo python3 sc_main.py xxbos rvm list xxbos vim .vimrc xxbos [
7,"/ tomcat7 / temp / xxbos uname trabalho av1 xxbos ls xxbos jps xxbos vi dns.sh xxbos vim play.sh xxbos xfce4 - settings - manager xxbos ls xxbos vim name.rb xxbos ls -la xxbos sudo chmod -r xxrep 3 7 vendor storage xxbos ls -l xxbos john id_rsa xxbos git config --list xxbos allocation_pools : [ { ' start ' : ' xxunk ' , ' end ' : ' xxunk '","tomcat7 / temp / xxbos uname trabalho av1 xxbos ls xxbos jps xxbos vi dns.sh xxbos vim play.sh xxbos xfce4 - settings - manager xxbos ls xxbos vim name.rb xxbos ls -la xxbos sudo chmod -r xxrep 3 7 vendor storage xxbos ls -l xxbos john id_rsa xxbos git config --list xxbos allocation_pools : [ { ' start ' : ' xxunk ' , ' end ' : ' xxunk ' }"
8,/ xxbos ls -l xxbos # xxunk xxbos [ goalador@gatanda hhneuauf.de]$ bash : [ goalador@gatanda : command not found xxbos clear xxbos mkdir root xxbos ssh zog xxbos ls xxbos tmux xxbos . / scripts / update_sifi_cmake master xxbos john --wordlist= / usr / share / wordlists / rockyou.txt key2.hash xxbos ls xxbos passwd xxbos git init xxbos bash : bash : : command not found xxbos ll xxbos # xxunk xxbos,xxbos ls -l xxbos # xxunk xxbos [ goalador@gatanda hhneuauf.de]$ bash : [ goalador@gatanda : command not found xxbos clear xxbos mkdir root xxbos ssh zog xxbos ls xxbos tmux xxbos . / scripts / update_sifi_cmake master xxbos john --wordlist= / usr / share / wordlists / rockyou.txt key2.hash xxbos ls xxbos passwd xxbos git init xxbos bash : bash : : command not found xxbos ll xxbos # xxunk xxbos cargo
9,"su - xxbos "" , $ host;done xxbos ls xxbos sudo apt - get install python - setuptools xxbos ls xxbos ls xxbos openhab - cli status xxbos vim lab xxbos sudo / edx / bin / supervisorctl restart all xxbos vi playbooks / playbook1.yml xxbos ls xxbos cp work02.tbj.gpg ~ / week02 / work02.tbj.gpg xxbos ls xxbos ls xxbos ls xxbos vim index.html xxbos git checkout mynewbranch xxbos wget gavana.uv.ro /","- xxbos "" , $ host;done xxbos ls xxbos sudo apt - get install python - setuptools xxbos ls xxbos ls xxbos openhab - cli status xxbos vim lab xxbos sudo / edx / bin / supervisorctl restart all xxbos vi playbooks / playbook1.yml xxbos ls xxbos cp work02.tbj.gpg ~ / week02 / work02.tbj.gpg xxbos ls xxbos ls xxbos ls xxbos vim index.html xxbos git checkout mynewbranch xxbos wget gavana.uv.ro / f.tgz"


In [None]:
class MyTokenizer(Transform):
    def setups(self, items):
        self.tok = SubwordTokenizer(vocab_sz=200)
        self.tok.setup(items)
        
    def encodes(self, txts):
        with open(txts, 'r') as file:
            content = file.read()
        flattened_list = [item for sublist in list(self.tok(content)) for item in sublist]
        return flattened_list
    
    def decodes(self, encoded):
        decoded_values = TitledStr(''.join(encoded))
        return  decoded_values
            
class MyNumerizer(Transform):
    def setups(self, items):
        self.num = Numericalize()
        self.num.setup(items)
        self.vocab = self.num.vocab
        

    def encodes(self, toks):
        return self.num(toks)
    
    def decodes(self, encoded):
        return self.num.decode(encoded)  