In [1]:
import torch
import torch.nn as nn
import numpy as np
import pickle
from fastai.text.all import *

In [2]:
if torch.cuda.is_available():
  device = "cuda:0"
else:
  device = "cpu"
device

'cuda:0'

In [3]:
def scaled_dot_product_attention(query, key, value, mask=None):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / np.sqrt(dim_k)
    if mask is not None:
        if scores.shape[1] == mask.shape[1]:
            scores = scores.masked_fill(mask == 0, float("-inf"))
        else:
            mask = torch.tril(torch.ones(scores.shape[1], scores.shape[1])).unsqueeze(0).to(device)
            scores = scores.masked_fill(mask == 0, float("-inf"))
    weights = F.softmax(scores, dim=-1)
    return weights.bmm(value)

class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim, vocab_size):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)
        self.mask = torch.tril(torch.ones(vocab_size, vocab_size)).unsqueeze(0).to(device)

    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(
            self.q(hidden_state), self.k(hidden_state), self.v(hidden_state), self.mask)
        return attn_outputs

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim, 72) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        return x

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

class TransformerEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def forward(self, x):
        # Apply layer normalization and then copy input into query, key, value
        hidden_state = self.layer_norm_1(x)
        # Apply attention with a skip connection
        x = x + self.attention(hidden_state)
        # Apply feed-forward layer with a skip connection
        x = x + self.feed_forward(self.layer_norm_2(x))
        return x

class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size, 
                                             config.hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                                config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout()

    def forward(self, input_ids):
        # Create position IDs for input sequence
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0).to(device)
        # Create token and position embeddings
        token_embeddings = self.token_embeddings(input_ids)
        
        position_embeddings = self.position_embeddings(position_ids)
        # Combine token and position embeddings
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

class TransformerEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.layers = nn.ModuleList([TransformerEncoderLayer(config) 
                                     for _ in range(config.num_hidden_layers)])

    def forward(self, x):
        x = self.embeddings(x)
        for layer in self.layers:
            x = layer(x)
        return x

class ShellTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.encoder = TransformerEncoder(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        
    def forward(self, x):
        #print(x)
        x = self.encoder(x)#[:, 0, :] # select hidden state of [CLS] token
        #print(x)
        x = self.dropout(x)
        x = self.classifier(x)
        #print(x)
        return x

In [9]:
with open('/home/chris/University/gnn_project/dataset', 'rb') as fp:
    _ = pickle.load(fp)

In [10]:
_[0:10]

['nmap\n',
 'nmap -v 10.1.26.4\n',
 'nmap -v 10.1.26.9\n',
 'ssh --help\n',
 'ssh 10.1.26.9\n',
 'ssh 10.1.26.9 admin/123456\n',
 'ssh --help\n',
 'ssh 10.1.26.9\n',
 'ssh -l admin 10.1.26.9\n',
 'ssh admin@admin 10.1.26.9\n']

In [11]:
import os

# Given array of text elements
texts = _

# Create a folder to store the text files
folder_path = '/home/chris/University/gnn_project/data/'
os.makedirs(folder_path, exist_ok=True)

# Write each non-empty text element to a separate file
for i, text in enumerate(texts):
    # Remove trailing newline character
    text = text.rstrip('\n')
    
    # Check if text is not empty after stripping newline
    if text.strip():
        file_path = os.path.join(folder_path, f'text_{i}.txt')
        with open(file_path, 'w') as file:
            file.write(text)


In [12]:
txts = L(_)

In [13]:
len(txts)

203101

# Tokenizer 

### IMDB Tokenizer

In [None]:
class MyTokenizer(Transform):
    def setups(self, items):
        path = untar_data(URLs.IMDB)
        self.tok =  Tokenizer.from_folder(path)
        self.tok.setup(items)
        
    def encodes(self, txts):
        with open(txts, 'r') as file:
            content = file.read()
        return self.tok(content)
    
    def decodes(self, encoded):
        return self.tok.decode(encoded)
            
class MyNumerizer(Transform):
    def setups(self, items):
        self.num = Numericalize()
        self.num.setup(items)
        self.vocab = self.num.vocab

    def encodes(self, toks):
        return self.num(toks)
    
    def decodes(self, encoded):
        return self.num.decode(encoded)
    
limit = 10000
path_test = '/home/chris/University/gnn_project/'
tfms = [[MyTokenizer(),MyNumerizer()]]
files = get_text_files(path_test, folders = ['data'])
dsets = Datasets(files[:limit], tfms)
dls = dsets.dataloaders(dl_type=LMDataLoader, bs=4)

dls.show_batch(max_n=10)

Unnamed: 0,text,text_
0,"xxbos cmake build xxbos trabalho xxbos make xxbos watch -n 1 "" syscoind getinfo & & tail -n 20 debug.log "" xxbos pwd xxbos java xxmaj xxunk xxbos sudo systemctl stop network - manager ; sleep xxunk systemctl restart network - manager xxbos set xxup xxunk xxbos chmod + x xxunk xxbos grep xxunk xxunk xxunk | sed -i 's / xxup xxunk / xxup xxunk / xxup g ' xxbos msfconsole","cmake build xxbos trabalho xxbos make xxbos watch -n 1 "" syscoind getinfo & & tail -n 20 debug.log "" xxbos pwd xxbos java xxmaj xxunk xxbos sudo systemctl stop network - manager ; sleep xxunk systemctl restart network - manager xxbos set xxup xxunk xxbos chmod + x xxunk xxbos grep xxunk xxunk xxunk | sed -i 's / xxup xxunk / xxup xxunk / xxup g ' xxbos msfconsole xxbos"
1,"read xxbos sudo reboot xxbos rvm -v xxbos ls xxbos ls xxbos python3 xxunk xxbos "" xxunk - xxunk "" : "" xxunk "" , xxbos gwsh xxunk xxbos vim xxunk xxbos bash : [ goalador@gatanda : command not found xxbos ls xxbos bg xxunk xxbos rm bin / info_pc1_cesar xxbos make xxbos wget xxbos [ goalador@gatanda hhneuauf.de]$ [ goalador@gatanda hhneuauf.de]$ [ goalador@gatanda hhneuauf.de]$ [ chg ] xxmaj controller xxup c8 :","xxbos sudo reboot xxbos rvm -v xxbos ls xxbos ls xxbos python3 xxunk xxbos "" xxunk - xxunk "" : "" xxunk "" , xxbos gwsh xxunk xxbos vim xxunk xxbos bash : [ goalador@gatanda : command not found xxbos ls xxbos bg xxunk xxbos rm bin / info_pc1_cesar xxbos make xxbos wget xxbos [ goalador@gatanda hhneuauf.de]$ [ goalador@gatanda hhneuauf.de]$ [ goalador@gatanda hhneuauf.de]$ [ chg ] xxmaj controller xxup c8 : f7:33:33"
2,"xxunk admin xxbos vim xxunk xxbos cat xxunk xxbos ls -a xxbos git clone ssh : / / xxunk / xxunk / git / xxunk xxbos sudo vi xxunk xxbos clear xxbos ls xxbos mysql -h alas -u ml12087 -p xxbos git xxunk xxbos [ goalador@gatanda hhneuauf.de]$ bash : bash : : command not found xxbos sudo xxbos git log xxbos test "" $ user "" = "" xxunk "" & &","admin xxbos vim xxunk xxbos cat xxunk xxbos ls -a xxbos git clone ssh : / / xxunk / xxunk / git / xxunk xxbos sudo vi xxunk xxbos clear xxbos ls xxbos mysql -h alas -u ml12087 -p xxbos git xxunk xxbos [ goalador@gatanda hhneuauf.de]$ bash : bash : : command not found xxbos sudo xxbos git log xxbos test "" $ user "" = "" xxunk "" & & echo"
3,"xxrep 3 x xxrep 3 c xxrep 3 v xxunk xxrep 4 n xxrep 3 m xxrep 4 , * / / xxrep 6 1 xxrep 6 ' xxup xxunk ! xxbos make xxbos bash : bash : : command not found xxbos reboot xxbos ls -la xxbos httpd xxbos mkdir .ssh xxbos mplayer xxunk xxbos node app xxbos run xxbos vi send.py xxbos ls xxbos python da.py xxbos dunst xxbos clear","3 x xxrep 3 c xxrep 3 v xxunk xxrep 4 n xxrep 3 m xxrep 4 , * / / xxrep 6 1 xxrep 6 ' xxup xxunk ! xxbos make xxbos bash : bash : : command not found xxbos reboot xxbos ls -la xxbos httpd xxbos mkdir .ssh xxbos mplayer xxunk xxbos node app xxbos run xxbos vi send.py xxbos ls xxbos python da.py xxbos dunst xxbos clear xxbos"


In [None]:
print("Length:",len(dls.vocab),dls.vocab)

Length: 1400 ['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxfld', 'xxrep', 'xxwrep', 'xxup', 'xxmaj', '/', 'ls', '-', ':', 'sudo', '"', 'git', '.', 'vim', '[', 'bash', 'goalador@gatanda', "'", '\\', 'hhneuauf.de]$', '#', 'command', 'not', 'vi', 'found', 'cat', 'make', 'nano', 'rm', 'clear', 'python', 'install', '|', 'll', 'apt', '3', '*', '>', 'ssh', 'etc', '-a', '0', 'add', '$', 'exit', '1', ';', '-p', 'bin', 'cd', 'grep', 'home', '=', 'status', '-l', '..', '-i', 'echo', '~', '&', '4', 'get', '-f', 'docker', 'pwd', '-u', 'push', 'run', 'nmap', '-v', 'mv', ',', '-r', 'commit', '172.18.1.5', '-h', 'set', 'master', '-m', ')', '{', 'ps', 'usr', '-d', 'origin', 'find', '}', 'mkdir', 'config', 'app', '(', 'ifconfig', 'node', 'cp', 'su', 'chmod', '-ltr', '`', 'less', '-t', 'log', '-s', '2', '-o', 'server', '-la', ']', 'dev', 'man', 'lib', 'python3', 'scp', 'a', 'test', 'php', 'w', 'root', 'update', 'service', 'tar', 'x', 'systemctl', '-rf', 'sh', '.invoices2019.zip', 'remote', 'pull', 'gcc', 'explo

### SubwordTokenizer

In [None]:
class MyTokenizer(Transform):
    def setups(self, items):
        self.tok = SubwordTokenizer(vocab_sz=200)
        self.tok.setup(items)
        
    def encodes(self, txts):
        with open(txts, 'r') as file:
            content = file.read()
        flattened_list = [item for sublist in list(self.tok(content)) for item in sublist]
        return flattened_list
    
    def decodes(self, encoded):
        decoded_values = TitledStr(''.join(encoded))
        return  decoded_values
            
class MyNumerizer(Transform):
    def setups(self, items):
        self.num = Numericalize()
        self.num.setup(items)
        self.vocab = self.num.vocab
        

    def encodes(self, toks):
        return self.num(toks)
    
    def decodes(self, encoded):
        return self.num.decode(encoded)  

limit = 1000
path_test = '/home/chris/University/gnn_project/'
tfms = [[MyTokenizer(),MyNumerizer()]]
files = get_text_files(path_test, folders = ['data'])
dsets = Datasets(files[:limit], tfms)
dls = dsets.dataloaders(dl_type=LMDataLoader, bs=64)

dls.show_batch(max_n=10)

Unnamed: 0,text,text_
0,▁n▁o▁d▁e▁b▁r▁e▁w▁a▁l▁i▁a▁s▁d▁e▁f▁a▁u▁l▁t▁v▁0▁.▁8▁.▁1▁5▁l▁s▁s▁h▁o▁w▁o▁p▁t,n▁o▁d▁e▁b▁r▁e▁w▁a▁l▁i▁a▁s▁d▁e▁f▁a▁u▁l▁t▁v▁0▁.▁8▁.▁1▁5▁l▁s▁s▁h▁o▁w▁o▁p▁t▁
1,▁3▁2▁5▁3▁0▁4▁2▁m▁a▁k▁e▁s▁u▁d▁o▁s▁e▁r▁v▁i▁c▁e▁d▁o▁c▁k▁e▁r▁h▁e▁l▁p▁R▁U▁N▁A,3▁2▁5▁3▁0▁4▁2▁m▁a▁k▁e▁s▁u▁d▁o▁s▁e▁r▁v▁i▁c▁e▁d▁o▁c▁k▁e▁r▁h▁e▁l▁p▁R▁U▁N▁A▁
2,▁a▁r▁e▁m▁e▁t▁a▁l▁n▁o▁d▁e▁l▁i▁s▁t▁l▁e▁s▁s▁c▁u▁t▁o▁f▁f▁.▁p▁y▁m▁a▁n▁t▁p▁u▁t,a▁r▁e▁m▁e▁t▁a▁l▁n▁o▁d▁e▁l▁i▁s▁t▁l▁e▁s▁s▁c▁u▁t▁o▁f▁f▁.▁p▁y▁m▁a▁n▁t▁p▁u▁t▁
3,▁e▁x▁i▁t▁m▁k▁d▁i▁r▁r▁u▁n▁l▁s▁t▁r▁i▁n▁i▁t▁y▁_▁t▁e▁s▁t▁/▁v▁i▁c▁o▁n▁c▁a▁t▁A,e▁x▁i▁t▁m▁k▁d▁i▁r▁r▁u▁n▁l▁s▁t▁r▁i▁n▁i▁t▁y▁_▁t▁e▁s▁t▁/▁v▁i▁c▁o▁n▁c▁a▁t▁A▁
4,"▁a▁l▁l▁r▁m▁-▁r▁f▁e▁x▁p▁.▁t▁g▁z▁g▁i▁t▁c▁o▁m▁m▁i▁t▁-▁m▁""▁u▁p▁d▁a▁t▁e▁s▁""▁e","a▁l▁l▁r▁m▁-▁r▁f▁e▁x▁p▁.▁t▁g▁z▁g▁i▁t▁c▁o▁m▁m▁i▁t▁-▁m▁""▁u▁p▁d▁a▁t▁e▁s▁""▁e▁"
5,▁i▁t▁]▁$▁>▁>▁[▁g▁o▁a▁l▁a▁d▁o▁r▁@▁g▁a▁t▁a▁n▁d▁a▁c▁o▁r▁p▁w▁e▁b▁s▁i▁t▁e▁-▁s,i▁t▁]▁$▁>▁>▁[▁g▁o▁a▁l▁a▁d▁o▁r▁@▁g▁a▁t▁a▁n▁d▁a▁c▁o▁r▁p▁w▁e▁b▁s▁i▁t▁e▁-▁s▁
6,▁.▁/▁s▁c▁a▁n▁2▁1▁6▁.▁8▁9▁;▁l▁a▁d▁o▁r▁@▁g▁a▁t▁a▁n▁d▁a▁h▁h▁n▁e▁u▁a▁u▁f▁.▁b,.▁/▁s▁c▁a▁n▁2▁1▁6▁.▁8▁9▁;▁l▁a▁d▁o▁r▁@▁g▁a▁t▁a▁n▁d▁a▁h▁h▁n▁e▁u▁a▁u▁f▁.▁b▁
7,▁m▁e▁s▁o▁s▁p▁h▁e▁r▁e▁/▁m▁a▁r▁a▁t▁h▁o▁n▁/▁a▁p▁i▁/▁S▁y▁s▁t▁e▁m▁R▁e▁s▁o▁u▁r,m▁e▁s▁o▁s▁p▁h▁e▁r▁e▁/▁m▁a▁r▁a▁t▁h▁o▁n▁/▁a▁p▁i▁/▁S▁y▁s▁t▁e▁m▁R▁e▁s▁o▁u▁r▁
8,▁4▁-▁0▁6▁-▁4▁4▁_▁e▁d▁i▁t▁.▁m▁k▁v▁.▁m▁k▁v▁s▁u▁d▁o▁n▁e▁t▁s▁t▁a▁t▁-▁p▁l▁a▁n,4▁-▁0▁6▁-▁4▁4▁_▁e▁d▁i▁t▁.▁m▁k▁v▁.▁m▁k▁v▁s▁u▁d▁o▁n▁e▁t▁s▁t▁a▁t▁-▁p▁l▁a▁n▁
9,▁r▁t▁i▁e▁s▁-▁c▁o▁m▁m▁o▁n▁:▁q▁a▁d▁d▁u▁s▁e▁r▁e▁c▁o▁l▁l▁e▁c▁t▁l▁s▁v▁i▁m▁/▁u,r▁t▁i▁e▁s▁-▁c▁o▁m▁m▁o▁n▁:▁q▁a▁d▁d▁u▁s▁e▁r▁e▁c▁o▁l▁l▁e▁c▁t▁l▁s▁v▁i▁m▁/▁u▁


In [None]:
print("Length:",len(dls.vocab),dls.vocab)

Length: 80 ['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxfld', 'xxrep', 'xxwrep', 'xxup', 'xxmaj', '▁', 'a', 't', 'o', 's', 'e', 'i', 'n', 'r', 'l', 'c', '-', 'p', 'd', 'm', '.', 'g', '1', 'h', '2', 'f', 'u', '0', 'b', 'v', '/', '8', '_', '3', 'k', ':', 'y', '4', '7', '5', 'w', 'R', 'D', '9', '6', 'V', 'T', 'S', 'B', 'H', 'E', '#', '"', 'A', 'I', 'x', 'N', 'z', '@', 'P', ';', 'O', 'q', '`', 'X', 'G', 'Z', '|', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake']


### BaseTokenizer

In [None]:
class MyTokenizer(Transform):
    def setups(self, items):
        self.tok = BaseTokenizer()
        #self.tok.setup(items)
        
    def encodes(self, txts):
        with open(txts, 'r') as file:
            content = file.read()
        flattened_list = [item for sublist in list(self.tok(content)) for item in sublist]
        return flattened_list
    
    def decodes(self, encoded):
        decoded_values = TitledStr(''.join(encoded))
        return  decoded_values
            
class MyNumerizer(Transform):
    def setups(self, items):
        self.num = Numericalize()
        self.num.setup(items)
        self.vocab = self.num.vocab
        

    def encodes(self, toks):
        return self.num(toks)
    
    def decodes(self, encoded):
        return self.num.decode(encoded)  

limit = 100
path_test = '/home/chris/University/gnn_project/'
tfms = [[MyTokenizer(),MyNumerizer()]]
files = get_text_files(path_test, folders = ['data'])
dsets = Datasets(files[:limit], tfms)
dls = dsets.dataloaders(dl_type=LMDataLoader, bs=64)

dls.show_batch(max_n=10)

Unnamed: 0,text,text_
0,gitconfig--global--add,itconfig--global--add
1,web.browser/opt/firefox/fire,web.browser/opt/firefox/firef
2,foxclearwgetpilotu.110mb.com/,oxclearwgetpilotu.110mb.com/R
3,RaZvaNBv.tgz;tarxvfRaZvaNBv,aZvaNBv.tgz;tarxvfRaZvaNBv.
4,.tgz;rm-rfRaZvaNBv.tgz;cd,tgz;rm-rfRaZvaNBv.tgz;cd.
5,.tmp;./startprintubutignome-t,tmp;./startprintubutignome-te
6,"erminal--role""gnome""sudop","rminal--role""gnome""sudopy"
7,"ythongitcommit-m""firstc","thongitcommit-m""firstco"
8,"ommit""ifconfig#1357789494ls","mmit""ifconfig#1357789494ls-"
9,-lpython3xxunkombined.pycdmysql,lpython3xxunkombined.pycdmysql


In [None]:
print("Length:",len(dls.vocab),dls.vocab)

Length: 80 ['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxfld', 'xxrep', 'xxwrep', 'xxup', 'xxmaj', '', 'a', 't', 'o', 's', 'e', 'i', 'n', 'r', 'l', 'c', '-', 'p', 'd', 'm', '.', 'g', '1', 'h', '2', 'f', 'u', '0', 'b', 'v', '/', '8', '_', '3', 'k', ':', 'y', '4', '7', '5', 'w', 'R', 'D', '9', '6', 'V', 'T', 'S', 'B', 'H', 'E', '#', '"', 'A', 'I', 'x', 'N', 'z', '@', 'P', ';', 'O', 'q', '`', 'X', 'G', 'Z', '|', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake']


### SpacyTokenizer

In [None]:
class MyTokenizer(Transform):
    def setups(self, items):
        self.tok = SpacyTokenizer()
        #self.tok.setup(items)
        
    def encodes(self, txts):
        with open(txts, 'r') as file:
            content = file.read()
        flattened_list = [item for sublist in list(self.tok(content)) for item in sublist]
        return flattened_list
    
    def decodes(self, encoded):
        decoded_values = TitledStr(''.join(encoded))
        return  decoded_values
            
class MyNumerizer(Transform):
    def setups(self, items):
        self.num = Numericalize()
        self.num.setup(items)
        self.vocab = self.num.vocab
        

    def encodes(self, toks):
        return self.num(toks)
    
    def decodes(self, encoded):
        return self.num.decode(encoded)  
    
limit = 100
path_test = '/home/chris/University/gnn_project/'
tfms = [[MyTokenizer(),MyNumerizer()]]
files = get_text_files(path_test, folders = ['data'])
dsets = Datasets(files[:limit], tfms)
dls = dsets.dataloaders(dl_type=LMDataLoader, bs=64)

dls.show_batch(max_n=10)

Unnamed: 0,text,text_
0,sudo raspi-configfind ./ -nam,udo raspi-configfind ./ -name
1,"e ""xxunk.fq.gz"" |zcat | grep xxunkHxxunkI","""xxunk.fq.gz"" |zcat | grep xxunkHxxunkIxxunk"
2,xxunkSIxxunklador@gatanda hhneuauf.ba,SIxxunklador@gatanda hhneuauf.bas
3,sh: bash:: command not founds,h: bash:: command not foundsu
4,udo ifdown eth9bash: xxunkgoalado,do ifdown eth9bash: xxunkgoalador
5,r@gatanda: command not found#,@gatanda: command not found#1
6,1517115588pwdwget pilotu.110m,517115588pwdwget pilotu.110mb
7,b.com/RaZvaNBv.tgz;tar xvf Ra,.com/RaZvaNBv.tgz;tar xvf RaZ
8,ZvaNBv.tgz;rm -rf RaZvaNBv.tg,vaNBv.tgz;rm -rf RaZvaNBv.tgz
9,z;cd .tmp;./start printubutic,;cd .tmp;./start printubutica


In [None]:
print("Length:",len(dls.vocab),dls.vocab)

Length: 80 ['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxfld', 'xxrep', 'xxwrep', 'xxup', 'xxmaj', ' ', 'a', 't', 'o', 's', 'e', 'i', 'n', 'r', 'l', 'c', '-', 'p', 'd', 'm', '.', 'g', '1', 'h', '2', 'f', 'u', '0', 'b', 'v', '/', '8', '_', '3', 'k', ':', 'y', '4', '7', '5', 'w', 'R', 'D', '9', '6', 'V', 'T', 'S', 'B', 'H', 'E', '#', '"', 'A', 'I', 'x', 'N', 'z', '@', 'P', ';', 'O', 'q', '`', 'X', 'G', 'Z', '|', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake', 'xxfake']


### WordTokenize

In [None]:
class MyTokenizer(Transform):
    def setups(self, items):
        self.tok = WordTokenizer()
        #self.tok.setup(items)
        
    def encodes(self, txts):
        with open(txts, 'r') as file:
            content = file.read()
        flattened_list = [item for sublist in list(self.tok(content)) for item in sublist]
        return flattened_list
    
    def decodes(self, encoded):
        decoded_values = TitledStr(''.join(encoded))
        return  decoded_values
            
class MyNumerizer(Transform):
    def setups(self, items):
        self.num = Numericalize()
        self.num.setup(items)
        self.vocab = self.num.vocab
        

    def encodes(self, toks):
        return self.num(toks)
    
    def decodes(self, encoded):
        return self.num.decode(encoded)  

limit = 10000
path_test = '/home/chris/University/gnn_project/'
tfms = [[MyTokenizer(),MyNumerizer()]]
files = get_text_files(path_test, folders = ['data'])
dsets = Datasets(files[:limit], tfms)
dls = dsets.dataloaders(dl_type=LMDataLoader, bs=64)

dls.show_batch(max_n=10)

Unnamed: 0,text,text_
0,php server.php#1473555325vim mysum.shhistoryservice redis-server stopfor,hp server.php#1473555325vim mysum.shhistoryservice redis-server stopfor
1,udo pacman -R docker docker-composevim /usr/share/dbus-1/services/ vim t,do pacman -R docker docker-composevim /usr/share/dbus-1/services/ vim tu
2,sername@10.1.26.255lsiptables -Lmakesudo yum install jenkinsvim pintosap,ername@10.1.26.255lsiptables -Lmakesudo yum install jenkinsvim pintosapr
3,cp profile2vim src/genetic/Main.java lscomandogit add *[goalador@gatanda,p profile2vim src/genetic/Main.java lscomandogit add *[goalador@gatanda
4,"squitto_pub -h 127.0.0.1 -t ""application/1/node/0000000000000000/tx"" -m","quitto_pub -h 127.0.0.1 -t ""application/1/node/0000000000000000/tx"" -m """
5,-lhlslsexitexport LC_ALL=en_US.utf8exploitsudo reflector --verbose --lat,lhlslsexitexport LC_ALL=en_US.utf8exploitsudo reflector --verbose --late
6,it pull --rebasevim zerador.shsu ./server -p 3490lsuname -r | cut -c 1mk,t pull --rebasevim zerador.shsu ./server -p 3490lsuname -r | cut -c 1mkd
7,sigc/EventLevel.pm lllssort testeps auxtrizen -Ss digikamls -ltr /isiseq,igc/EventLevel.pm lllssort testeps auxtrizen -Ss digikamls -ltr /isiseqr
8,d ..lssu -composer install --no-devps -fpython popel.py lsl temp/#150946,..lssu -composer install --no-devps -fpython popel.py lsl temp/#1509464
9,.akamai.com:8443/api/v1/se/deployments?environment=qa&isDeploy=true | jq,akamai.com:8443/api/v1/se/deployments?environment=qa&isDeploy=true | jq


In [22]:
class MyTokenizer(Transform):
    def setups(self, items):
        self.tok = WordTokenizer()
        #self.tok.setup(items)
        
    def encodes(self, txts):
        with open(txts, 'r') as file:
            content = file.read()
        flattened_list = [item for sublist in list(self.tok(content)) for item in sublist]
        return flattened_list
    
    def decodes(self, encoded):
        decoded_values = TitledStr(''.join(encoded))
        return  decoded_values
            
class MyNumerizer(Transform):
    def setups(self, items):
        self.num = Numericalize()
        self.num.setup(items)
        self.vocab = self.num.vocab
        

    def encodes(self, toks):
        return self.num(toks)
    
    def decodes(self, encoded):
        return self.num.decode(encoded)  

limit = -1
path = untar_data(URLs.IMDB)
tfms = [[Tokenizer.from_folder(path), Numericalize]]
files = get_text_files(path, folders = ['train', 'test'])
#splits = GrandparentSplitter(valid_name='test')(files)
dsets = Datasets(files[:], tfms)#, splits=splits)
dls = dsets.dataloaders(dl_type=LMDataLoader, before_batch=pad_input)

dls.show_batch(max_n=10)

Unnamed: 0,text,text_
0,"xxbos i really liked the xxmaj far xxmaj cry game , nice graphics , good level - design , interesting and clever enemies , above - average length and even a somewhat decent plot . i am not by default against movie spin - offs of games . i thought "" doom xxmaj the xxmaj movie "" was hilarious . xxmaj but what xxmaj uwe xxmaj boll has done here , is","i really liked the xxmaj far xxmaj cry game , nice graphics , good level - design , interesting and clever enemies , above - average length and even a somewhat decent plot . i am not by default against movie spin - offs of games . i thought "" doom xxmaj the xxmaj movie "" was hilarious . xxmaj but what xxmaj uwe xxmaj boll has done here , is to"
1,"in the rental store and someone asked me if the movie was worth the three bucks for the rental , xxmaj i 'd have to say no . xxmaj the plot was implausible . xxmaj i 've come to the conclusion that xxmaj keanu xxmaj reeves can not act . xxmaj he can , at times , be painful to watch ( though my wife thinks he 's cute ) . xxmaj","the rental store and someone asked me if the movie was worth the three bucks for the rental , xxmaj i 'd have to say no . xxmaj the plot was implausible . xxmaj i 've come to the conclusion that xxmaj keanu xxmaj reeves can not act . xxmaj he can , at times , be painful to watch ( though my wife thinks he 's cute ) . xxmaj dan"
2,"reading on the spot . xxmaj just lifeless ! xxmaj not only that , but he appears in desperate need of a blood transfusion or something . xxmaj he looks wan and sickly throughout and is several pounds smaller than most of his female costars . xxmaj robin xxmaj stone should be a hunk , not a hankie . \n\n xxmaj for anyone finding the film hard going ( it 's rather","on the spot . xxmaj just lifeless ! xxmaj not only that , but he appears in desperate need of a blood transfusion or something . xxmaj he looks wan and sickly throughout and is several pounds smaller than most of his female costars . xxmaj robin xxmaj stone should be a hunk , not a hankie . \n\n xxmaj for anyone finding the film hard going ( it 's rather slow"
3,"this movie and are a fan of xxmaj jackie xxmaj chan or action in general , give yourself a treat and watch this movie . xxmaj it is truly sensational . xxbos mcconaughey in a horror / thriller ? i had to see this . i was pleasantly surprised . \n\n xxmaj the plot is told in flashback mode , and it concerns an otherwise normal and happy family of three going","movie and are a fan of xxmaj jackie xxmaj chan or action in general , give yourself a treat and watch this movie . xxmaj it is truly sensational . xxbos mcconaughey in a horror / thriller ? i had to see this . i was pleasantly surprised . \n\n xxmaj the plot is told in flashback mode , and it concerns an otherwise normal and happy family of three going through"
4,"federal reserve ( i think ) . xxmaj the idea is to drive both trucks to a warehouse , stash the cash , then stage a hijack . xxmaj sure , the cops will suspect them , but if they stick together they 'll get through it . \n\n xxmaj trouble is , one of the six , played by xxmaj columbus xxmaj short , is a xxunk . xxmaj at first","reserve ( i think ) . xxmaj the idea is to drive both trucks to a warehouse , stash the cash , then stage a hijack . xxmaj sure , the cops will suspect them , but if they stick together they 'll get through it . \n\n xxmaj trouble is , one of the six , played by xxmaj columbus xxmaj short , is a xxunk . xxmaj at first ."
5,out of 10 and i would 've given it a 1 out of 10 if the story did'nt have anything to do with xxmaj bufford xxmaj pusser 's life but it did and that why i had given this movie a 3 out 10 . \n\n i strongly suggest that anyone who is planning on watching this cheese i suggest do n't and watch the first sequel instead . xxbos xxmaj this,of 10 and i would 've given it a 1 out of 10 if the story did'nt have anything to do with xxmaj bufford xxmaj pusser 's life but it did and that why i had given this movie a 3 out 10 . \n\n i strongly suggest that anyone who is planning on watching this cheese i suggest do n't and watch the first sequel instead . xxbos xxmaj this has
6,"? xxmaj these are not easy questions and most of us will feel uncomfortable with them . xxmaj as an artistic piece , this movie is really a forgotten and rough gem . xxmaj the script progresses with extreme simplicity , albeit some sappiness , but never pulling any punches to state its message , although by today standards , it is somewhat slow . xxmaj the photography is beautiful and it","xxmaj these are not easy questions and most of us will feel uncomfortable with them . xxmaj as an artistic piece , this movie is really a forgotten and rough gem . xxmaj the script progresses with extreme simplicity , albeit some sappiness , but never pulling any punches to state its message , although by today standards , it is somewhat slow . xxmaj the photography is beautiful and it has"
7,"sorority chicks , and xxmaj satan himself ( an ironically - cast xxmaj dave xxmaj grohl ) , in a climactic sequence that has to be seen to be believed ( and preferably played at high volume ) . xxmaj rarely do i see comedies in the theater , but "" pick "" is an extremely nice change of pace … it may not go up to 11 , but it hums","chicks , and xxmaj satan himself ( an ironically - cast xxmaj dave xxmaj grohl ) , in a climactic sequence that has to be seen to be believed ( and preferably played at high volume ) . xxmaj rarely do i see comedies in the theater , but "" pick "" is an extremely nice change of pace … it may not go up to 11 , but it hums a"
8,"xxmaj cage to the place where he was recognized in xxmaj hollywood for his talent . xxmaj from the music to the scenes at the opera to the kitchen table xxunk this is a very entertaining movie . xxbos xxmaj this movie is a disaster within a disaster film . xxmaj it is full of great action scenes , which are only meaningful if you throw away all sense of reality .","cage to the place where he was recognized in xxmaj hollywood for his talent . xxmaj from the music to the scenes at the opera to the kitchen table xxunk this is a very entertaining movie . xxbos xxmaj this movie is a disaster within a disaster film . xxmaj it is full of great action scenes , which are only meaningful if you throw away all sense of reality . xxmaj"
9,"lead actors closely resemble the two real - life killers . xxmaj robert xxmaj blake is more than convincing as xxmaj perry xxmaj smith , short and stocky with a bum leg , who dreams of finding xxmaj cortez ' buried treasure . xxmaj scott xxmaj wilson is almost as good as xxmaj dick xxmaj hickock , the smooth - talking con artist with an all - american smile . \n\n xxmaj","actors closely resemble the two real - life killers . xxmaj robert xxmaj blake is more than convincing as xxmaj perry xxmaj smith , short and stocky with a bum leg , who dreams of finding xxmaj cortez ' buried treasure . xxmaj scott xxmaj wilson is almost as good as xxmaj dick xxmaj hickock , the smooth - talking con artist with an all - american smile . \n\n xxmaj after"


In [23]:
len(dsets)

50000

In [5]:
print("Length:",len(dls.vocab),dls.vocab)



In [24]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained('bert-base-uncased')

config.vocab_size = len(dls.vocab)
config.num_labels = len(dls.vocab)
config.hidden_size = 132
config.num_hidden_layers = 5
transformer = ShellTransformer(config)

In [25]:
config.vocab_size

60008

In [26]:
model = transformer

model.to(device)

dls.to(device)

learn = Learner(
    dls, 
    model, 
    loss_func=CrossEntropyLossFlat(), 
    metrics=[accuracy]
)

learn.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,accuracy,time


  warn("Your generator is empty.")


In [27]:
vocab = dls.vocab

In [28]:
# Define a function for text generation
def generate_text(model, token_ids, max_length=10):
    token_ids = token_ids[0][0].to(device)
    input_ids = torch.tensor(token_ids).unsqueeze(0).to(device)  
    with torch.no_grad():
        for _ in range(max_length):
            outputs = model(input_ids)
            logits = outputs[:, -1, :]  
            next_token_id = torch.argmax(logits, dim=-1)
            token_ids = torch.cat((token_ids, next_token_id),dim=0)
            input_ids = torch.cat([input_ids, next_token_id.unsqueeze(0).to(device)], dim=-1)
    return token_ids

def decode_tokens(numerized_tokens, vocab):
    generated_test = [vocab[token] for token in numerized_tokens]
    return ' '.join(generated_test)


# Generate text
files = get_text_files('', folders = ['test'])
#vocab = dls.vocab
start_text_ids = Datasets(files, tfms)

generated_ids = generate_text(learn.model, start_text_ids)

path= '/home/paperspace/gnn_project/FastAI/test/text_generation.txt'

with open(path, 'r') as file:
     content = file.read()
     
print(content+decode_tokens(generated_ids,vocab))


  input_ids = torch.tensor(token_ids).unsqueeze(0).to(device)


The movie was good becausexxbos xxmaj the movie was good because i was n't sure that i was n't sure that


In [21]:
_[0:10]

['nmap\n',
 'nmap -v 10.1.26.4\n',
 'nmap -v 10.1.26.9\n',
 'ssh --help\n',
 'ssh 10.1.26.9\n',
 'ssh 10.1.26.9 admin/123456\n',
 'ssh --help\n',
 'ssh 10.1.26.9\n',
 'ssh -l admin 10.1.26.9\n',
 'ssh admin@admin 10.1.26.9\n']