In [None]:
import numpy as np
from collections import Counter
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords


In [None]:
# pretrained word vector `crawl-300d-2M.vec' can be downloaded here
# https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip

In [None]:
# !cd '/content/drive/My Drive/Colab Notebooks/DLP_project' && cp 'crawl-300d-2M.vec.zip' /content/
# !unzip crawl-300d-2M.vec.zip

Archive:  crawl-300d-2M.vec.zip
  inflating: crawl-300d-2M.vec       


In [24]:
EMBEDDING_DIM = 300
embedding_table = {}
with open('crawl-300d-2M.vec') as f:
    for i, line in enumerate(f):
        if i == 0: # header
            print('words, dim =', line.split())
            continue
        
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embedding_table[word] = coefs

print('Found %s word vectors.' % len(embedding_table))

words, dim = ['1999995', '300']
Found 1999995 word vectors.


In [25]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
def template2tokens(templates):
    # templates: list of str
    # e.g. templates = ['PacketResponder <*> for block <*> terminating', 'Received block <*> of size <*> from <*>']
    # return: list of list of str which has been processed (tokens)
    # e.g. [['packet', 'responder', 'block', 'terminate'], ['receive', 'block', 'size']]

    list_tokens = []

    for i, text in enumerate(templates):
        for j in reversed(range(len(text))):
            if j == 0: break
            if not text[j].isalpha():
                text = text[:j] + ' ' + text[j + 1:]
                continue
            if text[j].isupper() and text[j - 1].islower():
                text = text[:j] + ' ' + text[j:]
        
        tokens = nltk.word_tokenize(text.lower())
        tokens = [token for token in tokens if token not in stopwords.words('english')]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token, 'v') for token in tokens]
        list_tokens.append(tokens)
    
    return list_tokens

In [27]:
def calculate_freq(list_tokens, mode_idf=False, counter=None):
    # list_tokens: list of list of str
    # e.g. [['packet', 'responder', 'block', 'terminate'], ['receive', 'block', 'size']]
    # mode_idf: bool
    # if `mode_idf' is set to True, count as idf
    # counter: collections.Counter object
    # if `counter' parameter is not assigned, generate a new one
    # otherwise, the counter object being assigned will be update (aggregate)

    if counter is None:
        counter = Counter()

    for tokens in list_tokens:
        if mode_idf:
            counter_tokens = Counter(set(tokens))
        else:
            counter_tokens = Counter(tokens)
        
        counter.update(counter_tokens)

    return counter

In [28]:
def template2vec(templates, embedding_table, counter_idf):
    # templates: list of str
    # e.g. templates = ['PacketResponder <*> for block <*> terminating', 'Received block <*> of size <*> from <*>']
    # embedding_table: dict
    # a dict mapping words to vectors of dimension EMBEDDING_DIM
    # counter_idf: collections.Counter object
    # `counter_idf' indicates the counter for calculating idf

    list_vectors = []

    list_tokens = template2tokens(templates)
    for tokens in list_tokens:
        vector_token = np.zeros(EMBEDDING_DIM)
        
        counter_tf = calculate_freq([tokens])
        num_valid_token = 0

        for token in tokens:
            if token not in embedding_table: continue
            
            num_valid_token += 1

            tf = counter_tf[token] / sum(counter_tf.values())
            idf = np.log(sum(counter_idf.values()) / (1 + counter_idf.get(token, 0)))

            vector = embedding_table[token] * tf * idf
            vector_token += vector

        list_vectors.append(vector_token / num_valid_token)


    return list_vectors

In [29]:
# -----test-----

In [30]:
# example inputs (input 2 log lines)
inputs = ['PacketResponder <*> for block <*> terminating', 'Received block <*> of size <*> from <*>']

In [31]:
# idf of `new_list_tokens' can be aggregate by calling `calculate_freq(new_list_tokens, mode_idf=True, counter=counter_idf)' if needed
counter_idf = calculate_freq(list_tokens, mode_idf=True)
counter_idf

Counter({'block': 2,
         'packet': 1,
         'receive': 1,
         'responder': 1,
         'size': 1,
         'terminate': 1})

In [32]:
# example to convert logs templates into vectors
x_train = template2vec(inputs, embedding_table, counter_idf)
x_train = np.array(x_train)
x_train.shape

(2, 300)

In [33]:
x_train

array([[ 8.18546631e-02,  7.17818979e-02, -6.55269739e-02,
         6.30071624e-02,  6.53557193e-02, -3.77150381e-03,
        -3.38972183e-02, -3.00304815e-02, -3.69500928e-02,
        -1.37476367e-02,  3.68622376e-02,  1.08427417e-01,
        -1.28024204e-02, -3.51296179e-03, -5.40419435e-02,
         3.79978260e-03, -4.34649130e-03, -1.66587527e-02,
         1.59989407e-02, -2.28285402e-02,  4.64677005e-02,
         2.46743937e-02,  2.24453406e-02,  3.16291188e-02,
         5.18959006e-02,  6.60300464e-03, -2.45101281e-03,
        -4.69000496e-02, -4.93869270e-02, -5.21197002e-02,
        -4.24118503e-02, -1.44048575e-02, -7.38574315e-02,
        -7.71006616e-03,  5.84854707e-02, -2.53630407e-02,
        -1.70893276e-02, -2.69763998e-02, -1.32170189e-02,
        -6.62608421e-04,  2.30187880e-02,  3.93433969e-02,
        -1.45700665e-02,  3.15770211e-02, -5.49768799e-02,
        -5.35015180e-02, -2.11154576e-02, -5.00240391e-02,
        -6.20637867e-02,  2.36804723e-02, -2.26717924e-0