In [36]:
import pandas as pd
import numpy as np
from collections import Counter
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords


In [2]:
# pretrained word vector `crawl-300d-2M.vec' can be downloaded here
# https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip

In [3]:
!cd '/content/drive/My Drive/Colab Notebooks/DLP_project' && cp 'crawl-300d-2M.vec.zip' *.csv /content/
!unzip crawl-300d-2M.vec.zip

Archive:  crawl-300d-2M.vec.zip
  inflating: crawl-300d-2M.vec       


In [4]:
EMBEDDING_DIM = 300
embedding_table = {}
with open('crawl-300d-2M.vec') as f:
    for i, line in enumerate(f):
        if i == 0: # header
            print('words, dim =', line.split())
            continue
        
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embedding_table[word] = coefs

print('Found %s word vectors.' % len(embedding_table))

words, dim = ['1999995', '300']
Found 1999995 word vectors.


In [5]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [18]:
def template2tokens(templates):
    # templates: list of str
    # e.g. templates = ['PacketResponder <*> for block <*> terminating', 'Received block <*> of size <*> from <*>']
    # return: list of list of str which has been processed (tokens)
    # e.g. [['packet', 'responder', 'block', 'terminate'], ['receive', 'block', 'size']]

    list_tokens = []

    for i, text in enumerate(templates):
        for j in reversed(range(len(text))):
            if j == 0: break
            if not text[j].isalpha():
                text = text[:j] + ' ' + text[j + 1:]
                continue
            if text[j].isupper() and text[j - 1].islower():
                text = text[:j] + ' ' + text[j:]
        
        tokens = nltk.word_tokenize(text.lower())
        tokens = [token for token in tokens if token not in stopwords.words('english')]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token, 'v') for token in tokens]
        list_tokens.append(tokens)
    
    return list_tokens

In [19]:
def calculate_freq(list_tokens, mode_idf=False, counter=None):
    # list_tokens: list of list of str
    # e.g. [['packet', 'responder', 'block', 'terminate'], ['receive', 'block', 'size']]
    # mode_idf: bool
    # if `mode_idf' is set to True, count as idf
    # counter: collections.Counter object
    # if `counter' parameter is not assigned, generate a new one
    # otherwise, the counter object being assigned will be update (aggregate)

    if counter is None:
        counter = Counter()

    for tokens in list_tokens:
        if mode_idf:
            counter_tokens = Counter(set(tokens))
            counter_tokens['__num_lines__'] = 1
        else:
            counter_tokens = Counter(tokens)
        
        counter.update(counter_tokens)

    return counter

In [25]:
def template2vec(templates, embedding_table, counter_idf):
    # templates: list of str
    # e.g. templates = ['PacketResponder <*> for block <*> terminating', 'Received block <*> of size <*> from <*>']
    # embedding_table: dict
    # a dict mapping words to vectors of dimension EMBEDDING_DIM
    # counter_idf: collections.Counter object
    # `counter_idf' indicates the counter for calculating idf
    # return: list of numpy array

    list_vectors = []

    list_tokens = template2tokens(templates)
    for tokens in list_tokens:
        vector_token = np.zeros(EMBEDDING_DIM)
        
        counter_tf = calculate_freq([tokens])
        num_valid_token = 0

        for token in tokens:
            if token not in embedding_table: continue
            
            num_valid_token += 1

            tf = counter_tf[token] / sum(counter_tf.values())
            idf = np.log(counter_idf['__num_lines__'] / (1 + counter_idf.get(token, 0)))

            vector = embedding_table[token] * tf * idf
            vector_token += vector

        list_vectors.append(vector_token / num_valid_token)


    return list_vectors

In [26]:
# -----test-----

In [27]:
# example inputs (input 2 log lines)
inputs = ['PacketResponder <*> for block <*> terminating', 'Received block <*> of size <*> from <*>']

In [28]:
# idf of `new_list_tokens' can be aggregate by calling `calculate_freq(new_list_tokens, mode_idf=True, counter=counter_idf)' if needed
counter_idf = calculate_freq(template2tokens(inputs), mode_idf=True)
counter_idf

Counter({'__num_lines__': 2,
         'block': 2,
         'packet': 1,
         'receive': 1,
         'responder': 1,
         'size': 1,
         'terminate': 1})

In [34]:
# idf before log
for token in counter_idf:
    if token == '__num_lines__': continue
    # idf = np.log(counter_idf['__num_lines__'] / (1 + counter_idf.get(token, 0)))
    idf = counter_idf['__num_lines__'] / (1 + counter_idf.get(token, 0))
    print(token, idf)

responder 1.0
terminate 1.0
block 0.6666666666666666
packet 1.0
size 1.0
receive 1.0


In [29]:
# example to convert logs templates into vectors
x_train = template2vec(inputs, embedding_table, counter_idf)
x_train = np.array(x_train)
x_train.shape

(2, 300)

In [None]:
# generate csv file of log_templates that contains template vectors

In [105]:
df_template = pd.read_csv('HDFS_2k.log_templates.csv')
df_template

Unnamed: 0,EventId,EventTemplate,Occurrences
0,dc2c74b7,PacketResponder <*> for block <*> terminating,311
1,5d5de21c,BLOCK* NameSystem.addStoredBlock: blockMap upd...,314
2,e3df2680,Received block <*> of size <*> from <*>,292
3,09a53393,Receiving block <*> src: <*> dest: <*>,292
4,3d91fa85,BLOCK* NameSystem.allocateBlock: <*> <*>,115
5,32777b38,Verification succeeded for <*>,20
6,dba996ef,Deleting block <*> file <*>,263
7,626085d5,<*> Served block <*> to <*>,80
8,81cee340,<*>Got exception while serving <*> to <*>,80
9,d63ef163,BLOCK* NameSystem.delete: <*> is added to inva...,224


In [106]:
eventid_vectors = []
# get counter for idf
counter_idf = calculate_freq(template2tokens(df_template['EventTemplate']), mode_idf=True)
for event_id, template, occurrences in df_template.iloc:
    eventid_vectors.append(' '.join(map(str, template2vec([template], embedding_table, counter_idf)[0])))
df_template['vector'] = eventid_vectors
df_template

Unnamed: 0,EventId,EventTemplate,Occurrences,vector
0,dc2c74b7,PacketResponder <*> for block <*> terminating,311,0.10066175274550915 0.11497305858938489 -0.115...
1,5d5de21c,BLOCK* NameSystem.addStoredBlock: blockMap upd...,314,-0.019324294693598695 0.008473816761662338 0.0...
2,e3df2680,Received block <*> of size <*> from <*>,292,0.02841035866489013 0.08096243382897228 -0.033...
3,09a53393,Receiving block <*> src: <*> dest: <*>,292,-0.012383171560941264 0.06781194698123727 -0.0...
4,3d91fa85,BLOCK* NameSystem.allocateBlock: <*> <*>,115,-0.016556531935930253 0.012131390906870366 0.0...
5,32777b38,Verification succeeded for <*>,20,0.022925840690732002 0.1724896803498268 -0.326...
6,dba996ef,Deleting block <*> file <*>,263,-0.0012931609526276588 0.08127783429032813 0.0...
7,626085d5,<*> Served block <*> to <*>,80,0.01732614046583573 0.07184720974570762 -0.052...
8,81cee340,<*>Got exception while serving <*> to <*>,80,-0.07136904262006283 0.07215860148426145 -0.04...
9,d63ef163,BLOCK* NameSystem.delete: <*> is added to inva...,224,-0.007974783889949322 6.732780353299208e-05 -0...


In [107]:
df_template.to_csv('HDFS_2k.log_templates_with_vec.csv', index=False)

In [123]:
# try to read template vectors back
df=pd.read_csv('HDFS_2k.log_templates_with_vec.csv')
df

Unnamed: 0,EventId,EventTemplate,Occurrences,vector
0,dc2c74b7,PacketResponder <*> for block <*> terminating,311,0.10066175274550915 0.11497305858938489 -0.115...
1,5d5de21c,BLOCK* NameSystem.addStoredBlock: blockMap upd...,314,-0.019324294693598695 0.008473816761662338 0.0...
2,e3df2680,Received block <*> of size <*> from <*>,292,0.02841035866489013 0.08096243382897228 -0.033...
3,09a53393,Receiving block <*> src: <*> dest: <*>,292,-0.012383171560941264 0.06781194698123727 -0.0...
4,3d91fa85,BLOCK* NameSystem.allocateBlock: <*> <*>,115,-0.016556531935930253 0.012131390906870366 0.0...
5,32777b38,Verification succeeded for <*>,20,0.022925840690732002 0.1724896803498268 -0.326...
6,dba996ef,Deleting block <*> file <*>,263,-0.0012931609526276588 0.08127783429032813 0.0...
7,626085d5,<*> Served block <*> to <*>,80,0.01732614046583573 0.07184720974570762 -0.052...
8,81cee340,<*>Got exception while serving <*> to <*>,80,-0.07136904262006283 0.07215860148426145 -0.04...
9,d63ef163,BLOCK* NameSystem.delete: <*> is added to inva...,224,-0.007974783889949322 6.732780353299208e-05 -0...


In [124]:
eventid_vector_table = {}
for event_id, vector in df[['EventId', 'vector']].iloc:
    # eventid_vector_table[event_id] = np.fromstring(vector, dtype=np.float, sep=' ')
    eventid_vector_table[event_id] = vector # in str format for convenience

In [130]:
# generate csv file of log_structured that contains template vectors

In [132]:
df_structured = pd.read_csv('HDFS_2k.log_structured.csv', dtype=str) # prevents leading zeros of integer fields being removed
df_structured

Unnamed: 0,LineId,Date,Time,Pid,Level,Component,Content,EventId,EventTemplate,ParameterList
0,1,081109,203615,148,INFO,dfs.DataNode$PacketResponder,PacketResponder 1 for block blk_38865049064139...,dc2c74b7,PacketResponder <*> for block <*> terminating,"['1', 'blk_38865049064139660']"
1,2,081109,203807,222,INFO,dfs.DataNode$PacketResponder,PacketResponder 0 for block blk_-6952295868487...,dc2c74b7,PacketResponder <*> for block <*> terminating,"['0', 'blk_-6952295868487656571']"
2,3,081109,204005,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.addStoredBlock: blockMap upd...,5d5de21c,BLOCK* NameSystem.addStoredBlock: blockMap upd...,"['10.251.73.220:50010', 'blk_71283702376877284..."
3,4,081109,204015,308,INFO,dfs.DataNode$PacketResponder,PacketResponder 2 for block blk_82291938032499...,dc2c74b7,PacketResponder <*> for block <*> terminating,"['2', 'blk_8229193803249955061']"
4,5,081109,204106,329,INFO,dfs.DataNode$PacketResponder,PacketResponder 2 for block blk_-6670958622368...,dc2c74b7,PacketResponder <*> for block <*> terminating,"['2', 'blk_-6670958622368987959']"
...,...,...,...,...,...,...,...,...,...,...
1995,1996,081111,101621,24902,INFO,dfs.DataNode$DataXceiver,Receiving block blk_4198733391373026104 src: /...,09a53393,Receiving block <*> src: <*> dest: <*>,"['blk_4198733391373026104', '/10.251.106.10:46..."
1996,1997,081111,101735,26595,INFO,dfs.DataNode$PacketResponder,Received block blk_-5815145248455404269 of siz...,e3df2680,Received block <*> of size <*> from <*>,"['blk_-5815145248455404269', '67108864', '/10...."
1997,1998,081111,101804,26494,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-295306975763175640 src: /...,09a53393,Receiving block <*> src: <*> dest: <*>,"['blk_-295306975763175640', '/10.250.9.207:532..."
1998,1999,081111,101954,26414,INFO,dfs.DataNode$PacketResponder,PacketResponder 0 for block blk_52257196770490...,dc2c74b7,PacketResponder <*> for block <*> terminating,"['0', 'blk_5225719677049010638']"


In [136]:
# check whether all EventId matches
all([event_id in eventid_vector_table for event_id in df_structured['EventId']])

True

In [134]:
vector_structured = []
for event_id in df_structured['EventId']:
    vector_structured.append(eventid_vector_table[event_id])
df_structured['vector'] = vector_structured
df_structured

Unnamed: 0,LineId,Date,Time,Pid,Level,Component,Content,EventId,EventTemplate,ParameterList,vector
0,1,081109,203615,148,INFO,dfs.DataNode$PacketResponder,PacketResponder 1 for block blk_38865049064139...,dc2c74b7,PacketResponder <*> for block <*> terminating,"['1', 'blk_38865049064139660']",0.10066175274550915 0.11497305858938489 -0.115...
1,2,081109,203807,222,INFO,dfs.DataNode$PacketResponder,PacketResponder 0 for block blk_-6952295868487...,dc2c74b7,PacketResponder <*> for block <*> terminating,"['0', 'blk_-6952295868487656571']",0.10066175274550915 0.11497305858938489 -0.115...
2,3,081109,204005,35,INFO,dfs.FSNamesystem,BLOCK* NameSystem.addStoredBlock: blockMap upd...,5d5de21c,BLOCK* NameSystem.addStoredBlock: blockMap upd...,"['10.251.73.220:50010', 'blk_71283702376877284...",-0.019324294693598695 0.008473816761662338 0.0...
3,4,081109,204015,308,INFO,dfs.DataNode$PacketResponder,PacketResponder 2 for block blk_82291938032499...,dc2c74b7,PacketResponder <*> for block <*> terminating,"['2', 'blk_8229193803249955061']",0.10066175274550915 0.11497305858938489 -0.115...
4,5,081109,204106,329,INFO,dfs.DataNode$PacketResponder,PacketResponder 2 for block blk_-6670958622368...,dc2c74b7,PacketResponder <*> for block <*> terminating,"['2', 'blk_-6670958622368987959']",0.10066175274550915 0.11497305858938489 -0.115...
...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,081111,101621,24902,INFO,dfs.DataNode$DataXceiver,Receiving block blk_4198733391373026104 src: /...,09a53393,Receiving block <*> src: <*> dest: <*>,"['blk_4198733391373026104', '/10.251.106.10:46...",-0.012383171560941264 0.06781194698123727 -0.0...
1996,1997,081111,101735,26595,INFO,dfs.DataNode$PacketResponder,Received block blk_-5815145248455404269 of siz...,e3df2680,Received block <*> of size <*> from <*>,"['blk_-5815145248455404269', '67108864', '/10....",0.02841035866489013 0.08096243382897228 -0.033...
1997,1998,081111,101804,26494,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-295306975763175640 src: /...,09a53393,Receiving block <*> src: <*> dest: <*>,"['blk_-295306975763175640', '/10.250.9.207:532...",-0.012383171560941264 0.06781194698123727 -0.0...
1998,1999,081111,101954,26414,INFO,dfs.DataNode$PacketResponder,PacketResponder 0 for block blk_52257196770490...,dc2c74b7,PacketResponder <*> for block <*> terminating,"['0', 'blk_5225719677049010638']",0.10066175274550915 0.11497305858938489 -0.115...


In [135]:
df_structured.to_csv('HDFS_2k.log_structured_with_vec.csv', index=False)

In [135]:
# use np.fromstring to read vector from DataFrame
# np.fromstring(vector_str, dtype=np.float, sep=' ')