In [1]:
import re,string
import pandas as pd
from textacy import preprocessing
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

# Reading log file

In [2]:
log_data  = open('../data/BGL.log', 'r')
split_list = []

for line in log_data:
    thing1 = line.split(' ')
    if thing1[0] == '-':
        label = 'Normal'
    else:
        label = 'Anomaly'
    id = thing1[1]
    time = thing1[2]
    unknown1 = thing1[3]
    exact_time = thing1[4]
    unknown2 = thing1[5]
    info = thing1[6:]
    split_list.append([id, time, unknown1, exact_time, unknown2, info, label])

df = pd.DataFrame(split_list, columns=['id', 'time', 'unknown1', 'exact_time', 'unknown2', 'info', 'label'])


# Data Pre-processing

In [3]:
# convert info from list to string
df['info'] = df['info'].apply(lambda x: ' '.join(x).replace('\n',"")).replace('|','')

In [4]:
df['info'].value_counts()

RAS KERNEL FATAL data TLB error interrupt                                                                                                                                                     152734
RAS KERNEL INFO 0 microseconds spent in the rbs signal handler during 0 calls. 0 microseconds was the maximum time for a single instance of a correctable ddr.                                135005
RAS KERNEL INFO instruction cache parity error corrected                                                                                                                                      105924
RAS MMCS ERROR idoproxydb hit ASSERT condition: ASSERT expression=0 Source file=idotransportmgr.cpp Source line=1043 Function=int IdoTransportMgr::SendPacket(IdoUdpMgr*, BglCtlPavTrace*)     84168
RAS KERNEL INFO 1146800 double-hummer alignment exceptions                                                                                                                                     74091
               

In [5]:
def process_data(text):
    # convert to lowercase
    text = text.lower()
    # remove mlp such as FF:F2:9F:15:7E:DF:00:0D:60:EA:81:20
    text = re.sub(r'..\:..\:..\:..\:..\:..\:..\:..\:..\:..\:..\:..', '', text)
    # replace numbers
    text = re.sub(r"[^a-z ]", " ", text)
    # remove punctuation 
    text = preprocessing.remove.punctuation(text)
    # remove single word characters in text for ip adress 
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    # normalize whitespace
    text = re.sub( '\s+', ' ', text)
    # remove stopwords
    # stopwords such like 'to', 'on', 'of', 'for'...
    stop_words = set(stopwords.words('english'))
    additional_words = ['mon','tue','wed', 'thu', 'fri', 'sat', 'sun',
                       'san', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
                        'aug', 'sep', 'oct', 'nov', 'dec', 'pdt'] # date time in text
    for i in additional_words:
        stop_words.add(i)
    word_tokens = word_tokenize(text) 

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)
    text = ' '.join([w for w in filtered_sentence])
    
    return(text)

cleaned_info = []
for i in df['info']:
    cleaned_info.append(process_data(i))
df['info'] = cleaned_info

In [7]:
df.head()

Unnamed: 0,id,time,unknown1,exact_time,unknown2,info,label
0,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.363779,R02-M1-N0-C:J12-U11,ras kernel info instruction cache parity error...,Normal
1,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.527847,R02-M1-N0-C:J12-U11,ras kernel info instruction cache parity error...,Normal
2,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.675872,R02-M1-N0-C:J12-U11,ras kernel info instruction cache parity error...,Normal
3,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.823719,R02-M1-N0-C:J12-U11,ras kernel info instruction cache parity error...,Normal
4,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.982731,R02-M1-N0-C:J12-U11,ras kernel info instruction cache parity error...,Normal


In [8]:
# write dataframe to a csv file
df.to_csv('../data/BGL_cleaned.csv',index=None)