## Import Libraries

In [1]:
from parse import compile
from parse import with_pattern
import datetime as dt
import pandas as pd
import json

## create_log_list
- description: function which create a **python list** from the original **text file** containing the logs.
- input: text file (.txt)
- output: list of string

In [2]:
def create_log_list(log_file):
    logs = open(log_file)
    lines  = logs.readlines()
    log_list = []
    log = ''
    for line in lines:
        if line[:11] == "2014/Oct/24":
            log_list.append(log)
            log = line
        else:
            log += line

    log_list = log_list[1:]
    return log_list

log_list = create_log_list('WCG100140020.txt')

## portevent_filter
- description: function which filter a **log list** to extract only the logs of the type **PORTEVENT** (example : 2014/Oct/24 18:34:28.666455 mtc PORTEVENT WCG10014_0020.ttcn:53(testcase:WCG100140020) Port dnsInternalPort[0] was started.)
- input: list of string
- output: list of string

In [3]:
def portevent_filter(log_list):
    portevent_log_list = []
    file = open('porteventlogs.txt', 'w')
    for log in log_list:
        if "PORTEVENT" in log:
            log_line = log.replace('\n', '')
            portevent_log_list.append(log_line)
            file.write(log_line)
    return portevent_log_list

portevent_log_list  = portevent_filter(log_list)


## sending_message_filter
- description: function which filter a **log list** to extract only the logs which send message
- input: list of string
- output: list of string

In [4]:

def sending_message_filter(log_list):
    sending_log_list = []
    for log in log_list:
        if "Sent on" in log:
            sending_log_list.append(log)
    return sending_log_list

sending_log_list = sending_message_filter(log_list)

## Creation of log parsing fonctions :
- port_opening : parse the port opening log lines
- port_stopping : parse the port opening log lines
- port_connexion_establishment : parse log lines which are used to connect two users
- port_connexion_waiting : parse waiting connexion log lines 
- port_connexion_acceptance : parse connexion acceptance log lines 
- port_mapping : parse port mapping log lines 
- message_sending : parse log lines where messages are sent
- message_enqueuing : parse message enqueuing log lines 
- message_reception : parse message reception log lines 
- message_extraction : parse message extraction log lines 

In [5]:
@with_pattern(r"\d\d\d\d/\d\d/\d\d \d\d:\d\d:\d\d.\d\d\d\d\d\d")
def date_format(date_time_str):
    return dt.datetime.strptime(date_time_str, '%Y/%b/%d %H:%M:%S.%f')

port_opening = compile("{date:27} {user_name} {eventtype} {unknown_part} Port {portname} was started.")
port_stopping = compile("{date:27} {user_name} {eventtype} {unknown_part} Port {portname} was stopped.")
port_connexion_waiting = compile("{date:27} {message_receiver} {eventtype} {unknown_part} Port {message_receiver_portname} is waiting for connection from {message_sender_portname} on {communication_type} pathname {pathname}.")
port_connexion_establishment = compile("{date:27} {message_sender} {eventtype} {unknown_part} Port {message_sender_portname} has established the connection with {message_receiver_portname} using transport type {communication_type}.")
port_connexion_acceptance = compile("{date:27} {message_receiver} {eventtype} {unknown_part} Port {message_receiver_portname} has accepted the connection from {message_sender_portname}.")
port_mapping = compile("{date:27} {message_sender} {eventtype} {unknown_part} Port {message_sender_portname} was mapped to {message_receiver_portname}.")
message_sending = compile("{date:27} {message_sender} {eventtype} {unknown_part} Sent on {message_sender_portname} to {message_receiver} {message_type} : {message_content:0}")
message_enqueuing = compile("{date:27} {message_receiver} {eventtype} {unknown_part} Message enqueued on {message_receiver_portname} from {message_sender} {message_type} : {message_content:0} id {message_id_number}")
message_reception = compile("{date:27} {message_receiver} {eventtype} {unknown_part} Receive operation on port {message_receiver_portname} succeeded, message from {message_sender}: {message_type} : {message_content:0} id {message_id_number}")
message_extraction = compile("{date:27} {message_receiver} {eventtype} {unknown_part} Message with id {message_id_number} was extracted from the queue of {message_receiver_portname}.")


## create_log_sending_message_dataframe
- description: function creates a pandas Dataframe containing the **date**, the message **sender** and **receiver**, the **eventtype** (which is always PORTEVENT), the **message sender port name**, the **message type** and the **content of the message** using a pyhton list of logs where messages are sent (provided by the **sending_message_filter** fonction) 
- input: list of string
- output: Dataframe 

In [6]:
def create_log_sending_message_dataframe(sending_log_list):
    parsed_sending_log_dico = {'date': [], 'message_sender': [], 'eventtype': [], 'unknown_part': [], 'message_sender_portname': [], 
                            'message_receiver': [], 'message_type': [], 'message_content': []}
    for sending_log in sending_log_list:
        parsed_log = message_sending.parse(sending_log).named
        for k in parsed_log.keys():
            parsed_sending_log_dico[k].append(parsed_log[k])
    df = pd.DataFrame(parsed_sending_log_dico)
    return df

df = create_log_sending_message_dataframe(sending_log_list)

## log_format_to_json
- description: transform a message content extracted from the logs and format them to be a json string format so they can be turn directly into python dictionary 
- input: string
- output: string

In [7]:
def log_format_to_json(string):
    lines = string.split('\n')
    edited_message = ''
    for line in lines:
        decompose_line = line.split(' ')
        edited_line = ''
        for i in range(len(decompose_line)):
            if decompose_line[i] != '':
                if decompose_line[i] == ':=':
                    decompose_line[i-1] = '"' + decompose_line[i-1] + '"'
                    if decompose_line[i+1] != '{':
                        if '"' == decompose_line[i+1][0]:
                            break
                        else:
                            if i + 1 == len(decompose_line) - 1:
                                if ',' == decompose_line[i+1][-1]:
                                    decompose_line[i+1] = '"' + decompose_line[i+1].replace(',','') + '",'
                                else:
                                    decompose_line[i+1] = '"' + decompose_line[i+1] + '"'
                            else:
                                if ',' == decompose_line[-1][-1]:
                                    decompose_line[i+1] = '"' + decompose_line[i+1]
                                    decompose_line[-1] =  decompose_line[-1].replace(',','') + '",'
                                    break
                                else:
                                    decompose_line[i+1] = '"' + decompose_line[i+1]
                                    decompose_line[-1] =  decompose_line[-1] + '"'
                                    break
        for e in decompose_line:
            if e == '':
                edited_line += ' '
            if e == ':=':
                edited_line += ' ' + e + ' '
            else:
                edited_line += e
        edited_message += edited_line + '\n'    
    edited_message = edited_message.split(' ')
    open = 0
    ind = []
    for i in range(len(edited_message)):
        if '{' in edited_message[i]:
            if edited_message[i].count('}') > 1:
                for y in range(edited_message[i].count('}')):
                    open += 1
            else:
                open += 1
                f = list(filter(('').__ne__, edited_message[i:]))
                if len(f) > 0:
                    if '{' in f[1]:
                        edited_message[i] = edited_message[i].replace('{', '[',-1)
                        ind.append(open)
                    if '"' == f[1][0] and f[2] != ':=':
                        edited_message[i] = edited_message[i].replace('{', '[',-1)
                        ind.append(open)
        if '}' in edited_message[i]:
            if edited_message[i].count('}') > 1:
                for y in range(edited_message[i].count('}')):
                    if len(ind) != 0 and open == ind[-1]:
                        s = ""
                        dico_i_split = edited_message[i].split('}')
                        for l in range(len(dico_i_split) - 1):
                            if l == y:
                                s += dico_i_split[l] + ']'
                            else:
                                s += dico_i_split[l] + '}'
                        s += dico_i_split[-1]
                        edited_message[i] = s
                        ind.pop(-1)
                    open -= 1
            else:
                if len(ind) != 0 and open == ind[-1]:
                    edited_message[i] = edited_message[i].replace('}',']',1)
                    ind.pop(-1)
                open -= 1

    for i in range(len(edited_message)):
        if edited_message[i] == '':
            edited_message[i] = ' '

    json_string = ""
    for i in range(len(edited_message)):
        json_string += edited_message[i]
    json_string = json_string.replace('=', '')
    
    return json_string

## json_string_to_dict
- description: transform a json string into a json dictionary
- input: string
- output: dictionary

In [8]:
def json_string_to_dict(string):
    dict_json = json.loads(string)
    return dict_json

## message_to_dict
- description: transform the message content string extracted from the logs into python dictionary to exploit the data in an easier way
- input: string
- output: dictionary

In [9]:
def message_to_dict(message):
    return json_string_to_dict(log_format_to_json(message))

## parse_messages
- description: create a list of python dictionaries from a list of message contents exctracted from the logs 
- input: list of string
- output: list of dictionaries (parsed_message_list), list of string (failed_message_to_dict : the list of failed message that could not be turn into dictionaries (there are only few))

In [10]:
def parse_messages(message_list):
    parsed_message_list = []
    failed_message_to_dict = []
    for i,m in enumerate(message_list):
        try:
            parsed_message_list.append(message_to_dict(m))
        except:
            failed_message_to_dict.append(log_format_to_json(m))
    return parsed_message_list, failed_message_to_dict

parsed_message_list, failed_message_to_dict = parse_messages(df.message_content)

265
20
