In [1]:
# import sys
# sys.path.append('log-preprocessor')
# from LogData import LogData
# from Parser import Parser


In [2]:

import os
import pandas as pd
from typing import Optional
import hashlib

from utils.utils import *
from utils.constants import POSSIBLE_TIMESTAMP_PATHS
from Parser import Parser

def hash_string(input_string):
    """Hash a string."""
    hash_object = hashlib.sha256()
    hash_object.update(input_string.encode('utf-8'))
    return hash_object.hexdigest()

class LogData:
    """A LogData object acts as a single representation of the provided log files (optionally including labels)."""

    def __init__(
        self, 
        data_dir: str,
        parser_name: str,
        tmp_save_path="tmp/current_data.log",
        parsed_data_dir="tmp/data_parsed/",
    ):
        self.data_dir = data_dir
        self.parser_name = parser_name
        self.parser = Parser(parser_name, POSSIBLE_TIMESTAMP_PATHS)
        self.default_timestamp_paths = POSSIBLE_TIMESTAMP_PATHS
        self.tmp_save_path = tmp_save_path
        self.parsed_data_dir = parsed_data_dir
        self.input_filenames = os.listdir(self.data_dir)
        self.n_lines_per_file, self.start_timestamps = self._get_logfiles_info_from_dir()
        self.n_lines = sum(self.n_lines_per_file.values())
        self.input_filepaths = self._get_input_filepaths() # order is crucial!!!
    
    def get_df(self, use_parsed_data=True) -> pd.DataFrame:
        """Get the data as a single dataframe."""
        concatenate_files(self.input_filepaths, self.tmp_save_path) # concat files and save to tmp folder
        if use_parsed_data: # for faster repeated data usage save parsed df
            filestorage_label = "-".join([p.split("/")[-1] for p in self.input_filepaths])
            filestorage_name = f"{hash_string(filestorage_label)}_{self.parser_name}.feather"
            os.makedirs(self.parsed_data_dir, exist_ok=True)
            parsed_data_path = os.path.join(self.parsed_data_dir, filestorage_name)
            files = os.listdir(self.parsed_data_dir)
            if filestorage_name in files: # get parsed file
                df = pd.read_feather(parsed_data_path)
                print("Retrieved previously parsed data.")
            else:
                df = self.parser.parse_file(self.tmp_save_path, None, True, "ts")
                df.to_feather(parsed_data_path)
                print("Saving parsed data.")
        else: # parse file
            df = self.parser.parse_file(self.tmp_save_path, None, True, "ts")
        return df
    
    def split(self, split_idx: int):
        """Split data at specified index and return two new LogData objects."""
        #if 
        df_train, df_test = self.df[:split_idx], self.df[split_idx:]
        return

    def _get_logfiles_info_from_dir(self):
        """Returns number of lines and starting time of log files."""
        n_lines = {}
        start_timestamps = {}
        for file in self.input_filenames:
            path = os.path.join(self.data_dir, file)
            with open(path, "rb") as f:
                for line in f:
                    parsed_line = self.parser.parse_line(line, decode=True)
                    timestamp_string = get_timestamp_from_decoded_match_dict(parsed_line, self.default_timestamp_paths)
                    start_timestamps[file] = str_to_datetime(timestamp_string)
                    break # get only first line
            with open(path, "r") as f:
                n_lines[file] = sum(1 for _ in f) # get number of lines for offset
        return n_lines, start_timestamps

    def _get_input_filepaths(self):
        """Return input filespaths."""
        self.input_filenames = list(dict(sorted(self.start_timestamps.items(), key=lambda x: x[1])).keys()) # sort files
        self.input_filepaths = [os.path.join(self.data_dir, file) for file in self.input_filenames]
        return self.input_filepaths

In [4]:

data = LogData(
    data_dir="../data/russellmitchell/gather/intranet_server/logs/apache2/",
    parser_name="ApacheAccessParsingModel",
)

df = data.get_df()
display(df)

Saving parsed data.


Unnamed: 0,/model,/model/client_ip/client_ip,/model/sp1,/model/client_id,/model/sp2,/model/user_id,/model/sp3,/model/time,/model/sp4,/model/fm/request,/model/sp7,/model/status_code,/model/sp8,/model/content_size,/model/combined,/model/combined/combined,/model/combined/combined/sp9,/model/combined/combined/referer,/model/combined/combined/sp10,/model/combined/combined/user_agent,/model/combined/combined/sp11,/model/fm/request/method,/model/fm/request/sp5,/model/fm/request/request,/model/fm/request/sp6,/model/fm/request/version,/model/client_ip/localhost,/model/fm/dash,ts
0,"10.143.3.65 - - [21/Jan/2022:05:55:22 +0000] ""...",10.143.3.65,,-,,-,[,21/Jan/2022:05:55:22 +0000,"] """,GET / HTTP/1.1,"""",200,,6202,"""-"" ""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; ...","""-"" ""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; ...","""",-,""" """,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86....,"""",GET,,/,,HTTP/1.1,,,2022-01-21 05:55:22+00:00
1,"10.143.3.65 - - [21/Jan/2022:05:55:27 +0000] ""...",10.143.3.65,,-,,-,[,21/Jan/2022:05:55:27 +0000,"] """,GET /wp-includes/css/dist/block-library/style....,"""",200,,10846,"""http://intranet.smith.russellmitchell.com/"" ...","""http://intranet.smith.russellmitchell.com/"" ...","""",http://intranet.smith.russellmitchell.com/,""" """,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86....,"""",GET,,/wp-includes/css/dist/block-library/style.min....,,HTTP/1.1,,,2022-01-21 05:55:27+00:00
2,"10.143.3.65 - - [21/Jan/2022:05:55:27 +0000] ""...",10.143.3.65,,-,,-,[,21/Jan/2022:05:55:27 +0000,"] """,GET /wp-includes/js/wp-embed.min.js?ver=5.8.3 ...,"""",200,,1099,"""http://intranet.smith.russellmitchell.com/"" ...","""http://intranet.smith.russellmitchell.com/"" ...","""",http://intranet.smith.russellmitchell.com/,""" """,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86....,"""",GET,,/wp-includes/js/wp-embed.min.js?ver=5.8.3,,HTTP/1.1,,,2022-01-21 05:55:27+00:00
3,"10.143.3.65 - - [21/Jan/2022:05:55:27 +0000] ""...",10.143.3.65,,-,,-,[,21/Jan/2022:05:55:27 +0000,"] """,GET /wp-content/themes/go/dist/css/design-styl...,"""",200,,1412,"""http://intranet.smith.russellmitchell.com/"" ...","""http://intranet.smith.russellmitchell.com/"" ...","""",http://intranet.smith.russellmitchell.com/,""" """,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86....,"""",GET,,/wp-content/themes/go/dist/css/design-styles/s...,,HTTP/1.1,,,2022-01-21 05:55:27+00:00
4,"10.143.3.65 - - [21/Jan/2022:05:55:27 +0000] ""...",10.143.3.65,,-,,-,[,21/Jan/2022:05:55:27 +0000,"] """,GET /wp-includes/js/wp-emoji-release.min.js?ve...,"""",200,,5265,"""http://intranet.smith.russellmitchell.com/"" ...","""http://intranet.smith.russellmitchell.com/"" ...","""",http://intranet.smith.russellmitchell.com/,""" """,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86....,"""",GET,,/wp-includes/js/wp-emoji-release.min.js?ver=5.8.3,,HTTP/1.1,,,2022-01-21 05:55:27+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11179,"10.143.3.65 - - [24/Jan/2022:19:33:43 +0000] ""...",10.143.3.65,,-,,-,[,24/Jan/2022:19:33:43 +0000,"] """,POST /wp-admin/admin-ajax.php HTTP/1.1,"""",200,,549,"""https://intranet.smith.russellmitchell.com/?...","""https://intranet.smith.russellmitchell.com/?...","""",https://intranet.smith.russellmitchell.com/?p=5,""" """,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86....,"""",POST,,/wp-admin/admin-ajax.php,,HTTP/1.1,,,2022-01-24 19:33:43+00:00
11180,"10.143.3.65 - - [24/Jan/2022:19:34:06 +0000] ""...",10.143.3.65,,-,,-,[,24/Jan/2022:19:34:06 +0000,"] """,GET / HTTP/1.1,"""",200,,6203,"""-"" ""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; ...","""-"" ""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; ...","""",-,""" """,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86....,"""",GET,,/,,HTTP/1.1,,,2022-01-24 19:34:06+00:00
11181,"10.143.3.65 - - [24/Jan/2022:20:59:09 +0000] ""...",10.143.3.65,,-,,-,[,24/Jan/2022:20:59:09 +0000,"] """,GET / HTTP/1.1,"""",200,,6203,"""-"" ""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; ...","""-"" ""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; ...","""",-,""" """,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86....,"""",GET,,/,,HTTP/1.1,,,2022-01-24 20:59:09+00:00
11182,"10.143.2.4 - - [24/Jan/2022:20:59:09 +0000] ""P...",10.143.2.4,,-,,-,[,24/Jan/2022:20:59:09 +0000,"] """,POST /wp-cron.php?doing_wp_cron=1643057949.933...,"""",200,,150,"""-"" ""WordPress/5.8.3; https://intranet.smith....","""-"" ""WordPress/5.8.3; https://intranet.smith....","""",-,""" """,WordPress/5.8.3; https://intranet.smith.russel...,"""",POST,,/wp-cron.php?doing_wp_cron=1643057949.93366289...,,HTTP/1.1,,,2022-01-24 20:59:09+00:00
