In [13]:
from collections.abc import Iterable
from dataclasses import dataclass, field
import abc
import re
import inspect
from typing import (
    Callable, 
    Dict, 
    List, 
    Union, 
    Generator, 
    Any,
)
import logging
import os
from pathlib import Path
import json

import rich

In [83]:
@dataclass
class Serializer:
    """A Callable interface to various serializers that can select a specific format to generate, based on the data format requested by the user"""

    methods: Dict[str, Callable] = field(default_factory=dict)
    
    def _to_json(self, data: Iterable) -> Dict[str, Any]:
        """Convert python data container into a json format for output into other functions"""
        yield json.dumps(data)

    def __call__(self, parsed_data: Iterable, output_format: str):
        """Call the correct serialization method, as requested by user"""

        #capture the desired method, especially for future debugging
        match output_format:
            case "json":
                yield selected_serializer(parsed_data, output_format)

In [87]:
class Parser(abc.ABC):
    """Base class to enforce methods and functionality for all parsers in the app's library"""

    serialize = Serializer
        
    @abc.abstractmethod
    def _parse(self, raw_data: Iterable) -> Generator[str, None, None]:
        pass

    # Define call method as a simple algorithm to parse the input into structured data and then serialize it to json as the default format
    def __call__(self, file_path, output_format: str = None) -> Generator[str, None, None]:

        self.file_path = file_path
        output_format: Iterable = self.output_format
        
        # pipes _parse output into the serializer
        # print(f"parsed_data: {parsed_data}")
        yield Parser.serialize(self, parsed_data=self._parse(self.file_path), output_format=output_format)

    def _check_generators(self):
        """Check to make sure that subclassed _parse and _serialize methods are generators"""
        if not inspect.isgeneratorfunction(self._parse):
            raise TypeError("The _parse method must be a generator function")

    # run some checks by introspecting the subclass
    def __init_subclass__(cls, **kwargs):
        super().__init_subclass__(**kwargs)
        cls._check_generators(cls)
    

In [88]:
class RegexParser(Parser):
    def __init__(self, pattern: str, flags: int = 0, output_format: str = "json"):
        self.flags = flags
        self.pattern = re.compile(pattern, self.flags)
        self.output_format = output_format
        # self.serializer = Parser._serializer

    # treat this class as a callable generator to be used inside a comprehension
    def _parse(self, file_path: Union[str, os.PathLike]) -> Generator[str, None, None]:
        """Create structured file entries based on one or more lines, according to the regex
        matching rules provided to the instance"""
        
        with open(target_file, "r") as tf:
            entry = []
            for line in tf:
                if self.pattern.match(line):
                    if entry:
                        yield "".join(entry)
                        entry.clear()
                entry.append(line)

In [89]:
pattern = r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"
entry_parser = RegexParser(pattern=pattern, flags=re.MULTILINE)
target_file = Path("test_files/1d4c79af_c5c3_4b7c_9347_beb5eda819e8_job_10344_attempt_1_txt.txt")
entries = [result for result in entry_parser(target_file)]


with open("results.txt", "w") as r:
    r.write("Entries")
    
    for result in entries:
        r.write("".join(result))
        r.write("\n")
    
    r.write(f"total: {len(entries)}")

TypeError: Serializer.__init__() got an unexpected keyword argument 'parsed_data'

In [30]:
class LineParser(Parser):
    def __init__(self):
        pass

    def __call__(self, file_path: Union[str, os.PathLike]) -> Generator[str, None, None]:
        with open(target_file, "r") as tf:
            for line in tf:
                yield line

In [90]:
from pathlib import Path

unstructure_string = ""
target_file = Path("test_files/1d4c79af_c5c3_4b7c_9347_beb5eda819e8_job_10344_attempt_1_txt.txt")

line_parser = LineParser()
parsed_lines_from_file = [result for result in line_parser(file_path=target_file)]

# rich.print(unstructure_string)
len(parsed_lines_from_file)

3394

In [98]:
import rich

pattern = (
    # r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})|"
    # r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} .*?(?=\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}|$))"
    r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} .*?(?=\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}|$))|"
    r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} .+? \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} .*?(?=\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}|$))"
)
# pattern = ''.join(pattern)
print(pattern)
line_parser = LineParser()
entry_parser = RegexParser(pattern, re.MULTILINE|re.DOTALL)

entries = [result for result in entry_parser(unstructured_string)]
lines = [result for result in line_parser(unstructured_string)]

print("Entries")
for result in entries:
    rich.print(f"{result}\n")
print(f"total lines: {len(entries)}")

print("\n")
print("Lines:")
# for result in lines:
#     rich.print(f"{result}\n")
print(f"total lines: {len(lines)}")    

(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} .*?(?=\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}|$))|(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} .+? \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} .*?(?=\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}|$))
Entries


total lines: 43


Lines:
total lines: 3394
