In [1]:
from itertools import zip_longest
import re
from pathlib import Path
from os.path import abspath
import os

from dataclasses import dataclass, asdict
from typing import Optional

import pandas as pd

In [6]:
class LogParser:
    
    def __init__(self, file_folder = "files", *args, **kwargs):
        self.path_to_files = os.path.join(abspath(os.getcwd()), file_folder)
        if kwargs.get("process_all"):
            self.files = self.get_all_files()

    def get_all_files(self):
        """Open all of the files if you want"""
        return [self.get_file(filename = file) for file in os.listdir(self.path_to_files)]

    def get_file(self,filename):
        """Get particular file"""
        full_path_to_file = os.path.join(self.path_to_files, filename)
        check_file = Path(full_path_to_file)
        if not check_file.is_file():
            raise Exception(f"Not a File: {full_path_to_file}")

        with open(full_path_to_file, "r") as f:
            corpus = f.read()
        return corpus
    
    @staticmethod
    def grouper(iterable, n, fillvalue=None):
        """Collect data into fixed-length chunks or blocks.

        >>> grouper('ABCDEFG', 3, 'x')
        ['ABC', 'DEF', 'Gxx']
        Source:
            https://docs.python.org/3/library/itertools.html#itertools-recipes
        """
        args = [iter(iterable)] * n
        return zip_longest(*args, fillvalue=fillvalue)

    def group_positions(self,sentences,pattern = r"^(?![\s\S])"):
        """Group together sentences based on splitting condition

        Args:
            sentences (list(str)): list of strings split on new line
            pattern (str): pattern indicating start and end of new group
        return:
            groups (list(tuple)): list of groups containing data to be parsed
        """
        positions = []
        for idx,sentence in enumerate(sentences):
            if re.search(pattern,sentence):
                positions.append(idx)
        return list(self.grouper(positions,2))
    
    def get_groups(self,sentences,groups):
        """Group the correct data together
        
        Args:
            sentences (list(str)): list of sentences 
            groups (list(tuple(int,int))): list of positions indicating groups
        
        Return:
            All of the correct groupings
        """
        groupings = []
        for group in groups:
            start, end = group[0], group[-1]
            groupings.append(tuple(sentences[start+1:end]))
        return groupings
    
    def extract_data(self,filename):
        """You can build on this to process all files at once"""
        corpus = self.get_file(filename = filename)
        sentences = corpus.split("\n")
        groups = self.group_positions(sentences)
        extracted_data = self.get_groups(sentences,groups)
        return extracted_data

@dataclass  
class Record:
    """Define a schema"""
    job_name: Optional[str] = ""
    status: Optional[str] = ""
    start_time: Optional[str] = ""
    start_date: Optional[str] = ""
    last_ran_time: Optional[str] = ""
    last_ran_date: Optional[str] = ""
    elapsed_time: Optional[str] = ""

class Transformer:
    
    base_pattern = r"[a-zA-Z_]+\:|[a-zA-Z]+\W+[a-zA-Z_]+\:"
    on_date = r"(([a-zA-Z_]+\:)|([a-zA-Z]+\W+[a-zA-Z_]+\:)) ((0?[1-9]|1[012])\/(0?[1-9]|[12][0-9]|3[01])\/\d{4})"
    started_and_ran = r"(([a-zA-Z_]+\:)|([a-zA-Z]+\W+[a-zA-Z_]+\:)) (([1-9]|0[1-9]|1[0-2]):[0-5][0-9] ([AaPp][Mm]))"
    elapsed = r"(([a-zA-Z_]+\:)|([a-zA-Z]+\W+[a-zA-Z_]+\:)) (\d{2}:\d{2}:\d{2})"
    
    
    def get_job_or_status(self,sentence):
        """Used to get data after semicolon"""
        return sentence.strip().split(":")[-1]
    
    def get_elapsed(self,sentence):
        matches = re.search(self.elapsed,sentence)
        if not matches:
            return ""
        match_span = matches.span()
        subsequence = sentence[match_span[0]:match_span[1]]
        name,value = subsequence.split(":",1)
        return value
    
    def get_start_and_ran(self,sentence):
        matches = re.finditer(self.started_and_ran,sentence)
        if not matches:
            return {}
        
        output = {}
        for match in matches:
            group, span = match.group(), match.span()
            beginning,end = span[0], span[1]
            subsequence = sentence[beginning:end]
            name,value = subsequence.split(":",1)
            output[name.strip().lower()] = value
        return output

    def get_dates(self,sentence):
        matches = re.finditer(self.on_date,sentence)
        if not matches:
            return {}
        
        output = {}
        for idx,match in enumerate(matches):
            group, span = match.group(), match.span()
            beginning,end = span[0], span[1]
            subsequence = sentence[beginning:end]
            name,value = subsequence.split(":",1)
            output[str(idx)] = value
        return output
    
    def create_record(self,group):
        """Main function to create records based on the groups we identified"""
        if len(group) < 3:
            return None
        name,status,runtime_info = group[0], group[1], group[2]
        
        status = self.get_job_or_status(sentence = status)
        job_name = self.get_job_or_status(sentence = name)
        elapsed_time = self.get_elapsed(sentence = runtime_info)
        
        dates = self.get_dates(runtime_info)
        times = self.get_start_and_ran(runtime_info)
        
        record = Record()
        
        record.job_name = job_name
        record.status = status
        record.elapsed_time = elapsed_time
        record.start_time = times.get("started","")
        record.last_ran_time = times.get("last ran","")
        record.start_date = dates.get("0","")
        record.last_ran_date = dates.get("1","")
        
        return record
            

In [9]:
filename = "{}.txt"
parser = LogParser()
transformer = Transformer()

groups = parser.extract_data(filename)
data = [transformer.create_record(group) for group in groups]

pd.DataFrame([asdict(d) for d in data if d]).to_csv("logs.csv", index = False)