In [31]:
import re
import pandas as pd
import os

In [32]:
# 2020-23

def read_tbl(file_path):
    with open(file_path, "r") as file:
        content = file.read()
        # Enhanced splitting logic to handle various patterns in the new format
        incidents = re.split(r"(?=\d{4}-\d{3,4}(\-|\.)\w+)", content)
        return incidents

def extract_incident_info(incident):
    tracking_id = re.search(r"(\d{4}-\d{3,4}(\-|\.)\w+)", incident)
    if tracking_id:
        tracking_id = tracking_id.group(1)
    else:
        tracking_id = None

    allegation = re.search(r"(?i)Allegation:?\s*(.*?)(?=\s*Accused:|$)", incident, re.DOTALL)
    if allegation:
        allegation = allegation.group(1).strip()
    else:
        allegation = None

    accused = re.search(r"(?i)Accused:?\s*(.*?)(?=\s*Complainant:|Gist:|$)", incident, re.DOTALL)
    if accused:
        accused = accused.group(1).strip()
    else:
        accused = None

    complainant = re.search(r"(?i)Complainant:?\s*(.*?)(?=\s*Gist:|$)", incident, re.DOTALL)
    if complainant:
        complainant = complainant.group(1).strip()
    else:
        complainant = None

    gist = re.search(r"(?i)Gist:?\s*(.*?)(?=\s*\d{4}-\d{4}-\w+|$)", incident, re.DOTALL)
    if gist:
        gist = gist.group(1).strip()
    else:
        gist = None

    return {
        "tracking_id": tracking_id,
        "gist": gist
    }

input_directory = "../../ocr/data/output/"
output_directory = "../data/output/"

for filename in os.listdir(input_directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_directory, filename)
        incidents = read_tbl(file_path)
        incident_data = []
        for incident in incidents:
            incident_info = extract_incident_info(incident)
            incident_data.append(incident_info)
        df = pd.DataFrame(incident_data)
        output_path = os.path.join(output_directory, filename.replace(".txt", ".csv"))
        df.to_csv(output_path, index=False)

In [33]:

def read_tbl(file_path):
    with open(file_path, "r") as file:
        content = file.read()
    incidents = re.split(r"(\n\d+\. |\n\d+\.)", content)[1:]
    incidents = [prefix + incident for prefix, incident in zip(incidents[0::2], incidents[1::2])]
    df = pd.DataFrame({"incident": incidents})
    df["incident"] = df["incident"].str.strip()
    return df

def extract_tracking_id(incident):
    match = re.search(r"\d{4} ?- ?\d{4,5}(\.|-)?\[A-Z\]", incident)
    if match:
        return match.group()
    else:
        match = re.search(r"\d{4} ?- ?\d{4}", incident)
        return match.group() if match else None

def extract_gist(incident):
    match = re.search(r"(?i)Gist:(.+?)(?=\n\d+\. |\n\d+\.|$)", incident, re.DOTALL)
    if match:
        gist = match.group(1).strip()
        gist = re.sub(r"WEEKLY\s+GISTS\s+\d{1,2}[/\-\s]?\d{1,2}[/\-\s]?\d{2,4}\s*(?:THROUGH|[-â€“])\s*\d{1,2}[/\-\s]?\d{1,2}[/\-\s]?\d{2,4}", "", gist, flags=re.IGNORECASE).strip()
        gist = re.sub(r"Page\s+\d{1,3}\s+of\s+\d{1,3}", "", gist, flags=re.IGNORECASE).strip()
        return gist
    return None

input_directory = "../../ocr/data/output/"
output_directory = "../data/output/"

for filename in os.listdir(input_directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_directory, filename)
        df = read_tbl(file_path)
        df["tracking_id"] = df["incident"].apply(extract_tracking_id)
        df["gist"] = df["incident"].apply(extract_gist)
        df = df.drop(columns=["incident"])
        output_path = os.path.join(output_directory, filename.replace(".txt", ".csv"))
        df.to_csv(output_path, index=False)