In [368]:
from wand.image import Image as WImage
from wand.color import Color
from wand.api import library
from PIL import Image as PI
import io
import os
import pyocr
import pyocr.builders
import pandas as pd
import re
import os
from datamatch import (
    JaroWinklerSimilarity,
    ThresholdMatcher,
    ColumnsIndex,
)

In [369]:
def ocr_pdfs():
    pyocr.tesseract.TESSERACT_CMD = r"C:/Program Files/Tesseract-OCR/tesseract.exe"
    dir_path = os.chdir("C:/Users/PC/Desktop/keepers/ok/data/CT/pdfs")
    for file in os.listdir(dir_path):
        if file.endswith(".pdf"):
            tools = pyocr.get_available_tools()
            if len(tools) == 0:
                raise Exception("No tool available")

            tool = tools[0]
            print("Will use tool '%s'" % (tool.get_name()))
            langs = tool.get_available_languages()
            print("Available languages: %s" % ", ".join(langs))
            lang = langs[0]  # For English
            print("Will use language '%s'" % (lang))

            req_image = []
            final_text = {}

            with WImage(filename=file, resolution=300) as image_pdf:
                image_jpeg = image_pdf.convert("pdf")

            try:
                for img in image_jpeg.sequence:
                    img_page = WImage(image=img)
                    req_image.append(img_page.make_blob("jpeg"))
            finally:
                image_jpeg.destroy()

            i = 0
            for img in req_image:
                txt = tool.image_to_string(
                    PI.open(io.BytesIO(img)),
                    lang=lang,
                    builder=pyocr.builders.TextBuilder(),
                )
                final_text[str(i)] = txt
                i += 1

            text = final_text.items()
            df = pd.DataFrame(text)
            file_name = file + ".txt"
            path = r"C:/Users/PC/Desktop/keepers/ok/data/CT/ocr/"
            df.to_csv(path + file_name)
    return df

In [370]:
write = ocr_pdfs()

In [371]:
def match_names_w_badge_nos(df):
    df = df.astype(str)
    df["names"] = " ".join(row for row in df["data"])
    names = df["names"][0]
    df = df.drop_duplicates(subset=["names"])
    
    NAME1 = r"[A-Z][a-z]+,?\s+"
    MIDDLE_I = r"(?:[A-Z][a-z]*\.?\s*)?"
    NAME2 = r"[A-Z][a-z]+\s+"
    BADGE = r"#?\(\w+\)\s+"

    result = re.findall(NAME1 + MIDDLE_I + NAME2 + BADGE, names)
    df = pd.DataFrame(result, columns=["officers"])
    return df

In [372]:
def match_names(df):
    df = df.astype(str)
    ID = r"[Oo]fficer\(?s?\)?:?.+"

    result = [re.findall(ID, row) for row in df["data"]]
    df = pd.DataFrame(result, columns=["officers"])
    df = df[~((df.officers.fillna("") == ""))]

    return df

In [373]:
def df1():
    df = pd.read_csv(
        ("../ocr/consolidated.txt"),
        sep="delimiter",
        header=None,
        engine="python",
        index_col=False,
    )
    df = (
        df\
        .rename(columns={df.columns[0]: "data"})
        .pipe(match_names)
    )
    return df


def df2():
    df = pd.read_csv(
        ("../ocr/consolidated.txt"),
        sep="delimiter",
        header=None,
        engine="python",
        index_col=False,
    )
    df = (
        df\
        .rename(columns={df.columns[0]: "data"})
        .pipe(match_names_w_badge_nos)
    )
    return df

In [374]:
dfa = df1()
dfb = df2()

In [375]:
def concat(dfa, dfb):
    dfs = [dfa, dfb]
    df = pd.concat(dfs, axis=0)
    return df

In [376]:
df = concat(dfa, dfb)

In [377]:
def split_rows_with_multiple_officers(df):
    df = (
        df.drop("officers", axis=1)
        .join(
            df["officers"]
            .str.split(",", expand=True)
            .stack()
            .reset_index(level=1, drop=True)
            .rename("officers"),
            how="outer",
        )
        .reset_index(drop=True)
    )
    return df

In [378]:
def split_names(df):
    names = df.officers.str.lower().str.strip()\
        .str.replace(r"officers?\,", "officer", regex=True)\
        .str.replace(r"^ (\w+)", r"\1", regex=True)\
        .str.replace(r"(\w+) $", r"\1", regex=True)\
        .str.replace(r"\.", "", regex=True)\
        .str.replace(r"\(s\):", "", regex=True).str.extract(r"(officer|det|lt|sgt)?\.? ?(\w+) ?(?:(\w+))? ?(.+)?")
    df.loc[:, "rank_desc"] = names[0]
    df.loc[:, "first_name"] = names[1]
    df.loc[:, "last_name"] = names[2]
    df.loc[:, "badge_no"] = names[3]
    return df

In [379]:
df = df.pipe(split_rows_with_multiple_officers).pipe(split_names)

df.loc[:, ["officers", "rank_desc", "first_name", "last_name", "badge_no"]].iloc[200:250]

Unnamed: 0,officers,rank_desc,first_name,last_name,badge_no
200,Officers detained the two individuals who refu...,officer,s,detained,the two individuals who refused to provide the...
201,officers. One of the subjects,officer,s,one,of the subjects
202,a female,,a,female,
203,eventually provided the officers with her,,eventually,provided,the officers with her
204,officers. Certified letters were sent to the i...,officer,s,certified,letters were sent to the individuals involved
205,Officers’ actions were lawful and appropriate.,officer,s,,’ actions were lawful and appropriate
206,Officer(s): Landucci,officer,landucci,,
207,Valente,,valente,,
208,Vernik,,vernik,,
209,Officers received false information from Latoy...,officer,s,received,false information from latoya stewart regarding


In [382]:
search = df[df.officers.str.contains("Vernik")]
search

Unnamed: 0,officers,rank_desc,first_name,last_name,badge_no
208,Vernik,,vernik,,
