In [1]:
import os
from PIL import Image
import pytesseract
import re
from pdf2image import convert_from_path

import logging
import sys

import pandas as pd

In [51]:
arvd_strs = "|".join(['Arvd~', 'Arvd-', "Arv@-", "Arvd+"])
clrd_strs = "|".join(['Clrd-', 'Clrd~', 'Cird-', 'Clird-', 'Clrd+', "Clr@-"])
taker_strs = "|".join(["‘Call Taker", "Call Taker", "Cail Taker", "Cali Taker"])
loc_strs = "|".join(["Location/Address", "Locatiion/Address", "Locat ion/Address", "Location"])
narr_strs = "|".join(["Narrative:", "Narrative"])

def find_between(entry_text, left_text, right_text):
    left_end = re.search(left_text, entry_text)
    right_start = re.search(right_text, entry_text)
    
    if left_end and right_start:
        return entry_text[left_end.end():right_start.start()].strip()
    else:
        return None

def find_next_word(entry_text, left_text):
    left_end = re.search(left_text, entry_text)
    
    if left_end:
        return re.search('([^\s]+)', entry_text[left_end.end():]).group(0)
    else:
        return None
    

In [66]:
string_tol = 60
        
def parse_entry(entry_text):
    entry_words = [w for w in entry_text.split(' ') if len(w) > 0]
    
    
    if len(entry_words) <2:
        return [None]*6
    else:    
        # call number is always the first word of the entry
        call_number = entry_words[0]

        # the next 'word' is always the time
        call_time = entry_words[1][:min(4, len(entry_words[1]))]
    
        # call reason
        call_reason =  find_between(entry_text, call_time, taker_strs)
        if call_reason is None:
            # see if its actually near the end of the string so call taker never appears
            if re.search(call_time, entry_text) and re.search(taker_strs, entry_text) is None:
                
                call_reason = entry_text[re.search(call_time, entry_text).end():]
                

        # call taker is always the string between taker and location
        call_taker =  find_between(entry_text, taker_strs, loc_strs)
        if call_taker is None and re.search(taker_strs, entry_text):
            # see if its actually near the end of the string so location doesnt appear
            if len(entry_text) - re.search(taker_strs, entry_text).end() < string_tol:
                call_taker = entry_text[re.search(taker_strs, entry_text).end():]
            else:
                print("End of string?", len(entry_text) - re.search(taker_strs, entry_text).end(), entry_text)
                print(re.search(taker_strs, entry_text) )
            

        arvd_time = find_next_word(entry_text, arvd_strs)
        clrd_time = find_next_word(entry_text, clrd_strs)
        if clrd_time and len(clrd_time) < 8:
            print(clrd_time)
            
        narrative_text = re.search(narr_strs, entry_text)
        if narrative_text:
            narrative_text = entry_text[narrative_text.start():]
    
    return [call_number, call_time, call_reason, call_taker, arvd_time, clrd_time, narrative_text]


def replace_none_with_value(left_list, right_list):
    return [l if not l is None else r for l, r in zip(left_list, right_list)]


#entry_text = "19-95  0909  Initiated - BUILDING CHECK  BUILDING CHECKED/ SECURED"

#parse_entry(entry_text)

In [67]:
year_str = '19-'
current_date = '01/01/2019'

Arvd_list = []
Clrd_list = []


i_entry = 0
parsed_pages = []

for ipage in range(1,20):
    with open('../../data/Logs2019/page_{}.txt'.format(ipage),'r') as infile:
        page_text = infile.read()
    
    #print(ipage)
    #if ipage==2:
    #    print(page_text)
    
    
    page_entries = [log_idx.start() for log_idx in re.finditer(year_str, page_text)] + [-1]

    # check for updated date - we assume only once ever on the page
    date_end = [date_idx.end() for date_idx in re.finditer('For Date: ', page_text)]
    if len(date_end) > 1:
        print("Oh no, multiple days on the same page... this wont work.")
        adgasdgs
    
    # we found only one date so update
    elif len(date_end) == 1:
        current_date = re.search('([^\s]+)', page_text[date_end[0]:])
        if current_date:
            current_date = current_date.group(0).replace('-', "").strip()


    # now we worry about entries that start from the previous page
    initial_text = page_text[0:page_entries[0]]
    if len(initial_text) > 0 and len(parsed_pages) > 0:
        #print(initial_text)
        str_entry = parse_entry(initial_text)
        
        parsed_pages[-1][2:] = replace_none_with_value(parsed_pages[-1][2:], str_entry)
        
    
    for i_start in range(len(page_entries)-1):

        entry_text= page_text[page_entries[i_start]:page_entries[i_start + 1]]
        
        str_entry = parse_entry(entry_text)
        
        if str_entry.count(None) >= 5:
            # append text to previous narrative
            pass
        else:
            parsed_pages.append([current_date, ipage] + str_entry)
            i_entry += 1
            
        if False and str_entry[0] == '19-56':
            print(entry_text)
            print(str_entry)
        
        
        

08:19
07:42
End of string? 249 19-94  0807  Other ~ ASSIST OTHER AGENCY - REPO  SERVICES RENDERED  Cali Taker  PATROL DAVID JENNINGS, D  Vehicle  WHI 2013 HYUN SE ELANTRA Reg: PC MA 2CS815 VIN: 5NPDH4AE5DH351247  Owner  LAFLECHE, JANELL MARIE @ 15 ARCH ST - PITTSFIELD, MA 01201-5423  Race:  W Sex: F  Narrative  Ma  PC 2CS815 repossessed from [i Adams Rd.  
<re.Match object; span=(68, 78), match='Cali Taker'>
09:


In [68]:
parsed_pages = pd.DataFrame(parsed_pages, 
                            columns = ['current_date', 'page_num', 'call_number', 'call_time', 
                                       'call_reason_action', 'call_taker',"arvd_time", "clrd_time", "narrative_text"])                

# cleanup call_takers
parsed_pages['call_taker'] = parsed_pages['call_taker'].str.replace(":", "").str.strip()

parsed_pages.to_csv('../../data/parsed_logs_2019.csv', mode='w', index=False, header=True)

parsed_pages


Unnamed: 0,current_date,page_num,call_number,call_time,call_reason_action,call_taker,arvd_time,clrd_time,narrative_text
0,01/01/2019,1,19-1,0341,Initiated - PARKING CHECK SERVICES RENDERED,PATROL CRAIG A EICHHAMMER,03:42:00,03:42:17,
1,01/01/2019,1,19-4,0834,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",08:34:38,08:35:33,Narrative: checked Narrative: done Narrati...
2,01/01/2019,1,19-5,0842,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",08:43:07,08:43:21,Narrative checked Narrative Checked 0208
3,01/01/2019,1,19-6,0846,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",08:46:51,08:47:07,Narrative: checked Narrative: Checked 0159
4,01/01/2019,1,19-7,0847,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",08:47:37,08:48:05,Narrative checked Narrative Checked 0201
...,...,...,...,...,...,...,...,...,...
90,01/03/2019,18,19-118,1729,Initiated - BUILDING CHECK BUILDING CHECKED/S...,SERGEANT DAVID R LEMIEUX,17:29:15,17:30:26,Narrative: done Narrative: 01/04/2019 0039 ...
91,01/03/2019,19,19-121,1839,Initiated - BUILDING CHECK BUILDING CHECKED/S...,SERGEANT DAVID R LEMIEUX,18:40:41,18:43:03,Narrative No activity Narrative 01/04/2019 ...
92,01/03/2019,19,19-125,1945,Initiated - SUSPICIOUS. MOTOR VEHICLE LOG ENT...,PATROL KEVIN P GARNER,19:48:30,19:49:03,Narrative: Subject parked off road just east ...
93,01/03/2019,19,19-126,2044,Initiated - SUSPICIOUS MOTOR VEHICLE LOG ENTR...,PATROL KEVIN P GARNER,20:48:57,20:49:39,Narrative Subject said her friend needed quie...


In [75]:
#cleanup the call_takers

from fuzzywuzzy.fuzz import ratio, partial_ratio
import itertools
import numpy as np

parsed_pages['call_taker'].value_counts()

with open('williamston_known_officiers.txt', 'r') as infile:
    known_officers = [line.replace("\n", "") for line in infile]

known_officer_ratio = np.array([fuzzywuzzy.fuzz.ratio(o1,o2) for o1, o2 in itertools.combinations(known_officers, 2)])

min_officer_ratio = known_officer_ratio.max() + 5

print(min_officer_ratio)

def standardize_officers(s):
    if isinstance(s, str):
        for oname in known_officers:
            if partial_ratio(s, oname) > min_officer_ratio:
                return oname
    return None

parsed_pages['cleaned_call_taker'] = parsed_pages['call_taker'].apply(standardize_officers)

parsed_pages

68


Unnamed: 0,current_date,page_num,call_number,call_time,call_reason_action,call_taker,arvd_time,clrd_time,narrative_text,cleaned_call_taker
0,01/01/2019,1,19-1,0341,Initiated - PARKING CHECK SERVICES RENDERED,PATROL CRAIG A EICHHAMMER,03:42:00,03:42:17,,PATROL CRAIG A EICHHAMMER
1,01/01/2019,1,19-4,0834,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",08:34:38,08:35:33,Narrative: checked Narrative: done Narrati...,"PATROL DAVID JENNINGS, D"
2,01/01/2019,1,19-5,0842,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",08:43:07,08:43:21,Narrative checked Narrative Checked 0208,"PATROL DAVID JENNINGS, D"
3,01/01/2019,1,19-6,0846,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",08:46:51,08:47:07,Narrative: checked Narrative: Checked 0159,"PATROL DAVID JENNINGS, D"
4,01/01/2019,1,19-7,0847,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",08:47:37,08:48:05,Narrative checked Narrative Checked 0201,"PATROL DAVID JENNINGS, D"
...,...,...,...,...,...,...,...,...,...,...
90,01/03/2019,18,19-118,1729,Initiated - BUILDING CHECK BUILDING CHECKED/S...,SERGEANT DAVID R LEMIEUX,17:29:15,17:30:26,Narrative: done Narrative: 01/04/2019 0039 ...,SERGEANT DAVID R LEMIEUX
91,01/03/2019,19,19-121,1839,Initiated - BUILDING CHECK BUILDING CHECKED/S...,SERGEANT DAVID R LEMIEUX,18:40:41,18:43:03,Narrative No activity Narrative 01/04/2019 ...,SERGEANT DAVID R LEMIEUX
92,01/03/2019,19,19-125,1945,Initiated - SUSPICIOUS. MOTOR VEHICLE LOG ENT...,PATROL KEVIN P GARNER,19:48:30,19:49:03,Narrative: Subject parked off road just east ...,PATROL KEVIN P GARNER
93,01/03/2019,19,19-126,2044,Initiated - SUSPICIOUS MOTOR VEHICLE LOG ENTR...,PATROL KEVIN P GARNER,20:48:57,20:49:39,Narrative Subject said her friend needed quie...,PATROL KEVIN P GARNER


In [72]:
# now go after car text

entry_text = "19-94  0807  Other ~ ASSIST OTHER AGENCY - REPO  SERVICES RENDERED  Cali Taker  PATROL DAVID JENNINGS, D  Vehicle  WHI 2013 HYUN SE ELANTRA Reg: PC MA 2CS815 VIN: 5NPDH4AE5DH351247  Owner  LAFLECHE, JANELL MARIE @ 15 ARCH ST - PITTSFIELD, MA 01201-5423  Race:  W Sex: F  Narrative  Ma  PC 2CS815 repossessed from [i Adams Rd. "

vehicle_strs = "|".join(["Vehicle:", "Vehicle"])
owner_strs = "|".join(["Owner:", "Owner"])
operator_strs = "|".join(["Operator:", "Operator"])

vehicle_txt = re.search(vehicle_strs, entry_text)
if vehicle_txt:
    operator_txt = re.search(vehicle_strs, entry_text)
    