In [2]:
import os
from PIL import Image
import pytesseract
import re
from pdf2image import convert_from_path

import logging
import sys

import pandas as pd

In [194]:
arvd_strs = "|".join(['Arvd~', 'Arvd-', "Arv@-"])
clrd_strs = "|".join(['Clrd-', 'Cird-', 'Clird-', 'Clrd+'])
taker_strs = "|".join(["Call Taker", "Cail Taker", "Cali Taker"])
loc_strs = "|".join(["Location/Address", "Locatiion/Address", "Location"])

def find_between(entry_text, left_text, right_text):
    left_end = re.search(left_text, entry_text)
    right_start = re.search(right_text, entry_text)
    
    if left_end and right_start:
        return entry_text[left_end.end():right_start.start()].strip()
    else:
        return None

def find_next_word(entry_text, left_text):
    left_end = re.search(left_text, entry_text)
    
    if left_end:
        return re.search('([^\s]+)', entry_text[left_end.end():]).group(0)
    else:
        return None
    

In [195]:
string_tol = 30
        
def parse_entry(entry_text):
    entry_words = [w for w in entry_text.split(' ') if len(w) > 0]
    
    
    if len(entry_words) <2:
        return [None]*6
    else:    
        # call number is always the first word of the entry
        call_number = entry_words[0]

        # the next 'word' is always the time
        call_time = entry_words[1][:min(4, len(entry_words[1]))]
    
        # call reason
        call_reason =  find_between(entry_text, call_time, taker_strs)

        # call taker is always the string between taker and location
        call_taker =  find_between(entry_text, taker_strs, loc_strs)
        if call_taker is None and re.search(taker_strs, entry_text):
            # see if its actually near the end of the string so location doesnt appear
            if len(entry_text) - re.search(taker_strs, entry_text).end() < string_tol:
                call_taker = entry_text[re.search(taker_strs, entry_text).end():]
            else:
                print("End of string?", entry_text)
            

        arvd_time = find_next_word(entry_text, arvd_strs)
        clrd_time = find_next_word(entry_text, clrd_strs)
    
    return [call_number, call_time, call_reason, call_taker, arvd_time, clrd_time]


def replace_none_with_value(left_list, right_list):
    return [l if not l is None else r for l, r in zip(left_list, right_list)]


In [196]:
year_str = '19-'
current_date = '01/01/2019'

Arvd_list = []
Clrd_list = []


i_entry = 0
parsed_pages = []

for ipage in range(1,10):
    with open('../../data/Logs2019/page_{}.txt'.format(ipage),'r') as infile:
        page_text = infile.read()
    
    #print(ipage)
    #if ipage==2:
    #    print(page_text)
    
    
    page_entries = [log_idx.start() for log_idx in re.finditer(year_str, page_text)] + [-1]

    # check for updated date - we assume only once ever on the page
    date_end = [date_idx.end() for date_idx in re.finditer('For Date: ', page_text)]
    if len(date_end) > 1:
        print("Oh no, multiple days on the same page... this wont work.")
        adgasdgs
    
    # we found only one date so update
    elif len(date_end) == 1:
        current_date = re.search('([^\s]+)', page_text[date_end[0]:])
        if current_date:
            current_date = current_date.group(0).replace('-', "").strip()


    # now we worry about entries that start from the previous page
    initial_text = page_text[0:page_entries[0]]
    if len(initial_text) > 0 and len(parsed_pages) > 0:
        #print(initial_text)
        str_entry = parse_entry(initial_text)
        
        parsed_pages[-1][1:] = replace_none_with_value(parsed_pages[-1][1:], str_entry)
        
    
    for i_start in range(len(page_entries)-1):

        entry_text= page_text[page_entries[i_start]:page_entries[i_start + 1]]
        
        str_entry = parse_entry(entry_text)
        
        if str_entry.count(None) >= 5:
            # append text to previous narrative
            pass
        else:
            parsed_pages.append([current_date] + str_entry)
            i_entry += 1
            
        if str_entry[0] == '19-56':
            print(entry_text)
            print(str_entry)
        
        
        

End of string? 19-50  0854  Initiated - BUILDING CHECK  BUILDING CHECKED/ SECURED  Call Taker:  PATROL TANIA HERNANDEZ  FRENIER AVE  Tonetsle/Sage pes 
19-56  Call Taker:  PATROLMAN MICHAEL J ZIEMBA Jr  COLD SPRING RD  Location/Address:  Unit:  34  Arvd-10:00:30 Clrd-10:01:04  Narrative:  Checked $§-4  17:04  Narrative:  done  - Narrative:  Checked Area.  
['19-56', 'Call', '', ':  PATROLMAN MICHAEL J ZIEMBA Jr  COLD SPRING RD', '10:00:30', '10:01:04']


In [189]:
parsed_pages = pd.DataFrame(parsed_pages, columns = ['current_date', 'call_number', 'call_time', 'call_reason_action', 'call_taker',"arvd_time", "clrd_time"])                
parsed_pages['call_taker'] = parsed_pages['call_taker'].str.replace(":", "").str.strip()
parsed_pages


Unnamed: 0,current_date,call_number,call_time,call_reason_action,call_taker,arvd_time,clrd_time
0,01/01/2019,19-1,0341,Initiated - PARKING CHECK SERVICES RENDERED,PATROL CRAIG A EICHHAMMER,03:42:00,03:42:17
1,01/01/2019,19-4,0834,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",08:34:38,08:35:33
2,01/01/2019,19-5,0842,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",08:43:07,08:43:21
3,01/01/2019,19-6,0846,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",08:46:51,08:47:07
4,01/01/2019,19-7,0847,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",08:47:37,08:48:05
5,01/01/2019,19-8,0859,Initiated ~ BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",-08:59:22,09:00:14
6,01/01/2019,19-9,0903,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",09:03:18,09:03:42
7,01/01/2019,19-10,0906,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",09:06:22,09:07:57
8,01/01/2019,19-11,0941,Initiated ~ BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",09:41:46,09:41:58
9,01/01/2019,19-12,1004,Initiated - ROAD CONDITIONS SERVICES RENDERED,"PATROL DAVID JENNINGS, D SOUTHWORTH ST",10:05:59,10:07:54
