In [1]:
import os
from PIL import Image
import pytesseract
import re
from pdf2image import convert_from_path

import usaddress
import logging
import sys

import pandas as pd

In [51]:
arvd_strs = "|".join(['Arvd~', 'Arvd-', "Arv@-", "Arvd+"])
clrd_strs = "|".join(['Clrd-', 'Clrd~', 'Cird-', 'Clird-', 'Clrd+', "Clr@-"])
taker_strs = "|".join(["‘Call Taker", "Call Taker", "Cail Taker", "Cali Taker"])
loc_strs = "|".join(["Location/Address", "Locatiion/Address", "Locat ion/Address", "Location"])
narr_strs = "|".join(["Narrative:", "Narrative"])

def find_between(entry_text, left_text, right_text):
    left_end = re.search(left_text, entry_text)
    right_start = re.search(right_text, entry_text)
    
    if left_end and right_start:
        return entry_text[left_end.end():right_start.start()].strip()
    else:
        return None

def find_next_word(entry_text, left_text):
    left_end = re.search(left_text, entry_text)
    
    if left_end:
        return re.search('([^\s]+)', entry_text[left_end.end():]).group(0)
    else:
        return None
    

In [153]:
string_tol = 60
        
def parse_entry(entry_text):
    entry_words = [w for w in entry_text.split(' ') if len(w) > 0]
    
    
    if len(entry_words) <2:
        return [None]*6
    else:    
        # call number is always the first word of the entry
        call_number = entry_words[0]

        # the next 'word' is always the time
        call_time = entry_words[1][:min(4, len(entry_words[1]))]
        for c in ['(', ')', "[", "]", "{", "}"]:
            call_time = call_time.replace(c, "")
        
        # call reason
        call_reason =  find_between(entry_text, call_time, taker_strs)
        if call_reason is None:
            # see if its actually near the end of the string so call taker never appears
            if re.search(call_time, entry_text) and re.search(taker_strs, entry_text) is None:
                
                call_reason = entry_text[re.search(call_time, entry_text).end():]
        
        if False:
            try:      
                call_action = "__".join(call_reason.replace(" ","_").split("__")[1:])
                call_reason = call_reason.replace(" ","_").split("__")[0]

            except ValueError:
                print(call_reason)

        # call taker is always the string between taker and location
        call_taker =  find_between(entry_text, taker_strs, loc_strs)
        if call_taker is None and re.search(taker_strs, entry_text):
            # see if its actually near the end of the string so location doesnt appear
            if len(entry_text) - re.search(taker_strs, entry_text).end() < string_tol:
                call_taker = entry_text[re.search(taker_strs, entry_text).end():]
            else:
                print("End of string?", len(entry_text) - re.search(taker_strs, entry_text).end(), entry_text)
                print(re.search(taker_strs, entry_text) )
            

        #get address
        entry_text_ = entry_text.replace(" ","_").split("__")
        
        loc_series = pd.Series(entry_text_)[pd.Series(entry_text_).str.match(r'Location/Address*')]
        call_address = None
        if len(loc_series) > 0:
            loc_idx = loc_series.index[0]
            if loc_idx != len(entry_text_) -1:
                call_address = entry_text_[loc_idx +1]


        arvd_time = find_next_word(entry_text, arvd_strs)
        clrd_time = find_next_word(entry_text, clrd_strs)
        if clrd_time and len(clrd_time) < 8:
            print(clrd_time)
            
        narrative_text = re.search(narr_strs, entry_text)
        if narrative_text:
            narrative_text = entry_text[narrative_text.start():]
    
    return [call_number, call_time, call_reason, call_taker, call_address, arvd_time, clrd_time, narrative_text]


def replace_none_with_value(left_list, right_list):
    return [l if not l is None else r for l, r in zip(left_list, right_list)]


#entry_text = "19-95  0909  Initiated - BUILDING CHECK  BUILDING CHECKED/ SECURED"

#parse_entry(entry_text)

In [163]:
year_str = '19-'
current_date = '01/01/2019'

all_units = []


i_entry = 0
parsed_pages = []

for ipage in range(1,10):
    with open('../../data/Logs2019/page_{}.txt'.format(ipage),'r') as infile:
        page_text = infile.read()
    
    #print(ipage)
    #if ipage==2:
    #    print(page_text)
    
    page_entries = [log_idx.start() for log_idx in re.finditer(year_str, page_text)] + [-1]

    # check for updated date - we assume only once ever on the page
    date_end = [date_idx.end() for date_idx in re.finditer('For Date: ', page_text)]
    if len(date_end) > 1:
        print("Oh no, multiple days on the same page... this wont work.")
        adgasdgs
    
    # we found only one date so update
    elif len(date_end) == 1:
        current_date = re.search('([^\s]+)', page_text[date_end[0]:])
        if current_date:
            current_date = current_date.group(0).replace('-', "").strip()


    # now we worry about entries that start from the previous page
    initial_text = page_text[0:page_entries[0]]
    if len(initial_text) > 0 and len(parsed_pages) > 0:
        #print(initial_text)
        str_entry = parse_entry(initial_text)
        
        parsed_pages[-1][2:] = replace_none_with_value(parsed_pages[-1][2:], str_entry)
        call_number = parsed_pages[-1][2]
        process_units(entry_text, call_number, all_units)
        
    
    for i_start in range(len(page_entries)-1):

        entry_text= page_text[page_entries[i_start]:page_entries[i_start + 1]]
        
        str_entry = parse_entry(entry_text)
        
        if str_entry.count(None) >= 5:
            # append text to previous narrative
            pass
        else:
            parsed_pages.append([current_date, ipage] + str_entry)
            i_entry += 1
            
        if True and str_entry[0] == '19-22':
            print(entry_text)
            print(str_entry)
        
        
        

19-22  1527  911 - MOTOR VEHICLE ACCIDENT  BUILDING CHECKED/SECURED  Cail Taker  PATROL KALVIN DZIEDZIAK  Locatiion/Address  NORTH ST  Unit  32  Arvd-15:29:20  Disp-15:27:00  Clrd-15:44:26  Unit  38  Disp-15:27:00  Arvd-15:29:18  Clrd-15:46:09  Vehicle  RED 2011  SUZI SX4 Reg: PC MA  66BS20  VIN: JS2YBS5A38B6301415  Operator  FORREST,  ERICA @ 4 EDMUNDS ST  Apt. #A201 ~ ADAMS, MA 01220-2249  Race:  WwW  Sex: F  Refer To Citation  T1400608  Owner  FORREST,  ERICA @ 4 EDMUNDS ST  Apt. #A201 - ADAMS, MA 01220-2249  Race:  Ww  Sex: F  Vehicle  GRY 2012  SUBA ST OUTBAC Reg:  PC MA 452WB4 VIN: 4S4BRBCC8C3239617  Operator  LEBLANC,  NATHAN L @ 253 NORTH  ST - WILLIAMSTOWN, MA 01267-2004  Race: W  Sex: M  Owner  LEBLANC,  NATHAN L @ 253 NORTH  ST - WILLIAMSTOWN, MA 01267-2004  Race: W  Sex: M  Narrative  MVA located at the rotary.  1535- Williams Inn asked to hold  peter pan bus for three  additional passengers.  Refer To Accident:  19~1-AC  
['19-22', '1527', '911 - MOTOR VEHICLE ACCIDENT  BU

End of string? 143 19-1645  1340  911 -  S$-1-1 WRONG NUMBER  LOG ENTRY REQUEST  Call Taker:  CHIEF KYLE J JOHNSON  Narrative:  Emergency line. Neg. TTY. Generic answering machine on call  back. Google search lists it as spam.  1415  
<re.Match object; span=(62, 72), match='Call Taker'>
End of string? 90 19-1779  1238  Phone - BURN PERMIT  LOG ENTRY REQUEST  Call Taker:  DISPATCHER CHRISTINE LEMOINE  Narrative:  Henderson Rd  North Hoosac Rd  Henderson Rd  
<re.Match object; span=(55, 65), match='Call Taker'>
End of string? 67 19-1825  1248  Phone - BURN PERMIT  LOG ENTRY REQUEST  Call Taker:  CHIEF KYLE J JOHNSON  Narrative:  WM Luce rd; BBR N. voosac Rd  
<re.Match object; span=(55, 65), match='Call Taker'>
End of string? 76 19-1827  1343  911 - 9-1-1 WRONG NUMBER  LOG ENTRY REQUEST  Call Taker:  CHIEF KYLE J JOHNSON  Narrative:  Recorded sales call on emergency line  
<re.Match object; span=(60, 70), match='Call Taker'>
End of string? 308 19-1843  0152  Phone ~- WELL BEING CHECK  LO

End of string? 81 19-3155  1356  Phone ~ BURN PERMIT  LOG ENTRY REQUEST  Call Taker:  DISPATCHER CHRISTINE LEMOINE  Narrative:  261 Bee Hill Rd  530 White Oaks Rd  
<re.Match object; span=(55, 65), match='Call Taker'>
End of string? 92 19-3203  1253  Phone - BURN PERMIT  SERVICES RENDERED  Call Taker:  DISPATCHER CHRISTINE LEMOINE  Narrative:  White Oaks Rd  Petersburg Rd  ountain View St  
<re.Match object; span=(55, 65), match='Call Taker'>
End of string? 151 19-3312  0838  Initiated - BUILDING CHECK  BUILDING CHECKED/SECURED  Call Taker:  SERGEANT SCOTT E MCGOWAN  Lecation/Address:  LONGVIEW TER  Unit:  30  Arvd-08:38:17  Clrd-08:39:00  Narrative:  checked  Narrative:  Checked Area.  
<re.Match object; span=(69, 79), match='Call Taker'>
End of string? 126 19-3341  1039  Other - BURN PERMIT  LOG ENTRY REQUEST  Call Taker:  CHIEF KYLE J JOHNSON  Narrative:  White Oaks  Walnut  Mt. View; [BBB cold spring;  Henderson;  Ide &  a  f  cath  Oblong.  
<re.Match object; span=(55, 65), match=

End of string? 72 19-4340  1130  Walk-In - PUBLIC RECORDS REQUEST  SERVICES RENDERED  Call Taker:  CHIEF KYLE J JOHNSON  Narrative:  In house check. Negative results.  
<re.Match object; span=(68, 78), match='Call Taker'>
End of string? 74 19-4417  0918  Other - PUBLIC RECORDS REQUEST  SERVICES RENDERED  Call Taker:  CHIEF KYLE J JOHNSON  Narrative:  Requests in-house records / emailed  
<re.Match object; span=(66, 76), match='Call Taker'>
End of string? 63 19-4423  0929  Other - PUBLIC RECORDS REQUEST  SERVICES RENDERED  Call Taker:  CHIEF KYLE J JOHNSON  Narrative:  In house check. Emailed.  
<re.Match object; span=(66, 76), match='Call Taker'>
End of string? 129 19-4493  1249  Other - BURN PERMIT  SERVICES RENDERED  Cali Taker:  CHIEF KYLE J JOHNSON  Narrative:  lee  Williams College - Poker Plats will have a bonfire for a  special event this evening.  
<re.Match object; span=(55, 65), match='Cali Taker'>
<re.Match object; span=(36, 46), match='Call Taker'>
End of string? 63 19-4720

End of string? 119 19-6587  0031  911 - 9-1-1 OVERFLOW NORTH ADAMS  SERVICES RENDERED  ‘Call Taker:  PATROL ANTHONY M DUPRAT  Narrative:  911 Overflow call from North Adams,  Call was transfered to  North Adams 911.  
<re.Match object; span=(68, 79), match='‘Call Taker'>
End of string? 136 19-6609  1327  Initiated - BUILDING CHECK  BUILDING CHECKED/SECURED  Call Taker:  PATROLMAN MICHAEL J 2IEMBA Jr  dress:  TORREY WOODS RD  zonation [ne  Unit:  37K  Arvd-13:27:22  Clrd-13:32:12  Narrative:  Checked  
<re.Match object; span=(69, 79), match='Call Taker'>


In [165]:
parsed_pages = pd.DataFrame(parsed_pages, 
                            columns = ['current_date', 'page_num', 'call_number', 'call_time', 
                                       'call_reason_action', 'call_taker', "call_address", "arvd_time", "clrd_time", "narrative_text"])                

# cleanup call_takers
parsed_pages['call_taker'] = parsed_pages['call_taker'].str.replace(":", "").str.strip()

parsed_pages.to_csv('../../data/parsed_logs_2019.csv', mode='w', index=False, header=True)

parsed_pages


Unnamed: 0,current_date,page_num,call_number,call_time,call_reason_action,call_taker,call_address,arvd_time,clrd_time,narrative_text
0,01/01/2019,1,19-1,0341,Initiated - PARKING CHECK SERVICES RENDERED,PATROL CRAIG A EICHHAMMER,[WIA_249]_NORTH_ST,03:42:00,03:42:17,
1,01/01/2019,1,19-4,0834,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",COLD_SPRING_RD,08:34:38,08:35:33,Narrative: checked Narrative: done Narrati...
2,01/01/2019,1,19-5,0842,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",LONGVIEW_TER,08:43:07,08:43:21,Narrative checked Narrative Checked 0208
3,01/01/2019,1,19-6,0846,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",FRENTER_AVE,08:46:51,08:47:07,Narrative: checked Narrative: Checked 0159
4,01/01/2019,1,19-7,0847,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",HARWOOD_ST,08:47:37,08:48:05,Narrative checked Narrative Checked 0201
...,...,...,...,...,...,...,...,...,...,...
5080,05/26/2019,998,19-6643,1020,Initiated - BUILDING CHECK BUILDING CHECKED/S...,PATROL TANIA HERNANDEZ,MAIN_sT,10:20:55,10:21:07,Narrative Workers on scene Narrative Checke...
5081,05/26/2019,998,19-6644,1033,Initiated - BUILDING CHECK BUILDING CHECKED/S...,PATROL TANIA HERNANDEZ,COLD_SPRING_RD,10:33:41,10:35:13,Narrative Checked S-4 18:52 Narrative done ...
5082,05/26/2019,998,19-6648,1058,Initiated - MOTOR VEHICLE STOP CITATION - WAR...,PATROLMAN MICHAEL J ZIEMBA Jr,MAIN_ST,10:58:51,,
5083,05/26/2019,999,19-6649,1102,Initiated - BUILDING CHECK BUILDING CHECKED/S...,PATROL TANIA HERNANDEZ,TORREY_WOODS_RD,11:02:44,11:04:59,Narrative Checked 5S-4 18:58 Narrative done


In [166]:
parsed_pages['call_taker'].value_counts().iloc[:30]

PATROL DAVID JENNINGS, D                       612
PATROL TANIA HERNANDEZ                         443
SERGEANT SCOTT E MCGOWAN                       354
PATROLMAN MICHAEL J ZIEMBA Jr                  293
PATROL SHUAN N WILLIAM                         293
PATROL KALVIN DZIEDZIAK                        286
PATROL JOHN J MCCONNELL JR                     241
SERGEANT DAVID R LEMIEUX                       224
DISPATCHER LAURIE TUPER                        222
CHIEF KYLE J JOHNSON                           190
PATROL KEVIN P GARNER                          167
PATROLMAN SCOTT SKORUPSKI                      128
PATROL ANTHONY M DUPRAT                        114
PATROL BRAD SACCO                              109
DISPATCHER CHRISTINE LEMOINE                   106
DISPATCHER WILLIAM C JENNINGS JR                83
PATROL CRAIG A EICHHAMMER                       81
SERGEANT PAUL D THOMPSON                        65
                                                36
PATROLMAN MICHAEL J ZIEMBA dr  

In [239]:
#cleanup the call_takers

from fuzzywuzzy.fuzz import ratio, partial_ratio
import itertools
import numpy as np

parsed_pages['call_taker'].value_counts()

with open('williamston_known_officers.txt', 'r') as infile:
    known_officers = [line.replace("\n", "").strip() for line in infile]

known_officer_ratio = np.array([fuzzywuzzy.fuzz.ratio(o1,o2) for o1, o2 in itertools.combinations(known_officers, 2)])

min_officer_ratio = known_officer_ratio.max() + 5

print(min_officer_ratio)

def standardize_officers(s):
    if isinstance(s, str):
        for oname in known_officers:
            if partial_ratio(s, oname) > min_officer_ratio:
                return oname
    return None

parsed_pages['cleaned_call_taker'] = parsed_pages['call_taker'].apply(standardize_officers)

parsed_pages

68


Unnamed: 0,current_date,page_num,call_number,call_time,call_reason_action,call_taker,call_address,arvd_time,clrd_time,narrative_text,cleaned_call_taker,call_reasons,call_actions
0,01/01/2019,1,19-1,0341,Initiated - PARKING CHECK SERVICES RENDERED,PATROL CRAIG A EICHHAMMER,[WIA_249]_NORTH_ST,03:42:00,03:42:17,,PATROL CRAIG A EICHHAMMER,Initiated - PARKING CHECK,SERVICES RENDERED
1,01/01/2019,1,19-4,0834,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",COLD_SPRING_RD,08:34:38,08:35:33,Narrative: checked Narrative: done Narrati...,"PATROL DAVID JENNINGS, D",Initiated - BUILDING CHECK,BUILDING CHECKED/SECURED
2,01/01/2019,1,19-5,0842,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",LONGVIEW_TER,08:43:07,08:43:21,Narrative checked Narrative Checked 0208,"PATROL DAVID JENNINGS, D",Initiated - BUILDING CHECK,BUILDING CHECKED/SECURED
3,01/01/2019,1,19-6,0846,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",FRENTER_AVE,08:46:51,08:47:07,Narrative: checked Narrative: Checked 0159,"PATROL DAVID JENNINGS, D",Initiated - BUILDING CHECK,BUILDING CHECKED/SECURED
4,01/01/2019,1,19-7,0847,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",HARWOOD_ST,08:47:37,08:48:05,Narrative checked Narrative Checked 0201,"PATROL DAVID JENNINGS, D",Initiated - BUILDING CHECK,BUILDING CHECKED/SECURED
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10692,12/31/2019,2371,19-16060,1656,Initiated - MOTOR VEHICLE STOP CITATION ~ WAR...,PATROL KALVIN DZIBDSIAK HANCOCK RD,Unit:,16:56:09,17:03:51,,PATROL KALVIN DZIEDZIAK,Initiated - MOTOR VEHICLE STOP,CITATION
10693,12/31/2019,2371,19-16062,Call,,SERGEANT DAVID R LEMIEUX SOUTHWORTH ST,Unit:,17:04:50,17:06:35,Narrative: Checked 0151 Initiated - BUILDING...,SERGEANT DAVID R LEMIEUX,,
10694,12/31/2019,2371,19-16063,i7il,,SERGEANT DAVID R LEMIEUX NORTH ST,(WIA_243],17:11:31,17:12:47,Narrative checked SERVICES RENDERED 29-1606...,SERGEANT DAVID R LEMIEUX,,
10695,12/31/2019,2371,19-16065,1729,Phone - ASSIST OTHER AGENCY - wes PATROL DAVI...,,STETSON_RD,22:;23:;36,22:29:55,Narrative: he green shed at the little league...,,Phone - ASSIST OTHER AGENCY,


In [142]:
# now go after car text

entry_text = "19-22  1527  911 - MOTOR VEHICLE ACCIDENT  BUILDING CHECKED/SECURED  Cail Taker  PATROL KALVIN DZIEDZIAK  Locatiion/Address  NORTH ST  Unit  32  Arvd-15:29:20  Disp-15:27:00  Clrd-15:44:26  Unit  38  Disp-15:27:00  Arvd-15:29:18  Clrd-15:46:09  Vehicle  RED 2011  SUZI SX4 Reg: PC MA  66BS20  VIN: JS2YBS5A38B6301415  Operator  FORREST,  ERICA @ 4 EDMUNDS ST  Apt. #A201 ~ ADAMS, MA 01220-2249  Race:  WwW  Sex: F  Refer To Citation  T1400608  Owner  FORREST,  ERICA @ 4 EDMUNDS ST  Apt. #A201 - ADAMS, MA 01220-2249  Race:  Ww  Sex: F  Vehicle  GRY 2012  SUBA ST OUTBAC Reg:  PC MA 452WB4 VIN: 4S4BRBCC8C3239617  Operator  LEBLANC,  NATHAN L @ 253 NORTH  ST - WILLIAMSTOWN, MA 01267-2004  Race: W  Sex: M  Owner  LEBLANC,  NATHAN L @ 253 NORTH  ST - WILLIAMSTOWN, MA 01267-2004  Race: W  Sex: M  Narrative  MVA located at the rotary.  1535- Williams Inn asked to hold  peter pan bus for three  additional passengers.  Refer To Accident:  19~1-AC  "

call_number = "19-22"

vehicle_strs = "|".join(["Vehicle:", "Vehicle"])
owner_strs = "|".join(["Owner:", "Owner"])
operator_strs = "|".join(["Operator:", "Operator"])

all_vehicles = []
all_people = []

def process_vehicles(entry_text, call_number):

    vehicle_starts = [vloc.start() for vloc in re.finditer(vehicle_strs, entry_text)] + [-1]

    if len(vehicle_starts) > 1:

        for ivs in range(len(vehicle_starts) - 1):
            vehicle_entry = entry_text[vehicle_starts[ivs]:vehicle_starts[ivs+1]]
            #print(ivs, vehicle_entry)
            operator_txt = re.search(owner_strs, vehicle_entry)
            owner_txt = re.search(operator_strs, vehicle_entry)
            
            # now we have to worry about the ordering of operators / owners and if we have both info
            if operator_txt and owner_txt:
                
                # operator comes first
                if operator_txt.start() < owner_txt.start():
                    vehicle_txt = vehicle_entry[:operator_txt.start()]
                    operator_txt = vehicle_entry[operator_txt.end():owner_txt.start()]
                    owner_txt = vehicle_entry[owner_txt.end():]
                
                # owner comes first
                else:
                    vehicle_txt = vehicle_entry[:owner_txt.start()]
                    owner_txt = vehicle_entry[owner_txt.end():operator_txt.start()]
                    operator_txt = vehicle_entry[operator_txt.end():]
            
            # only operator
            elif operator_txt:
                vehicle_txt = vehicle_entry[:operator_txt.start()]
                operator_txt = vehicle_entry[operator_txt.end():]
            
            # only owner    
            elif owner_txt:
                vehicle_txt = vehicle_entry[:owner_txt.start()]
                owner_txt = vehicle_entry[owner_txt.end():]

            
            if vehicle_txt:
                vinfo = get_vehicle_info(vehicle_txt)
                all_vehicles.append([call_number] + get_vehicle_info(vehicle_txt))
            
            if operator_txt:
                all_people.append([call_number, 'operator', vinfo[3]] + get_person_info(operator_txt))
                
            if owner_txt:
                all_people.append([call_number, 'owner', vinfo[3]] + get_person_info(owner_txt))

        
def get_vehicle_info(vehicle_txt):
    vehicle_words = [w for w in vehicle_txt.split(' ') if len(w) > 0]
    
    vcolor = vehicle_words[1]
    vyear = vehicle_words[2]
    
    if 'Reg:' in vehicle_words:
        regidx = vehicle_words.index('Reg:')
        vmodel = " ".join(vehicle_words[3:regidx])
    elif 'Reg' in vehicle_words:
        regidx = vehicle_words.index('Reg')
        vmodel = " ".join(vehicle_words[3:regidx])
    else:
        vmodel = None
        
        
    if 'VIN:' in vehicle_words:
        vinidx = vehicle_words.index('VIN:')
        vin = vehicle_words[vinidx+1]
    elif 'VIN' in vehicle_words:
        vinidx = vehicle_words.index('VIN')
        vin = vehicle_words[vinidx+1]
    else:
        vin = None
        
    return [vcolor, vyear, vmodel, vin]

def get_person_info(person_txt):
    lastname_idx = re.search(',', person_txt)
    if lastname_idx:
        lastname = person_txt[:lastname_idx.start()].strip()
    else:
        lastname = None
        
    firstname_idx = re.search('@', person_txt)
    if firstname_idx and lastname_idx:
        firstname = person_txt[lastname_idx.end():firstname_idx.start()].strip()
    else:
        firstname = None
    
    address = usaddress.parse(person_txt)
    st = " ".join([e[0] for e in address if e[1] in ['AddressNumber', 'StreetName', 'StreetNamePostType', 'OccupancyType', 'OccupancyIdentifier']])
    city = " ".join([e[0] for e in address if e[1] in ['PlaceName']])
    state = " ".join([e[0] for e in address if e[1] in ['StateName']])
    zipcode = " ".join([e[0] for e in address if e[1] in ['ZipCode']])
    
    race = find_next_word(person_txt, 'Race:')
    sex = find_next_word(person_txt, 'Sex:')
        
    return [lastname, firstname, race, sex, st, city, state, zipcode]
    
    
process_vehicles(entry_text, call_number)

print(all_vehicles)

print(all_people)

[['19-22', 'RED', '2011', 'SUZI SX4', 'JS2YBS5A38B6301415'], ['19-22', 'GRY', '2012', 'SUBA ST OUTBAC', '4S4BRBCC8C3239617']]
[['19-22', 'operator', 'JS2YBS5A38B6301415', 'FORREST', 'ERICA', 'Ww', 'F', '4 EDMUNDS ST Apt. # A201', 'ADAMS,', 'MA', '01220-2249'], ['19-22', 'owner', 'JS2YBS5A38B6301415', 'FORREST', 'ERICA', 'WwW', 'F', '4 EDMUNDS ST Apt. # A201', 'ADAMS,', 'MA', '01220-2249'], ['19-22', 'operator', '4S4BRBCC8C3239617', 'LEBLANC', 'NATHAN L', 'W', 'M', '253 NORTH ST', 'WILLIAMSTOWN,', 'MA', '01267-2004'], ['19-22', 'owner', '4S4BRBCC8C3239617', 'LEBLANC', 'NATHAN L', 'W', 'M', '253 NORTH ST', 'WILLIAMSTOWN,', 'MA', '01267-2004']]


In [242]:
unit_strs = "|".join(["Unit:", "Unit"])
arvd_strs = "|".join(['Arvd~', 'Arvd-', "Arv@-", "Arvd+"])
clrd_strs = "|".join(['Clrd-', 'Clrd~', 'Cird-', 'Clird-', 'Clrd+', "Clr@-"])
disp_strs = "|".join(['Disp-', 'Disp~', 'Disp+', "Dis@-"])
enrt_strs = "|".join(['Enrt-', 'Enrt~', 'Enrt+', "Enr@-"])

def process_units(entry_text, call_number, all_units):

    unit_starts = [uloc.start() for uloc in re.finditer(unit_strs, entry_text)] + [-1]
    if len(unit_starts) > 1:
        for iunits in range(len(unit_starts) - 1):
            
            unit_text = entry_text[unit_starts[iunits]:unit_starts[iunits+1]]
            
            unitnum = find_next_word(unit_text, unit_strs)
            disp_time = find_next_word(unit_text, disp_strs)
            enrt_time = find_next_word(unit_text, enrt_strs)
            arvd_time = find_next_word(unit_text, arvd_strs)
            clrd_time = find_next_word(unit_text, clrd_strs)
            
            all_units.append([call_number, unitnum, disp_time, enrt_time, arvd_time, clrd_time])
            
            
    return all_units
    

In [237]:
parsed_pages = pd.read_csv('../../data/parsed_logs_2019.csv')

#parsed_pages.loc[parsed_pages['call_reason_action'].str.len() > 1, 'call_reason_action1'] = parsed_pages[parsed_pages['call_reason_action'].str.len() > 1]['call_reason_action'].str.split("  ").str.get(0)
#parsed_pages['call_reason_action2'] = parsed_pages['call_reason_action'].str.split("  ").str.get(1)


#parsed_pages['call_reason_action1'].value_counts().iloc[100:140]

In [238]:
def standardize_partial(s, known_list, min_ratio):
    if isinstance(s, str):
        s = s.strip()
        pmatch = np.array([partial_ratio(s, oname) for oname in known_list])
        argmatch = np.argmax(pmatch)
        if pmatch[argmatch] > min_ratio:
            return known_list[argmatch]
    return None 
    
def clean_call_actions(parsed_pages):

    with open('williamston_known_actions.txt', 'r') as infile:
        known_actions = [line.replace("\n", "").strip() for line in infile]

    known_actions_ratio = np.array([ratio(a1,a2) for a1, a2 in itertools.combinations(known_actions, 2)])
    min_actions_ratio = known_actions_ratio.max()

    with open('williamston_known_reasons.txt', 'r') as infile:
        known_reasons = [line.replace("\n", "").strip() for line in infile]

    known_reasons_ratio = np.array([ratio(a1,a2) for a1, a2 in itertools.combinations(known_reasons, 2)])
    min_reasons_ratio = known_reasons_ratio.max()

    parsed_pages['call_reasons'] = [standardize_partial(s, known_reasons, min_reasons_ratio) for s in parsed_pages['call_reason_action'].values]
    parsed_pages['call_actions'] = [standardize_partial(s, known_actions, min_actions_ratio) for s in parsed_pages['call_reason_action'].values]


    return parsed_pages

parsed_pages = clean_call_actions(parsed_pages)

In [224]:
parsed_pages[parsed_pages['call_reasons'].isnull()]['call_reason_action'].str.split("  ").str.get(0).value_counts().iloc[:50]

Phone -                                     77
911 -                                       38
Walk-In -                                   31
Other -                                     30
Initiated - MOTOR VEHICLE sToP              23
                                            18
$11 -                                       16
Initiated - PUBLIC SERVICE                  14
initiated ~ BUILDING CHECK                  13
Phone - ASSIST OTHER AGENCY                 11
Initiated -                                 10
Initiated - MOTOR VEHICLE sToOP              9
Initiated - BULTLDING CHECK                  8
DING CHECKED/SECURED                         8
ative:                                       8
Walk-In - MOTOR VEHICLE ACCIDENT             8
Sil -                                        7
initiated - BULLDING CHECK                   6
Initiated - ASSIST OTHER AGENCY              5
Initiated - SERVE RESTRAINING ORDER          5
Walk-In - TRAFFIC COMPLAINT                  5
Initiated - M

In [225]:
parsed_pages[parsed_pages['call_actions'].isnull()]['call_reason_action'].str.split("  ").str.get(1).value_counts().iloc[:50]

ALARM                                                           14
ANIMAL CONTROL                                                  12
MATLED                                                           7
ALARM - BURGLAR                                                  6
PATROL TANIA HERNANDEZ                                           5
SERGEANT SCOTT E MCGOWAN                                         4
TAKEN                                                            3
Phone -                                                          3
AGENCY                                                           3
Initiated - BUILDING CHECK                                       3
PATROL BRAD SACCO                                                2
UNWANTED GUEST                                                   2
call Taker:                                                      2
‘call Taker                                                      2
‘call Taker:                                                  

In [229]:
with open('williamston_known_reasons.txt', 'r') as infile:
        known_reasons = [line.replace("\n", "").strip() for line in infile]

for a1, a2 in itertools.combinations(known_reasons, 2):
    if(ratio(a1,a2)  > 90):
        print(a1, a2, ratio(a1,a2))


Initiated - ASSIST OTHER AGENCY - FIRE Initiated - ASSIST OTHER AGENCY - POLICE 92
Initiated - ASSIST OTHER AGENCY - FIRE Initiated - ASSIST OTHER AGENCY - DPW 91
Initiated - ASSIST OTHER AGENCY Initiated - ASSIST OTHER AGENCY - DPW 91
Initiated - ASSIST OTHER AGENCY - POLICE Initiated - ASSIST OTHER AGENCY - DPW 91
Phone - ASSIST OTHER AGENCY - FIRE Phone - ASSIST OTHER AGENCY - POLICE 91
Phone - ASSIST OTHER AGENCY - DPW Phone - ASSIST OTHER AGENCY - WCS 94


In [230]:
with open('williamston_known_actions.txt', 'r') as infile:
        known_reasons = [line.replace("\n", "").strip() for line in infile]

for a1, a2 in itertools.combinations(known_reasons, 2):
    if(ratio(a1,a2)  > 90):
        print(a1, a2, ratio(a1,a2))



In [235]:
parsed_pages[parsed_pages['cleaned_call_taker'].isnull()]['call_taker'].value_counts().iloc[:50]

dress  SPRING ST  h    1
COLD SPRING RD         1
;                      1
Name: call_taker, dtype: int64

In [247]:
parsed_pages

Unnamed: 0,current_date,page_num,call_number,call_time,call_reason_action,call_taker,call_address,arvd_time,clrd_time,narrative_text,cleaned_call_taker,call_reasons,call_actions
0,01/01/2019,1,19-1,0341,Initiated - PARKING CHECK SERVICES RENDERED,PATROL CRAIG A EICHHAMMER,[WIA_249]_NORTH_ST,03:42:00,03:42:17,,PATROL CRAIG A EICHHAMMER,Initiated - PARKING CHECK,SERVICES RENDERED
1,01/01/2019,1,19-4,0834,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",COLD_SPRING_RD,08:34:38,08:35:33,Narrative: checked Narrative: done Narrati...,"PATROL DAVID JENNINGS, D",Initiated - BUILDING CHECK,BUILDING CHECKED/SECURED
2,01/01/2019,1,19-5,0842,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",LONGVIEW_TER,08:43:07,08:43:21,Narrative checked Narrative Checked 0208,"PATROL DAVID JENNINGS, D",Initiated - BUILDING CHECK,BUILDING CHECKED/SECURED
3,01/01/2019,1,19-6,0846,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",FRENTER_AVE,08:46:51,08:47:07,Narrative: checked Narrative: Checked 0159,"PATROL DAVID JENNINGS, D",Initiated - BUILDING CHECK,BUILDING CHECKED/SECURED
4,01/01/2019,1,19-7,0847,Initiated - BUILDING CHECK BUILDING CHECKED/S...,"PATROL DAVID JENNINGS, D",HARWOOD_ST,08:47:37,08:48:05,Narrative checked Narrative Checked 0201,"PATROL DAVID JENNINGS, D",Initiated - BUILDING CHECK,BUILDING CHECKED/SECURED
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10692,12/31/2019,2371,19-16060,1656,Initiated - MOTOR VEHICLE STOP CITATION ~ WAR...,PATROL KALVIN DZIBDSIAK HANCOCK RD,Unit:,16:56:09,17:03:51,,PATROL KALVIN DZIEDZIAK,Initiated - MOTOR VEHICLE STOP,CITATION
10693,12/31/2019,2371,19-16062,Call,,SERGEANT DAVID R LEMIEUX SOUTHWORTH ST,Unit:,17:04:50,17:06:35,Narrative: Checked 0151 Initiated - BUILDING...,SERGEANT DAVID R LEMIEUX,,
10694,12/31/2019,2371,19-16063,i7il,,SERGEANT DAVID R LEMIEUX NORTH ST,(WIA_243],17:11:31,17:12:47,Narrative checked SERVICES RENDERED 29-1606...,SERGEANT DAVID R LEMIEUX,,
10695,12/31/2019,2371,19-16065,1729,Phone - ASSIST OTHER AGENCY - wes PATROL DAVI...,,STETSON_RD,22:;23:;36,22:29:55,Narrative: he green shed at the little league...,,Phone - ASSIST OTHER AGENCY,


In [240]:
parsed_pages.to_csv('../../data/parsed_logs_2019.csv', mode='w', index=False, header=True)

In [246]:
all2019 = pd.read_csv('../../data/parsed_logs_2019.csv')
all2019[all2019['call_number'] == '19-2157']



Unnamed: 0,current_date,page_num,call_number,call_time,call_reason_action,call_taker,call_address,arvd_time,clrd_time,narrative_text,cleaned_call_taker,call_reasons,call_actions
1551,02/18/2019,306,19-2157,1630,Initiated - MOTOR VEHICLE STOP CITATION - CIVIL,PATROL JOHN J MCCONNELL JR,SIMONDS_RD,16:30:58,16:40:52,,PATROL JOHN J MCCONNELL JR,Initiated - MOTOR VEHICLE STOP,CITATION - CIVIL


In [None]:
'PATROL CRAIG A EICHHAMMER'