### Import All Neccesary Libraries

In [None]:
# ! pip install usaddress
# ! pip install spacy

import spacy
import re
import pandas as pd
from spacy import displacy
from spacy.attrs import LOWER 
from collections import Counter
from spacy.matcher import Matcher
import numpy as np
import usaddress
import requests
import pandas as pd

import nltk
from nltk.tokenize import RegexpTokenizer

nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')

In [None]:
# Set column width to be larger to display more content
pd.options.display.max_colwidth = 1000

### Import the Transcribed Audio

In [None]:
df1 = pd.read_csv("./Datasets/transcribed_audio/Feed25818_May2020_01AM_to_03PM_transcript_Alex.csv")
df1 = df1[['file_name','confidence','transcript']]
df1.head(2)

In [None]:
df2 = pd.read_csv("./Datasets/transcribed_audio/Feed25818_May2020_10AM_to_12AM_transcript_Zach.csv")
df2 = df2.drop(columns = ['audio_length', 'transcribe_time'])
df2.head(2)

In [None]:
df3 = pd.read_excel("./Datasets/transcribed_audio/watertown_manhunt_transcript.xlsx")
df3 = df3.drop(columns = ['Unnamed: 1'])
df3['confidence'] = None
df3['file_name'] = 'watertown_manhunt'
df3 = df3[['file_name','confidence','transcript']]

In [None]:
df = pd.concat([df1,df3,df2]).reset_index().drop(columns=['index'])
df

### Import List of Streets

In [None]:
street_list = pd.read_csv('./Datasets/Metro_West_Streets.csv')
streets_list = street_list['0'].tolist()
streets_list[:10]

### Tokenize the Transcribed Audio

In [None]:
def identify_tokens(row):
    tran = row['transcript']
    tokens = nltk.word_tokenize(tran)
    # taken only words (not punctuation)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

df['tokens'] = df.apply(identify_tokens,
                        axis=1)

# adpapted from Michael Allen (pythonhealthcare.org)

In [None]:
df.head(10)

### Identify and Match Street Names from the Audio Transcripts

In [None]:
# Instantiate the Spacy Matcher Function
matcher = Matcher(nlp.vocab)

# Create a matching function
def on_match(matcher, doc, id, matches):
    return matches

# building patterns for every road name
def build_pattern(road_name):
    list_words = road_name.split(' ')
    # ensure capitlization does not affect the model 
    pattern = [{'LOWER': word.lower()} for word in list_words]
    return pattern

# Get a pattern of every road
for road in streets_list:
    matcher.add(road, on_match, build_pattern(road))
    
# capitalize all the strings
def capitalize_string(string_in):
    words = string_in.split(' ')
    string_out = ''
    for i in words:
        string_out += i.capitalize() + ' '
    string_out = string_out[:-1]
    return string_out   
    
# Look for locations in the transcript, then extract them
def location_extraction_context(string_in):
    doc = nlp(string_in)
    string_out = ''
    list_words = string_in.split(' ')
    matches = matcher(doc)
    if len(matches) == 0:
        return None

    # loop through the matches and make sure they all follow the same format
    for match in matches:
        list_pattern = matcher.get(match[0])[1][0]
        for token in list_pattern:
            string_out += token['LOWER'] + ' '
        string_out += ', '
    string_out = string_out[:-3]
    string_out = capitalize_string(string_out)
    return string_out

# Add a column consisting of the extracted streets
df['streets'] = df['transcript'].map(location_extraction_context)

Above Code Adapted from: Grant Wilson San Francisco Cohort

### Find Possible Street Numbers

In [None]:
# Creat list to house data from all addresses
addresses = []

# Loop Through all DataFrame's rows
for row in df['transcript']:
    # Create dictionary to house data for each row of the DataFrame
    d = {}
    
    # Parse through rows and house results in a list
    list_tuples = usaddress.parse(row)
    
    # Create variable to house list of possible numbers
    numbers = []
    
    # Loop through each value in the list created
    for i, n in enumerate(list_tuples):
        
        # Get addresses' numbers
        if list_tuples[i][1] == 'AddressNumber':
            
            # Append numbers to list
            numbers.append(n[0])
    
    # Include keys and values into d
    d['numbers'] = numbers
    
    # Append d to addresses
    addresses.append(d)

Above Code Adapted from: Grant Wilson San Francisco Cohort

### Add Street Number Column to the Dataframe

In [None]:
df = pd.concat([df, pd.DataFrame(addresses)], axis=1)

In [None]:
# Drop NaNs
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.shape

### Generate a List of All Possible Adresses for each Row in the Transcript

In [None]:
# Creat list to house data for possible addresses
possibilities = []

# Loop Through all DataFrame's rows
for i in range(0, df.shape[0]):
    
    # Create variables to temporarily house information
    final_poss = []
    d = {}
    number_poss = []
    
    # Loop through values in each row / numbers
    for row in df[i:i+1]['numbers']:
        for a_number in row:
            number_poss.append(a_number)
    
    # Loop through values in each row / streets
    street_poss = []
    for row2 in [x.split(',') for x in df[i:(i+1)]['streets']][0]:
        for j in row2.split(','):
            street_poss.append(j.strip())

    # Concatenate numbers and streets
    for i in number_poss:
        for j in street_poss:
            final_poss.append(i + ' ' + j)

    # Append all possibilities to list
    d['full_streets'] = list(set(final_poss))
    possibilities.append(d)

# Concatenate dataframes
df = pd.concat([df, pd.DataFrame(possibilities)], axis=1)

Above Code Adapted from: Grant Wilson San Francisco Cohort

In [None]:
df.head(2)

### Drop Blanks

In [None]:
df['full_streets'] = df['full_streets'].map(lambda x: np.nan if len(x) == 0 else x)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.head(2)

In [None]:
df.shape