DSI BOS 11 (May 2020) Project 5

Alex Golden, Jungmoon Ham, Luke Podsiadlo, Zach Tretter

Workbook 5 - Natural Language Processing

---------

## Identify Addresses in Transcripts

This workbook leverages the work of [DSI-SF-9 (Grant Wilson, J. Hall, Gabriel Perez Prieto)](https://github.com/GWilson97/san_francisco_dispatch_audio_mapping/blob/master/code/04_get_street_name.ipynb)

### Workflow Steps

1. [Imports](#Imports)
2. [Read in Transcripts](#Read-in-Transcripts)
3. [Import List of Streets](#Import-List-of-Streets)
4. [Tokenize the Transcribed Audio](#Tokenize-the-Transcribed-Audio)
5. [Identify and Match Street Names in Transcripts](#Identify-and-Match-Street-Names-in-Transcripts)
6. [Find Possible Street Numbers](#Find-Possible-Street-Numbers)
7. [Add Street Numbers to Dataframe](#Add-Street-Numbers-to-Dataframe)
8. [Generate Potential Addresses](#Generate-Potential-Addresses)
9. [Drop Blanks and Export](#Drop-Blanks-and-Export)

## Imports

In [None]:
# ! pip install usaddress
# ! pip install spacy

import spacy
import re
import pandas as pd
from spacy import displacy
from spacy.attrs import LOWER 
from collections import Counter
from spacy.matcher import Matcher
import numpy as np
import usaddress
import requests
import pandas as pd
import os

import seaborn as sns

import nltk
from nltk.tokenize import RegexpTokenizer

# !python -m spacy download en_core_web_sm
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
# Set column width to be larger to display more content
pd.options.display.max_colwidth = 1000

### Read in Transcripts

* feed_25818_raw_transcript.csv
* feed_25818_enhanced_transcript.csv
* watertown_manhunt_transcript

In [None]:
df1 = pd.read_csv("../DATASETS/transcripts/feed_25818_raw_transcript.csv")
df1['dolby'] = False
df1 = df1[['file_name','confidence','dolby','transcript']]

In [None]:
df_enhanced = pd.read_csv("../DATASETS/transcripts/feed_25818_enhanced_transcript.csv")
df_enhanced['dolby'] = True
df_enhanced = df_enhanced[['file_name','confidence','dolby','transcript']]

In [None]:
df_watertown = pd.read_excel("../DATASETS/transcripts/watertown_manhunt_transcript.xlsx")
df_watertown = df_watertown.drop(columns = ['Unnamed: 1'])
df_watertown['confidence'] = 'manual'
df_watertown['dolby'] = False
df_watertown['file_name'] = 'watertown_manhunt'
df_watertown = df_watertown[['file_name','confidence','dolby','transcript']]

In [None]:
df = pd.concat([df1,
                df_enhanced,
                df_watertown]).reset_index().drop(columns=['index'])
df

### Import List of Streets

In [None]:
street_list = pd.read_csv('../DATASETS/ancillary_csv/Metro_West_Streets.csv')
streets_list = street_list['0'].tolist()

### Tokenize the Transcribed Audio

Adapted from [Michael Allen (pythonhealthcare.org)](https://pythonhealthcare.org/2018/12/14/101-pre-processing-data-tokenization-stemming-and-removal-of-stop-words/)

In [None]:
def identify_tokens(row):
    tran = row['transcript']
    tokens = nltk.word_tokenize(tran)
    # taken only words (not punctuation)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

df['tokens'] = df.apply(identify_tokens,
                        axis=1)

### Identify and Match Street Names in Transcripts

Adapted from [DSI-SF-9 (Grant Wilson, J. Hall, Gabriel Perez Prieto)](https://github.com/GWilson97/san_francisco_dispatch_audio_mapping/blob/master/code/04_get_street_name.ipynb)

In [None]:
# Instantiate the Spacy Matcher Function
matcher = Matcher(nlp.vocab)

# Create a matching function
def on_match(matcher, doc, id, matches):
    return matches

# building patterns for every road name
def build_pattern(road_name):
    list_words = road_name.split(' ')
    # ensure capitlization does not affect the model 
    pattern = [{'LOWER': word.lower()} for word in list_words]
    return pattern

# Get a pattern of every road
for road in streets_list:
    matcher.add(road, on_match, build_pattern(road))
    
# capitalize all the strings
def capitalize_string(string_in):
    words = string_in.split(' ')
    string_out = ''
    for i in words:
        string_out += i.capitalize() + ' '
    string_out = string_out[:-1]
    return string_out   
    
# Look for locations in the transcript, then extract them
def location_extraction_context(string_in):
    doc = nlp(string_in)
    string_out = ''
    list_words = string_in.split(' ')
    matches = matcher(doc)
    if len(matches) == 0:
        return None

    # loop through the matches and make sure they all follow the same format
    for match in matches:
        list_pattern = matcher.get(match[0])[1][0]
        for token in list_pattern:
            string_out += token['LOWER'] + ' '
        string_out += ', '
    string_out = string_out[:-3]
    string_out = capitalize_string(string_out)
    return string_out

# Add a column consisting of the extracted streets
df['streets'] = df['transcript'].map(location_extraction_context)

### Find Possible Street Numbers

Adapted from [DSI-SF-9 (Grant Wilson, J. Hall, Gabriel Perez Prieto)](https://github.com/GWilson97/san_francisco_dispatch_audio_mapping/blob/master/code/04_get_street_name.ipynb)

In [None]:
# Creat list to house data from all addresses
addresses = []

# Loop Through all DataFrame's rows
for row in df['transcript']:
    # Create dictionary to house data for each row of the DataFrame
    d = {}
    
    # Parse through rows and house results in a list
    list_tuples = usaddress.parse(row)
    
    # Create variable to house list of possible numbers
    numbers = []
    
    # Loop through each value in the list created
    for i, n in enumerate(list_tuples):
        
        # Get addresses' numbers
        if list_tuples[i][1] == 'AddressNumber':
            
            # Append numbers to list
            numbers.append(n[0])
    
    # Include keys and values into d
    d['numbers'] = numbers
    
    # Append d to addresses
    addresses.append(d)

### Add Street Numbers to Dataframe

In [None]:
df = pd.concat([df, 
                pd.DataFrame(addresses)], axis=1)

In [None]:
# Drop NaNs
df.dropna(inplace=True)
df.reset_index(drop=True,
               inplace=True)

In [None]:
df.shape

### Generate Potential Addresses

Adapted from [DSI-SF-9 (Grant Wilson, J. Hall, Gabriel Perez Prieto)](https://github.com/GWilson97/san_francisco_dispatch_audio_mapping/blob/master/code/04_get_street_name.ipynb)

In [None]:
# Creat list to house data for possible addresses
possibilities = []

# Loop Through all DataFrame's rows
for i in range(0, df.shape[0]):
    
    # Create variables to temporarily house information
    final_poss = []
    d = {}
    number_poss = []
    
    # Loop through values in each row / numbers
    for row in df[i:i+1]['numbers']:
        for a_number in row:
            number_poss.append(a_number)
    
    # Loop through values in each row / streets
    street_poss = []
    for row2 in [x.split(',') for x in df[i:(i+1)]['streets']][0]:
        for j in row2.split(','):
            street_poss.append(j.strip())

    # Concatenate numbers and streets
    for i in number_poss:
        for j in street_poss:
            final_poss.append(i + ' ' + j)

    # Append all possibilities to list
    d['full_streets'] = list(set(final_poss))
    possibilities.append(d)

# Concatenate dataframes
df = pd.concat([df, pd.DataFrame(possibilities)], axis=1)

### Drop Blanks and Export

In [None]:
df['full_streets'] = df['full_streets'].map(lambda x: np.nan if len(x) == 0 else x)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.shape

In [None]:
df

In [None]:
df.to_csv("../DATASETS/dataframe_final.csv")

### Visualize "Confidence"

In [None]:
df_confid = df[(df['file_name']!='watertown_manhunt')
   &(df['file_name'].str.contains('enhanced')!=True)]['confidence']

In [None]:
df_confid.mean()

In [None]:
df_confid.plot(
    kind = 'hist',
    x = 'Confidence',
    title = 'Transcription Confidence',
    figsize = (10,8))

In [None]:
ax = sns.distplot(df_confid, kde = False)
ax.set_title('Transcription Confidence')
ax.set_ylabel("Frequency")
ax.set_xlabel("Confidence")