# Scrape Data

In [62]:
# General
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# For Natural Language Processing
import regex as re
import unidecode
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

## Watertown transcript

In [63]:
# import text file, Watertown transcript
with open('watertown.txt', 'r') as file:
    data = file.read()

In [64]:
# Split the text by sentence
df = data.split('\n')
print(df[0])
print(df[2])

One 5'7", the second with darker skin, both suspects armed with firearms, driving a Black Mercedes SUV.  Carjacked at 816 memorial drive at the gas station in Cambridge, Suspects are two middle eastern males, on is 5'7", second one with



In [65]:
type(df[2])

str

In [66]:
len(df[2])

0

In [67]:
# Remove empty sentences
df = [string for string in df if len(string) > 0]

# Put it into the dataframe
df_transcript = pd.DataFrame({'transcript':df})

# Define Watertown text as 'is_crime'
df_transcript['type'] = 'is_crime'

In [136]:
df_transcript

Unnamed: 0,transcript,type
0,"One 5'7"", the second with darker skin, both suspects armed with firearms, driving a Black Mercedes SUV. Carjacked at 816 memorial drive at the gas station in Cambridge, Suspects are two middle eastern males, on is 5'7"", second one with",is_crime
1,"darker skin, no description on clothing yet, ahh both are armed with firearms""",is_crime
2,"508, inaudible...supposedly he is saying inaudible...fled towards Harvard Square. I believe he also said one of the suspects went into the Shell Station and paid cash for gas, put gas in the car, before they fled and that was when the victim was able to get out of the vehicle",is_crime
3,"Control: they went into that gas station, paid cash for gas and then fled towards Harvard Square?",is_crime
4,"Yea, I believe they have video in that Shell station, I am with the victim and believe they have video in the station",is_crime
...,...,...
163,"Control, the suspect vehicle is on Spruce - the suspect vehicle is on Spruce.",is_crime
164,"(officer at Spruce ) 19, ahh you have a math problem here, Do you have 1 or 2 SP's in custody? Do you know?",is_crime
165,"19, I don't know - I gotten word that there's one at Upland and Mt. Auburn in custody, I've gotten word there's another one at Dexter and Laurel - fallen down waiting for EMS - the second party is not out of his ....inaudible",is_crime
166,"521, All non Cambridge units we have two suspects in custody - one at Dexter and Laurel and one at Mt Auburn and Upland. Two suspects are in custody.",is_crime


In [139]:
# For street name
df_transcript.to_csv('watertown_street.csv', index=False)

## Football Commentary

In [69]:
# https://www.news18.com/fifa-world-cup-2018/commentary/21977/
# http://www.chesterfc.com/page/3/?s=LIVE+TEXT+COMMENTARY

In [70]:
# import text file, soccer commentary transcript
with open('football.txt', 'r') as file:
    data2 = file.read()

In [71]:
# Split the text by sentence
df_nonwatertown = data2.split('\n')

# Remove empty sentences
df_nonwatertown = [string for string in df_nonwatertown if len(string) > 0]

# Put it into the dataframe
df_non_transcript = pd.DataFrame({'transcript':df_nonwatertown})

# Define Watertown text as 'is_not_crime'
df_non_transcript['type'] = 'is_not_crime'

In [72]:
df_non_transcript

Unnamed: 0,transcript,type
0,we played nearly six minutes yer list’ning to five live from the bee bee cee Real Madrid nil Manchester united nil here in the bernabeu (1) Alonso (.) drives it in right footed (.) deep and its headed over the crossbar,is_not_crime
1,ROBBIE SAVAGE: (2) Yeh (.) I thought united (.) y’know the back four were bit too deep then for the initial free kick they started on the penalty area and drifted towards the six yard a /,is_not_crime
2,ALAN GREEN: / Sergio,is_not_crime
3,ROBBIE SAVAGE: /yep/,is_not_crime
4,ALAN GREEN: /Ramos who got the header in,is_not_crime
...,...,...
161,"Portugal Substitutions: Beto, Ruben Dias, Manuel Fernandes, Joao Mario, Anthony Lopes, Gelson Martins, Bruno Alves, Ricardo Pereira, Ricardo Quaresma, Mario Rui, Adrien Silva, Andre Silva",is_not_crime
162,"Portugal (4-2-3-1): Rui Patricio; Cedric, Pepe, Jose Fonte, Raphael Guerreiro; William Carvalho, Joao Moutinho; Bernardo Silva, Goncalo Guedes, Bruno Fernandes; Cristiano Ronaldo; Fernando Santos",is_not_crime
163,"Spain look to repeat their 2010 World Cup performance which they won, while trying to avoid their 2014 World Cup performance which they failed to make it out of the group stage. The big news coming out of the Spain camp is the firing of manager Julen Lopetegui on the eve of the start of the World Cup. Lopetegui did not lose a match while managing Spain. Spain players have rallied together, with Gerard Pique referencing the 1989 Michigan Wolverines basketball team, implying that they can win, even with the new manager.",is_not_crime
164,"The 2016 UEFA European Champions will look to make it out of the group stage after failing to do so in 2014. In what might be Cristiano Ronaldo's last World Cup, Ronaldo will be looking for glory. He will have help from an experienced defense, led by goalie Rui Patricio and stalwart center back Pepe, while Goncalo Guedes and Bernardo Silva will look to provide pace and support on the wings.",is_not_crime


In [73]:
# Combine into single dataframe
df = pd.concat([df_transcript, df_non_transcript])
df = df.reset_index(drop=True)
df['is_crime'] = df['type'].map(lambda t: 1 if t == 'is_crime' else 0)

In [74]:
df.head()

Unnamed: 0,transcript,type,is_crime
0,"One 5'7"", the second with darker skin, both suspects armed with firearms, driving a Black Mercedes SUV. Carjacked at 816 memorial drive at the gas station in Cambridge, Suspects are two middle eastern males, on is 5'7"", second one with",is_crime,1
1,"darker skin, no description on clothing yet, ahh both are armed with firearms""",is_crime,1
2,"508, inaudible...supposedly he is saying inaudible...fled towards Harvard Square. I believe he also said one of the suspects went into the Shell Station and paid cash for gas, put gas in the car, before they fled and that was when the victim was able to get out of the vehicle",is_crime,1
3,"Control: they went into that gas station, paid cash for gas and then fled towards Harvard Square?",is_crime,1
4,"Yea, I believe they have video in that Shell station, I am with the victim and believe they have video in the station",is_crime,1


# Cleaning

## Pre-processing

In [75]:
# Drop duplicates and check the base accuracy
df.drop_duplicates(inplace=True)
df['is_crime'].value_counts(normalize=True)

0    0.50303
1    0.49697
Name: is_crime, dtype: float64

In [76]:
# Preprocessing function
def commentaries_to_words(raw_commentary):
    
    # Get rid of accents
    unaccented = unidecode.unidecode(raw_commentary)
    
    # Get rid of punctuation
    letters_only = re.sub("[^a-zA-Z]", " ", unaccented)
    
    # Get all lowercase words
    words = letters_only.lower().split()
    
    # Instantiate and run Lemmatizer
    lemmatizer = WordNetLemmatizer()
    tokens_lem = [lemmatizer.lemmatize(i) for i in words]
    
    # Remove stop words
    stops = set(stopwords.words('english'))
    meaningful_words = [w for w in words if not w in stops]
    
    # Join into string and return the result.
    return(" ".join(meaningful_words))

In [77]:
# Clean all commentary 
total_commentaries = df.shape[0]
clean_commentaries = []

print("Cleaning the commentaries")

i = 0
for commentary in df['transcript']:
    clean_commentaries.append(commentaries_to_words(commentary))
    
    if (i+1) % 100 == 0:
        print(f'Commentary {i+1} of {total_commentaries}.')
        
        i += 1
        
        if i == total_commentaries:
            print('Done.')

Cleaning the commentaries


In [78]:
df = df.assign(clean_commentary = clean_commentaries)
df.head()

Unnamed: 0,transcript,type,is_crime,clean_commentary
0,"One 5'7"", the second with darker skin, both suspects armed with firearms, driving a Black Mercedes SUV. Carjacked at 816 memorial drive at the gas station in Cambridge, Suspects are two middle eastern males, on is 5'7"", second one with",is_crime,1,one second darker skin suspects armed firearms driving black mercedes suv carjacked memorial drive gas station cambridge suspects two middle eastern males second one
1,"darker skin, no description on clothing yet, ahh both are armed with firearms""",is_crime,1,darker skin description clothing yet ahh armed firearms
2,"508, inaudible...supposedly he is saying inaudible...fled towards Harvard Square. I believe he also said one of the suspects went into the Shell Station and paid cash for gas, put gas in the car, before they fled and that was when the victim was able to get out of the vehicle",is_crime,1,inaudible supposedly saying inaudible fled towards harvard square believe also said one suspects went shell station paid cash gas put gas car fled victim able get vehicle
3,"Control: they went into that gas station, paid cash for gas and then fled towards Harvard Square?",is_crime,1,control went gas station paid cash gas fled towards harvard square
4,"Yea, I believe they have video in that Shell station, I am with the victim and believe they have video in the station",is_crime,1,yea believe video shell station victim believe video station


In [79]:
df_clean = pd.DataFrame({'commentary': df['clean_commentary'], 'crime': df['is_crime']})
df_clean.head()

Unnamed: 0,commentary,crime
0,one second darker skin suspects armed firearms driving black mercedes suv carjacked memorial drive gas station cambridge suspects two middle eastern males second one,1
1,darker skin description clothing yet ahh armed firearms,1
2,inaudible supposedly saying inaudible fled towards harvard square believe also said one suspects went shell station paid cash gas put gas car fled victim able get vehicle,1
3,control went gas station paid cash gas fled towards harvard square,1
4,yea believe video shell station victim believe video station,1


In [80]:
df_clean.to_csv('cleaned_data.csv', index=False)

In [81]:
df_clean['commentary'].isna().sum()

0

# NLP

## Classification Models

In [82]:
# import libraries

# For classification modeling
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# For evaluation
from sklearn.metrics import confusion_matrix

In [83]:
# import data
df = pd.read_csv('cleaned_data.csv')

In [84]:
# Few examples of crime
df.loc[df['crime']==1, 'commentary'].head()

0         one second darker skin suspects armed firearms driving black mercedes suv carjacked memorial drive gas station cambridge suspects two middle eastern males second one
1                                                                                                                       darker skin description clothing yet ahh armed firearms
2    inaudible supposedly saying inaudible fled towards harvard square believe also said one suspects went shell station paid cash gas put gas car fled victim able get vehicle
3                                                                                                            control went gas station paid cash gas fled towards harvard square
4                                                                                                                  yea believe video shell station victim believe video station
Name: commentary, dtype: object

In [85]:
# Few examples of not_crime
df.loc[df['crime']==0, 'commentary'].head()

164    played nearly six minutes yer list ning five live bee bee cee real madrid nil manchester united nil bernabeu alonso drives right footed deep headed crossbar
165                                        robbie savage yeh thought united know back four bit deep initial free kick started penalty area drifted towards six yard
166                                                                                                                                               alan green sergio
167                                                                                                                                               robbie savage yep
168                                                                                                                                     alan green ramos got header
Name: commentary, dtype: object

In [86]:
df = df.dropna()

### Get baseline accuracy score

In [87]:
df['crime'].value_counts(normalize=True)

0    0.506098
1    0.493902
Name: crime, dtype: float64

### Train/Test Split

In [88]:
# Set features and target
features = df['commentary']

X = features
y = df['crime']

In [89]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

### Bag of Words

In [90]:
# Instantiate CountVectorizer and add some stop words
cv = CountVectorizer(stop_words=['inaudible','okay','portugal','spain'])

# Fit the vectorizer
X_train_cv = cv.fit_transform(X_train)
X_train_cv = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names())

# Transform the test set
X_test_cv = cv.transform(X_test)

### Logistic Regression

In [91]:
# Instantiate model
lr = LogisticRegression(solver='lbfgs')

# Fit model
lr.fit(X_train_cv, y_train)

# Get scores
print('CV score:', cross_val_score(lr, X_train_cv, y_train, cv=3).mean())
print('Training accuracy:', lr.score(X_train_cv, y_train))
print('Testing accuracy:', lr.score(X_test_cv, y_test))

CV score: 0.9512195121951219
Training accuracy: 1.0
Testing accuracy: 0.975609756097561


### Naive Bayes

In [92]:
# Instantiate model
nb = MultinomialNB(0.7)

# Fit model
nb.fit(X_train_cv, y_train)

# Get scores
print('CV score:', cross_val_score(nb, X_train_cv, y_train, cv=3).mean())
print('Training accuracy:', nb.score(X_train_cv, y_train))
print('Testing accuracy:', nb.score(X_test_cv, y_test))

CV score: 0.9430894308943089
Training accuracy: 0.9959349593495935
Testing accuracy: 0.9512195121951219


### Random Forest

In [93]:
# Instantiate model
rf = RandomForestClassifier()

In [94]:
rf.fit(X_train_cv, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [95]:
cross_val_score(rf,X_train_cv,y_train).mean()

0.8536326530612245

In [96]:
# Set model params 
rf_params = {
    'n_estimators': [15, 20, 25],
    'max_depth': [None, 1, 2, 3, 4, 5],
    'min_samples_split': [2,3,4]
}

rf_gs = GridSearchCV(rf, param_grid=rf_params)
rf_gs.fit(X_train_cv, y_train)
print(rf_gs.best_score_)
rf_gs.best_params_

0.8699591836734694


{'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 25}

In [97]:
# Store the best fit model as best_rf 
best_rf = rf_gs.best_estimator_

# Evaluate the best fit model on the train and test data
print('train score:', best_rf.score(X_train_cv, y_train))
print('test score:', best_rf.score(X_test_cv, y_test))

train score: 0.8617886178861789
test score: 0.8902439024390244


## Evaluation

In [98]:
from sklearn.metrics import confusion_matrix

In [99]:
# Use best scoring model to evaluate
predictions = lr.predict(X_test_cv)
cm = confusion_matrix(y_test, predictions)

In [100]:
# Convert confusion matrix to dataframe
cm_df = pd.DataFrame(cm,
                    columns = ['predicted neg', 'predicted pos'],
                    index = ['actual neg', 'actual pos'])
cm_df

Unnamed: 0,predicted neg,predicted pos
actual neg,41,1
actual pos,1,39


In [101]:
# Create dataframe to show true and predicted values
results = pd.DataFrame()
results['actual'] = y_test
results['predicted'] = predictions

# Create column to show confusion matrix values
results['confusion'] = ''

# Set true positives
results['confusion'] = np.where(((results['actual']==1) & (results['predicted']==1)), 'TP', results['confusion'])

# Set true negatives
results['confusion'] = np.where(((results['actual']==0) & (results['predicted']==0)), 'TN', results['confusion'])

# Set false positives
results['confusion'] = np.where(((results['actual']==0) & (results['predicted']==1)), 'FP', results['confusion'])

# Set false negatives
results['confusion'] = np.where(((results['actual']==1) & (results['predicted']==0)), 'FN', results['confusion'])

results.head(3)

Unnamed: 0,actual,predicted,confusion
109,1,1,TP
93,1,1,TP
52,1,1,TP


In [102]:
coefs = list(zip(cv.get_feature_names(), lr.coef_[0].T))
coefs = pd.DataFrame(coefs, columns = ['word','coef'])
coefs['e^coef'] = np.exp(coefs['coef'])

In [103]:
# Show words most associated with is_Crime
coefs.sort_values(by='e^coef', ascending=False).head(10)

Unnamed: 0,word,coef,e^coef
865,spruce,0.971849,2.642827
993,vehicle,0.880107,2.411157
652,officer,0.771334,2.162649
249,dexter,0.746303,2.109188
1014,watertown,0.731561,2.078322
621,mt,0.682675,1.979165
72,auburn,0.682675,1.979165
198,copy,0.58922,1.802582
520,laurel,0.576691,1.780138
189,confirm,0.561408,1.753138


In [104]:
coefs.sort_values(by='e^coef').head(10)

Unnamed: 0,word,coef,e^coef
84,ball,-0.844388,0.42982
387,get,-0.842814,0.430498
408,good,-0.835097,0.433833
946,time,-0.785553,0.455867
839,silva,-0.777572,0.45952
948,today,-0.697451,0.497852
127,bruno,-0.683084,0.505057
330,fernandes,-0.683084,0.505057
789,ronaldo,-0.680434,0.506397
212,cristiano,-0.680344,0.506443


# Street name 

In [105]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [106]:
pip install usaddress

Note: you may need to restart the kernel to use updated packages.


### Import all necessary libraries

In [107]:
import spacy
import re
import pandas as pd
from spacy import displacy
from spacy.attrs import LOWER 
from collections import Counter
from spacy.matcher import Matcher
import numpy as np
import usaddress
import requests
import pandas as pd

import nltk
from nltk.tokenize import RegexpTokenizer

In [111]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz (12.0 MB)
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.0-py3-none-any.whl size=12019121 sha256=4090c07acc4b79a69b83b2681c46d6b598d26f751da4aa1ba365d1005741b84b
  Stored in directory: /Users/moon/Library/Caches/pip/wheels/64/69/41/6f820cf1d7488a0381a2059f66ec9f8f23116f7c67d18f3d8d
Successfully built en-core-web-sm
Note: you may need to restart the kernel to use updated packages.


In [116]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [117]:
# Set column width to be larger to display more content
pd.options.display.max_colwidth = 1000

### Import the transcribed audio

In [141]:
data = pd.read_csv('watertown_street.csv')

### Import List of streets

In [142]:
street_list = pd.read_csv('../Datasets/Metro_West_Streets.csv')

In [143]:
streets_list = street_list['0'].tolist()

### Tokenize the Transcribed Audio

In [144]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/moon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [145]:
def identify_tokens(row):
    tran = row['transcript']
    tokens = nltk.word_tokenize(tran)
    # taken only words (not punctuation)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

data['tokens'] = data.apply(identify_tokens, axis=1)

# adpapted from Michael Allen (pythonhealthcare.org)

In [146]:
data.head()

Unnamed: 0,transcript,type,tokens
0,"One 5'7"", the second with darker skin, both suspects armed with firearms, driving a Black Mercedes SUV. Carjacked at 816 memorial drive at the gas station in Cambridge, Suspects are two middle eastern males, on is 5'7"", second one with",is_crime,"[One, the, second, with, darker, skin, both, suspects, armed, with, firearms, driving, a, Black, Mercedes, SUV, Carjacked, at, memorial, drive, at, the, gas, station, in, Cambridge, Suspects, are, two, middle, eastern, males, on, is, second, one, with]"
1,"darker skin, no description on clothing yet, ahh both are armed with firearms""",is_crime,"[darker, skin, no, description, on, clothing, yet, ahh, both, are, armed, with, firearms]"
2,"508, inaudible...supposedly he is saying inaudible...fled towards Harvard Square. I believe he also said one of the suspects went into the Shell Station and paid cash for gas, put gas in the car, before they fled and that was when the victim was able to get out of the vehicle",is_crime,"[inaudible, supposedly, he, is, saying, inaudible, fled, towards, Harvard, Square, I, believe, he, also, said, one, of, the, suspects, went, into, the, Shell, Station, and, paid, cash, for, gas, put, gas, in, the, car, before, they, fled, and, that, was, when, the, victim, was, able, to, get, out, of, the, vehicle]"
3,"Control: they went into that gas station, paid cash for gas and then fled towards Harvard Square?",is_crime,"[Control, they, went, into, that, gas, station, paid, cash, for, gas, and, then, fled, towards, Harvard, Square]"
4,"Yea, I believe they have video in that Shell station, I am with the victim and believe they have video in the station",is_crime,"[Yea, I, believe, they, have, video, in, that, Shell, station, I, am, with, the, victim, and, believe, they, have, video, in, the, station]"


### Identify and Match Street Names from the Audio Transcripts

In [147]:
# Instantiate the Spacy Matcher Function
matcher = Matcher(nlp.vocab)

# Create a matching function
def on_match(matcher, doc, id, matches):
    return matches

# building patterns for every road name
def build_pattern(road_name):
    list_words = road_name.split(' ')
# ensure capitlization does not affect the model 
    pattern = [{'LOWER': word.lower()} for word in list_words]
    return pattern

# Get a pattern of every road
for road in streets_list:
    matcher.add(road, on_match, build_pattern(road))
    
# capitalize all the strings
def capitalize_string(string_in):
    words = string_in.split(' ')
    string_out = ''
    for i in words:
        string_out += i.capitalize() + ' '
    string_out = string_out[:-1]
    return string_out   
    
# Look for locations in the transcript, then extract them
def location_extraction_context(string_in):
    doc = nlp(string_in)
    string_out = ''
    list_words = string_in.split(' ')
    matches = matcher(doc)
    if len(matches) == 0:
        return None

    # loop through the matches and make sure they all follow the same format
    for match in matches:
        list_pattern = matcher.get(match[0])[1][0]
        for token in list_pattern:
            string_out += token['LOWER'] + ' '
        string_out += ', '
    string_out = string_out[:-3]
    string_out = capitalize_string(string_out)
    return string_out

# Add a column consisting of the extracted streets
data['streets'] = data['transcript'].map(location_extraction_context)

### Find possible street numbers

In [149]:
# Creat list to house data from all addresses
addresses = []

# Loop Through all DataFrame's rows
for row in data['transcript']:
    # Create dictionary to house data for each row of the DataFrame
    d = {}
    
    # Parse through rows and house results in a list
    list_tuples = usaddress.parse(row)
    
    # Create variable to house list of possible numbers
    numbers = []
    
    # Loop through each value in the list created
    for i, n in enumerate(list_tuples):
        
        # Get addresses' numbers
        if list_tuples[i][1] == 'AddressNumber':
            
            # Append numbers to list
            numbers.append(n[0])
    
    # Include keys and values into d
    d['numbers'] = numbers
    
    # Append d to addresses
    addresses.append(d)

In [150]:
data = pd.concat([data, pd.DataFrame(addresses)], axis=1)

In [151]:
# Drop NaNs
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

In [152]:
data.shape

(136, 5)

In [153]:
# Creat list to house data for possible addresses
possibilities = []

# Loop Through all DataFrame's rows
for i in range(0, data.shape[0]):
    
    # Create variables to temporarily house information
    final_poss = []
    d = {}
    number_poss = []
    
    # Loop through values in each row / numbers
    for row in data[i:i+1]['numbers']:
        for a_number in row:
            number_poss.append(a_number)
    
    # Loop through values in each row / streets
    street_poss = []
    for row2 in [x.split(',') for x in data[i:(i+1)]['streets']][0]:
        for j in row2.split(','):
            street_poss.append(j.strip())

    # Concatenate numbers and streets
    for i in number_poss:
        for j in street_poss:
            final_poss.append(i + ' ' + j)

    # Append all possibilities to list
    d['full_streets'] = list(set(final_poss))
    possibilities.append(d)

# Concatenate dataframes
data = pd.concat([data, pd.DataFrame(possibilities)], axis=1)

In [154]:
data.head()

Unnamed: 0,transcript,type,tokens,streets,numbers,full_streets
0,"One 5'7"", the second with darker skin, both suspects armed with firearms, driving a Black Mercedes SUV. Carjacked at 816 memorial drive at the gas station in Cambridge, Suspects are two middle eastern males, on is 5'7"", second one with",is_crime,"[One, the, second, with, darker, skin, both, suspects, armed, with, firearms, driving, a, Black, Mercedes, SUV, Carjacked, at, memorial, drive, at, the, gas, station, in, Cambridge, Suspects, are, two, middle, eastern, males, on, is, second, one, with]","The , Black , Memorial , Drive , The , Station , Cambridge , Middle , Eastern",[816],"[816 Cambridge, 816 Station, 816 Memorial, 816 Eastern, 816 Middle, 816 Black, 816 The, 816 Drive]"
1,"508, inaudible...supposedly he is saying inaudible...fled towards Harvard Square. I believe he also said one of the suspects went into the Shell Station and paid cash for gas, put gas in the car, before they fled and that was when the victim was able to get out of the vehicle",is_crime,"[inaudible, supposedly, he, is, saying, inaudible, fled, towards, Harvard, Square, I, believe, he, also, said, one, of, the, suspects, went, into, the, Shell, Station, and, paid, cash, for, gas, put, gas, in, the, car, before, they, fled, and, that, was, when, the, victim, was, able, to, get, out, of, the, vehicle]","Harvard , The , The , Station , The , The , The",[],[]
2,"Control: they went into that gas station, paid cash for gas and then fled towards Harvard Square?",is_crime,"[Control, they, went, into, that, gas, station, paid, cash, for, gas, and, then, fled, towards, Harvard, Square]","Station , Harvard",[],[]
3,"Yea, I believe they have video in that Shell station, I am with the victim and believe they have video in the station",is_crime,"[Yea, I, believe, they, have, video, in, that, Shell, station, I, am, with, the, victim, and, believe, they, have, video, in, the, station]","Station , The , The , Station",[],[]
4,"Tango inaudible....Does the victim report the operator, that was operating the vehicle was armed with a firearm but he was unsure about the passenger",is_crime,"[Tango, inaudible, the, victim, report, the, operator, that, was, operating, the, vehicle, was, armed, with, a, firearm, but, he, was, unsure, about, the, passenger]","The , The , The , The",[],[]


### Drop Blanks

In [155]:
data['full_streets'] = data['full_streets'].map(lambda x: np.nan if len(x) == 0 else x)
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

In [158]:
data

Unnamed: 0,transcript,type,tokens,streets,numbers,full_streets
0,"One 5'7"", the second with darker skin, both suspects armed with firearms, driving a Black Mercedes SUV. Carjacked at 816 memorial drive at the gas station in Cambridge, Suspects are two middle eastern males, on is 5'7"", second one with",is_crime,"[One, the, second, with, darker, skin, both, suspects, armed, with, firearms, driving, a, Black, Mercedes, SUV, Carjacked, at, memorial, drive, at, the, gas, station, in, Cambridge, Suspects, are, two, middle, eastern, males, on, is, second, one, with]","The , Black , Memorial , Drive , The , Station , Cambridge , Middle , Eastern",[816],"[816 Cambridge, 816 Station, 816 Memorial, 816 Eastern, 816 Middle, 816 Black, 816 The, 816 Drive]"
1,Is there an officer driving the 609 right now?,is_crime,"[Is, there, an, officer, driving, the, right, now]",The,[609],[609 The]
2,Shots Fired! Shots Fired! Officers pinned down 94 Spruce Street. I need backup. 94 Spruce Street I need long guns. I need long guns!,is_crime,"[Shots, Fired, Shots, Fired, Officers, pinned, down, Spruce, Street, I, need, backup, Spruce, Street, I, need, long, guns, I, need, long, guns]","Spruce , Spruce , Long , Long",[94],"[94 Spruce, 94 Long]"
3,111 there is an officer down at Hazel and Dexter.,is_crime,"[there, is, an, officer, down, at, Hazel, and, Dexter]","Hazel , Dexter",[111],"[111 Hazel, 111 Dexter]"
4,"94 Spruce Street, I need long guns. 94 Spruce.",is_crime,"[Spruce, Street, I, need, long, guns, Spruce]","Spruce , Long , Spruce",[94],"[94 Spruce, 94 Long]"
5,"982, Patch it up with probably Watertown, yea Watertown for now -there is a report of second officer down, there is definitely hand grenades and automatic gunfire.",is_crime,"[Patch, it, up, with, probably, Watertown, yea, Watertown, for, now, is, a, report, of, second, officer, down, there, is, definitely, hand, grenades, and, automatic, gunfire]","Watertown , Watertown","[982,]","[982, Watertown]"
6,1181 I am at Spruce and Lincoln,is_crime,"[I, am, at, Spruce, and, Lincoln]","Spruce , Lincoln",[1181],"[1181 Spruce, 1181 Lincoln]"
7,"13, CP I am on foot in the backyards between Dexter and Laurel.",is_crime,"[CP, I, am, on, foot, in, the, backyards, between, Dexter, and, Laurel]","The , Dexter , Laurel","[13,]","[13, Dexter, 13, The, 13, Laurel]"
8,"Okay once we complete those 2 blocks - expand it two more blocks from those areas. okay - we have plenty of police officers lets start using them - from 98 Spruce, first 2 blocks then 4 blocks.",is_crime,"[Okay, once, we, complete, those, blocks, expand, it, two, more, blocks, from, those, areas, okay, we, have, plenty, of, police, officers, lets, start, using, them, from, Spruce, first, blocks, then, blocks]",Spruce,[2],[2 Spruce]
9,inaudible.....we have a package on the ground at Mt Auburn - 543 Mt. Auburn.,is_crime,"[inaudible, have, a, package, on, the, ground, at, Mt, Auburn, Mt, Auburn]","The , Auburn , Auburn",[543],"[543 The, 543 Auburn]"


In [157]:
data.shape

(16, 6)