# Using an agreed upon corpus, extract neighborhoods and crimes for a given dataframe
The following code will search for and extract matches (exact and partial) of the neighborhood/crime corpi from a given dataframe. The philosophy is:
1. Look for exact corpus matches first using CountVectorizer
2. For records that had no exact matches, tokenize using ngrams (size 1-4)and try to find high score partial matches

In [1]:
import pandas as pd
import py_stringmatching as sm
import nltk
from nltk.util import ngrams

from sklearn.feature_extraction.text import CountVectorizer

## To use this code, specify the dataframe location and the column to be searched.

In [2]:
df = pd.read_csv('../data/cleaned_nd_final_data.csv')
df.reset_index(inplace = True)
col = 'post_text'

In [3]:
df['post_text_count'].sum()

1855181

Prep the search column by making it string.

In [4]:
df[col].fillna('', inplace = True)
df[col] = df[col].astype('str')

Import the agreed upon crime and neighborhood corpi

In [5]:
crime_corpus = pd.read_csv('../data/crime_corpus.csv')
crime_corpus['crime'].astype('str')
crime_corpus['crime'] = crime_corpus['crime'].apply(lambda x: x.strip())
crime_corpus.drop_duplicates(inplace = True)
crime_corpus.reset_index(drop = True, inplace = True)
crime_corpus.reset_index(inplace = True)

neighborhood_corpus = pd.read_csv('../data/neighborhood_corpus.csv')
neighborhood_corpus['neighborhood'].astype('str')
neighborhood_corpus['neighborhood'] = neighborhood_corpus['neighborhood'].apply(lambda x: x.strip())
neighborhood_corpus.reset_index(inplace = True)
neighborhood_corpus.drop_duplicates(inplace = True)

Initialize the vectorizer for each corpus and fit the vocab.

In [6]:
crime_vectorizer = CountVectorizer(vocabulary = crime_corpus['crime'])
neighborhood_vectorizer = CountVectorizer(vocabulary = neighborhood_corpus['neighborhood'])

In [7]:
print(crime_vectorizer.get_feature_names_out()[0:10])
print(neighborhood_vectorizer.get_feature_names_out()[0:10])

['reckless driving' 'stolen vehicle log' 'ambulance call overdose'
 'abandoned refrigerator' 'calling for help' 'adw cost recovery'
 'stayout of area no radio trans' 'receive sell stolen prop' 'adw'
 'officer needs help']
['clairemont mesa east' 'clairemont mesa west' 'bay ho' 'north clairemont'
 'university city' 'bay park' 'mission beach' 'pacific beach'
 'mission bay park' 'la jolla']


Find exact matching by using the transform method.

In [8]:
%%time
crime_vectorizer_matches = crime_vectorizer.transform(df[col])
neighborhood_vectorizer_matches = neighborhood_vectorizer.transform(df[col])

CPU times: total: 3.09 s
Wall time: 3.1 s


Check the shape of the matches matrix to make sure it has the right dimensions.

In [9]:
print(crime_vectorizer_matches.shape)
print(df.shape[0], len(crime_corpus['crime']))

print(neighborhood_vectorizer_matches.shape)
print(df.shape[0], len(neighborhood_corpus['neighborhood']))

(2811, 712)
2811 712
(2811, 402)
2811 402


Get the indices for the exact matches as a pandas dataframe

In [10]:
crime_exact_matches_df = pd.DataFrame({'dfindex': crime_vectorizer_matches.nonzero()[0], 
                                      'crimeindex': crime_vectorizer_matches.nonzero()[1]})
neighborhood_exact_matches_df = pd.DataFrame({'dfindex': neighborhood_vectorizer_matches.nonzero()[0], 
                                      'neighborhoodindex': neighborhood_vectorizer_matches.nonzero()[1]})
crime_exact_matches_df

Unnamed: 0,dfindex,crimeindex
0,0,171
1,0,297
2,0,322
3,0,350
4,0,368
...,...,...
27822,2807,339
27823,2809,218
27824,2809,488
27825,2810,105


In [11]:
neighborhood_exact_matches_df

Unnamed: 0,dfindex,neighborhoodindex
0,1,356
1,4,111
2,7,351
3,8,204
4,8,277
...,...,...
1084,2786,356
1085,2794,84
1086,2797,18
1087,2797,39


Merge the exact matches with the original corpi, then collapse into a list

In [12]:
crime_exact_matches_df = crime_exact_matches_df.merge(crime_corpus, how = 'inner', left_on = 'crimeindex', right_index = True)
crime_exact_matches_df = crime_exact_matches_df.groupby(by = 'dfindex', as_index = False).agg({'crime': lambda x: x.tolist()})
crime_exact_matches_df

Unnamed: 0,dfindex,crime
0,0,"[police, violation, investigative, news, killi..."
1,1,[money]
2,2,"[law, government, illegal, homeless, story, cr..."
3,3,"[news, domestic]"
4,4,"[police, news, laws, money, law, homeless, sta..."
...,...,...
2699,2805,"[police, news, law, illegal, bad, problems, ev..."
2700,2806,"[case, security, street, wrong, scam, attempt,..."
2701,2807,"[bad, business, scam]"
2702,2809,"[illegal, dangerous]"


In [13]:
neighborhood_exact_matches_df = neighborhood_exact_matches_df.merge(neighborhood_corpus, how = 'inner', 
                                                                    left_on = 'neighborhoodindex', 
                                                                    right_index = True)
neighborhood_exact_matches_df = neighborhood_exact_matches_df.groupby(by = 'dfindex', 
                                                                      as_index = False).agg({'neighborhood': lambda x: x.tolist()})
neighborhood_exact_matches_df

Unnamed: 0,dfindex,neighborhood
0,1,[downtown]
1,4,[castle]
2,7,[gateway]
3,8,"[hill, ridgeview]"
4,11,"[downtown, hillcrest]"
...,...,...
789,2784,[kensington]
790,2786,"[downtown, hill]"
791,2794,[hillcrest]
792,2797,"[miramar, skyline]"


Get records that did not have exact matches to be used for partial matching

In [14]:
crime_partial_matches_df = df[~df.index.isin(crime_exact_matches_df.index)].copy(deep = True)
print(crime_partial_matches_df.shape)
neighborhood_partial_matches_df = df[~df.index.isin(neighborhood_exact_matches_df.index)].copy(deep = True)
print(neighborhood_partial_matches_df.shape)

(107, 7)
(2017, 7)


Set up the n-gram tokenizer

In [15]:
al_tok = sm.AlphabeticTokenizer()

al_tok.tokenize('hello! world?this is: our2 project*')

['hello', 'world', 'this', 'is', 'our', 'project']

In [16]:
def ngram_tokenize(txt, min_len = 1, max_len = 4):
    alph_toks = al_tok.tokenize(txt)
    tokens = []
    for i in range(min_len, max_len + 1):
        tokens += [' '.join(gram) for gram in list(ngrams(alph_toks, i))]
    return tokens

In [17]:
ngram_tokenize('hello! world?this is: our2 project*')

['hello',
 'world',
 'this',
 'is',
 'our',
 'project',
 'hello world',
 'world this',
 'this is',
 'is our',
 'our project',
 'hello world this',
 'world this is',
 'this is our',
 'is our project',
 'hello world this is',
 'world this is our',
 'this is our project']

Create a tokenized column for records that did not have exact matches.

In [18]:
crime_partial_matches_df[col+'_tok'] = crime_partial_matches_df[col].apply(lambda x: ngram_tokenize(x))
neighborhood_partial_matches_df[col+'_tok'] = neighborhood_partial_matches_df[col].apply(lambda x: ngram_tokenize(x))

In [19]:
crime_partial_matches_df[col+'_tok'].head()

2704    [hey, everyone, i, just, need, some, help, fig...
2705    [as, long, as, we, buy, from, reputable, store...
2706    [more, self, inflicted, stupidity, from, our, ...
2707    [be, careful, of, this, guy, with, two, pitbul...
2708    [hi, does, anyone, know, what, s, going, on, i...
Name: post_text_tok, dtype: object

In [20]:
neighborhood_partial_matches_df[col+'_tok'].head()

794    [porch, pirate, right, before, pm, still, dayl...
795    [was, at, balboa, park, with, my, son, i, usua...
796    [last, night, someone, came, into, my, yard, t...
797    [dead, cats, i, ve, lost, cats, now, in, the, ...
798    [looking, for, doggie, foster, home, my, best,...
Name: post_text_tok, dtype: object

Estabilish similarity measures and matching function.

In [21]:
jaro = sm.Jaro()
print(jaro.get_raw_score('the', 'theft'))

lev = sm.Levenshtein()
print(lev.get_sim_score('MARTHA', 'MARHTA'))

0.8666666746139526
0.6666666666666667


In [38]:
def get_partial_matches(toks, corpus, min_score = 0.95):
    matches = []
    for token in toks:
        for corp in corpus:
            if (jaro.get_raw_score(token, corp) >= min_score):
                matches.append(corp)
    matches = list(set(matches))
    return matches

Get the partial matches.

In [93]:
%%time
crime_partial_matches_df['crime'] = crime_partial_matches_df[col+'_tok'].apply(lambda x: 
                                                                                            get_partial_matches(x, 
                                                                                                                crime_corpus['crime'].to_list()))

KeyboardInterrupt: 

In [47]:
'''
%%time
neighborhood_partial_matches_df['neighborhood'] = neighborhood_partial_matches_df[col+'_tok'].apply(lambda x: 
                                                                                            get_partial_matches(x, 
                                                                                                                neighborhood_corpus['neighborhood'].to_list()))
                                                                                                                '''

"\n%%time\nneighborhood_partial_matches_df['neighborhood'] = neighborhood_partial_matches_df[col+'_tok'].apply(lambda x: \n                                                                                            get_partial_matches(x, \n                                                                                                                neighborhood_corpus['neighborhood'].to_list()))\n                                                                                                                "

Concatenate the matches

In [None]:
crime_matches = pd.concat([crime_exact_matches_df[['index', 'crime']], crime_partial_matches_df[['index', 'crime']]

In [None]:
df_crime = df.merge(crime_matches, left_on = 'index', right_on = 'index', how = 'left')