# Create crime corpus

In [1]:
from pathlib import Path 

import pandas as pd

import py_stringmatching as sm

from cleaner_lib import remove_puncuations, clean_str_col

In [2]:
# set params
create_new = False

In [3]:
# create whitespace tokenizer
ws_tok = sm.WhitespaceTokenizer(return_set=True)

In [4]:
# paths
data_p = Path('./data/')
cleaned_p = data_p / 'cleaned'

if create_new:
    pd_crime_p = data_p / 'annotated_pd_call_types.csv'
else:
    pd_crime_p = data_p / 'crime_corpus.csv'

crime_out_p = data_p / 'crime_corpus.csv'
ethnicity_out_p = data_p / 'ethnicity_corpus.csv'
neighborhood_p = data_p / 'neighborhood_corpus.csv'

In [5]:
# load in files to build a corpus
crime_corpus_df = pd.read_csv(pd_crime_p)
crime_corpus_df

Unnamed: 0,crime
0,reckless driving
1,stolen vehicle log
2,ambulance call overdose
3,abandoned refrigerator
4,calling for help
...,...
708,beguilt
709,ernest klein
710,karl brugmann
711,libertarianism


In [6]:
# Filter out more misc call types
if create_new:
    crime_corpus_df = crime_corpus_df.fillna('')
    crime_corpus_df = crime_corpus_df.loc[~crime_corpus_df.crime_type.str.contains('exclude')]
    
    crime_corpus_df["description"] = crime_corpus_df["description"].replace('/', ' ', regex=True)
    crime_corpus_df["description"] = crime_corpus_df["description"].replace('-', ' ', regex=True)
    crime_corpus_df = clean_str_col(crime_corpus_df, "description")
    crime_corpus_df["description"] = crime_corpus_df["description"].replace(r'\d+',' ', regex=True)
    crime_corpus_df = remove_puncuations(crime_corpus_df, "description")
    
    other_crimes = ["stolen", "stole", "steal", "scam", "scammed", "scamming", "murder", "murdered", "rape", "absuse", "domestic violence",
               "larceny", "embezzeled", "robber"]
    crime_corpus = crime_corpus_df.description.tolist()
    crime_corpus = crime_corpus_df.description.tolist() + other_crimes
    ws_tok.tokenize(' '.join(crime_corpus))
    crime_corpus = list(set(crime_corpus))
    
    pd.Series(crime_corpus, name='crime').to_csv(crime_out_p, index=False)

# Create ethnicity corpi

In [7]:
ethnicity_corpus = ["caucasian", "white", "indian", "african american", "black", "hispanic", "native american", "russian", "chinese", "asian", "canadian", "japanese"]
pd.Series(ethnicity_corpus, name='ethnicity').to_csv(ethnicity_out_p, index=False)

# Create neighborhood corpi

In [8]:
neighborhood_df = pd.read_csv(neighborhood_p, header=None)
neighborhood_df.rename(columns={0: "neighborhood"}, inplace=True)
neighborhood_df

Unnamed: 0,neighborhood
0,Clairemont Mesa East
1,Clairemont Mesa West
2,Bay Ho
3,North Clairemont
4,University City
...,...
564,University City
565,University Heights
566,Valencia Park
567,Via de la Valle


In [9]:
neighborhood_df["neighborhood"] = neighborhood_df["neighborhood"].replace('/', ' ', regex=True)
neighborhood_df["neighborhood"] = neighborhood_df["neighborhood"].replace('-', ' ', regex=True)
neighborhood_df = clean_str_col(neighborhood_df, "neighborhood")
neighborhood_df.drop_duplicates(inplace=True)
neighborhood_df.to_csv(neighborhood_p, index=False)