In [1]:
import string

from pathlib import Path

import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize

from rake_nltk import Rake

import py_stringmatching as sm

import spacy

import sys

sys.path.insert(0, '../data_cleaning_and_integration')

from cleaner_lib import remove_puncuations, clean_str_col

In [15]:
# set paths
data_p             = Path("../data")
nd_processed_p = data_p / "processed_nextdoor_data"
nd_cleaned_p = nd_processed_p / "cleaned_nd_final_data.csv"

# out paths for keyword and nd
keyword_out_p = nd_processed_p / "nd_keyword_extraction.csv"
keyword_sim_out_p = nd_processed_p / "nd_keyword_corpi_sim.csv"
ner_out_p = nd_processed_p / "nd_ner.csv"

corpi_p = data_p / 'corpi'

pd_crime_p = corpi_p / 'crime_corpus.csv'
ethnicity_p = corpi_p / 'ethnicity_corpus.csv'
stopword_p = corpi_p / 'stopwords.csv'
neighborhood_p = corpi_p / 'neighborhood_corpus.csv'

In [3]:
nd_df = pd.read_csv(nd_cleaned_p)
print("Unique post ID Count: ",len(pd.unique(nd_df['post_id'])))
print("Dataset size:",len(nd_df))
nd_df.head()

Unique post ID Count:  2811
Dataset size: 2811


Unnamed: 0,post_id,ShortLink,Author,post_text,post_text_count,Neighborhood
0,nd1,https://nextdoor.com/p/--3jc5nsXN58?view=detail,Hannah Lopez,how late can people be working on construction...,131,Corridor
1,nd2,https://nextdoor.com/p/--mjpdwdS3yx?view=detail,Tim Welch,rain has finally arrived in north park but las...,280,Montclair
2,nd3,https://nextdoor.com/p/-3GwdKj4_sMm?view=detail,News,dont we have a water shortage... jennifer that...,1250,
3,nd4,https://nextdoor.com/p/-4qn3_2yNk_Y?view=detail,Frank Negrete,guess nd didnt like my question about drinking...,82,Hillcrest Northeast
4,nd5,https://nextdoor.com/p/-5-J-BXgJ84y?view=detail,Dawn Burton,day time robbery marston hillsupdate. update u...,1853,Hillcrest Southeast


# Keywords

In [4]:
# Set params
min_word_count = 20 # lose roughly 15k posts
max_keywords = 10
max_keyword_length = 2
load_stopwords = True

In [5]:
# Filter data df
nd_pred_df = nd_df.copy()
nd_pred_df = nd_pred_df.loc[nd_pred_df.post_text_count > min_word_count].copy()

In [6]:
# do some last bit of cleaning
nd_pred_df["post_text"] = nd_pred_df["post_text"].replace(r'\d+',' ', regex=True)
nd_pred_df = remove_puncuations(nd_pred_df, "post_text")
nd_pred_df

Unnamed: 0,post_id,ShortLink,Author,post_text,post_text_count,Neighborhood
0,nd1,https://nextdoor.com/p/--3jc5nsXN58?view=detail,Hannah Lopez,how late can people be working on construction...,131,Corridor
1,nd2,https://nextdoor.com/p/--mjpdwdS3yx?view=detail,Tim Welch,rain has finally arrived in north park but las...,280,Montclair
2,nd3,https://nextdoor.com/p/-3GwdKj4_sMm?view=detail,News,dont we have a water shortage jennifer that’s ...,1250,
3,nd4,https://nextdoor.com/p/-4qn3_2yNk_Y?view=detail,Frank Negrete,guess nd didnt like my question about drinking...,82,Hillcrest Northeast
4,nd5,https://nextdoor.com/p/-5-J-BXgJ84y?view=detail,Dawn Burton,day time robbery marston hillsupdate update un...,1853,Hillcrest Southeast
...,...,...,...,...,...,...
2806,nd2817,https://nextdoor.com/p/zyBKcPsfG8p4?view=detail,Lisa Busalacchi,got this text today… since i’m expecting some ...,1026,Del Cerro Hearst
2807,nd2818,https://nextdoor.com/p/zzWdg8FDxMw4?view=detail,Eleanor Jacobs,this has to be sketchy scammy don’t click dele...,171,North Park Burlingame/Altadena
2808,nd2819,https://nextdoor.com/p/zzYsgLb5T2sb?view=detail,Rosie Hin,hi everyone this is my baby ish month old co...,63,University Heights Antique Row N
2809,nd2820,https://nextdoor.com/p/zzgTmx49yTM4?view=detail,Grace Joseph,ah yes pm the perfect time to play the game g...,68,North Park Burlingame/Altadena


In [7]:
# add stop words
if load_stopwords:
    stopwords_df = pd.read_csv(stopword_p)
    stop_words = stopwords_df.stopwords.tolist()
# stop_words

In [8]:
keyword_res = []
for row in nd_pred_df.iterrows():
    text = row[1]['post_text']
    # building rake object
    r = Rake(max_length=max_keyword_length, include_repeated_phrases=False,
             stopwords=stop_words,
             punctuations=['&#', ';', '-', '.', ',', '&', "'", '"', '."', '(', ')', '/', '’', ').'])
    r.extract_keywords_from_text(text)
    keyword_res.append([row[1]['post_id'], r.get_ranked_phrases(), text])

In [9]:
keyword_df = pd.DataFrame(keyword_res, columns=["post_id", "keywords", "post_text"])
keyword_df.to_csv(keyword_out_p, index=False)
keyword_df

Unnamed: 0,post_id,keywords,post_text
0,nd1,"[willful violation, news trying, means capturi...",how late can people be working on construction...
1,nd2,"[“ yeah, vehicles chance, shall rebuild, san d...",rain has finally arrived in north park but las...
2,nd3,"[… enough, water usage, water situation, trans...",dont we have a water shortage jennifer that’s ...
3,nd4,"[public facewithtearsofjoy, faces bios, delete...",guess nd didnt like my question about drinking...
4,nd5,"[yet nothing, violent felonies, unlawful behav...",day time robbery marston hillsupdate update un...
...,...,...,...
2760,nd2817,"[xxx amount, vacation home, uspspaula absolute...",got this text today… since i’m expecting some ...
2761,nd2818,"[sketchy scammy, senders email, scammichael sa...",this has to be sketchy scammy don’t click dele...
2762,nd2819,"[‘ charlie, yrs old, outcarol thank, hi everyo...",hi everyone this is my baby ish month old co...
2763,nd2820,"[tongueincheekdarn fireworks, seconds apart, p...",ah yes pm the perfect time to play the game g...


# Similarity

In [10]:
# create whitespace tokenizer
ws_tok = sm.WhitespaceTokenizer(return_set=True)

In [11]:
# load in files to build a corpus
crime_corpus_df = pd.read_csv(pd_crime_p)
crime_corpus = crime_corpus_df.crime.tolist()
crime_corpus = list(set(crime_corpus))

ethnicity_corpus_df = pd.read_csv(ethnicity_p)
ethnicity_corpus = ethnicity_corpus_df.ethnicity.tolist()
ethnicity_corpus = list(set(ethnicity_corpus))

neighborhood_corpus_df = pd.read_csv(neighborhood_p)
neighborhood_corpus = neighborhood_corpus_df.neighborhood.tolist()
neighborhood_corpus = list(set(neighborhood_corpus))

In [12]:
def get_jac_sim(*, target_tok, tok):
    jac = sm.Jaccard()
    sim = jac.get_sim_score(target_tok, tok)
    return sim

def calc_strs(data_df, col, corpus_df, tok):
    return data_df[col].apply(lambda x:
                                get_jac_sim(target_tok=corpus_df, tok=tok.tokenize(' '.join(x))))
    # return data_df

In [16]:
# crime scoring
keyword_df["crime_score"] = calc_strs(keyword_df.copy(), "keywords", crime_corpus, ws_tok)

# ethnicity scoring
keyword_df["ethnicity_score"] = calc_strs(keyword_df.copy(), "keywords", ethnicity_corpus, ws_tok)

# neighborhood scoring
keyword_df["neighborhood_score"] = calc_strs(keyword_df.copy(), "keywords", neighborhood_corpus, ws_tok)

# write to file
keyword_df.to_csv(keyword_sim_out_p, index=False)

In [17]:
keyword_df.loc[keyword_df["crime_score"] > 0]

Unnamed: 0,post_id,keywords,post_text,crime_score,ethnicity_score,neighborhood_score
0,nd1,"[willful violation, news trying, means capturi...",how late can people be working on construction...,0.005391,0.000000,0.0
2,nd3,"[… enough, water usage, water situation, trans...",dont we have a water shortage jennifer that’s ...,0.008448,0.000000,0.0
4,nd5,"[yet nothing, violent felonies, unlawful behav...",day time robbery marston hillsupdate update un...,0.042534,0.000000,0.0
5,nd6,"[unlocked report, police response, hood openno...",just a heads up the black nissan has had the h...,0.009321,0.017857,0.0
6,nd7,"[workers feel, still working, republic workers...",trash hauler republic services says union reje...,0.002653,0.000000,0.0
...,...,...,...,...,...,...
2759,nd2816,"[wording suggests, vast majority, usually rece...",strange note left in front yard hi all i came ...,0.013699,0.000000,0.0
2760,nd2817,"[xxx amount, vacation home, uspspaula absolute...",got this text today… since i’m expecting some ...,0.003181,0.000000,0.0
2761,nd2818,"[sketchy scammy, senders email, scammichael sa...",this has to be sketchy scammy don’t click dele...,0.001335,0.000000,0.0
2763,nd2820,"[tongueincheekdarn fireworks, seconds apart, p...",ah yes pm the perfect time to play the game g...,0.002759,0.000000,0.0


In [18]:
keyword_df.loc[keyword_df["neighborhood_score"] > 0]

Unnamed: 0,post_id,keywords,post_text,crime_score,ethnicity_score,neighborhood_score
1,nd2,"[“ yeah, vehicles chance, shall rebuild, san d...",rain has finally arrived in north park but las...,0.000000,0.000000,0.002079
7,nd8,"[ustonya whattwice, tag cars, stories includin...",i just witnessed a guy on an electric skateboa...,0.007407,0.008772,0.001984
8,nd9,"[u relax, u call, tread visibledid, think perh...",did anyone hear the cars doing dounts on the c...,0.008621,0.000000,0.003210
11,nd12,"[zero access, yetdirty dogssure, years past, y...",last night at the stroke of midnight an army o...,0.007634,0.000000,0.003273
14,nd15,"[yard stuff, wits end, whole proposition, truc...",just a heads up we had our outdoor patio furni...,0.009479,0.000000,0.001852
...,...,...,...,...,...,...
2734,nd2790,"[“ make, young women, youd like, ww sometimes,...",hi my name is lillie i’m and a gemini upside...,0.004306,0.000000,0.001610
2738,nd2794,"[“ bless, working right, workers maybe, white ...",backpack stolen in front of house hi civita ne...,0.014493,0.004808,0.003350
2739,nd2795,"[window frame, weeks ago, traffic stop, thread...",my car was broken into between and or earl...,0.008274,0.006623,0.001848
2749,nd2805,"[time around, thief well, thanks filesdpd, sto...",hey everyone our backyard got broken into last...,0.019906,0.005917,0.001789


In [19]:
keyword_df.loc[keyword_df["ethnicity_score"] > 0]

Unnamed: 0,post_id,keywords,post_text,crime_score,ethnicity_score,neighborhood_score
5,nd6,"[unlocked report, police response, hood openno...",just a heads up the black nissan has had the h...,0.009321,0.017857,0.000000
7,nd8,"[ustonya whattwice, tag cars, stories includin...",i just witnessed a guy on an electric skateboa...,0.007407,0.008772,0.001984
9,nd10,"[yards around, truly inhumane, texas st, tests...",if anyone knows the person who recently moved ...,0.003559,0.006944,0.000000
19,nd21,"[wander far, super sweet, super curious, simil...",found smilingfacewithopenhands smilingfacewith...,0.001202,0.007634,0.000000
40,nd43,"[” lbs, safethank god, brown eyes, blue shirt,...",this is a post from a local agencyfound safe t...,0.001359,0.028571,0.000000
...,...,...,...,...,...,...
2742,nd2798,"[workingronald biggica, woman team, window scr...",thieves broke into my house today somehow open...,0.017058,0.003968,0.000000
2747,nd2803,"[zero reason, young children, without dignity,...",stranger alert hey we just had a guy show up a...,0.013201,0.004566,0.000000
2749,nd2805,"[time around, thief well, thanks filesdpd, sto...",hey everyone our backyard got broken into last...,0.019906,0.005917,0.001789
2753,nd2809,"[white house, voters approved, trolley riders,...",i heard parking is being removed from park ave...,0.014925,0.004000,0.000000


# NER

In [4]:
nlp_spacy = spacy.load("en_core_web_md")

In [5]:
ner_df = nd_df.copy()
ner_df = ner_df[ner_df['post_text_count']>4]
ner_df.head()

Unnamed: 0,post_id,ShortLink,Author,post_text,post_text_count,Neighborhood
0,nd1,https://nextdoor.com/p/--3jc5nsXN58?view=detail,Hannah Lopez,how late can people be working on construction...,131,Corridor
1,nd2,https://nextdoor.com/p/--mjpdwdS3yx?view=detail,Tim Welch,rain has finally arrived in north park but las...,280,Montclair
2,nd3,https://nextdoor.com/p/-3GwdKj4_sMm?view=detail,News,dont we have a water shortage... jennifer that...,1250,
3,nd4,https://nextdoor.com/p/-4qn3_2yNk_Y?view=detail,Frank Negrete,guess nd didnt like my question about drinking...,82,Hillcrest Northeast
4,nd5,https://nextdoor.com/p/-5-J-BXgJ84y?view=detail,Dawn Burton,day time robbery marston hillsupdate. update u...,1853,Hillcrest Southeast


In [6]:
print("Unique post ID Count: ",len(pd.unique(ner_df['post_id'])))
print("Dataset size:",len(ner_df))

Unique post ID Count:  2808
Dataset size: 2808


In [None]:
%%time
ent_dict = []
for index, row in ner_df.iterrows():
    #print(post)
    post_id = row['post_id']
    post = row['post_text']
    doc_spacy = nlp_spacy(post)
    for ent in doc_spacy.ents:
        ent_dict.append([post_id,ent.label_,ent.text])

In [None]:
# Extracting the labels
final_lst = [i[1] for i in ent_dict]

# Unique list of Entity labels present in our reddit posts
final_lst = list(np.unique(final_lst))

# Adding description to those labels
final_lst = [[label,spacy.explain(label)] for label in final_lst]
final_lst

In [None]:
# These are the significant Entity labels we want to check.
ent_labels = ['DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'NORP', 'ORG', 'PERSON', 'TIME']

#Adding Entity labels as columns in our input dataframe
for lbl in ent_label_dict:
    ner_df[lbl] = None

In [None]:
%%time
# Let's add the list in a column
# Important Labels:
# DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, NORP, ORG, PERSON, TIME

for index, row in ner_df.iterrows():
    #print("index",index)
    ent_label_dict = dict.fromkeys(ent_labels,[])
    post = row['post_text']
    doc_spacy = nlp_spacy(post)
    for ent in doc_spacy.ents:
        if ent.label_ in ent_label_dict:
            ent_label_dict[ent.label_] = ent_label_dict[ent.label_] + [ent.text]
        
            # Updating the label value recognized in the dataframe
            ner_df.at[index,ent.label_] = ent_label_dict[ent.label_]

In [None]:
ner_df.to_csv(ner_out_p, index=False)
ner_df.head()

In [None]:
ner_df.count()

In [None]:
ner_df['ORG'].dropna().head(20)

In [None]:
ner_df['GPE'].dropna().head(20)

In [None]:
ner_df['NORP'].dropna().head(20)

In [None]:
ner_df['DATE'].dropna().head(20)

# Merge ner and keywords