# Acronyms
* pb = peanut butter or pacific beach

In [1]:
from pathlib import Path 

import pandas as pd

import nltk
#nltk.download('punkt')
from nltk.corpus import stopwords

from rake_nltk import Rake

import py_stringmatching as sm

import sys

sys.path.insert(0, '../data_cleaning_and_integration')

from cleaner_lib import remove_puncuations, clean_str_col

In [2]:
# Set params
min_word_count = 20 # lose roughly 15k posts
max_keywords = 10
max_keyword_length = 2
load_stopwords = True

In [3]:
# paths
data_p = Path('../data/')
cleaned_p = data_p / 'processed_reddit_data'
cleaned_reddit_p = cleaned_p / 'cleaned_reddit_12-21_to_1115.csv'
corpi_p = data_p / 'corpi'

pd_crime_p = corpi_p / 'crime_corpus.csv'
ethnicity_p = corpi_p / 'ethnicity_corpus.csv'
stopword_p = corpi_p / 'stopwords.csv'
neighborhood_p = corpi_p / 'neighborhood_corpus.csv'

In [4]:
reddit_df = pd.read_csv(cleaned_reddit_p)
reddit_df

Unnamed: 0,subreddit,title,post_id,post_author,post_utc,full_link,post_text,post_text_count
0,sandiego,going to visit san diego next week any places...,x4nzh2,Fearmkultra,2022-09-03 06:57:58+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,going to visit san diego next week any places ...,12
1,sandiego,interesting illusion’s,x4ny4c,Break-these-cuffs,2022-09-03 06:55:24+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,interesting illusion’s,2
2,sandiego,whaley house picture of ghost,x4ntm7,Open_Construction_31,2022-09-03 06:47:09+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,whaley house picture of ghost as a kid i saw t...,199
3,sandiego,language exchange,x4n6xv,Poshorock,2022-09-03 06:07:46+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,language exchange is there someone by there wh...,31
4,SanDiegan,chula vista police stopping cars going east on...,x4n5aj,kaptaincorn,2022-09-03 06:04:54+00:00,https://www.reddit.com/r/SanDiegan/comments/x4...,chula vista police stopping cars going east on...,57
...,...,...,...,...,...,...,...,...
45352,UCSD,new covid variant detected in at least 40 diff...,sca7fv,Yeezy75024,2022-01-25 09:58:30+00:00,https://www.reddit.com/r/UCSD/comments/sca7fv/...,new covid variant detected in at least 40 diff...,93
45353,sandiego,tmz baltimore maggots leaked video twitter sca...,sc9b5t,EdgeIQ,2022-01-25 08:54:03+00:00,https://www.reddit.com/r/sandiego/comments/sc9...,tmz baltimore maggots leaked video twitter sca...,14
45354,UCSD,still doing devious licks,sc974s,UCSuckDick,2022-01-25 08:45:58+00:00,https://www.reddit.com/r/UCSD/comments/sc974s/...,still doing devious licks,4
45355,UCSD,mailing services while school’s online,sc90i4,esppperanza,2022-01-25 08:32:43+00:00,https://www.reddit.com/r/UCSD/comments/sc90i4/...,mailing services while school’s online hey eve...,223


In [5]:
# Filter data df
reddit_df = reddit_df.loc[reddit_df.post_text_count > min_word_count].copy()

In [6]:
# do some last bit of cleaning
reddit_df["post_text"] = reddit_df["post_text"].replace(r'\d+',' ', regex=True)
reddit_df = remove_puncuations(reddit_df, "post_text")
# data_df["post_text"] = data_df["post_text"].replace('.',' ', regex=True)
reddit_df

Unnamed: 0,subreddit,title,post_id,post_author,post_utc,full_link,post_text,post_text_count
2,sandiego,whaley house picture of ghost,x4ntm7,Open_Construction_31,2022-09-03 06:47:09+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,whaley house picture of ghost as a kid i saw t...,199
3,sandiego,language exchange,x4n6xv,Poshorock,2022-09-03 06:07:46+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,language exchange is there someone by there wh...,31
4,SanDiegan,chula vista police stopping cars going east on...,x4n5aj,kaptaincorn,2022-09-03 06:04:54+00:00,https://www.reddit.com/r/SanDiegan/comments/x4...,chula vista police stopping cars going east on...,57
5,SanDiegan,todd gloria finalizes plan to change park blvd...,x4n2rv,Lemonade_IceCold,2022-09-03 06:00:38+00:00,https://www.reddit.com/r/SanDiegan/comments/x4...,todd gloria finalizes plan to change park blvd...,666
6,sandiego,ultimate adult tantrum,x4mz7c,oshunsorchard,2022-09-03 05:54:45+00:00,https://www.reddit.com/r/sandiego/comments/x4m...,ultimate adult tantrum do other business recei...,72
...,...,...,...,...,...,...,...,...
45350,SanDiegan,puppy play hours,scdswz,Emmyj123,2022-01-25 13:31:14+00:00,https://www.reddit.com/r/SanDiegan/comments/sc...,puppy play hours hi all wavinghand wavinghand ...,242
45351,UCSD,la jolla donor makes 50m research t that could...,scdqum,Yeezy75024,2022-01-25 13:28:21+00:00,https://www.reddit.com/r/UCSD/comments/scdqum/...,la jolla donor makes m research t that could ...,74
45352,UCSD,new covid variant detected in at least 40 diff...,sca7fv,Yeezy75024,2022-01-25 09:58:30+00:00,https://www.reddit.com/r/UCSD/comments/sca7fv/...,new covid variant detected in at least diffe...,93
45355,UCSD,mailing services while school’s online,sc90i4,esppperanza,2022-01-25 08:32:43+00:00,https://www.reddit.com/r/UCSD/comments/sc90i4/...,mailing services while school’s online hey eve...,223


In [7]:
# add stop words
if load_stopwords:
    stopwords_df = pd.read_csv(stopword_p)
    stop_words = stopwords_df.stopwords.tolist()
else:
    stop_words = stopwords.words('english')
    add_words = ["haha", "ha", "was", "were", "lol", "lmao", "rofl", "bro", "sheesh", "wanna", "would", "im", "ive",
                 "...", "want", "went", "go", "get", "got", "going", "much", "rl", "irl", "iirc", "gonna", "yall",
                 "dont", "fyi", "hmu", "b", "nd", "k", "kk", "youre", "dm", "youll", "ngl", "tbh", "ez", "...", "ty", "l",
                 "th", "lmfao", "uwu", "noob", "etc", "af"

                ]
    stop_words = stop_words + add_words
    pd.Series(stop_words, name="stopwords").to_csv(data_p / 'stopwords.csv', index=False)
# stop_words

In [8]:
keyword_res = []
for row in reddit_df.iterrows():
    text = row[1]['post_text']
    # building rake object
    r = Rake(max_length=max_keyword_length, include_repeated_phrases=False,
             stopwords=stop_words,
             punctuations=['&#', ';', '-', '.', ',', '&', "'", '"', '."', '(', ')', '/', '’', ').'])
    r.extract_keywords_from_text(text)
    keyword_res.append([row[1]['post_id'], r.get_ranked_phrases(), text])

In [11]:
keyword_df = pd.DataFrame(keyword_res, columns=["post_id", "keywords", "post_text"])
keyword_df.to_csv(Path('../data/processed_reddit_data/keyword_extraction.csv'), index=False)
keyword_df

Unnamed: 0,post_id,keywords,post_text
0,x4ntm7,"[suddenly appeared, something hard, smoke weed...",whaley house picture of ghost as a kid i saw t...
1,x4n6xv,"[language exchange, practice spanish, practice...",language exchange is there someone by there wh...
2,x4n5aj,"[grand ave, seen, pb, holidays, end, east]",chula vista police stopping cars going east on...
3,x4n2rv,"[zoo uptown, working class, traffic elsewhere,...",todd gloria finalizes plan to change park blvd...
4,x4mz7c,"[verbal abuse, sell anything, extreme wind, bu...",ultimate adult tantrum do other business recei...
...,...,...,...
31410,scdswz,"[turning weeks, rohr park, north county, make ...",puppy play hours hi all wavinghand wavinghand ...
31411,scdqum,"[wasnt aware, san diego, never wondered, good ...",la jolla donor makes m research t that could ...
31412,sca7fv,"[sigma variant, new shot, like omicron, kill l...",new covid variant detected in at least diffe...
31413,sc90i4,"[thing thankfully, theyre forwarding, pretty i...",mailing services while school’s online hey eve...


# Similarity

In [12]:
# create whitespace tokenizer
ws_tok = sm.WhitespaceTokenizer(return_set=True)

In [13]:
# load in files to build a corpus
crime_corpus_df = pd.read_csv(pd_crime_p)
crime_corpus = crime_corpus_df.crime.tolist()
crime_corpus = list(set(crime_corpus))

ethnicity_corpus_df = pd.read_csv(ethnicity_p)
ethnicity_corpus = ethnicity_corpus_df.ethnicity.tolist()
ethnicity_corpus = list(set(ethnicity_corpus))

neighborhood_corpus_df = pd.read_csv(neighborhood_p)
neighborhood_corpus = neighborhood_corpus_df.neighborhood.tolist()
neighborhood_corpus = list(set(neighborhood_corpus))

In [14]:
def get_jac_sim(*, target_tok, tok):
    jac = sm.Jaccard()
    sim = jac.get_sim_score(target_tok, tok)
    return sim

def calc_strs(data_df, col, corpus_df, tok):
    return data_df[col].apply(lambda x:
                                get_jac_sim(target_tok=corpus_df, tok=tok.tokenize(' '.join(x))))
    # return data_df

In [15]:
# crime scoring
keyword_df["crime_score"] = calc_strs(keyword_df.copy(), "keywords", crime_corpus, ws_tok)

# ethnicity scoring
keyword_df["ethnicity_score"] = calc_strs(keyword_df.copy(), "keywords", ethnicity_corpus, ws_tok)

# neighborhood scoring
keyword_df["neighborhood_score"] = calc_strs(keyword_df.copy(), "keywords", neighborhood_corpus, ws_tok)

# write to file
keyword_df.to_csv(cleaned_p / 'cime_related_text.csv', index=False)

In [16]:
keyword_df.loc[keyword_df["crime_score"] > 0]

Unnamed: 0,post_id,keywords,post_text,crime_score,ethnicity_score,neighborhood_score
3,x4n2rv,"[zoo uptown, working class, traffic elsewhere,...",todd gloria finalizes plan to change park blvd...,0.003501,0.0,0.000000
4,x4mz7c,"[verbal abuse, sell anything, extreme wind, bu...",ultimate adult tantrum do other business recei...,0.002732,0.0,0.000000
7,x4m8lo,"[twice within, tracking number, stuff inside, ...",avoid suites on paseo like the plague they los...,0.007273,0.0,0.000000
8,x4lqvq,"[tent downtown, someone wants, joke made, hous...",one year anniversary of living in san diego tw...,0.001339,0.0,0.002294
9,x4loi4,"[win better, one ppl, one died, next time, lig...",guy in honda accord tried to run me off the ro...,0.008621,0.0,0.000000
...,...,...,...,...,...,...
31406,schiuk,"[sdg e, like tinder, internal resources, heard...",anyone know how to email human resources of sd...,0.004043,0.0,0.000000
31410,scdswz,"[turning weeks, rohr park, north county, make ...",puppy play hours hi all wavinghand wavinghand ...,0.001300,0.0,0.000000
31411,scdqum,"[wasnt aware, san diego, never wondered, good ...",la jolla donor makes m research t that could ...,0.001366,0.0,0.000000
31412,sca7fv,"[sigma variant, new shot, like omicron, kill l...",new covid variant detected in at least diffe...,0.001355,0.0,0.000000


In [17]:
keyword_df.loc[keyword_df["neighborhood_score"] > 0]

Unnamed: 0,post_id,keywords,post_text,crime_score,ethnicity_score,neighborhood_score
8,x4lqvq,"[tent downtown, someone wants, joke made, hous...",one year anniversary of living in san diego tw...,0.001339,0.0,0.002294
25,x4jity,"[stupid shit, short hair, plate number, light ...",homophobic harassment on broadway st in downto...,0.002594,0.0,0.002169
50,x4eq9k,"[vague plan, still variables, rest either, ren...",revelle ge dread diagnosis so i’m aware of the...,0.000000,0.0,0.002092
51,x4epox,"[rest either, renaissance scholar, reddit tryi...",revelle ge dread diagnosis and so i’m aware of...,0.000000,0.0,0.002188
56,x4dar7,"[renaissance scholar, put revelle, proficiency...",revelle ges okay so i’m aware of the general r...,0.000000,0.0,0.002227
...,...,...,...,...,...,...
31335,sflupb,"[yearround amenities, whoops …, well …, way dr...",this city is amazing and bike lanes are good a...,0.001182,0.0,0.003745
31341,saanvs,"[teachers decision, spotify playlist, simple h...",if anyone here needs a friend or someone to ve...,0.004902,0.0,0.001969
31399,scjmi4,"[withered away, wellmanicured lawn, verdant oa...",what the heck happened to gallagher square pic...,0.001209,0.0,0.001938
31408,scg1t7,"[work staying, tourism sub, side bar, send us,...",ill be in town this week for work staying in t...,0.000000,0.0,0.002232


In [18]:
keyword_df.loc[keyword_df["ethnicity_score"] > 0]

Unnamed: 0,post_id,keywords,post_text,crime_score,ethnicity_score,neighborhood_score
66,xc4ozk,"[tow hook, super chill, state law, small chanc...",sdpdsheriff attitude toward front license plat...,0.008284,0.006667,0.000000
160,xh4lw4,"[tecate bus, smart problem, sd lincoln, said w...",what are some of the worst high schools in sd ...,0.001287,0.013158,0.000000
224,xmhl90,"[wrong place, white jeep, walking towards, lea...",strangers in uh neighborhood keep asking about...,0.001326,0.018868,0.000000
232,xmfvf1,"[doesnt work, discussion section, chinese char...",chin an sections if anyone is in the am or ...,0.000000,0.045455,0.000000
285,xro6u0,"[tried asking, mahjong club, japanese mahjong,...",anyone play japanese riichi mahjong im a big f...,0.000000,0.033333,0.000000
...,...,...,...,...,...,...
30717,s1zwqz,"[unfortunately need, shit ton, pov someone, ne...",suggestions on good therapists in the area i m...,0.006510,0.014085,0.000000
30864,seorlr,"[white chest, total sweetheart, information pl...",missing dog north park hi looking for a lb b...,0.000000,0.045455,0.000000
31245,sg014r,"[midway drive, lost black, gray chihuahua, wea...",lost older chihuahua off midway drive in point...,0.000000,0.047619,0.002433
31284,sfoilc,"[marukai thanks, half chinese, wanted, surpris...",where to find lunar new year decorations hi al...,0.000000,0.045455,0.000000
