# Acronyms
* pb = peanut butter or pacific beach

In [1]:
from pathlib import Path 

import pandas as pd

import nltk
nltk.download('punkt')
from nltk.corpus import stopwords

from rake_nltk import Rake

import py_stringmatching as sm

from cleaner_lib import remove_puncuations, clean_str_col

[nltk_data] Downloading package punkt to /home/matthew/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Set params
min_word_count = 20 # lose roughly 15k posts
max_keywords = 10
max_keyword_length = 2
load_stopwords = True

In [3]:
# paths
data_p = Path('./data/')
cleaned_p = data_p / 'cleaned'
cleaned_reddit_p = cleaned_p / 'cleaned_reddit_12-21_to_1115.csv'

pd_crime_p = data_p / 'crime_corpus.csv'
ethnicity_p = data_p / 'ethnicity_corpus.csv'
stopword_p = data_p / 'stopwords.csv'

In [4]:
reddit_df = pd.read_csv(cleaned_reddit_p)
reddit_df

Unnamed: 0,subreddit,title,post_id,post_author,post_utc,full_link,post_text,post_text_count
0,sandiego,going to visit san diego next week any places...,x4nzh2,Fearmkultra,2022-09-03 06:57:58+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,going to visit san diego next week any places ...,12
1,sandiego,interesting illusion’s,x4ny4c,Break-these-cuffs,2022-09-03 06:55:24+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,interesting illusion’s,2
2,sandiego,whaley house picture of ghost,x4ntm7,Open_Construction_31,2022-09-03 06:47:09+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,whaley house picture of ghost as a kid i saw t...,199
3,sandiego,language exchange,x4n6xv,Poshorock,2022-09-03 06:07:46+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,language exchange is there someone by there wh...,31
4,SanDiegan,chula vista police stopping cars going east on...,x4n5aj,kaptaincorn,2022-09-03 06:04:54+00:00,https://www.reddit.com/r/SanDiegan/comments/x4...,chula vista police stopping cars going east on...,57
...,...,...,...,...,...,...,...,...
45352,UCSD,new covid variant detected in at least 40 diff...,sca7fv,Yeezy75024,2022-01-25 09:58:30+00:00,https://www.reddit.com/r/UCSD/comments/sca7fv/...,new covid variant detected in at least 40 diff...,93
45353,sandiego,tmz baltimore maggots leaked video twitter sca...,sc9b5t,EdgeIQ,2022-01-25 08:54:03+00:00,https://www.reddit.com/r/sandiego/comments/sc9...,tmz baltimore maggots leaked video twitter sca...,14
45354,UCSD,still doing devious licks,sc974s,UCSuckDick,2022-01-25 08:45:58+00:00,https://www.reddit.com/r/UCSD/comments/sc974s/...,still doing devious licks,4
45355,UCSD,mailing services while school’s online,sc90i4,esppperanza,2022-01-25 08:32:43+00:00,https://www.reddit.com/r/UCSD/comments/sc90i4/...,mailing services while school’s online hey eve...,223


In [5]:
# Filter data df
reddit_df = reddit_df.loc[reddit_df.post_text_count > min_word_count].copy()

In [6]:
# do some last bit of cleaning
reddit_df["post_text"] = reddit_df["post_text"].replace(r'\d+',' ', regex=True)
reddit_df = remove_puncuations(reddit_df, "post_text")
# data_df["post_text"] = data_df["post_text"].replace('.',' ', regex=True)
reddit_df

Unnamed: 0,subreddit,title,post_id,post_author,post_utc,full_link,post_text,post_text_count
2,sandiego,whaley house picture of ghost,x4ntm7,Open_Construction_31,2022-09-03 06:47:09+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,whaley house picture of ghost as a kid i saw t...,199
3,sandiego,language exchange,x4n6xv,Poshorock,2022-09-03 06:07:46+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,language exchange is there someone by there wh...,31
4,SanDiegan,chula vista police stopping cars going east on...,x4n5aj,kaptaincorn,2022-09-03 06:04:54+00:00,https://www.reddit.com/r/SanDiegan/comments/x4...,chula vista police stopping cars going east on...,57
5,SanDiegan,todd gloria finalizes plan to change park blvd...,x4n2rv,Lemonade_IceCold,2022-09-03 06:00:38+00:00,https://www.reddit.com/r/SanDiegan/comments/x4...,todd gloria finalizes plan to change park blvd...,666
6,sandiego,ultimate adult tantrum,x4mz7c,oshunsorchard,2022-09-03 05:54:45+00:00,https://www.reddit.com/r/sandiego/comments/x4m...,ultimate adult tantrum do other business recei...,72
...,...,...,...,...,...,...,...,...
45350,SanDiegan,puppy play hours,scdswz,Emmyj123,2022-01-25 13:31:14+00:00,https://www.reddit.com/r/SanDiegan/comments/sc...,puppy play hours hi all wavinghand wavinghand ...,242
45351,UCSD,la jolla donor makes 50m research t that could...,scdqum,Yeezy75024,2022-01-25 13:28:21+00:00,https://www.reddit.com/r/UCSD/comments/scdqum/...,la jolla donor makes m research t that could ...,74
45352,UCSD,new covid variant detected in at least 40 diff...,sca7fv,Yeezy75024,2022-01-25 09:58:30+00:00,https://www.reddit.com/r/UCSD/comments/sca7fv/...,new covid variant detected in at least diffe...,93
45355,UCSD,mailing services while school’s online,sc90i4,esppperanza,2022-01-25 08:32:43+00:00,https://www.reddit.com/r/UCSD/comments/sc90i4/...,mailing services while school’s online hey eve...,223


In [7]:
# add stop words
if load_stopwords:
    stopwords_df = pd.read_csv(stopword_p)
    stop_words = stopwords_df.stopwords.tolist()
else:
    stop_words = stopwords.words('english')
    add_words = ["haha", "ha", "was", "were", "lol", "lmao", "rofl", "bro", "sheesh", "wanna", "would", "im", "ive",
                 "...", "want", "went", "go", "get", "got", "going", "much", "rl", "irl", "iirc", "gonna", "yall",
                 "dont", "fyi", "hmu", "b", "nd", "k", "kk", "youre", "dm", "youll", "ngl", "tbh", "ez", "...", "ty", "l",
                 "th", "lmfao", "uwu", "noob", "etc", "af"

                ]
    stop_words = stop_words + add_words
    pd.Series(stop_words, name="stopwords").to_csv(data_p / 'stopwords.csv', index=False)
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
keyword_res = []
for row in reddit_df.iterrows():
    text = row[1]['post_text']
    # building rake object
    r = Rake(max_length=max_keyword_length, include_repeated_phrases=False,
             stopwords=stop_words,
             punctuations=['&#', ';', '-', '.', ',', '&', "'", '"', '."', '(', ')', '/', '’', ').'])
    r.extract_keywords_from_text(text)
    keyword_res.append([row[1]['post_id'], r.get_ranked_phrases(), text])

In [9]:
keyword_df = pd.DataFrame(keyword_res, columns=["post_id", "keywords", "post_text"])
keyword_df.to_csv(Path('./data/cleaned/keyword_extraction.csv'), index=False)
keyword_df

Unnamed: 0,post_id,keywords,post_text
0,x4ntm7,"[suddenly appeared, something hard, smoke weed...",whaley house picture of ghost as a kid i saw t...
1,x4n6xv,"[language exchange, practice spanish, practice...",language exchange is there someone by there wh...
2,x4n5aj,"[grand ave, seen, pb, holidays, end, east]",chula vista police stopping cars going east on...
3,x4n2rv,"[zoo uptown, working class, traffic elsewhere,...",todd gloria finalizes plan to change park blvd...
4,x4mz7c,"[verbal abuse, sell anything, extreme wind, bu...",ultimate adult tantrum do other business recei...
...,...,...,...
31410,scdswz,"[turning weeks, rohr park, north county, make ...",puppy play hours hi all wavinghand wavinghand ...
31411,scdqum,"[wasnt aware, san diego, never wondered, good ...",la jolla donor makes m research t that could ...
31412,sca7fv,"[sigma variant, new shot, like omicron, kill l...",new covid variant detected in at least diffe...
31413,sc90i4,"[thing thankfully, theyre forwarding, pretty i...",mailing services while school’s online hey eve...


# Similarity

In [10]:
# create whitespace tokenizer
ws_tok = sm.WhitespaceTokenizer(return_set=True)

In [11]:
# load in files to build a corpus
crime_corpus_df = pd.read_csv(pd_crime_p)
crime_corpus = crime_corpus_df.crime.tolist()
crime_corpus = list(set(crime_corpus))

ethnicity_corpus_df = pd.read_csv(ethnicity_p)
ethnicity_corpus = ethnicity_corpus_df.ethnicity.tolist()
ethnicity_corpus = list(set(ethnicity_corpus))

In [12]:
def get_jac_sim(*, target_tok, tok):
    jac = sm.Jaccard()
    sim = jac.get_sim_score(target_tok, tok)
    return sim

def calc_strs(data_df, col, corpus_df, tok):
    return data_df[col].apply(lambda x:
                                get_jac_sim(target_tok=corpus_df, tok=tok.tokenize(' '.join(x))))
    # return data_df

In [13]:
# crime scoring
keyword_df["crime_score"] = calc_strs(keyword_df.copy(), "keywords", crime_corpus, ws_tok)

# ethnicity scoring
keyword_df["ethnicity_score"] = calc_strs(keyword_df.copy(), "keywords", ethnicity_corpus, ws_tok)

# neighborhood scoring
# TODO

# write to file
keyword_df.to_csv(cleaned_p / 'cime_related_text.csv', index=False)

In [14]:
keyword_df.loc[keyword_df["crime_score"] > 0]

Unnamed: 0,post_id,keywords,post_text,crime_score,ethnicity_score
9,x4loi4,"[win better, one ppl, one died, next time, lig...",guy in honda accord tried to run me off the ro...,0.003717,0.0
28,x4j1gi,"[stolen surfboards, personal message, knows su...",stolen surfboards in clairemont so one of my r...,0.005181,0.0
41,x4h1km,"[students stay, ea ea, charge unit, charge, we...",discount for ea ea is saying on their website ...,0.005780,0.0
42,x4grq2,"[stolen august, ring videos, mission ave, minu...",anyone have their catalytic converter stolen r...,0.005495,0.0
63,x4c6jj,"[wednesday august, via reddit, nordstrom rack,...",stolen vehicle toyota tacoma trd off road l...,0.005435,0.0
...,...,...,...,...,...
31073,ruf3ym,"[works despite, usually individuals, united st...",i got offered a scab job gta masters in engine...,0.003521,0.0
31125,saev0y,"[upstairs apartment, super recognizable, prett...",stolen bikes where do you think stolen bikes t...,0.004566,0.0
31221,sdfitb,"[toyota corollas, midway district, emblems mis...",stolen toyota grill emblem point loma someone ...,0.005650,0.0
31302,s1mj8k,"[many stickers, anyone reading, stolen, saw, m...",heavilystickered bike stolen hi if anyone read...,0.005814,0.0
