# Police nodes


# reddit nodes
## posts
* node id: post id
* attributes
    * url
    * post title
    * keywords (may be empty)
    * (future ideas: tot comments, upvotes, has media, etc) 

## users
* node id: author
* attributes
    * has_posted?
    * has commented?

# Crime corpus nodes
* node: crime type

# Neighborhood corpus nodes
* node: neighborhood location

# Relationships

## Reddit
* start_id = user id
* end_id = post id
* type: comment, post

## Crime (BELONGS_TO)
* start_id = crime post/call
* end_id = crime node
* source type = reddit, nextdoor, police calls
* time type = time bin
* neighborhood type?

## Crime (HAPPENED_AT)
* start_id = crime post/call
* end_id = neighborhood node

In [1]:
# import libraries
from pathlib import Path

import pandas as pd

In [2]:
# set paths
data_p             = Path("../data")

corpi_p            = data_p / "corpi"
neighborhood_p     = corpi_p / "neighborhood_corpus.csv"
crime_p            = corpi_p / "crime_corpus.csv"

reddit_processed_p = data_p / "processed_reddit_data"

# create out path
out_p = data_p / "neo4j_files"
out_p.mkdir(exist_ok=True)

node_p = out_p / "nodes"
node_p.mkdir(exist_ok=True)

relations_p = out_p / "relationships"
relations_p.mkdir(exist_ok=True)

## Make Corpi Nodes

In [3]:
# read in neighborhood corpus and write to node file
neighborhood_df = pd.read_csv(neighborhood_p)

# prepare corpus csv
neighborhood_df[":ID"] = neighborhood_df.index + 1
neighborhood_df[":LABEL"] = "neighborhood"

# rearrange columns
neighborhood_df = neighborhood_df[[":ID", "neighborhood", ":LABEL"]]

# Write out node csv
neighborhood_out_p = node_p / "nodes_neighborhood.csv"
neighborhood_df.to_csv(neighborhood_out_p, index=False)

neighborhood_df.head()

Unnamed: 0,:ID,neighborhood,:LABEL
0,1,clairemont mesa east,neighborhood
1,2,clairemont mesa west,neighborhood
2,3,bay ho,neighborhood
3,4,north clairemont,neighborhood
4,5,university city,neighborhood


In [None]:
# read in neighborhood corpus and write to node file
neighborhood_df = pd.read_csv(neighborhood_p)

# prepare corpus csv
neighborhood_df[":ID"] = neighborhood_df.index + 1
neighborhood_df[":LABEL"] = "neighborhood"

# rearrange columns
neighborhood_df = neighborhood_df[[":ID", "neighborhood", ":LABEL"]]

# Write out node csv
neighborhood_out_p = node_p / "nodes_neighborhood.csv"
neighborhood_df.to_csv(neighborhood_out_p, index=False)

neighborhood_df.head()

## Reddit: Merge NER and Rake results

In [2]:

ner_p = data_p / "cleaned_reddit_ner_12-21_to_1115.csv"
keywords_p = data_p / "keyword_extraction.csv"

In [3]:
ner_df = pd.read_csv(ner_p)
print(f"Total observations: {ner_df.shape[0]}")

# drop unamed index
ner_df.drop(columns=['Unnamed: 0'], inplace=True)

ner_df.head()

Total observations: 43421


Unnamed: 0,subreddit,title,post_id,post_author,post_utc,full_link,post_text,post_text_count,ORG,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,NORP,PERSON,TIME
0,sandiego,going to visit san diego next week any places...,x4nzh2,Fearmkultra,2022-09-03 06:57:58+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,going to visit san diego next week any places ...,12,['san diego'],['next week'],,,,,,,,,
1,sandiego,whaley house picture of ghost,x4ntm7,Open_Construction_31,2022-09-03 06:47:09+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,whaley house picture of ghost as a kid i saw t...,199,"['whaley house', 'the whaley house']","['13', '25 yrs ago']",,,['san diegans'],,,,,,"['a minute later', 'late nightearly morning']"
2,sandiego,language exchange,x4n6xv,Poshorock,2022-09-03 06:07:46+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,language exchange is there someone by there wh...,31,,,,,,['english'],,,['spanish'],['san diego'],
3,SanDiegan,chula vista police stopping cars going east on...,x4n5aj,kaptaincorn,2022-09-03 06:04:54+00:00,https://www.reddit.com/r/SanDiegan/comments/x4...,chula vista police stopping cars going east on...,57,,,,,['chula vista'],,,,,,
4,SanDiegan,todd gloria finalizes plan to change park blvd...,x4n2rv,Lemonade_IceCold,2022-09-03 06:00:38+00:00,https://www.reddit.com/r/SanDiegan/comments/x4...,todd gloria finalizes plan to change park blvd...,666,['gtonly'],,,['balboa park'],,,,,['north american'],"['todd gloria', 'kevin']",


In [4]:
ner_df.columns

Index(['subreddit', 'title', 'post_id', 'post_author', 'post_utc', 'full_link',
       'post_text', 'post_text_count', 'ORG', 'DATE', 'EVENT', 'FAC', 'GPE',
       'LANGUAGE', 'LAW', 'LOC', 'NORP', 'PERSON', 'TIME'],
      dtype='object')

In [5]:
keywords_df = pd.read_csv(keywords_p)
print(f"Total observations: {keywords_df.shape[0]}")
keywords_df.drop(columns=['post_text'], inplace=True)
keywords_df.head()

Total observations: 31415


Unnamed: 0,post_id,keywords
0,x4ntm7,"['suddenly appeared', 'something hard', 'smoke..."
1,x4n6xv,"['language exchange', 'practice spanish', 'pra..."
2,x4n5aj,"['grand ave', 'seen', 'pb', 'holidays', 'end',..."
3,x4n2rv,"['zoo uptown', 'working class', 'traffic elsew..."
4,x4mz7c,"['verbal abuse', 'sell anything', 'extreme win..."


In [6]:
ner_df.merge(keywords_df)

Unnamed: 0,subreddit,title,post_id,post_author,post_utc,full_link,post_text,post_text_count,ORG,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,NORP,PERSON,TIME,keywords
0,sandiego,whaley house picture of ghost,x4ntm7,Open_Construction_31,2022-09-03 06:47:09+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,whaley house picture of ghost as a kid i saw t...,199,"['whaley house', 'the whaley house']","['13', '25 yrs ago']",,,['san diegans'],,,,,,"['a minute later', 'late nightearly morning']","['suddenly appeared', 'something hard', 'smoke..."
1,sandiego,language exchange,x4n6xv,Poshorock,2022-09-03 06:07:46+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,language exchange is there someone by there wh...,31,,,,,,['english'],,,['spanish'],['san diego'],,"['language exchange', 'practice spanish', 'pra..."
2,SanDiegan,chula vista police stopping cars going east on...,x4n5aj,kaptaincorn,2022-09-03 06:04:54+00:00,https://www.reddit.com/r/SanDiegan/comments/x4...,chula vista police stopping cars going east on...,57,,,,,['chula vista'],,,,,,,"['grand ave', 'seen', 'pb', 'holidays', 'end',..."
3,SanDiegan,todd gloria finalizes plan to change park blvd...,x4n2rv,Lemonade_IceCold,2022-09-03 06:00:38+00:00,https://www.reddit.com/r/SanDiegan/comments/x4...,todd gloria finalizes plan to change park blvd...,666,['gtonly'],,,['balboa park'],,,,,['north american'],"['todd gloria', 'kevin']",,"['zoo uptown', 'working class', 'traffic elsew..."
4,sandiego,ultimate adult tantrum,x4mz7c,oshunsorchard,2022-09-03 05:54:45+00:00,https://www.reddit.com/r/sandiego/comments/x4m...,ultimate adult tantrum do other business recei...,72,,,,,,,,,,,,"['verbal abuse', 'sell anything', 'extreme win..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31624,SanDiegan,puppy play hours,scdswz,Emmyj123,2022-01-25 13:31:14+00:00,https://www.reddit.com/r/SanDiegan/comments/sc...,puppy play hours hi all wavinghand wavinghand ...,242,,"['11 weeks', 'about 67 months']",,,['north county'],,,['rohr park'],['german'],['kamp kanine'],"['hours', 'hours']","['turning weeks', 'rohr park', 'north county',..."
31625,UCSD,la jolla donor makes 50m research t that could...,scdqum,Yeezy75024,2022-01-25 13:28:21+00:00,https://www.reddit.com/r/UCSD/comments/scdqum/...,la jolla donor makes 50m research t that could...,74,"['usc the san diego uniontribune i', 'usc']",,,,"['la jolla', 'san diego lmao']",,,,,"['usc', 'usc']",,"['wasnt aware', 'san diego', 'never wondered',..."
31626,UCSD,new covid variant detected in at least 40 diff...,sca7fv,Yeezy75024,2022-01-25 09:58:30+00:00,https://www.reddit.com/r/UCSD/comments/sca7fv/...,new covid variant detected in at least 40 diff...,93,,['every year'],,,,,,,,['wpec idk'],,"['sigma variant', 'new shot', 'like omicron', ..."
31627,UCSD,mailing services while school’s online,sc90i4,esppperanza,2022-01-25 08:32:43+00:00,https://www.reddit.com/r/UCSD/comments/sc90i4/...,mailing services while school’s online hey eve...,223,['clownface'],"['a couple weeks ago', 'the quarter', 'last we...",,,['hahaha'],,,,,,,"['thing thankfully', 'theyre forwarding', 'pre..."
