# This notebook is for showing intermediate results of data processing pipelines on Reddit user behavior data
### Note that the intermediate processed data are not saved in this notebook. In order to run notebook 2.1 for model training, please run the kedro pipeline first.  

### Download and check data 

In [4]:
import json
import requests
import ujson 
import pandas as pd
import os 
from pathlib import Path

In [19]:
URL = "https://files.pushshift.io/reddit/comments/RC_2008-05.zst"
reddit_raw_data_file_path = '../../data/01_raw/user_behavior/RC_2008-05.zst'

In [22]:
reddit_raw_data_file_path_abs = Path(reddit_raw_data_file_path).resolve()

In [24]:
reddit_raw_data_file_path, reddit_raw_data_file_path_abs

('../../data/01_raw/user_behavior/RC_2008-05.zst',
 PosixPath('/home/ec2-user/SageMaker/anomaly-detection-spatial-temporal-data/data/01_raw/user_behavior/RC_2008-05.zst'))

In [29]:
reddit_raw_data_file_dir = Path('/'.join(reddit_raw_data_file_path.split('/')[:-1]))
reddit_raw_data_file_dir

PosixPath('../../data/01_raw/user_behavior')

In [30]:
reddit_raw_data_file_dir.mkdir(parents=True, exist_ok=True)

In [17]:
response = requests.get(URL)
open(reddit_raw_data_file_path, "wb").write(response.content)

47173996

In [31]:
!unzstd ../../data/01_raw/user_behavior/RC_2008-05.zst --memory=2048MB

../../data/01_raw/user_behavior/RC_2008-05.zst: 310404232 bytes                


### read in data 

In [33]:
reddit_raw_data_file_path

'../../data/01_raw/user_behavior/RC_2008-05.zst'

In [34]:
records = map(json.loads, open(reddit_raw_data_file_path.rstrip(".zst"), encoding="utf8"))
df = pd.DataFrame.from_records(records)

In [35]:
df.shape

(536380, 21)

In [37]:
df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
link_id,t3_6hoxb,t3_6holm,t3_6hl0a,t3_6hq4l,t3_6hoyd,t3_6hoyd,t3_6hpzs,t3_6hnvn,t3_6hnim,t3_6hq4k
author_flair_css_class,,,,,,,,,,
retrieved_on,1425846806,1425846806,1425846806,1425846806,1425846806,1425846806,1425846806,1425846806,1425846806,1425846806
controversiality,0,0,0,0,0,0,0,0,0,0
archived,True,True,True,True,True,True,True,True,True,True
name,t1_c03vgla,t1_c03vgli,t1_c03vglj,t1_c03vglk,t1_c03vgll,t1_c03vglm,t1_c03vgln,t1_c03vglo,t1_c03vglp,t1_c03vglq
edited,False,False,False,False,False,False,False,False,False,False
subreddit,reddit.com,pics,pics,reddit.com,worldnews,worldnews,programming,business,politics,pics
score,1,1,1,0,0,1,2,2,1,1
created_utc,1209600017,1209600008,1209600078,1209600015,1209600034,1209600034,1209600050,1209600052,1209600134,1209600068


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536380 entries, 0 to 536379
Data columns (total 21 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   link_id                 536380 non-null  object
 1   author_flair_css_class  780 non-null     object
 2   retrieved_on            536380 non-null  int64 
 3   controversiality        536380 non-null  int64 
 4   archived                536380 non-null  bool  
 5   name                    536380 non-null  object
 6   edited                  536380 non-null  object
 7   subreddit               536380 non-null  object
 8   score                   536380 non-null  int64 
 9   created_utc             536380 non-null  object
 10  score_hidden            536380 non-null  bool  
 11  distinguished           0 non-null       object
 12  id                      536380 non-null  object
 13  author_flair_text       696 non-null     object
 14  gilded                  536380 non-n

## Observation about the data:
1. There are 536380 rows and 20 columns where each row is an unique post with 20 attributes/columns related to that comment
2. Most important attributes include author, sub-reddit, body and score. Body is the comment thread content, and score is the total votes received on Reddit (1 for one upvote and -1 for downvote). Each record represents one author posts something (body) related to the sub-reddit topic. 
3. Each unique author can have multiple comments across more than one subreddit with varying scores for each comment


## Data processing steps to get input for ELAND model. Steps include:
1. Drop records of absolute scores lesser than 10
2. Drop user if they have posted less than 10 times
3. Drop users that are [deleted]

### We don't have ground truth labels for training the model. To generate labels on users that are neeeded for next step, we used a rule to group users into either benign and anomalous users based on their posts scores stats. 
   - Anomalous user: An author who has commented atleast 10 times and every score of theirs is lesser than or equal to -10
   - Benign user: An author who has commented atleast 10 times and every score of theirs is greater than or equal to 10

In [39]:
#Drop records if their absoulte value of score is lesser than 10
df_score = df.drop(df[abs(df.score) < 10].index)

In [52]:
df.shape, df_score.shape  #a lot of comments with less than score of 10

((536380, 21), (43343, 21))

In [40]:
#check lowest score and highest score
df_score.score.min(), df_score.score.max()

(-284, 1522)

In [41]:
df_score['author'].value_counts()

[deleted]           10453
nixonrichard          162
Poromenos             119
otakucode             115
UntakenUsername        96
                    ...  
Lucretius               1
redditto                1
Franks2000inchTV        1
sbchapm                 1
a_caspis                1
Name: author, Length: 8088, dtype: int64

In [84]:
df_score['subreddit'].value_counts()

reddit.com     11097
pics            5383
politics        4718
programming     4422
funny           3656
               ...  
psychology         1
guns               1
joel               1
lgbt               1
Anarchism          1
Name: subreddit, Length: 61, dtype: int64

In [42]:
#Drop user if they have posted less than 10 times
counts = df_score['author'].value_counts()
res = df_score[~df_score['author'].isin(counts[counts < 10].index)]

In [44]:
#Drop users that are [deleted]
res = res.drop(res[res.author=='[deleted]'].index)

In [45]:
res['author'].value_counts()

nixonrichard       162
Poromenos          119
otakucode          115
UntakenUsername     96
7oby                85
                  ... 
Osmanthus           10
mipadi              10
myotheralt          10
Jivlain             10
Xiphorian           10
Name: author, Length: 787, dtype: int64

In [46]:
#Number of unique users
len(res.author.unique())

787

## Create user labels

In [71]:
benign = pd.DataFrame()
anomaly = pd.DataFrame()

In [72]:
benign.shape, anomaly.shape

((0, 0), (0, 0))

In [73]:
benign = benign.append(res)

In [74]:
benign.shape

(15529, 21)

In [75]:
#remove records that score less than 10 
benign = benign.drop(benign[benign.score < 10].index)

In [76]:
benign.shape

(14643, 21)

In [78]:
#check one example of benign author
benign.loc[benign['author'] == 'jonknee'].T

Unnamed: 0,230,14927,54120,113751,183996,238600,238693,299957,338384,338699,339453,353770,411425,412147,426088,426377,428278,431114,515403
link_id,t3_6hpa9,t3_6hta1,t3_6i3qt,t3_6iisd,t3_6j20f,t3_6jg07,t3_6jfve,t3_6jtyv,t3_6k4gz,t3_6k4gz,t3_6k4gz,t3_6k7u9,t3_6kn7n,t3_6kn7n,t3_6kq30,t3_6kqr3,t3_6kqr3,t3_6kr7s,t3_6lf1g
author_flair_css_class,,,,,,,,,,,,,,,,,,,
retrieved_on,1425846807,1425847139,1425847701,1425848538,1425849347,1425850043,1425850044,1425850814,1425851565,1425851567,1425851577,1425851835,1425852667,1425852676,1425852996,1425852999,1425853018,1425853102,1425854332
controversiality,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
archived,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
name,t1_c03vgrw,t1_c03vs4n,t1_c03wmef,t1_c03xwhz,t1_c03zeta,t1_c040l0r,t1_c040l3c,t1_c041whg,t1_c042q73,t1_c042qfu,t1_c042r0s,t1_c04323e,t1_c044any,t1_c044b80,t1_c044lzy,t1_c044m7z,t1_c044nos,t1_c044pvn,t1_c046izm
edited,False,True,False,False,False,False,True,False,True,False,False,False,True,False,False,False,False,True,True
subreddit,programming,reddit.com,reddit.com,reddit.com,entertainment,programming,programming,programming,programming,programming,programming,reddit.com,programming,programming,politics,reddit.com,reddit.com,reddit.com,reddit.com
score,23,11,10,10,15,15,29,11,35,28,11,25,12,17,13,10,12,48,47
created_utc,1209601191,1209672909,1209917033,1210200009,1210599323,1210820045,1210820440,1211125762,1211310090,1211310945,1211312809,1211384268,1211657168,1211660609,1211748243,1211749538,1211758839,1211773274,1212165734


In [79]:
##anomalous author
anomaly = anomaly.append(res)

In [80]:
#remove records with score larger than -10 
anomaly = anomaly.drop(anomaly[anomaly.score > -10].index)

In [87]:
#Example author
anomaly.loc[anomaly['author'] == 'I_AM_A_NEOCON'].T

Unnamed: 0,1982,1983,1984,33267,33271,33276,33277,33278,33279,33280,...,369853,413071,413184,413376,423537,439728,439767,439768,474500,482592
link_id,t3_6hqjc,t3_6hqjc,t3_6hqjc,t3_6hymo,t3_6hymo,t3_6hymo,t3_6hymo,t3_6hymo,t3_6hymo,t3_6hymo,...,t3_6kcwp,t3_6knyr,t3_6ko0m,t3_6ko0m,t3_6kq2z,t3_6ku8s,t3_6ktds,t3_6ktds,t3_6l46t,t3_6l6f3
author_flair_css_class,,,,,,,,,,,...,,,,,,,,,,
retrieved_on,1425846829,1425846829,1425846829,1425847375,1425847375,1425847375,1425847375,1425847375,1425847375,1425847375,...,1425852109,1425852688,1425852688,1425852694,1425852881,1425853209,1425853209,1425853209,1425853715,1425853847
controversiality,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
archived,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
name,t1_c03vi4l,t1_c03vi4m,t1_c03vi4n,t1_c03w6ai,t1_c03w6am,t1_c03w6ar,t1_c03w6as,t1_c03w6at,t1_c03w6au,t1_c03w6av,...,t1_c043eix,t1_c044bxr,t1_c044c0x,t1_c044c69,t1_c044k13,t1_c044wjh,t1_c044wkk,t1_c044wkl,t1_c045nef,t1_c045tnd
edited,False,False,False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,True,True,False,False
subreddit,reddit.com,reddit.com,reddit.com,reddit.com,reddit.com,reddit.com,reddit.com,reddit.com,reddit.com,reddit.com,...,worldnews,reddit.com,reddit.com,reddit.com,business,entertainment,pics,pics,politics,politics
score,-17,-15,-10,-12,-13,-12,-14,-11,-12,-12,...,-14,-17,-13,-14,-10,-18,-29,-22,-14,-16
created_utc,1209609935,1209609935,1209609935,1209760815,1209760822,1209760829,1209760829,1209760829,1209760829,1209760829,...,1211452473,1211665998,1211666598,1211667641,1211735668,1211828667,1211828825,1211828826,1211993319,1212023750


In [88]:
#same author can have high score comments and low score comments at the same time 
benign.loc[benign['author'] == 'I_AM_A_NEOCON'].T

Unnamed: 0,47535,47563,90755,172604,286327,302936,304864,305026,343045,409427,426252,434929,503026,503240
link_id,t3_6i2he,t3_6i2he,t3_6idwb,t3_6iz5d,t3_6jrnz,t3_6juwd,t3_6jvfu,t3_6jvf9,t3_6k5jv,t3_6kn92,t3_6kqr3,t3_6krxf,t3_6lbr7,t3_6lbr7
author_flair_css_class,,,,,,,,,,,,,,
retrieved_on,1425847559,1425847559,1425848153,1425849227,1425850649,1425850850,1425850877,1425850878,1425851668,1425852647,1425852997,1425853150,1425854086,1425854089
controversiality,0,0,0,0,0,0,0,0,0,0,0,0,0,0
archived,True,True,True,True,True,True,True,True,True,True,True,True,True,True
name,t1_c03whb7,t1_c03whbz,t1_c03xeqe,t1_c03z5zx,t1_c041lxd,t1_c041ysb,t1_c04209x,t1_c0420ef,t1_c042tsm,t1_c04494e,t1_c044m4i,t1_c044su1,t1_c0469fe,t1_c0469lc
edited,False,False,True,True,True,True,False,False,True,True,False,False,False,False
subreddit,pics,pics,pics,politics,pics,pics,pics,reddit.com,pics,business,reddit.com,reddit.com,reddit.com,reddit.com
score,13,85,10,30,20,11,14,14,12,12,10,17,196,74
created_utc,1209860326,1209860526,1210104280,1210532662,1211038785,1211140244,1211151122,1211152191,1211325658,1211649492,1211749017,1211802335,1212107460,1212108242


In [92]:
anomaly_author_names = anomaly.author.unique()
benign_author_names = benign.author.unique()

In [94]:
len(anomaly_author_names), len(benign_author_names)

(327, 787)

In [95]:
def common_member(a, b):
    """check common elements of a and b"""
    a_set = set(a)
    b_set = set(b)
 
    if (a_set & b_set):
        return (a_set & b_set)
    else:
        print("No common elements")

In [99]:
overlap_authors = common_member(benign_author_names, anomaly_author_names)
len(overlap_authors)

327

In [102]:
benign.shape

(14643, 21)

In [104]:
benign = benign[~benign['author'].isin(overlap_authors)]
benign_author_names = benign.author.unique()
print("Number of benign users: ", len(benign.author.unique()))
print("Number of anomalous users: ", len(anomaly.author.unique()))

Number of benign users:  460
Number of anomalous users:  327


#### Each author is labelled as either 'benign' or 'anomaly' and is saved in a txt file names userlabels

In [105]:
user_label_filepath = '../../data/02_intermediate/user_behavior/user_labels.csv'

In [109]:
benign_user_label = pd.DataFrame()
benign_user_label['author'] = benign_author_names
benign_user_label['label'] = 0 #0 as benign user
anomalous_user_label = pd.DataFrame()
anomalous_user_label['author'] = anomaly_author_names
anomalous_user_label['label'] = 1

In [110]:
benign_user_label.shape, anomalous_user_label.shape

((460, 2), (327, 2))

In [113]:
benign_user_label.head(2)

Unnamed: 0,author,label
0,ultimatt42,0
1,jonknee,0


In [114]:
anomalous_user_label.head(2)

Unnamed: 0,author,label
0,I_AM_A_NEOCON,1
1,moogle516,1


In [115]:
user_label = pd.concat([benign_user_label, anomalous_user_label])

In [118]:
user_label.shape

(787, 2)

In [150]:
len(user_label)

787

## Create other input feature 
#### Each subreddit topic is given an index and saved as a pickle file. We will be naming the file p2index.pkl
#### Each author is also given an index and saved as a pickle file. We will be naming the file u2index.pkl

In [122]:
benign_prod_names = benign.subreddit.unique()
benign_prod_names = benign_prod_names.tolist()

In [123]:
anomaly_prod_names = anomaly.subreddit.unique()
anomaly_prod_names = anomaly_prod_names.tolist()

In [198]:
total_prod_names = benign_prod_names + anomaly_prod_names
total_prod_names = sorted(list(set(total_prod_names)))
print(total_prod_names)

['AskReddit', 'Drugs', 'Economics', 'Music', 'WTF', 'apple', 'area51', 'atheism', 'bestof', 'business', 'canada', 'cogsci', 'comics', 'entertainment', 'environment', 'funny', 'gadgets', 'gaming', 'geek', 'happy', 'lgbt', 'linux', 'lolcats', 'math', 'netsec', 'nsfw', 'obama', 'offbeat', 'philosophy', 'photography', 'pics', 'politics', 'programming', 'psychology', 'reddit.com', 'science', 'scifi', 'self', 'sex', 'software', 'sports', 'technology', 'videos', 'web_design', 'worldnews', 'xkcd', 'yourweek']


In [125]:
p2index={}
count = 0
for subreddit in total_prod_names:
    p2index[subreddit]=count
    count+=1

In [126]:
p2index

{'yourweek': 0,
 'technology': 1,
 'software': 2,
 'atheism': 3,
 'xkcd': 4,
 'philosophy': 5,
 'psychology': 6,
 'Drugs': 7,
 'nsfw': 8,
 'WTF': 9,
 'lgbt': 10,
 'web_design': 11,
 'pics': 12,
 'scifi': 13,
 'gaming': 14,
 'science': 15,
 'business': 16,
 'sports': 17,
 'sex': 18,
 'math': 19,
 'apple': 20,
 'area51': 21,
 'canada': 22,
 'linux': 23,
 'gadgets': 24,
 'geek': 25,
 'funny': 26,
 'comics': 27,
 'bestof': 28,
 'netsec': 29,
 'worldnews': 30,
 'reddit.com': 31,
 'happy': 32,
 'Economics': 33,
 'AskReddit': 34,
 'self': 35,
 'cogsci': 36,
 'entertainment': 37,
 'Music': 38,
 'obama': 39,
 'environment': 40,
 'photography': 41,
 'videos': 42,
 'lolcats': 43,
 'offbeat': 44,
 'politics': 45,
 'programming': 46}

In [199]:
total_author_names = benign_author_names.tolist() + anomaly_author_names.tolist()
total_author_names = sorted(list(set(total_author_names)))

In [200]:
u2index={}
count = 0
for author in total_author_names:
    u2index[author]=count
    count+=1
len(u2index)

787

In [201]:
u2index

{'0_o': 0,
 '138': 1,
 '13ren': 2,
 '1812overture': 3,
 '1esproc': 4,
 '315was_an_inside_job': 5,
 '43P04T34': 6,
 '7oby': 7,
 'AAjax': 8,
 'ABabyAteMyDingo': 9,
 'ANSICL': 10,
 'AbouBenAdhem': 11,
 'Aerik': 12,
 'Ajenthavoc': 13,
 'AliasHandler': 14,
 'AmericanGoyBlog': 15,
 'AngelaMotorman': 16,
 'AngledLuffa': 17,
 'Anonymous7777': 18,
 'AnteChronos': 19,
 'ApostrophePosse': 20,
 'ArcticCelt': 21,
 'Bagel': 22,
 'Battleloser': 23,
 'BedtimeForSheeple': 24,
 'BeetleB': 25,
 'Benny_Lava': 26,
 'Bensch': 27,
 'Bixie': 28,
 'Bloodlustt': 29,
 'Bloody_Eye': 30,
 'BlueBeard': 31,
 'BobGaffney': 32,
 'BraveSirRobin': 33,
 'BrianBoyko': 34,
 'Browzer': 35,
 'Burlapin': 36,
 'Busybyeski': 37,
 'CampusTour': 38,
 'CannedMango': 39,
 'Captain-Obliviouss': 40,
 'Chirp08': 41,
 'ChunkyLaFunga': 42,
 'Ciserus': 43,
 'Clothos': 44,
 'CodeMonkey1': 45,
 'Codebender': 46,
 'ColdSnickersBar': 47,
 'Cookie': 48,
 'CrackIsGoodForYou': 49,
 'CrimsonSun99': 50,
 'D-Style': 51,
 'DCGaymer': 52,
 'DOGA': 5

## get edge list data 

In [196]:
benign.shape, anomaly.shape

((8057, 21), (886, 21))

In [132]:
edgelist_df = benign.append(anomaly, ignore_index=True)
edgelist_df = edgelist_df.sort_values(by = 'retrieved_on')
#edgelist_df

In [195]:
edgelist_df.shape

(8943, 21)

In [136]:
edgelist_df[['author','subreddit','retrieved_on']].head(10)

Unnamed: 0,author,subreddit,retrieved_on
0,ultimatt42,science,1425846806
1,jonknee,programming,1425846807
4,burtonmkz,science,1425846810
5,pavel_lishin,reddit.com,1425846810
6,pavel_lishin,reddit.com,1425846810
7,sblinn,politics,1425846810
2,dons,programming,1425846811
3,Jedravent,politics,1425846811
8,WebZen,politics,1425846811
9,doodahdei,politics,1425846812


## get train/valiadation/test split 

In [137]:
import random

def generate_n_lists(num_of_lists, num_of_elements, value_from=0, value_to=100):
    s = random.sample(range(value_from, value_to + 1), num_of_lists * num_of_elements)
    return [s[i*num_of_elements:(i+1)*num_of_elements] for i in range(num_of_lists)]

l = generate_n_lists(2, 393, 0, 786)

In [140]:
len(l), len(l[0]), len(l[1])

(2, 393, 393)

In [142]:
import numpy as np

In [143]:
data_tvt = (np.array(l[0][:195]), np.array(l[0][195:]), np.array(l[1]))

In [147]:
type(data_tvt)

tuple

In [149]:
len(data_tvt[0]),len(data_tvt[1]), len(data_tvt[2])

(195, 198, 393)

# Get node features using NLP models
- To get node feature for user/author, we preprocess comments from each author, get their most frequently used word and feed these words into word2vec model to get embeddings as author node features.
- To get node feature for subreddit topic, we get the mostly mentioned word for each topic and feed these words into word2vec model to get embeddings as subreddit topic node features. 
#### Steps for comments/posts body processing are:
1. Convert words to lower
2. Remove numbers
3. Remove punctuation and symbols
4. Normalize the words (lemmatize and stem the words)

In [152]:
! pip install gensim

Collecting gensim
  Downloading gensim-4.2.0.tar.gz (23.2 MB)
     |████████████████████████████████| 23.2 MB 5.4 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: gensim
  Building wheel for gensim (setup.py) ... [?25ldone
[?25h  Created wheel for gensim: filename=gensim-4.2.0-cp36-cp36m-linux_x86_64.whl size=24166446 sha256=9774686b8c9ae3001e04a41854417313caf15d1fe168578fdb841821324e1232
  Stored in directory: /home/ec2-user/.cache/pip/wheels/44/1e/2b/b0056a533d057c3ed56c84fbdd79cca690496f4cd7c03c157c
Successfully built gensim
Installing collected packages: gensim
Successfully installed gensim-4.2.0


In [153]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import re
import collections
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import gensim.downloader

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### getting the pretrained models

In [154]:
vectors = gensim.downloader.load('word2vec-google-news-300')

[--------------------------------------------------] 1.4% 23.4/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[=-------------------------------------------------] 3.4% 56.5/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[==------------------------------------------------] 5.5% 91.9/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[===-----------------------------------------------] 7.7% 128.3/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[====----------------------------------------------] 9.9% 163.8/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/home/ec2-user/nltk_data'
    - '/home/ec2-user/anaconda3/envs/pytorch_p36/nltk_data'
    - '/home/ec2-user/anaconda3/envs/pytorch_p36/share/nltk_data'
    - '/home/ec2-user/anaconda3/envs/pytorch_p36/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [158]:
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer= PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### get the user node features (user2vec) 

In [172]:
u2index

{'jumpyg1258': 0,
 'baxyjr': 1,
 'rogerssucks': 2,
 'haywire9000': 3,
 'Fidodo': 4,
 'The_Ultimate_Reality': 5,
 'Richeh': 6,
 'atomicthumbs': 7,
 'kareems': 8,
 'bananahead': 9,
 'FeedMePlease': 10,
 'argoff': 11,
 'Petrarch1603': 12,
 'danweber': 13,
 'robotevil': 14,
 'BeetleB': 15,
 'honus': 16,
 'Erudecorp': 17,
 'utexaspunk': 18,
 'BraveSirRobin': 19,
 'Oak': 20,
 'andrewd': 21,
 '315was_an_inside_job': 22,
 'syroncoda': 23,
 'Vreep-eep': 24,
 'machrider': 25,
 'formido': 26,
 'Grimalkin': 27,
 'lugfish': 28,
 'qgyh2': 29,
 'NSMike': 30,
 'jamesallen74': 31,
 'raedix': 32,
 'JasonDJ': 33,
 'casicatracha': 34,
 'dlds': 35,
 'thrakhath': 36,
 'Dildozer': 37,
 'martoo': 38,
 'apathy': 39,
 'RonObvious': 40,
 'onebit': 41,
 'cajolingwilhelm': 42,
 'wurtis16': 43,
 'kalazar': 44,
 'me_so_porny': 45,
 'rainman_104': 46,
 'Spacksack': 47,
 'CrimsonSun99': 48,
 'garyr_h': 49,
 'anions': 50,
 'fingers': 51,
 'Poromenos': 52,
 'btl': 53,
 'frutiger': 54,
 'Lystrodom': 55,
 'daisy0808': 56,

In [174]:
type(vectors['hi']),vectors['hi'].shape

(numpy.ndarray, (300,))

In [189]:
final_user2vec_npy = np.zeros((len(u2index), 300))

for u in u2index:
    user = edgelist_df.loc[edgelist_df['author'] == u]
    comment_row_list = []
    for index, rows in user.iterrows():
        my_list = rows.body
        my_list = my_list.replace('\n'," ")
        my_list = my_list.replace('\t'," ")
        my_list = my_list.lower()
        my_list = ''.join([i for i in my_list if not i.isdigit()])
        my_list = re.sub(r'[^\w\s]', ' ', my_list)
        tokens = word_tokenize(my_list)
        my_list = [i for i in tokens if not i in stopwords]
        comment_row_list.append(my_list)
        
    flat_list = [x for xs in comment_row_list for x in xs]
    counter = collections.Counter(flat_list)
    top10 = counter.most_common(10)
    print(f'top 10 words used by {u} are:', top10)
    final_vectors = np.zeros((10, 300))
    for i, w in enumerate(top10):
        try:
            embedding = vectors[w[0]]
            #embedding = embedding.tolist()
        except:
            print('no embeddings created for word: {}'.format(w[0]))
            embedding = np.array([0] * 300)
        final_vectors[i,:]=embedding
    final_embeddings = np.sum(final_vectors, axis=0)    

    #arrays = [np.array(x) for x in final_vectors]
    #final_vector = [np.sum(k) for k in zip(*arrays)]
    #final_vector = np.array(final_vector)
    #final_vector = final_vector.tolist()
    if u2index[u] < 1:
        print(final_vectors.shape, final_embeddings.shape)
        print(final_vectors, final_embeddings)
    final_user2vec_npy[u2index[u],:] = final_embeddings

top 10 words used by jumpyg1258 are: [('says', 2), ('anything', 2), ('would', 2), ('place', 2), ('guy', 2), ('congress', 2), ('realize', 1), ('going', 1), ('ensure', 1), ('never', 1)]
(10, 300) (300,)
[[ 0.04370117 -0.11425781  0.06738281 ...  0.2578125   0.13867188
  -0.01177979]
 [ 0.07080078 -0.03491211  0.06542969 ... -0.20800781 -0.09326172
  -0.171875  ]
 [ 0.08935547  0.12988281  0.21289062 ... -0.15234375  0.00552368
  -0.10058594]
 ...
 [ 0.15136719  0.2421875   0.09326172 ... -0.06201172  0.05517578
  -0.16992188]
 [-0.484375    0.07714844 -0.13085938 ... -0.08740234  0.04443359
  -0.21972656]
 [ 0.02392578 -0.04614258  0.00390625 ... -0.26953125 -0.06884766
  -0.27539062]] [-1.67236328e-01  6.69433594e-01  1.32080078e-01  8.64501953e-01
 -5.40344238e-01 -4.69848633e-01  9.45678711e-01 -5.72448730e-01
  1.05920410e+00  1.19384766e-01 -4.04693604e-01 -1.31176758e+00
 -7.63000488e-01 -7.42492676e-02 -1.05029297e+00  9.28558350e-01
  5.77148438e-01  1.08013916e+00  1.99493408e-0

In [188]:
final_user2vec_npy, final_user2vec_npy.shape

(array([[-0.16723633,  0.66943359,  0.13208008, ..., -0.4543457 ,
          0.16543579, -0.73339844],
        [-0.05300903,  0.44433594, -0.14715576, ..., -0.72937012,
         -1.85644531,  0.70336914],
        [ 0.81152344, -0.14349365, -0.23791504, ..., -0.68164062,
          1.07611084,  0.04150391],
        ...,
        [ 0.36010742, -0.33374023, -0.26541138, ..., -1.19122314,
         -0.17749023,  1.07963562],
        [ 0.47998047,  0.94696045,  0.20869446, ..., -0.46154785,
         -0.61923218,  0.2364502 ],
        [-0.52929688, -0.34033203,  0.05163574, ..., -1.22705078,
         -0.11096191,  0.47301102]]),
 (787, 300))

### get the subreddit topic node features (prod2vec)

In [191]:
p2index

{'yourweek': 0,
 'technology': 1,
 'software': 2,
 'atheism': 3,
 'xkcd': 4,
 'philosophy': 5,
 'psychology': 6,
 'Drugs': 7,
 'nsfw': 8,
 'WTF': 9,
 'lgbt': 10,
 'web_design': 11,
 'pics': 12,
 'scifi': 13,
 'gaming': 14,
 'science': 15,
 'business': 16,
 'sports': 17,
 'sex': 18,
 'math': 19,
 'apple': 20,
 'area51': 21,
 'canada': 22,
 'linux': 23,
 'gadgets': 24,
 'geek': 25,
 'funny': 26,
 'comics': 27,
 'bestof': 28,
 'netsec': 29,
 'worldnews': 30,
 'reddit.com': 31,
 'happy': 32,
 'Economics': 33,
 'AskReddit': 34,
 'self': 35,
 'cogsci': 36,
 'entertainment': 37,
 'Music': 38,
 'obama': 39,
 'environment': 40,
 'photography': 41,
 'videos': 42,
 'lolcats': 43,
 'offbeat': 44,
 'politics': 45,
 'programming': 46}

In [193]:
#prod_unique = edgelist_df.subreddit.unique()
final_prod2vec_npy = np.zeros((len(p2index), 300))

for p in p2index:
    subreddit = edgelist_df.loc[edgelist_df['subreddit'] == p]
    subreddit_row_list = []
    for index, rows in subreddit.iterrows():
        my_list = rows.body
        my_list = my_list.replace('\n'," ")
        my_list = my_list.replace('\t'," ")
        my_list = my_list.lower()
        my_list = ''.join([i for i in my_list if not i.isdigit()])
        my_list = re.sub(r'[^\w\s]', ' ', my_list)
        tokens = word_tokenize(my_list)
        my_list = [i for i in tokens if not i in stopwords]
        subreddit_row_list.append(my_list)
        
    flat_list = [x for xs in subreddit_row_list for x in xs]
    counter = collections.Counter(flat_list)
    top10 = counter.most_common(10)
    print(f'top 10 words for subreddit topic {p} are:', top10)

    final_vectors = np.zeros((10, 300))
    for i, w in enumerate(top10):
        try:
            embedding = vectors[w[0]]
            #embedding = embedding.tolist()
        except:
            print('no embeddings created for word: {}'.format(w[0]))
            embedding = np.array([0] * 300)
        final_vectors[i,:]=embedding
    final_embeddings = np.sum(final_vectors, axis=0)
    final_prod2vec_npy[p2index[p],:] = final_embeddings
#     arrays = [np.array(x) for x in final_vectors]
#     final_vector = [np.sum(k) for k in zip(*arrays)]
#     final_vector = np.array(final_vector)
#     final_vector = final_vector.tolist()
#     final_prod2vec_npy.append(final_vector)

top 10 words for subreddit topic yourweek are: [('asbestos', 2), ('shut', 2), ('gypsum', 2), ('boring', 2), ('fork', 2), ('lift', 2), ('would', 2), ('vending', 2), ('fall', 2), ('summer', 2)]
top 10 words for subreddit topic technology are: [('gt', 66), ('people', 54), ('one', 52), ('like', 49), ('would', 46), ('get', 37), ('really', 31), ('craigslist', 31), ('want', 27), ('use', 26)]
top 10 words for subreddit topic software are: [('think', 1), ('meant', 1), ('x', 1)]
top 10 words for subreddit topic atheism are: [('religious', 18), ('god', 17), ('people', 12), ('like', 12), ('atheists', 11), ('christian', 10), ('gt', 9), ('would', 8), ('believe', 7), ('religion', 6)]
top 10 words for subreddit topic xkcd are: [('like', 3), ('xkcd', 2), ('several', 1), ('earlier', 1), ('comics', 1), ('touching', 1), ('biting', 1), ('hilarious', 1), ('always', 1), ('loved', 1)]
top 10 words for subreddit topic philosophy are: [('mathematics', 13), ('theory', 13), ('system', 9), ('rules', 9), ('one', 8)

In [194]:
type(final_prod2vec_npy),final_prod2vec_npy.shape

(numpy.ndarray, (47, 300))

# References

Jason Baumgartner, Savvas Zannettou, Brian Keegan, Megan Squire, and Jeremy Blackburn. 2020. The Pushshift Reddit Dataset.