In [1]:
from datetime import datetime
import pickle as pk
from convokit import Corpus, download
from preprocess import preprocess
data_dir = "./data/"

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
subreddit = 'snowboarding'
corpus = Corpus(filename=download(f'subreddit-{subreddit}'))


Downloading subreddit-snowboarding to /Users/wyw/.convokit/downloads/subreddit-snowboarding
Downloading subreddit-snowboarding from http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/corpus-zipped/sneakermarketuk~-~soccer/snowboarding.corpus.zip (89.6MB)... Done


In [None]:
df = corpus.get_utterances_dataframe(selector = lambda utt: len(utt.text.split()) > 5)
df = df[df['speaker'] != "[deleted]"]
df['datetime'] = df['timestamp'].apply(lambda t: datetime.fromtimestamp(t))
df['year-month'] = df['datetime'].apply(lambda d: str(d.year) + "-" + str(d.month).zfill(2))

In [None]:
comments = df[df['reply_to'].notnull()].copy()

In [None]:
comments['year-month'].unique()

In [None]:
comments.groupby(['year-month']).size().to_dict()

In [None]:
comments.head()

In [None]:
usecols = ['year-month', 'timestamp', 'text', 'speaker']

In [None]:
comments = comments[usecols].copy()

In [None]:
comments['original_text'] = comments['text']
comments['text'] = comments['original_text'].apply(preprocess)

In [None]:
comments.head()

In [41]:
pk.dump(comments[usecols], open(data_dir + f"{subreddit}-comments.pk", "wb"))

In [18]:
subreddit = "movies"
df = pk.load(open(data_dir + f"{subreddit}-comments.pk", "rb"))

In [19]:
df.shape

(17757184, 3)

In [16]:
df.head()

Unnamed: 0_level_0,year-month,timestamp,text,speaker
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c04malt,2008-07,1215513011,way to be a sterotypical jew,moogle516
c054y64,2008-08,1219432706,if you are the sort of jewish person who is ve...,marcstober
c063kbd,2008-10,1225377639,i would spell shlemile as shlemiel but what do...,Caper8888
c070qdr,2009-01,1231890143,rabbi so many of you have said to me this week...,egbert
c08gt10,2009-03,1238045719,i swear i saw shemale at first glance,sputterpop


In [17]:
df['year'] = df['year-month'].apply(lambda x: int(x.split("-")[0]))
df = df[df['year'] >= 2014]
df.shape

(481116, 5)

## large files

In [25]:
import jsonlines
import pandas as pd
from tqdm import tqdm

In [26]:
subreddit = 'snowboarding'

In [28]:
comments = list()
with jsonlines.open(f'/Users/wyw/.convokit/downloads/subreddit-{subreddit}/utterances.jsonl') as reader:
    for obj in tqdm(reader):
        if obj['timestamp'] > 1388552400 and obj['reply_to']: # only collect comments after 2014/1 
            if len(obj['text'].split()) > 5:
                obj['text'] = preprocess(obj['text'])
                if len(obj['text']) > 5: 
                    dt = datetime.fromtimestamp(obj['timestamp'])
                    obj['year'] = dt.year
                    obj['month'] = dt.month
                    obj['year-month'] = str(dt.year) + "-" + str(dt.month).zfill(2)
                    comments.append({k: obj[k] for k in ['id', 'year', 'month', 'year-month', 'text']})

712718it [00:25, 27664.73it/s] 


In [32]:
import numpy as np
sample_comments = np.random.choice(comments, size=min(len(comments), 500000), replace=False)

In [33]:
sample_comments

array([{'id': 'dlhude7', 'year': 2017, 'month': 8, 'year-month': '2017-08', 'text': 'i lost it at old school shred it'},
       {'id': 'dfh6pq8', 'year': 2017, 'month': 3, 'year-month': '2017-03', 'text': 'maybe or maybe not  you should have seen those pipes in person  i was at the north american open in 86 at sunshine with a similar pipe  no transition the same lumpy no vertical portion  compared to a modern perfectly groomed park they were way different'},
       {'id': 'du2atry', 'year': 2018, 'month': 2, 'year-month': '2018-02', 'text': 'no way should mark and max been 2 and 3 both their top laps were more technical that reds '},
       ...,
       {'id': 'cjvyekd', 'year': 2014, 'month': 8, 'year-month': '2014-08', 'text': 'no one is too old for this no one'},
       {'id': 'dxkuer5', 'year': 2018, 'month': 4, 'year-month': '2018-04', 'text': 'cork in triple back half out 4 flips 1 and a half spins '},
       {'id': 'cp4pphq', 'year': 2015, 'month': 3, 'year-month': '2015-03', 'te

In [14]:
comments = pd.DataFrame.from_dict(comments)


In [15]:
comments = comments.set_index("id")
comments = comments.rename({"user": "speaker"}, axis=1)
comments = comments.drop("reply_to", axis=1)
comments.head()

Unnamed: 0_level_0,year-month,text,speaker
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ceeft1x,2013-12,maybe not in age of ultron but theyll probably...,_Valisk
ceefta0,2013-12,nobody cared about iron man before his movie p...,Nova178
ceefted,2013-12,i think its a dvd release since they get the d...,mardfet
ceeftg7,2013-12,people really oversell dc animated series spoi...,symon_says
ceeftn0,2013-12,wasnt she covered in blue at some point too,AtrumTalio


In [16]:
pk.dump(comments, open(data_dir + f"{subreddit}-comments.pk", "wb"))