In [10]:
import praw
from bs4 import BeautifulSoup
import re
import boto3
import pickle
import s3fs
import pyspark
import pandas as pd
import re

In [2]:
# Reddit API credentials
reddit = praw.Reddit(
    client_id ='**', 
    client_secret ='**',  
    user_agent ='**',  
    username ='**',  
)

In [3]:
# AWS credentials
AWS_ACCESS_KEY_ID = '**'
AWS_SECRET_ACCESS_KEY = '**'

## Read list of subreddits from HTML

In [4]:
def read_html(bucket, txt_file, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY):
    '''
    read html using beautiful soup
    '''
    fs = s3fs.S3FileSystem(key = AWS_ACCESS_KEY_ID, secret = AWS_SECRET_ACCESS_KEY)
    with fs.open(f'{bucket}/{txt_file}', 'rb') as file:
        subreddit_html = file.read()   
    soup = BeautifulSoup(subreddit_html, "html.parser")
    list_html = soup.find("div", {"class": "md wiki"})

    return list_html

In [5]:
def find_subreddits(html_text):
    '''
    search for list of subreddits
    '''
    subreddits_list = set()
    for line in html_text:
        matches = re.findall('(/r)\/([\w]+)', str(line))
        if matches:
            for match in matches:
                subreddits_list.add(match[1].lower())
    subreddits_list = list(subreddits_list)

    return subreddits_list

In [39]:
list_html = read_html(bucket, 'subreddit_list.txt', AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

In [40]:
subreddits_list = find_subreddits(list_html)

In [41]:
qna_subreddits = [ent for ent in subreddits_list if 'ask' in ent]
qna_subreddits

['askhistorians',
 'askreddit',
 'trueaskreddit',
 'askmenover30',
 'askwomen',
 'shittyaskscience',
 'askredditafterdark',
 'tooafraidtoask',
 'asktransgender',
 'askgaybros',
 'nobodyasked',
 'askouija',
 'asksciencefiction',
 'askculinary',
 'askmen',
 'collegebasketball',
 'askhistory',
 'asksocialscience',
 'askengineers',
 'askscience',
 'askphilosophy',
 'askdocs']

In [42]:
len(qna_subreddits)

22

## Download submissions from each subreddit

In [11]:
# download subreddit submissions
submissions = []
for subred in qna_subreddits:
    for submission in reddit.subreddit(subred).hot(limit=None):
        submissions.append(submission)
    print(f'Done with {subred}')

Done with askhistorians
Done with askreddit
Done with trueaskreddit
Done with askmenover30
Done with askwomen
Done with shittyaskscience
Done with askredditafterdark
Done with tooafraidtoask
Done with asktransgender
Done with askgaybros
Done with nobodyasked
Done with askouija
Done with asksciencefiction
Done with askculinary
Done with askmen
Done with collegebasketball
Done with askhistory
Done with asksocialscience
Done with askengineers
Done with askscience
Done with askphilosophy
Done with askdocs


In [6]:
key='submissions.pkl'

In [7]:
bucket='reddit-tifu'
s3_resource = boto3.resource('s3')

In [17]:
# submissions_pkl = pickle.dumps(submissions) 
# s3_resource.Object(bucket,key).put(Body=submissions_pkl)

In [8]:
submissions = pickle.loads(s3_resource.Bucket(bucket).Object(key).get()['Body'].read())

In [9]:
len(submissions)

17327

## Extract text from each subreddit

In [16]:
def extract_info(sub):
    '''
    extract sentences from text
    '''
    title = sub.title
    main_text = sub.selftext
    sentences = re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', main_text)
    if title == '' or sentences == []:
        return []
    else:
        return sentences

In [17]:
sc = pyspark.SparkContext('local')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/01 16:49:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [31]:
subset_size = 1000
n_subsets = 1 + len(submissions)//subset_size

In [20]:
def batch_extract(subset):
    '''
    parallelize sentence extraction
    '''
    rdd = sc.parallelize(subset)
    submissions_info = rdd.map(extract_info)
    submissions_data = submissions_info.collect()

    return submissions_data

In [None]:
# loop over batches
for n in range(n_subsets):
    start = n*subset_size
    if n < n_subsets - 1:
        end = (n + 1)*subset_size
        subset = submissions[start: end]
    else: 
        subset = submissions[start: ]
        
    submissions_data = batch_extract(subset)

    submissions_pkl = pickle.dumps(submissions_data) 
    key = f'batch_data/batch_{str(n)}.pkl'
    s3_resource.Object(bucket,key).put(Body=submissions_pkl)
    print(f'Done with {n}')

In [34]:
# save batch data in S3 bucket
submissions_data = []
for n in range(n_subsets):
    key = f'batch_data/batch_{str(n)}.pkl'
    submissions_data += pickle.loads(s3_resource.Bucket(bucket).Object(key).get()['Body'].read())

In [35]:
assert len(submissions_data) == len(submissions)

In [49]:
# convert list of text to dataframe
submissions_train_df = pd.DataFrame()
# submissions_test_df = pd.DataFrame()
# titles = []
train_texts = []
# test_texts = []
for content in submissions_data: 
    if content != [] and content != ['']:
       # titles.append(ent[0])     
        for ent in content:
            train_texts.append(ent)  
        
       # test_texts.append(ent[2])     

In [50]:
len(train_texts)

77920

In [51]:
# submissions_train_df['title'] = titles
submissions_train_df['text'] = train_texts
# submissions_test_df['text'] = test_texts

In [52]:
submissions_train_df.head()

Unnamed: 0,text
0,[Previous](https://www.reddit.com/r/AskHistori...
1,Nobody can read all the questions and answers ...
2,[Previous weeks!](https://www.reddit.com/r/Ask...
3,Mods *will* remove questions which we deem to ...
4,We *will* remove answers which don't include a...


In [53]:
s3_path = f"s3a://{bucket}/subreddits_train_data.json"
submissions_train_df.to_json(s3_path, orient = 'index')

In [15]:
s3_path = f"s3a://{bucket}/subreddits_test_data.json"
submissions_test_df.to_json(s3_path)