In [2]:
import praw
from bs4 import BeautifulSoup
import re
import boto3
import pickle
import s3fs
import pyspark
import pandas as pd
import re

In [3]:
# Reddit API credentials
reddit = praw.Reddit(
    client_id ='**', 
    client_secret ='**',  
    user_agent ='**',  
    username ='**',  
)

In [13]:
# AWS credentials
AWS_ACCESS_KEY_ID = '**'
AWS_SECRET_ACCESS_KEY = '**'
bucket = 'reddit-tifu'
s3_resource = boto3.resource('s3')
s3 = boto3.client("s3")
s3_bucket = s3_resource.Bucket(bucket)

# Read list of subreddits from HTML

In [4]:
def read_html(bucket, txt_file, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY):
    '''
    read html using beautiful soup
    '''
    fs = s3fs.S3FileSystem(key = AWS_ACCESS_KEY_ID, secret = AWS_SECRET_ACCESS_KEY)
    with fs.open(f'{bucket}/{txt_file}', 'rb') as file:
        subreddit_html = file.read()   
    soup = BeautifulSoup(subreddit_html, "html.parser")
    list_html = soup.find("div", {"class": "md wiki"})

    return list_html

In [5]:
def find_subreddits(html_text):
    '''
    search for list of subreddits
    '''
    subreddits_list = set()
    for line in html_text:
        matches = re.findall('(/r)\/([\w]+)', str(line))
        if matches:
            for match in matches:
                subreddits_list.add(match[1].lower())
    subreddits_list = list(subreddits_list)

    return subreddits_list

In [6]:
list_html = read_html(bucket, 'subreddit_list.txt', AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

In [7]:
subreddits_list = find_subreddits(list_html)

In [8]:
len(subreddits_list)

1774

In [9]:
qna_subreddits = [ent for ent in subreddits_list if 'ask' in ent]
qna_subreddits

['askwomen',
 'askhistory',
 'askreddit',
 'shittyaskscience',
 'asksciencefiction',
 'asktransgender',
 'askgaybros',
 'tooafraidtoask',
 'askhistorians',
 'askdocs',
 'askscience',
 'askphilosophy',
 'trueaskreddit',
 'askmen',
 'askculinary',
 'askouija',
 'asksocialscience',
 'askmenover30',
 'collegebasketball',
 'askengineers',
 'askredditafterdark',
 'nobodyasked']

In [14]:
len(qna_subreddits)

22

# Download submissions from each subreddit 

In [11]:
other_topics = []
for topic in subreddits_list:
    if topic not in qna_subreddits:
        other_topics.append(topic)  

In [None]:
for ind, subred in enumerate(other_topics):
    try:
        s3.head_object(Bucket=bucket, Key=f'submissions_subreddit/submissions_{subred}.pkl')
        pass
    except:
        if subred != 'virginsvschad':
            submissions = []
            for submission in reddit.subreddit(subred).hot(limit=None):
                submissions.append(submission)
            key=f'submissions_subreddit/submissions_{subred}.pkl'
            
            submissions_pkl = pickle.dumps(submissions) 
            s3_resource.Object(bucket,key).put(Body=submissions_pkl)
            
            print(f'Done with {subred}')

In [16]:
subreddits_pkl = []
for object_summary in s3_bucket.objects.filter(Prefix="submissions_subreddit/"):
    subreddit_file = object_summary.key
    if subreddit_file != 'submissions_subreddit/':
        subreddits_pkl.append(subreddit_file)

In [17]:
len(subreddits_pkl)

694

# Extract text from each subreddit

In [18]:
def extract_info(sub):
    title = sub.title
    main_text = sub.selftext
    sentences = re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', main_text)
    if title == '' or sentences == []:
        return []
    else:
        return sentences

In [19]:
sc = pyspark.SparkContext('local')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/13 18:55:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [20]:
def batch_extract(subset):
    rdd = sc.parallelize(subset)
    submissions_info = rdd.map(extract_info)
    submissions_data = submissions_info.collect()

    return submissions_data

In [43]:
subreddit.split('/')[1]

'submissions_13or30.pkl'

In [None]:
for ind, subreddit in enumerate(subreddits_pkl):
    key=subreddit
    submissions = pickle.loads(s3_resource.Bucket(bucket).Object(key).get()['Body'].read())
    submissions_data = batch_extract(submissions)

    submissions_pkl = pickle.dumps(submissions_data) 
    subreddit_split = subreddit.split('/')[1]
    key = f'submissions_subreddit/batch_data/{subreddit_split}'
    s3_resource.Object(bucket,key).put(Body=submissions_pkl)
    if ind%100 == 0:
        print(f'Done with {ind}, {subreddit_split}')

In [51]:
submissions_data = []
for subreddit in subreddits_pkl:
    subreddit_split = subreddit.split('/')[1]
    key = f'submissions_subreddit/batch_data/{subreddit_split}'
    submissions_data += pickle.loads(s3_resource.Bucket(bucket).Object(key).get()['Body'].read())

In [55]:
submissions_train_df = pd.DataFrame()
train_texts = []
for content in submissions_data: 
    if content != [] and content != ['']:
        for ent in content:
            train_texts.append(ent)  

In [56]:
len(train_texts)

1839912

In [57]:
submissions_train_df['text'] = train_texts

In [58]:
submissions_train_df.head()

Unnamed: 0,text
0,This post contains content not supported on ol...
1,[Click here to view the full post](https://sh....
2,I still get asked for my ID when I go to clubs.
3,My mum and dad don't get thid Charleston music...
4,I’m 31


In [59]:
s3_path = f"s3a://{bucket}/subreddits_train_data_3.json"
submissions_train_df.to_json(s3_path, orient = 'index')