In [1]:
import praw
from bs4 import BeautifulSoup
import re
import boto3
import pickle
import s3fs
import pyspark
import pandas as pd
import re

In [2]:
# Reddit API credentials
reddit = praw.Reddit(
    client_id ='**', 
    client_secret ='**',  
    user_agent ='**',  
    username ='**',  
)

In [3]:
# AWS credentials
AWS_ACCESS_KEY_ID = '**'
AWS_SECRET_ACCESS_KEY = '**'
s3_bucket = '**'

# Read list of subreddits from HTML

In [4]:
def read_html(bucket, txt_file, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY):
    '''
    read html using beautiful soup
    '''
    fs = s3fs.S3FileSystem(key = AWS_ACCESS_KEY_ID, secret = AWS_SECRET_ACCESS_KEY)
    with fs.open(f'{bucket}/{txt_file}', 'rb') as file:
        subreddit_html = file.read()   
    soup = BeautifulSoup(subreddit_html, "html.parser")
    list_html = soup.find("div", {"class": "md wiki"})

    return list_html

In [5]:
def find_subreddits(html_text):
    '''
    search for list of subreddits
    '''
    subreddits_list = set()
    for line in html_text:
        matches = re.findall('(/r)\/([\w]+)', str(line))
        if matches:
            for match in matches:
                subreddits_list.add(match[1].lower())
    subreddits_list = list(subreddits_list)

    return subreddits_list

In [6]:
list_html = read_html(s3_bucket, 'subreddit_list.txt', AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

In [7]:
subreddits_list = find_subreddits(list_html)

In [8]:
len(subreddits_list)

1774

In [9]:
qna_subreddits = [ent for ent in subreddits_list if 'ask' in ent]
qna_subreddits

['askwomen',
 'askhistory',
 'askreddit',
 'shittyaskscience',
 'asksciencefiction',
 'asktransgender',
 'askgaybros',
 'tooafraidtoask',
 'askhistorians',
 'askdocs',
 'askscience',
 'askphilosophy',
 'trueaskreddit',
 'askmen',
 'askculinary',
 'askouija',
 'asksocialscience',
 'askmenover30',
 'collegebasketball',
 'askengineers',
 'askredditafterdark',
 'nobodyasked']

In [14]:
len(qna_subreddits)

22

# Download submissions from each subreddit 

In [11]:
other_topics = []
for topic in subreddits_list:
    if topic not in qna_subreddits:
        other_topics.append(topic)  

In [12]:
bucket='reddit-tifu'
s3_resource = boto3.resource('s3')

In [None]:
s3 = boto3.client("s3")
for ind, subred in enumerate(other_topics):
    try:
        s3.head_object(Bucket=bucket, Key=f'submissions_subreddit/submissions_{subred}.pkl')
        pass
    except:
        if subred != 'virginsvschad':
            submissions = []
            for submission in reddit.subreddit(subred).hot(limit=None):
                submissions.append(submission)
            key=f'submissions_subreddit/submissions_{subred}.pkl'
            
            submissions_pkl = pickle.dumps(submissions) 
            s3_resource.Object(bucket,key).put(Body=submissions_pkl)
            
            print(f'Done with {subred}')

In [35]:
submissions = pickle.loads(s3_resource.Bucket(bucket).Object(key).get()['Body'].read())

In [18]:
len(submissions)

41017

# Extract text from each subreddit

In [22]:
def extract_info(sub):
    title = sub.title
    main_text = sub.selftext
    sentences = re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', main_text)
    if title == '' or sentences == []:
        return []
    else:
        # comments = []
        # for top_level_comment in sub.comments:
        #     try:
        #         comments.append((top_level_comment.body, top_level_comment.score))  
        #     except:
        #         comments.append(())
        return sentences

In [23]:
sc = pyspark.SparkContext('local')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/09 17:02:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [50]:
subset_size = 1000
n_subsets = 1 + len(submissions)//subset_size

In [25]:
def batch_extract(subset):
    rdd = sc.parallelize(subset)
    submissions_info = rdd.map(extract_info)
    submissions_data = submissions_info.collect()

    return submissions_data

In [None]:
for n in range(n_subsets):
    start = n*subset_size
    if n < n_subsets - 1:
        end = (n + 1)*subset_size
        subset = submissions[start: end]
    else: 
        subset = submissions[start: ]
        
    submissions_data = batch_extract(subset)

    submissions_pkl = pickle.dumps(submissions_data) 
    key = f'batch_data_2/batch_{str(n)}.pkl'
    s3_resource.Object(bucket,key).put(Body=submissions_pkl)
    print(f'Done with {n}')

In [52]:
submissions_data = []
for n in range(n_subsets):
    key = f'batch_data_2/batch_{str(n)}.pkl'
    submissions_data += pickle.loads(s3_resource.Bucket(s3_bucket).Object(key).get()['Body'].read())

In [53]:
assert len(submissions_data) == len(submissions)

In [54]:
submissions_train_df = pd.DataFrame()
# submissions_test_df = pd.DataFrame()
# titles = []
train_texts = []
# test_texts = []
for content in submissions_data: 
    if content != [] and content != ['']:
       # titles.append(ent[0])     
        for ent in content:
            train_texts.append(ent)  
        
       # test_texts.append(ent[2])     

In [55]:
len(train_texts)

85850

In [56]:
# submissions_train_df['title'] = titles
submissions_train_df['text'] = train_texts
# submissions_test_df['text'] = test_texts

In [57]:
submissions_train_df.head()

Unnamed: 0,text
0,Another home on the eastern plains of Colorado...
1,Colorado stopped using state Medicaid funds on...
2,Here’s one of many that are now abandoned-
3,More Blossoms & Bandos.
4,This house has a very interesting history.


In [58]:
s3_path = f"s3a://{bucket}/subreddits_train_data_2.json"
submissions_train_df.to_json(s3_path, orient = 'index')

In [15]:
s3_path = f"s3a://{bucket}/subreddits_test_data.json"
submissions_test_df.to_json(s3_path)