In [2]:
import praw
from bs4 import BeautifulSoup
import re
import boto3
import pickle
import s3fs
import pyspark
import pandas as pd
import re

In [3]:
# Reddit API credentials
reddit = praw.Reddit(
    client_id ='**', 
    client_secret ='**',  
    user_agent ='**',  
    username ='**',  
)

In [13]:
# AWS credentials
AWS_ACCESS_KEY_ID = '**'
AWS_SECRET_ACCESS_KEY = '**'
bucket = 'reddit-tifu'
s3_resource = boto3.resource('s3')
s3 = boto3.client("s3")
s3_bucket = s3_resource.Bucket(bucket)

# Read list of subreddits from HTML

In [4]:
def read_html(bucket, txt_file, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY):
    '''
    read html using beautiful soup
    '''
    fs = s3fs.S3FileSystem(key = AWS_ACCESS_KEY_ID, secret = AWS_SECRET_ACCESS_KEY)
    with fs.open(f'{bucket}/{txt_file}', 'rb') as file:
        subreddit_html = file.read()   
    soup = BeautifulSoup(subreddit_html, "html.parser")
    list_html = soup.find("div", {"class": "md wiki"})

    return list_html

In [5]:
def find_subreddits(html_text):
    '''
    search for list of subreddits
    '''
    subreddits_list = set()
    for line in html_text:
        matches = re.findall('(/r)\/([\w]+)', str(line))
        if matches:
            for match in matches:
                subreddits_list.add(match[1].lower())
    subreddits_list = list(subreddits_list)

    return subreddits_list

In [6]:
list_html = read_html(bucket, 'subreddit_list.txt', AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

In [7]:
subreddits_list = find_subreddits(list_html)

In [8]:
len(subreddits_list)

1774

In [9]:
qna_subreddits = [ent for ent in subreddits_list if 'ask' in ent]
qna_subreddits

['askwomen',
 'askhistory',
 'askreddit',
 'shittyaskscience',
 'asksciencefiction',
 'asktransgender',
 'askgaybros',
 'tooafraidtoask',
 'askhistorians',
 'askdocs',
 'askscience',
 'askphilosophy',
 'trueaskreddit',
 'askmen',
 'askculinary',
 'askouija',
 'asksocialscience',
 'askmenover30',
 'collegebasketball',
 'askengineers',
 'askredditafterdark',
 'nobodyasked']

In [14]:
len(qna_subreddits)

22

# Download submissions from each subreddit 

In [11]:
other_topics = []
for topic in subreddits_list:
    if topic not in qna_subreddits:
        other_topics.append(topic)  

In [None]:
for ind, subred in enumerate(other_topics):
    try:
        s3.head_object(Bucket=bucket, Key=f'submissions_subreddit/submissions_{subred}.pkl')
        pass
    except:
        if subred != 'virginsvschad':
            submissions = []
            for submission in reddit.subreddit(subred).hot(limit=None):
                submissions.append(submission)
            key=f'submissions_subreddit/submissions_{subred}.pkl'
            
            submissions_pkl = pickle.dumps(submissions) 
            s3_resource.Object(bucket,key).put(Body=submissions_pkl)
            
            print(f'Done with {subred}')

In [16]:
subreddits_pkl = []
for object_summary in s3_bucket.objects.filter(Prefix="submissions_subreddit/"):
    subreddit_file = object_summary.key
    if subreddit_file != 'submissions_subreddit/':
        subreddits_pkl.append(subreddit_file)

In [17]:
len(subreddits_pkl)

694

# Extract text from each subreddit

In [18]:
def extract_info(sub):
    title = sub.title
    main_text = sub.selftext
    sentences = re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', main_text)
    if title == '' or sentences == []:
        return []
    else:
        return sentences

In [19]:
sc = pyspark.SparkContext('local')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/13 18:55:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [20]:
def batch_extract(subset):
    rdd = sc.parallelize(subset)
    submissions_info = rdd.map(extract_info)
    submissions_data = submissions_info.collect()

    return submissions_data

In [43]:
subreddit.split('/')[1]

'submissions_13or30.pkl'

In [44]:
for ind, subreddit in enumerate(subreddits_pkl):
    key=subreddit
    submissions = pickle.loads(s3_resource.Bucket(bucket).Object(key).get()['Body'].read())
    submissions_data = batch_extract(submissions)

    submissions_pkl = pickle.dumps(submissions_data) 
    subreddit_split = subreddit.split('/')[1]
    key = f'submissions_subreddit/batch_data/{subreddit_split}'
    s3_resource.Object(bucket,key).put(Body=submissions_pkl)
    if ind%100 == 0:
        print(f'Done with {ind}, {subreddit_split}')

25/05/13 19:24:25 WARN TaskSetManager: Stage 695 contains a task of very large size (2204 KiB). The maximum recommended task size is 1000 KiB.


Done with 0, submissions_13or30.pkl


25/05/13 19:24:26 WARN TaskSetManager: Stage 697 contains a task of very large size (3091 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:24:27 WARN TaskSetManager: Stage 698 contains a task of very large size (4060 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:24:27 WARN TaskSetManager: Stage 699 contains a task of very large size (2478 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:24:28 WARN TaskSetManager: Stage 700 contains a task of very large size (2405 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:24:28 WARN TaskSetManager: Stage 701 contains a task of very large size (2680 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:24:29 WARN TaskSetManager: Stage 702 contains a task of very large size (2275 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:24:29 WARN TaskSetManager: Stage 703 contains a task of very large size (1683 KiB). The maximum recommended task size is 1000 KiB.

Done with 100, submissions_budgetfood.pkl


25/05/13 19:25:10 WARN TaskSetManager: Stage 796 contains a task of very large size (2779 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:25:11 WARN TaskSetManager: Stage 797 contains a task of very large size (3989 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:25:11 WARN TaskSetManager: Stage 798 contains a task of very large size (2946 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:25:12 WARN TaskSetManager: Stage 799 contains a task of very large size (3367 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:25:12 WARN TaskSetManager: Stage 801 contains a task of very large size (2571 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:25:13 WARN TaskSetManager: Stage 802 contains a task of very large size (2170 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:25:13 WARN TaskSetManager: Stage 803 contains a task of very large size (4112 KiB). The maximum recommended task size is 1000 KiB.

Done with 200, submissions_eagles.pkl


25/05/13 19:25:53 WARN TaskSetManager: Stage 896 contains a task of very large size (1189 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:25:53 WARN TaskSetManager: Stage 897 contains a task of very large size (3403 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:25:54 WARN TaskSetManager: Stage 898 contains a task of very large size (2850 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:25:54 WARN TaskSetManager: Stage 899 contains a task of very large size (2140 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:25:55 WARN TaskSetManager: Stage 901 contains a task of very large size (1207 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:25:55 WARN TaskSetManager: Stage 902 contains a task of very large size (2940 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:25:56 WARN TaskSetManager: Stage 903 contains a task of very large size (1837 KiB). The maximum recommended task size is 1000 KiB.

Done with 300, submissions_hockey.pkl


25/05/13 19:26:37 WARN TaskSetManager: Stage 996 contains a task of very large size (1608 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:26:37 WARN TaskSetManager: Stage 997 contains a task of very large size (3041 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:26:38 WARN TaskSetManager: Stage 998 contains a task of very large size (2256 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:26:38 WARN TaskSetManager: Stage 999 contains a task of very large size (3126 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:26:39 WARN TaskSetManager: Stage 1000 contains a task of very large size (3700 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:26:39 WARN TaskSetManager: Stage 1001 contains a task of very large size (3632 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:26:40 WARN TaskSetManager: Stage 1002 contains a task of very large size (3469 KiB). The maximum recommended task size is 1000 K

Done with 400, submissions_microporn.pkl


25/05/13 19:27:24 WARN TaskSetManager: Stage 1096 contains a task of very large size (1160 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:27:25 WARN TaskSetManager: Stage 1097 contains a task of very large size (2857 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:27:25 WARN TaskSetManager: Stage 1098 contains a task of very large size (1392 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:27:25 WARN TaskSetManager: Stage 1099 contains a task of very large size (4982 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:27:26 WARN TaskSetManager: Stage 1100 contains a task of very large size (4380 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:27:27 WARN TaskSetManager: Stage 1101 contains a task of very large size (3703 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:27:27 WARN TaskSetManager: Stage 1102 contains a task of very large size (2534 KiB). The maximum recommended task size is 10

Done with 500, submissions_redditsings.pkl


25/05/13 19:28:07 WARN TaskSetManager: Stage 1196 contains a task of very large size (2370 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:28:07 WARN TaskSetManager: Stage 1197 contains a task of very large size (1312 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:28:08 WARN TaskSetManager: Stage 1198 contains a task of very large size (1256 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:28:08 WARN TaskSetManager: Stage 1199 contains a task of very large size (2399 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:28:09 WARN TaskSetManager: Stage 1200 contains a task of very large size (2790 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:28:09 WARN TaskSetManager: Stage 1201 contains a task of very large size (1932 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:28:10 WARN TaskSetManager: Stage 1202 contains a task of very large size (3872 KiB). The maximum recommended task size is 10

Done with 600, submissions_tattoo.pkl


25/05/13 19:28:57 WARN TaskSetManager: Stage 1296 contains a task of very large size (3456 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:28:57 WARN TaskSetManager: Stage 1297 contains a task of very large size (1689 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:28:57 WARN TaskSetManager: Stage 1298 contains a task of very large size (1352 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:28:58 WARN TaskSetManager: Stage 1299 contains a task of very large size (2863 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:28:58 WARN TaskSetManager: Stage 1300 contains a task of very large size (2550 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:28:59 WARN TaskSetManager: Stage 1301 contains a task of very large size (2966 KiB). The maximum recommended task size is 1000 KiB.
25/05/13 19:28:59 WARN TaskSetManager: Stage 1302 contains a task of very large size (2516 KiB). The maximum recommended task size is 10

In [51]:
submissions_data = []
for subreddit in subreddits_pkl:
    subreddit_split = subreddit.split('/')[1]
    key = f'submissions_subreddit/batch_data/{subreddit_split}'
    submissions_data += pickle.loads(s3_resource.Bucket(bucket).Object(key).get()['Body'].read())

In [55]:
submissions_train_df = pd.DataFrame()
train_texts = []
for content in submissions_data: 
    if content != [] and content != ['']:
        for ent in content:
            train_texts.append(ent)  

In [56]:
len(train_texts)

1839912

In [57]:
submissions_train_df['text'] = train_texts

In [58]:
submissions_train_df.head()

Unnamed: 0,text
0,This post contains content not supported on ol...
1,[Click here to view the full post](https://sh....
2,I still get asked for my ID when I go to clubs.
3,My mum and dad don't get thid Charleston music...
4,I’m 31


In [59]:
s3_path = f"s3a://{bucket}/subreddits_train_data_3.json"
submissions_train_df.to_json(s3_path, orient = 'index')