# Data Scraper

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import praw
import json
import re

In [3]:
from datetime import datetime as dt

In [4]:
import seaborn as sns
# import matplotlib.pyplot as plt

sns.set_style('whitegrid')
sns.set_context('talk')

In [5]:
DATA_FOLDER = '../data/'
FIGURE_FOLDER = '../reports/figures'
API_LOGIN_FN = '../data/config/reddit_auth.json'

In [6]:
date_time = {
    'start' : {
        'year' : 2013,
        'month' : 6,
        'day' : 1
    },
    
    'end' : {
        'year' : 2023,
        'month' : 6,
        'day' : 1
    }
}

start_time = int(dt(date_time['start']['year'], date_time['start']['month'], date_time['start']['day']).timestamp())
end_time = int(dt(date_time['end']['year'], date_time['end']['month'], date_time['end']['day']).timestamp())

In [7]:
with open(API_LOGIN_FN, 'r') as f:
    login_info = json.load(f)
    
# Check to ensure that login_info contains the necessary information.
assert('client_id' in login_info.keys())
assert('client_secret' in login_info.keys())
assert('user_agent' in login_info.keys())
assert('username' in login_info.keys())
assert('password' in login_info.keys())

In [8]:
reddit = praw.Reddit(**login_info, ratelimit_seconds=60)

Version 7.7.0 of praw is outdated. Version 7.7.1 was released Tuesday July 11, 2023.


In [9]:
reddit.read_only = True

In [10]:
print(reddit.user.me())

None


In [11]:
# Define a list of queries
queries = ['semaglutide' ,'ozempic', 'wegovy', 'rybelsus', 'liraglutide',
           'saxenda', 'dulaglutide', 'trulicity', 'tirzepatide', 'mounjaro',
           'victoza', 'bydureon', 'byetta', 'GLP-1RA','GLP-1', 'GLP1', 'GLP1RA', 
           'exenatide', 'bydureon bcise', 'orforglipron',
          'lixisenatide', 'adlyxin', 'retatrutide', 'ly3437943']
# queries = queries.sort()
# queries = ['ozempic']

num_subreddits = 0
subreddits_by_query = {}
# subscribers_threshold = 1000000
subscribers_threshold = 0

for query in queries:
    subreddits = reddit.subreddits.search(query)
    filtered_subreddits = []
    for subreddit in subreddits:
#         print(subreddit.submission_count)
        if subreddit.subscribers is None:
            continue
        elif subreddit.subscribers > subscribers_threshold:
            num_subreddits += 1
            filtered_subreddits.append(subreddit)
    subreddits_by_query[query] = filtered_subreddits
#     print(query, ": ", len(filtered_subreddits))
#     print(len(filtered_subreddits))

In [12]:
# print(len(queries))

In [13]:
# subreddits_by_query

In [14]:
import csv
with open(DATA_FOLDER +  'subreddits_by_query.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    for key, value in subreddits_by_query.items():
       writer.writerow([key, value])

In [15]:
final_subreddits = set()

In [16]:
for key, value in subreddits_by_query.items():
#     print(len(value))
#     if len(value) < 10:
#         continue
#     print(value)
    final_subreddits.update(value)

In [17]:
# final_subreddits

In [18]:
# len(final_subreddits)

In [19]:
POST_DF_MAPPING = {
    'id' : 'id',
    'author' : 'author',
    'title' : 'title',
    'date' : 'created_utc',
    'body' : 'selftext',
    'n_comments' : 'num_comments',
    'upvotes' : 'score',
}

COMM_DF_MAPPING = {
    'id' : 'id',
    'author' : 'author',
    'title' : 'title',
    'body' : 'body',
    'date' : 'created_utc',
    'link_id' : 'link_id',
    'parent_id' : 'parent_id',
    'upvotes' : 'score'
}

PSAW_MAPPINGS = {
    'post' : POST_DF_MAPPING,
    'comment' : COMM_DF_MAPPING
}

In [20]:
def api_result_conversion(res, type_of_res):
    """
    Convert the API result from PSAW into a pandas Series.    
    """
    
    mapping = PSAW_MAPPINGS[type_of_res]
    
    out = []
    
    for col, col_map in mapping.items():
        try:
            out.append(getattr(res, col_map))
        except:
            out.append('')
        
    return out

In [21]:
def scrape_reddit_posts(subreddits, search_words, save_name):
    """Scrapes Reddit for posts matching a set of search words in a set of subreddits.
    """

    posts = []
    cols = ['subreddit', 'query']
    cols.extend(POST_DF_MAPPING.keys())

    for subreddit in subreddits:
        for search_word in search_words:
            print("Now searching r/{0} for {1}".format(subreddit, search_word))

            for post in subreddit.search(search_word, limit=None):

                if start_time <= post.created_utc <= end_time:
                    # print(comment.body)
                    post_ext = [subreddit, search_word]
                    post_ext.extend(api_result_conversion(post, 'post'))
                    posts.append(post_ext)

            print("Completed search. Total {0} entries!".format(len(posts)))
                             
    post_df = pd.DataFrame(posts, columns=cols)
    post_df.set_index('id')

    post_df.to_csv(DATA_FOLDER + save_name)
    
    return post_df

In [22]:
# list(subreddits_by_query.values())[0]

In [23]:
# import time

# start_time = time.time()

# posts = scrape_reddit_posts(final_subreddits, subreddits_by_query.keys(), 'raw/posts.csv')

# end_time = time.time()
# elapsed_time = end_time - start_time

# print(elapsed_time)

In [24]:
def scrape_reddit_comments_in_post(posts, save_name):
    """Scrapes Reddit for posts matching a set of search words in a set of subreddits.
    """
    
#     import time

#     start_time = time.time()

    cols = ['subreddit', 'query']
    cols.extend(COMM_DF_MAPPING.keys())

    comments = []
    post_ids = posts.id
    post_queries = posts['query']
    post_subreddits = posts['subreddit']
    idx = 0
    for post_id in post_ids:
        # print("Now searching post <{0}>".format(post_id))

        submission = reddit.submission(post_id)
        submission.comments.replace_more(limit=None)

        for comment in submission.comments.list():
            query_idx = comment.body.find(post_queries[idx])
            if query_idx == -1:
                continue
            # print(comment.body)
            comment_ext = [post_subreddits[idx], post_queries[idx]]
            comment_row = api_result_conversion(comment, 'comment')
            comment_ext.extend(comment_row)
            comments.append(comment_ext)

        # print("Completed search. Total {0} comments!".format(len(comments)))
        idx += 1
        print(idx, end='\r')

    comm_df = pd.DataFrame(comments, columns=cols)
    comm_df.set_index('id')

    comm_df.to_csv(DATA_FOLDER + save_name)
    
#     end_time = time.time()
#     elapsed_time = end_time - start_time

#     print(elapsed_time)

    return comm_df

In [41]:
posts = pd.read_csv(DATA_FOLDER + 'raw/posts.csv')

In [42]:
posts.shape

(14391, 10)

In [43]:
posts.head()

Unnamed: 0.1,Unnamed: 0,subreddit,query,id,author,title,date,body,n_comments,upvotes
0,0,trt,semaglutide,11n6wmk,Zellenial,Trt works if you work it.. started 1.5 years a...,1678403000.0,,77,24
1,1,trt,semaglutide,116ihmh,7856970,Which TRT clinics offer semaglutide?,1676827000.0,,13,1
2,2,trt,semaglutide,10rz4pf,Bud1985,Semaglutide,1675368000.0,Anyone in here have experience taking Semaglut...,17,4
3,3,trt,semaglutide,wc4u7i,Hormonesforme-com,Semaglutide is a revolutionary fat loss medica...,1659208000.0,\n\nSemaglutide is a revolutionary new weigh...,0,2
4,4,trt,semaglutide,13epai9,Hey_its_Jack,Doctors office changed criteria for 'normal le...,1683816000.0,"37y/o male, 250lbs\n\nA few months ago, I got ...",66,10


In [44]:
# start_time = time.time()

comments = scrape_reddit_comments_in_post(posts, 'raw/comments.csv')

# end_time = time.time()
# elapsed_time = end_time - start_time

# print(elapsed_time)

570

TooManyRequests: received 429 HTTP response

In [None]:
comments = pd.read_csv(DATA_FOLDER + 'raw/comments.csv')

In [None]:
comments.shape

In [None]:
comments.head()

In [45]:
posts['date'] = posts['date'].apply(lambda x: dt.fromtimestamp(x))
comments['date'] = comments['date'].apply(lambda x: dt.fromtimestamp(x))

In [46]:
search_words_regex = re.compile("|".join(subreddits_by_query.keys()))

matching_rows = posts['body'].map(lambda x: True if search_words_regex.search(str(x)) else False) + \
    posts['title'].map(lambda x: True if search_words_regex.search(x) else False)

In [47]:
posts['content'] = posts['title'] + '. ' + posts['body']
comments['content'] = comments['body']

posts['type'] = 'post'
comments['type'] = 'comment'

In [48]:
posts.head()

Unnamed: 0.1,Unnamed: 0,subreddit,query,id,author,title,date,body,n_comments,upvotes,content,type
0,0,trt,semaglutide,11n6wmk,Zellenial,Trt works if you work it.. started 1.5 years a...,2023-03-09 18:01:51,,77,24,,post
1,1,trt,semaglutide,116ihmh,7856970,Which TRT clinics offer semaglutide?,2023-02-19 12:22:42,,13,1,,post
2,2,trt,semaglutide,10rz4pf,Bud1985,Semaglutide,2023-02-02 14:56:14,Anyone in here have experience taking Semaglut...,17,4,Semaglutide. Anyone in here have experience ta...,post
3,3,trt,semaglutide,wc4u7i,Hormonesforme-com,Semaglutide is a revolutionary fat loss medica...,2022-07-30 15:05:59,\n\nSemaglutide is a revolutionary new weigh...,0,2,Semaglutide is a revolutionary fat loss medica...,post
4,4,trt,semaglutide,13epai9,Hey_its_Jack,Doctors office changed criteria for 'normal le...,2023-05-11 10:38:49,"37y/o male, 250lbs\n\nA few months ago, I got ...",66,10,Doctors office changed criteria for 'normal le...,post


In [49]:
comments.head()

In [50]:
cols_to_keep = ['query', 'subreddit', 'type', 'author', 'content', 'date', 'upvotes', 'n_comments']
pc = pd.concat((posts[cols_to_keep], comments[cols_to_keep]))
# pc = posts[cols_to_keep]

In [51]:
pc['len_text'] = pc['content'].apply(lambda x : len(str(x).split()))

In [52]:
pc.head()

Unnamed: 0,query,subreddit,type,author,content,date,upvotes,n_comments,len_text
0,semaglutide,trt,post,Zellenial,,2023-03-09 18:01:51,24,77,1
1,semaglutide,trt,post,7856970,,2023-02-19 12:22:42,1,13,1
2,semaglutide,trt,post,Bud1985,Semaglutide. Anyone in here have experience ta...,2023-02-02 14:56:14,4,17,35
3,semaglutide,trt,post,Hormonesforme-com,Semaglutide is a revolutionary fat loss medica...,2022-07-30 15:05:59,2,0,523
4,semaglutide,trt,post,Hey_its_Jack,Doctors office changed criteria for 'normal le...,2023-05-11 10:38:49,10,66,154


In [53]:
print(pc.shape)

(14391, 9)


In [54]:
# Fill empty cells and remove some weird html tags
pc['content'].fillna("", inplace=True)
pc.content = pc.content.str.replace("http\S+", "")
pc.content = pc.content.str.replace("\\n", " ")
pc.content = pc.content.str.replace("&gt;", "")
pc.content = pc.content.str.replace("\\", "")

In [55]:
pc.head()

Unnamed: 0,query,subreddit,type,author,content,date,upvotes,n_comments,len_text
0,semaglutide,trt,post,Zellenial,,2023-03-09 18:01:51,24,77,1
1,semaglutide,trt,post,7856970,,2023-02-19 12:22:42,1,13,1
2,semaglutide,trt,post,Bud1985,Semaglutide. Anyone in here have experience ta...,2023-02-02 14:56:14,4,17,35
3,semaglutide,trt,post,Hormonesforme-com,Semaglutide is a revolutionary fat loss medica...,2022-07-30 15:05:59,2,0,523
4,semaglutide,trt,post,Hey_its_Jack,Doctors office changed criteria for 'normal le...,2023-05-11 10:38:49,10,66,154


In [56]:
print(pc.shape)

(14391, 9)


In [57]:
pc.to_csv(DATA_FOLDER + "/processed/posts_and_comments.csv")