In [1]:
import requests
import pandas as pd
import re
from psaw import PushshiftAPI
api = PushshiftAPI()
from datetime import datetime
import json
import pysolr
from tqdm import tqdm

In [2]:
subreddits = ['science','worldnews','news','food','askscience','space','mildlyinteresting','history','UpliftingNews'
             'philosophy','technology','politics']
def getSubmissionsBySubReddit(subreddit,limit):
    gen = api.search_submissions(subreddit=[subreddit], 
                             filter=['id','url','author', 'title', 'score',
                                       'subreddit','selftext','num_comments'], 
                             limit = limit, 
                             num_comments=">10")
    submission_list = list(gen)
    submission_list = [item.d_ for item in submission_list]
    submission_df = pd.DataFrame.from_dict(submission_list)
    submission_df = submission_df.fillna("")
    return convertSubmissionsTofinalSchema(submission_df,topic=None)

def getSubmissionsBySubRedditsAndTopic(subreddits,topic,limit,keyword):
    gen = api.search_submissions(subreddit=subreddits,
                             filter=['id','url','author', 'title', 'score',
                                       'subreddit','selftext','num_comments'], 
                             limit = limit, 
                             num_comments=">10", q=keyword) 

    submission_list = list(gen)
    submission_list = [item.d_ for item in submission_list]
    submission_df = pd.DataFrame.from_dict(submission_list)
    submission_df = submission_df.fillna("")
    return convertSubmissionsTofinalSchema(submission_df,topic=topic)

def convertSubmissionsTofinalSchema(submission_df,topic):
    submission_df = submission_df[submission_df.selftext!='[removed]']    
    submission_df = submission_df[submission_df.selftext!='[deleted]']
    submission_df = submission_df[submission_df.selftext!='']
    submission_df['is_submission'] = True
    submission_df['body'] = submission_df['selftext']
    submission_df['parent_id'] = None
    submission_df['parent_body'] = None
    submission_df['topic'] = topic
    submission_df=submission_df.rename({'url':'full_link'},axis=1)
    submission_df=submission_df.rename({'score':'upvotes'},axis=1)
    res= submission_df[['id','subreddit','full_link','title','body','selftext','author','is_submission','parent_id'
                          ,'parent_body','topic','created_utc','upvotes']]
    return res

def convertCommentsToFinalSchema(comments_df,topic=None):
    comments_df = comments_df[comments_df.body!='[removed]']    
    comments_df = comments_df[comments_df.body!='[deleted]']
    comments_df = comments_df[comments_df.body!='']
    comments_df['is_submission'] = False
    comments_df['topic'] = topic
    comments_df['parent_body'] = None
    comments_df['selftext'] = None
    comments_df['title'] = None
    comments_df['parent_id'] = comments_df['parent_id'].apply(lambda x:x.split('_')[1] )
    comments_df=comments_df.rename({'permalink':'full_link'},axis=1)
    res= comments_df[['id','subreddit','full_link','title','body','selftext','author','is_submission','parent_id'
                          ,'parent_body','topic','created_utc','score']]
    return res

def fetchCommentsForSubmission(submission):
    comments_lst = list(api.search_comments(link_id=submission["id"].iloc[0], subreddit=[submission["subreddit"].iloc[0]],
                                        filter=['id','parent_id','permalink','author', 'title', 
                                                'subreddit','body','num_comments','score']))
    
    comments_lst = [item.d_ for item in comments_lst]
    comments_df = pd.DataFrame.from_dict(comments_lst)
    comments_df = comments_df.fillna("")
    
    if len(comments_lst) == 0:
        return pd.DataFrame()
    
    if 'permalink' not in comments_df.columns:
        print('no permalink')
        return pd.DataFrame()
    
    return convertCommentsToFinalSchema(comments_df,topic=submission["topic"].iloc[0])

def populateComments(submission):
    comments = fetchCommentsForSubmission(submission)
    if comments.size == 0:
        return pd.concat([submission])
    res = pd.concat([submission,comments])
    merged = pd.merge(res,res,left_on='parent_id',right_on='id',how='left')
    merged = merged.rename({'id_x':'id','subreddit_x':'subreddit','full_link_x':'full_link',
                        'title_x':'title','body_x':'body','selftext_x':'selftext','author_x':'author',
                        'is_submission_x':'is_submission','parent_id_x':'parent_id',
                        'topic_x':'topic','body_y':'parent_body','created_utc_x':'created_utc','score_x':'upvotes'},axis=1)
    final= merged[['id','subreddit','full_link','title','body','selftext','author','is_submission','parent_id'
                          ,'parent_body','topic','created_utc','upvotes']]
    # final = final.set_index('id')
    
    return final

In [3]:
submission_df = pd.concat([ 
getSubmissionsBySubRedditsAndTopic(subreddits,topic='Politics',limit=120,keyword='Democrats'),
getSubmissionsBySubRedditsAndTopic(subreddits,topic='Politics',limit=120,keyword='Republican'),
getSubmissionsBySubRedditsAndTopic(subreddits,topic='Environment',limit=120,keyword='Environment'),
getSubmissionsBySubRedditsAndTopic(subreddits,topic='Environment',limit=120,keyword='Climate change'),
getSubmissionsBySubRedditsAndTopic(subreddits,topic='Environment',limit=120,keyword='Solar energy'),
getSubmissionsBySubRedditsAndTopic(subreddits,topic='Technology',limit=120,keyword='Programming'),
getSubmissionsBySubRedditsAndTopic(subreddits,topic='Technology',limit=120,keyword='Computer'),
getSubmissionsBySubRedditsAndTopic(subreddits,topic='Technology',limit=120,keyword='Blockchain'),
getSubmissionsBySubRedditsAndTopic(subreddits,topic='Healthcare',limit=120,keyword='Healthcare'),
getSubmissionsBySubRedditsAndTopic(subreddits,topic='Healthcare',limit=120,keyword='Covid'),
getSubmissionsBySubRedditsAndTopic(subreddits,topic='Education',limit=120,keyword='Education'),
getSubmissionsBySubRedditsAndTopic(subreddits,topic='Education',limit=120,keyword='Learning'),
getSubmissionsBySubRedditsAndTopic(subreddits,topic='Education',limit=120,keyword='School'),
getSubmissionsBySubReddit('ExplainLikeImFive',150),
getSubmissionsBySubReddit('FoodForThought',150),
getSubmissionsBySubReddit('ChangeMyView',150),
getSubmissionsBySubReddit('TodayILearned',150)
])
print(submission_df.shape)

In [4]:
%%time
submissionAndComment = pd.DataFrame()
for i in tqdm(range(0,submission_df.shape[0])):
    submission = submission_df.iloc[[i]]
    final = populateComments(submission)
    submissionAndComment = pd.concat([submissionAndComment,final])

In [None]:
submissionAndComment['upvotes'] = submissionAndComment['upvotes'].fillna(1)

In [None]:
submissionAndComment.to_csv('D:\\scrape2.csv')

In [None]:
print(submissionAndComment.topic.unique())
print(submissionAndComment.upvotes.dtype)
print(submissionAndComment.upvotes.isna().unique())