In [None]:
import pickle
import praw
import re

In [None]:
def getSubredditID(url):
    m = re.search('www.reddit.com/r/(.+)/', url)
    if m: 
        return m.group(1)
    else: 
        return 'Unable to identify subreddit ID.'

#Give it a url that is from reddit
#Return all the titles (main page) or comments (comments page) along with metadata (user names, suburls, post time, etc...)
def scrapeRedditPage(url, threadDepth=9999, commentDepth=9999):
    pageInformation = {}
    
    #Log onto reddit using praw
    reddit = loginToReddit()

    threadInformation = []
    for submission in reddit.subreddit(getSubredditID(url)).hot(limit=threadDepth):
        try: 
            submissionInformation = {}
            submissionInformation['title'] = submission.title #Title of the thread
            submissionInformation['url'] = submission.url #Url that the tread points to, can be to outside or inside redit
            submissionInformation['timeStamp'] = submission.created_utc #Time the thread was created
            submissionInformation['author'] = submission.author #The author of the thread
            submissionInformation['ups'] = submission.ups #The number of "ups" for the thread
            submissionInformation['likes'] = submission.likes #The number of "likes" for the thread
            submissionInformation['downs'] = submission.downs #The number of "downs" for the thread
            submissionInformation['id'] = submission.id #The ID used to access the comments page
            submissionInformation['numberComments'] = submission.num_comments #Number of comments in the thread
            submissionInformation['score'] = submission.score #The reddit score of the thread
            submissionInformation['text'] = submission.selftext #The text in the body of the thread post
            submissionInformation['bestComments'] = [] #The top X comments ordered by the best to worst by score
            submissionInformation['newestComments'] = [] #The top X comments ordered by the newest to oldest

            commentSubmission = reddit.submission(id=submissionInformation['id'])
            commentSubmission.comment_sort = 'best'
            commentCount = 0 
            for top_level_comment in commentSubmission.comments:
                commentCount += 1
                if commentCount > commentDepth:
                    break
                submissionInformation['bestComments'].append((top_level_comment.body, top_level_comment.score, top_level_comment.created_utc, top_level_comment.author))

            commentSubmission.comment_sort = 'new'
            commentCount = 0 
            for top_level_comment in commentSubmission.comments:
                commentCount += 1
                if commentCount > commentDepth:
                    break
                submissionInformation['newestComments'].append((top_level_comment.body, top_level_comment.score, top_level_comment.created_utc, top_level_comment.author))
            threadInformation.append(submissionInformation)
        except: 
            print("Error, skipped a submission thread.")
            pass
    return threadInformation


def getAllTextFromRedditPage(redditName, maxThreads=999999, maxComments=99999):
    #All google main "cryonics" search page result pages

    identifierString = redditName + "_" + str(maxThreads) + "_" + str(maxComments)

    try:
        print("Attempting to load redditSearchAllText"+identifierString+".p variable from previous calculation.")
        redditAllText = pickle.load(open("redditSearchAllText"+identifierString+".p", "rb"))
        allUrls = pickle.load(open("redditSearchUrls"+identifierString+".p", "rb"))
        sentenceInformation = pickle.load(open("sentenceInformation"+identifierString+".p", "rb"))
        print("Load successful.")    
    except (OSError, IOError) as e:
        print("Load un-successful. Calculating.")
        
        redditUrls = []
        redditAllText = ""
        sentenceInformation = {}
        print("Getting reddit threads information.")
        threadInformation = scrapeRedditPage("https://www.reddit.com/r/"+redditName+"/", threadDepth=maxThreads, commentDepth=maxComments)
        print("Finished getting reddit threads information.")
        for x in threadInformation: 
            
            #Add Thread information
            url = x["url"]
            threadAuthor = x["author"]
            redditUrls.append(x["url"])
            redditAllText = redditAllText + x["title"]
            sentenceInformation[x["title"]] = (url, threadAuthor, getSentencesSentiment([x["title"]]))
            print("Found thread at: ", url, " by author: ", threadAuthor)
            commentObject = x["bestComments"]
            #Add all comments
            for commentCount in range(0, len(commentObject)): 
                commentText = x["bestComments"][commentCount][0]
                redditAllText = redditAllText + x["bestComments"][commentCount][0]

                commentScore = x["bestComments"][commentCount][1]
                commentCreatedTimestamp = x["bestComments"][commentCount][2]
                commentAuthor = x["bestComments"][commentCount][3]

                sentenceInformation[commentText] = (url, threadAuthor, getSentencesSentiment([commentText]), commentAuthor, x["title"], )
                print("Found comment at: ", url, " by author: ", commentAuthor)
            print("Size of sentence information: ", len(sentenceInformation))
            

        print("Writing calculated redditSearch"+identifierString+".p variable to pickle file.")
        
        pickle.dump(redditAllText, open( "redditSearchAllText"+identifierString+".p", "wb" ) )
        pickle.dump(redditUrls, open( "redditSearchUrls"+identifierString+".p", "wb" ) )
        pickle.dump(sentenceInformation, open( "sentenceInformation"+identifierString+".p", "wb" ) )
        
    return (redditAllText, sentenceInformation)
