Topic: Project 4   
Subject: Getting and Cleaning Reddit Comments by Subreddit  
Date: 11/10/2017   
Name: Zach Heick   

**Summary**: For each subreddit, I cleaned the comments and recorded metadata for modeling.

In [1]:
import pickle
from pymongo import MongoClient
import re
from nltk.corpus import stopwords
from nltk.stem import porter
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_clean_comments(subreddit):
    """
    Connects to a subreddit collection in a MongoDB hosted on AWS.
    For each comment:
        -markdown syntax and hyperlinks are removed
        -metadata for each comment is recorded
        -words are lemmatized and stop words are removed
    :param subreddit: name of the subreddit
    :returns: list of dictionaries for each comment
    """
    
    # Connect to MongoDB and get raw comments
    MONGO_U = os.environ['MONGO_U']
    MONGO_P = os.environ['MONGO_P']
    
    client = MongoClient('mongodb://' + MONGO_U + ':' + MONGO_P + '@13.59.55.238/reddit_comments_db')
    db = client['reddit_comments_db']
    collection_name = '{}_comments_collection'.format(subreddit)
    collection = db[collection_name]
    
    raw_comments = []
    for d in collection.find():
        raw_comments += d['comments']
        
    _stopwords = stopwords.words()
    _lemmatizer = WordNetLemmatizer()
    
    url_regex_markdown = "http[s]?://[^)]+"
    url_regex = "http\S+"

    comments = []
    
    for comment_tuple in raw_comments:
        raw_comment = comment_tuple[0]
        score = comment_tuple[1]
        time = comment_tuple[2]

        comment_d = {}
        
        # Remove markdown syntax and hyperlinks. Do not include any deleted or removed comments.
        if '>' not in raw_comment and raw_comment not in ['.', '[deleted]', '[removed]']:              
            comment_no_markdown = re.sub(r'\(\s*({})\s*\)'.format(url_regex_markdown), ' ', raw_comment) \
                .replace('*', '') \
                .replace('&nbsp;', '') \
                .replace('[',' ').replace(']',' ') \
                .replace('(', ' ').replace(')', ' ')
            cleaned_comment = re.sub(r'{}'.format(url_regex), ' ', comment_no_markdown, flags=re.MULTILINE) \
                .replace('\n', ' ')

            # Record the polarity and subjectivity of each sentence in the comment
            if cleaned_comment.strip() != '' and 'I am a bot' not in cleaned_comment:
                subjectivities = [0]
                polarities = [0]
                for split in re.split('[?:!.]', cleaned_comment):
                    sentiment = TextBlob(split).sentiment
                    if split != '':
                        subjectivities.append(float('{0:.4f}'.format(sentiment.subjectivity)))
                        polarities.append(float('{0:.4f}'.format(sentiment.polarity)))

                # Comment metadata
                sentiment = TextBlob(cleaned_comment).sentiment
                comment_d['max_subjectivity'] = max(subjectivities)
                comment_d['min_subjectivity'] = min(subjectivities)
                comment_d['max_polarity'] = max(polarities)
                comment_d['min_polarity'] = min(polarities)
                comment_d['overall_polarity'] = float('{0:.4f}'.format(sentiment.polarity))

                words = cleaned_comment.split()

                comment_d['words_count'] = len(words)
                comment_d['char_count'] = len(cleaned_comment.strip())
                comment_d['time'] = float('{0:.2f}'.format(time))
                comment_d['score'] = score

                # Lemmatize the words and remove stopwords from the comment.
                cleaned_words = []
                for word in words:
                    lemm_word = _lemmatizer.lemmatize(word)
                    if lemm_word not in _stopwords:
                        cleaned_words.append(lemm_word)
                comment_d['comment'] = ' '.join(cleaned_words)
                comments.append(comment_d)
    return comments

In [3]:
subreddits = ['politics', 'atheism', 'hiphopheads', 'science', 'worldnews']

In [4]:
for subreddit in subreddits:
    with open('{}_comments.pickle'.format(subreddit), 'wb') as f:
        comments = get_clean_comments(subreddit)
        pickle.dump(comments, f)

In [5]:
with open('subreddits.pickle', 'wb') as f:
    pickle.dump(subreddits, f)