In [2]:
from textblob import TextBlob
import json
import pickle
import os
import sys
from collections import defaultdict

In [2]:
def sentimentize_data(data):
    results = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    subreddit_counts = defaultdict(int)
    for idx, comment in enumerate(data):
        opinion = TextBlob(comment['body'])
        timestamp = int(comment['created_utc'] / 300) * 300 # break into five minute chunks
        subreddit = comment['subreddit']
        polarity = opinion.sentiment.polarity
        subjectivity = opinion.sentiment.subjectivity
        subreddit_counts[subreddit] += 1
        results[timestamp][subreddit]['total'] += 1
        results[timestamp][subreddit]['polarity'] += polarity
        results[timestamp][subreddit]['subjectivity'] += subjectivity
    output = []
    i = 0
    for subreddit in subreddit_counts.values():
        if subreddit >= 60:
            i += 1
    print(i)
    for timestamp, subreddits in results.items():
        for subreddit, values in subreddits.items():
            if subreddit_counts[subreddit] < 60: # remove any subreddits that get < 1 comments/min
                continue
            output.append({'t' : timestamp, 'r' : subreddit, 'p' : values['polarity'] / values['total'], 's' : values['subjectivity'] / values['total']})
    return output

In [3]:
def main():
    rootdir = '/Users/ashackelford/Desktop/Reddit/'
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            with open(subdir + "/" + file, 'r') as f:
                print(file)
                output = sentimentize_data(json.load(f))
            with open("all/" + file.rsplit('.', 1)[0] + '_sentiment' + '.json', 'w') as f:
                json.dump(output, f)

In [4]:
main()

2019-06-01-20.json
552
2019-06-01-00.json
529
2019-06-01-16.json
590
2019-06-01-17.json
590
2019-06-01-01.json
487
2019-06-01-21.json
532
2019-06-01-06.json
302
2019-06-01-10.json
281
2019-06-01-11.json
292
2019-06-01-07.json
272
2019-06-01-12.json
379
2019-06-01-04.json
402
2019-06-01-08.json
262
2019-06-01-09.json
240
2019-06-01-05.json
349
2019-06-01-13.json
456
2019-06-01-18.json
570
2019-06-01-22.json
525
2019-06-01-14.json
516
2019-06-01-02.json
488
2019-06-01-03.json
447
2019-06-01-15.json
579
2019-06-01-23.json
509
2019-06-01-19.json
541


In [3]:
all_subreddits = defaultdict(int)
def fill_all_subreddits(data):
    global all_subreddits
    subreddit_set = set()
    for item in data:
        subreddit_set.add(item['r'])
    for subreddit in list(subreddit_set):
        all_subreddits[subreddit] += 1

In [4]:
def get_trimmed_subreddits():
    global all_subreddits
    global trimmed_subreddits
    trimmed_subreddits = set()
    for subreddit, value in all_subreddits.items():
        if value == 24:
            trimmed_subreddits.add(subreddit)
    print(len(list(trimmed_subreddits)))

In [5]:
def trim_data(data):
    global trimmed_subreddits
    trimmed_data = []
    for item in data:
        if item['r'] in trimmed_subreddits:
            trimmed_data.append(item)
    return trimmed_data

In [6]:
def trim():
    rootdir = 'all/'
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            with open(subdir + "/" + file, 'r') as f:
                print(file)
                fill_all_subreddits(json.load(f))
    
    get_trimmed_subreddits()                
                
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            with open(subdir + "/" + file, 'r') as f:
                print(file)
                output = trim_data(json.load(f))      
                with open("trimmed/" + file, 'w') as f:
                    json.dump(output, f)

In [7]:
trim()

2019-06-01-21_sentiment.json
2019-06-01-20_sentiment.json
2019-06-01-22_sentiment.json
2019-06-01-23_sentiment.json
2019-06-01-17_sentiment.json
2019-06-01-02_sentiment.json
2019-06-01-18_sentiment.json
2019-06-01-05_sentiment.json
2019-06-01-10_sentiment.json
2019-06-01-11_sentiment.json
2019-06-01-04_sentiment.json
2019-06-01-19_sentiment.json
2019-06-01-03_sentiment.json
2019-06-01-16_sentiment.json
2019-06-01-01_sentiment.json
2019-06-01-14_sentiment.json
2019-06-01-09_sentiment.json
2019-06-01-13_sentiment.json
2019-06-01-06_sentiment.json
2019-06-01-07_sentiment.json
2019-06-01-12_sentiment.json
2019-06-01-08_sentiment.json
2019-06-01-15_sentiment.json
2019-06-01-00_sentiment.json
144
2019-06-01-21_sentiment.json
2019-06-01-20_sentiment.json
2019-06-01-22_sentiment.json
2019-06-01-23_sentiment.json
2019-06-01-17_sentiment.json
2019-06-01-02_sentiment.json
2019-06-01-18_sentiment.json
2019-06-01-05_sentiment.json
2019-06-01-10_sentiment.json
2019-06-01-11_sentiment.json
2019-06-01