### Scrape Reddit Submissions using AWS Lambda

In [1]:
import boto3
from concurrent.futures import ThreadPoolExecutor
import json
import pandas as pd
import time
import praw
import re

In [2]:
# List subreddits to scrape from: must remove spaces 
subreddits = ["WorldNews",
              "WorldPolitics",
              "Politics", 
              "USPolitics", 
              "AmericanPolitics", 
              "Republican", 
              "Democrats", 
              "Conservative", 
              "Progressive", 
              "Libertarian"]

In [3]:
# load in each movement's search terms
key_terms = pd.read_csv("../data/key_terms.csv")

all_data = []
for subreddit in subreddits:
    for idx, row in key_terms.iterrows():
        all_data.append({"subreddit_name": subreddit,
                         "search_terms": row["Key Terms"],
                        "movement": row["Movement"]})

In [4]:
def invoke_lambdas(test_data):
    aws_lambda = boto3.client('lambda')

    r = aws_lambda.invoke(FunctionName='scrape_reddit',
                       InvocationType='RequestResponse',
                       Payload=json.dumps(test_data))
    return json.loads(r['Payload'].read())

In [7]:
# we will use 1 worker for subreddit/movement combination
workers = len(all_data)

with ThreadPoolExecutor(max_workers=workers) as executor:
        results = executor.map(invoke_lambdas, all_data)
time.sleep(2)

In [8]:
# gather the results from the scrape
df_list = []
for result in results:
    df_list.append(result)
    
df = pd.DataFrame(df_list, columns=["subreddit", "movement", "mentions"])

In [9]:
# view the summary of number of terms found within each subreddit
mention_summary = df.pivot(index=["movement"], columns="subreddit", values="mentions")
mention_summary["total"] = mention_summary.sum(axis=1)
mention_summary.sort_values("total", ascending=False)

subreddit,AmericanPolitics,Conservative,Democrats,Libertarian,Politics,Progressive,Republican,USPolitics,WorldNews,WorldPolitics,total
movement,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Libertarianism,0,0,0,17,0,0,0,0,0,0,17
Conservatism,3,1,1,1,0,3,0,3,0,0,12
Communism,1,1,0,4,0,0,1,0,0,0,7
Authoritarianism,1,1,1,1,1,0,1,0,0,0,6
Liberalism,1,0,1,3,0,1,0,0,0,0,6
Progressivism,1,0,0,0,2,2,0,1,0,0,6
Socialism,0,3,0,3,0,0,0,0,0,0,6
Fascism,1,1,0,2,0,1,0,0,0,0,5
QAnon,0,0,0,0,0,3,0,0,0,0,3
Nationalism,0,0,0,0,0,1,0,0,0,0,1


In [10]:
# figure out how to send via json object for Kinesis
summary = df.groupby("movement").mentions.sum()
movement_list = summary.index.values.tolist()
values = [int(x) for x in df.groupby("movement").mentions.sum()]
output = {m:v for m,v in zip(movement_list, values)}