In [1]:
import numpy as np
import pandas as pd
import random
import json
import urllib
from urllib.request import urlopen
import requests
from requests.exceptions import HTTPError
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load all Jan 2019 /r/worldnews comments, retrieved from Google BigQuery
comments = pd.read_csv("data/reddit_wnews_01-2019.csv")

In [3]:
# Separate removed comments
#'[deleted]' comments are user-deleted and '[removed]' comments are moderator-deleted

not_removed_comments = comments[(comments['body']!='[removed]') & (comments['body']!='[deleted]')]
removed_comments = comments[comments['body']=='[removed]']

In [4]:
# Get text for removed comments from Reddit API
url = "https://api.pushshift.io/reddit/search/comment?ids="

# Create id list and dictionary
def comment_ids_fields(var_list,df):
    d = dict()
    id_list = list()
    
    for var in var_list:
        d[var] = list() 
        
    for i in range(3):
        id_subset = df.iloc[i]['id']
        id_list.append(id_subset) 
        
    return d,id_list
 
    
var_list = ['id','body','author']
d,id_list = comment_ids_fields(var_list,removed_comments)


# Append field values to dictionary
def extract_fields_from_response(item, var_list):
    for var in var_list:
        var_ = item.get(var, None)
        d[var].append(var_)
    return d


# Request data from API
def get_fields(id_no, session):
    """Get missing field details using Reddit API (sequentially)"""
    url_ = url + id_no
    response = None
    try:
        response = session.get(url_)
        response.raise_for_status()
        #print(f"Response status ({url_}): {response.status_code}")
    except HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"An error ocurred: {err}")
    response_json = response.json()
    items = response_json.get("data", [{}])[0]
    return items


with requests.Session() as session:
    for id_no in id_list:
        try:
            response = get_fields(id_no, session)
            parsed_response = extract_fields_from_response(response, var_list)
            #print(f"Response: {json.dumps(parsed_response, indent=2)}")
        except Exception as err:
            #print(f"Exception occured: {err}")
            pass

In [5]:
# Create dataframe
removed_df = pd.DataFrame(d)
removed_df['Removed'] = 1

In [6]:
removed_df.to_csv(r'data/wnews_removed_comments.csv')