# NYT Comment Analysis
* Build csv file of comments from NYT articles:
    * Read raw bytes object from NYT API
    * Convert to json object(s)
    * Extract data to csv format

In [140]:
# Modules
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import json
import math
import numpy as np
import scipy as sp
from scipy import stats
from scipy.stats import pearsonr

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.colors as col
import matplotlib.cm as cm
import seaborn as sns; sns.set()
from datetime import datetime

from collections import Counter
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
import requests
import urllib3
import zipfile
import re
import glob
import os
import time

# Pandas view options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
pd.set_option('precision', 4)

## Program Parameters

In [141]:
# File paths (Change BASE_PATH to your project folder)
BASE_PATH = '/Users/apelullo/Desktop/CDH/projects/nyt/data/'
URL_PATH = BASE_PATH + 'urls/'
MASTER_PATH = BASE_PATH + 'master_data/'

# API credentials (keys are from 9 distinct APPS)
API_REQUEST = 'https://api.nytimes.com/svc/community/v3/user-content/url.json?api-key='
KEYS = ['REDACTED','REDACTED','REDACTED',
        'REDACTED','REDACTED','REDACTED',
        'REDACTED','REDACTED','REDACTED']

# Constants
DEFAULT_SLEEP = 6
TIMEOUT_SLEEP = 10
MAX_ATTEMPT = 10

## Functions

In [142]:
def get_response(url,num_requests,curr_key,offset,complete):
    timeout = 0
    processed = False
    
    # Build request url
    if num_requests%10==0:
        curr_key = (curr_key+1)%len(KEYS)
        print('Switching keys: ' + KEYS[curr_key][0:10] + '...')
    request_url = API_REQUEST + KEYS[curr_key] + '&offset=' + str(offset) + '&url=http://' + url

    # Request handling: status codes 200,400,429
    while not processed:
        response = requests.get(request_url)
        status = response.status_code
        num_requests+=1
        time.sleep(DEFAULT_SLEEP)
        
        if status==200:
            # Check for empty response content
            content = soup(response.content)
            if len(content)==0:
                return None,num_requests,curr_key,offset,complete

            # Convert to json
            json_content = json.loads(content.text)
            results = json_content['results']

            # Pagination
            returned_comments = results['totalParentCommentsReturned']
            offset += returned_comments
            
            # Check for completion
            if returned_comments < 25:
                complete=True

            processed=True
            print('Request successful!',returned_comments,'top-level comments returned...')
        elif status==400:
            results=None
            processed=True
            print('Bad request! Error code:', status)
        elif status==429:
            timeout+=1
            print('Rate limit exceeded! Error code:', status)
            if timeout < MAX_ATTEMPT:
                print('Sleeping for',TIMEOUT_SLEEP,'seconds and retrying...')
                time.sleep(TIMEOUT_SLEEP)
            else:
                results=None
                processed=True
                print('Too many attempts!')
        else:
            results=None
            processed=True
            print('There was an error executing the request! Error code:', status)
    
    return results,num_requests,curr_key,offset,complete

In [143]:
def get_article_meta(results,article_id):
    article_row = []
    article_cols = [item for item in list(results.keys()) if item not in ['comments','userData']]
    
    # Build article row
    for col in article_cols:
        article_row.append(results[col])
    
    # Add article_id
    article_row.append(article_id)
    article_cols.append('article_id')
    return article_row, article_cols

In [144]:
def get_comments(comments,article_id):
    comment_chunk = []
    
    # Iterate through comment chunk
    for com in comments:
        comment_row = []
        comment_cols = [item for item in list(com.keys()) if item not in ['replies']]

        # Build comment row
        for col in comment_cols:
            comment_row.append(com[col])
        
        # Add article_id
        comment_row.append(article_id)
        comment_cols.append('article_id')
        
        # Add comment row to comment chunk
        comment_chunk.append(comment_row)
        
        # Handle replies recursively
        replies = com['replies']
        if len(replies) != 0:
            # Add replies to comment chunk
            print('Collecting comment replies...')
            reply_data,reply_cols = get_comments(replies,article_id)
            comment_chunk = comment_chunk + reply_data
        
    return comment_chunk,comment_cols

In [145]:
def process_urls(url_list):
    article_data = []
    comment_data = []

    article_id = 0
    num_requests = 1
    curr_key = 0

    offset = 0
    first = True
    complete = False

    for url in url_list:
        print('\nProcessing url:', url)
        print('Article id:', article_id)
        while not complete:
            # Get response
            print('Executing Request...')
            results,num_requests,curr_key,offset,complete = get_response(url,num_requests,curr_key,offset,complete)
            
            # Check for empty results
            if results==None:
                print('Article does not exist! Moving on..')
                break

            # Article metadata
            if first:
                print('Collecting article metadata...')
                article_meta,article_cols = get_article_meta(results,article_id)
                article_data.append(article_meta)
                first=False
                
            # Check for empty comments
            returned_comments = results['totalParentCommentsReturned']
            if returned_comments==0:
                print('Article has no comments! Moving on..')
                break

            # Comments and replies
            print('Collecting article comments...')
            comments = results['comments']
            comment_chunk,comment_cols = get_comments(comments,article_id)
            if len(comment_data)==0:
                comment_data = comment_chunk
            else:
                comment_data = comment_data + comment_chunk

            print(str(len(comment_data)) + ' comments and replies extracted.')

        # Reset parameters
        article_id += 1
        offset=0
        first=True
        complete=False
    
    print('All articles processed correctly! Our work here is done - See you next time!')
    return [[article_data,article_cols],[comment_data,comment_cols]]

## Read Data

In [146]:
# open files if they exist
if os.path.exists(URL_PATH + 'privacy_urls.csv') and os.path.exists(URL_PATH + 'health_urls.csv'):
    print('Reading from file.')
    privacy_urls = pd.read_csv(URL_PATH + 'privacy_urls.csv')
    health_urls = pd.read_csv(URL_PATH + 'health_urls.csv')
else:
    print('Creating url lists.')
    with open(URL_PATH + 'privacy_urls_raw.txt') as file:
        contents = file.read()
        temp = contents.split('href="https://')[1:]
        urls = [item.split('">\\\n')[0] for item in temp]

        # all privacy articles
        privacy_urls = pd.DataFrame(urls, columns=['url'])
        privacy_urls.to_csv(URL_PATH + 'privacy_urls.csv', index=False)

        # health articles
        health_urls = privacy_urls.iloc[[45,90],:]
        health_urls.to_csv(URL_PATH + 'health_urls.csv', index=False)

Reading from file.


## Main Program

In [147]:
output = dict()
#url_dict={'health_urls':list(health_urls.url)}
url_dict={'privacy_urls':list(privacy_urls.url)}
#url_dict={'health_urls':list(health_urls.url),'privacy_urls':list(privacy_urls.url)}

for key,urls in url_dict.items():
    output[key] = process_urls(urls)


Processing url: www.nytimes.com/2019/07/25/opinion/facebook-fine-5-billion.html?rref=collection%2Fseriescollection%2Fnew-york-times-privacy-project
Article id: 0
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article metadata...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
38 comments and replies extracted.
Executing Request...
Request successful! 12 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
57 comments and replies extracted.

Processing url: www.nytimes.com/2019/07/23/opinion/google-ftc-facebook-fines.html?rref=collection%2Fseriescollection%2Fnew-york-times-privacy-project
Article id: 1
Executing Request...
Request 

Request successful! 25 top-level comments returned...
Collecting article metadata...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
609 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
649 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...


Request successful! 10 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
1176 comments and replies extracted.

Processing url: www.nytimes.com/2019/06/27/opinion/cities-privacy-surveillance.html?rref=collection%2Fseriescollection%2Fnew-york-times-privacy-project
Article id: 19
Executing Request...
Request successful! 19 top-level comments returned...
Collecting article metadata...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
1202 comments and replies extracted.

Processing url: www.nytimes.com/2019/06/27/opinion/police-cam-facial-recognition.html?rref=collection%2Fseriescollection%2Fnew-york-times-privacy

Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
1808 comments and replies extracted.
Executing Request...
Request successful! 5 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
1820 comments and replies extracted.

Processing url: www.nytimes.com/2019/06/13/opinion/timothy-carpenter-prison-privacy.html?rref=collection%2Fseriescollection%2Fnew-york-times-privacy-project
Article id: 28
Executing Request...
Switching keys: 27A0h9dxWJ...
Re

Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
2457 comments and replies extracted.
Executing Request...
Switching keys: 96zqzbAAur...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
2492 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...

Request successful! 16 top-level comments returned...
Collecting article metadata...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
3039 comments and replies extracted.

Processing url: www.nytimes.com/2019/05/28/opinion/online-ads.html?rref=collection%2Fseriescollection%2Fnew-york-times-privacy-project
Article id: 42
Executing Request...
Switching keys: 7adp5kTspN...
Request successful! 0 top-level comments returned...
Collecting article metadata...
Article has no comments! Moving on..

Processing url: www.nytimes.com/2019/05/28/opinion/privacy-antitrust-facebook.html?rref=collection%2Fseriescollection%2Fnew-york-times-privacy-project
Article id: 43
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article metadata...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting

Request successful! 25 top-level comments returned...
Collecting article metadata...
Collecting article comments...
3869 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
3907 comments and replies extracted.
Executing Request...
Switching keys: MGDqrMfUG9...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
3940 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Colle

Request successful! 25 top-level comments returned...
Collecting article metadata...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
4639 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
4683 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
4715 comments and replies ex

Request successful! 25 top-level comments returned...
Collecting article metadata...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
5298 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
5329 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
5356 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
5385 comments and replies extracted.
Executing Request...
Switching keys: ErlMB7lQkc...
Request successful! 25 top-level comm

Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
6234 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
6266 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
6294 comments and replies extracted.
Executing Request...
Switching keys: uqpjc4N8kA...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
6329 c

Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
6959 comments and replies extracted.
Executing Request...
Request successful! 1 top-level comments returned...
Collecting article comments...
Collecting comment replies...
6962 comments and replies extracted.

Processing url: www.nytimes.com/2019/04/30/opinion/facebook-ftc-privacy.html?rref=collection%2Fseriescollection%2Fnew-york-times-privacy-project
Article id: 66
Executing Request...
Request succes

Request successful! 25 top-level comments returned...
Collecting article metadata...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
7625 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
7662 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies..

Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
8336 comments and replies extracted.
Executing Request...
Request successful! 12 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
8354 comments and replies extracted.

Processing url: www.nytimes.com/2019/04/13/opinion/china-internet-privacy.html?rref=collection%2Fseriescollection%2Fnew-york-times-privacy-project
Article id: 78
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article metadata...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
8383 comments a

Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
9073 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
9106 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
9138 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collec

Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
9883 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
9937 comments and repl

Request successful! 25 top-level comments returned...
Collecting article metadata...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
10591 comments and replies extracted.
Executing Request...
Switching keys: 96zqzbAAur...
Request successful! 25 top-level comments returned...
Collecting article comments...
10616 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
10642 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
10674 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecti

Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
11415 comments and replies extracted.
Executing Request...
Request successful! 25 top-level comments returned...
Collecting article comments...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment replies...
Collecting comment re

In [None]:
# NOTE: Threading/multiprocessing will work well here, assuming independent rate limits for keys from distinct apps

## Explore Data

### Create Dataframes and Save

In [9]:
# Create dataframes
health_article_df = pd.DataFrame(output['health_urls'][0][0], columns=output['health_urls'][0][1])
health_article_df.to_csv(MASTER_PATH + 'health_article_data.csv', index=False)

health_comment_df = pd.DataFrame(output['health_urls'][1][0], columns=output['health_urls'][1][1])
health_comment_df.to_csv(MASTER_PATH + 'health_comment_data.csv', index=False)

In [148]:
# Create dataframes
privacy_article_df = pd.DataFrame(output['privacy_urls'][0][0], columns=output['privacy_urls'][0][1])
privacy_article_df.to_csv(MASTER_PATH + 'privacy_article_data.csv', index=False)

privacy_comment_df = pd.DataFrame(output['privacy_urls'][1][0], columns=output['privacy_urls'][1][1])
privacy_comment_df.to_csv(MASTER_PATH + 'privacy_comment_data.csv', index=False)

### Health Data

In [10]:
print(len(health_article_df))
print(len(health_article_df.columns))
health_article_df.head()

2
20


Unnamed: 0,callerID,api_timestamp,depthLimit,filter,page,replyLimit,sort,totalCommentsFound,totalCommentsReturned,totalEditorsSelectionFound,totalEditorsSelectionReturned,totalParentCommentsFound,totalParentCommentsReturned,totalRecommendationsFound,totalRecommendationsReturned,totalReplyCommentsFound,totalReplyCommentsReturned,totalReporterReplyCommentsFound,totalReporterReplyCommentsReturned,article_id
0,,1564206858,2,,1,3,,200,26,7,0,162,25,144,13,38,1,0,0,0
1,,1564206899,2,,1,3,,451,31,14,2,266,25,291,15,185,6,0,0,1


In [11]:
print(len(health_comment_df))
print(len(health_comment_df.columns))
health_comment_df.head(2)

581
26


Unnamed: 0,commentID,status,commentSequence,userID,userDisplayName,userLocation,userTitle,userURL,picURL,commentTitle,commentBody,createDate,updateDate,approveDate,recommendations,replyCount,editorsSelection,parentID,parentUserDisplayName,depth,commentType,trusted,recommendedFlag,permID,isAnonymous,article_id
0,100725114,approved,100725114,0,Anne Fauvre,San Francisco,,,,<br\//>,With new technologies I don't think this has t...,1559246362,1559250380,1559246436,1,0,False,,,1,comment,0,0,100725114,False,0
1,100675020,approved,100675020,0,michael,tristate,,,,<br\//>,"Tsk, tsk, tsk.\n\nWhy don't you work on implem...",1558986958,1559061711,1559061711,0,0,False,,,1,comment,0,0,100675020,False,0


In [12]:
# Summary stats
print('Total unique articles:', len(health_comment_df.article_id.unique()))
print()
print('Total unique comments and replies:', len(health_comment_df.commentID.unique()))
for idx in health_comment_df.article_id.unique():
    print('\tArticle ' + str(idx) + ':', len(health_comment_df[health_comment_df['article_id']==idx]))
print()
print('Total unique comments:', len(health_comment_df[health_comment_df['parentID'].isnull()]))
for idx in health_comment_df.article_id.unique():
    print('\tArticle ' + str(idx) + ':', len(health_comment_df[(health_comment_df['article_id']==idx)&(health_comment_df['parentID'].isnull())]))
print()
print('Total unique replies:', len(health_comment_df[health_comment_df['parentID'].notnull()]))
for idx in health_comment_df.article_id.unique():
    print('\tArticle ' + str(idx) + ':', len(health_comment_df[(health_comment_df['article_id']==idx)&(health_comment_df['parentID'].notnull())]))

Total unique articles: 2

Total unique comments and replies: 581
	Article 0: 196
	Article 1: 385

Total unique comments: 428
	Article 0: 162
	Article 1: 266

Total unique replies: 153
	Article 0: 34
	Article 1: 119


In [None]:
# What else would be interesting to summarize?

In [None]:
# The possibilities are endless... =) 

### Privacy Data

In [149]:
print(len(privacy_article_df))
print(len(privacy_article_df.columns))
privacy_article_df.head()

94
20


Unnamed: 0,callerID,api_timestamp,depthLimit,filter,page,replyLimit,sort,totalCommentsFound,totalCommentsReturned,totalEditorsSelectionFound,totalEditorsSelectionReturned,totalParentCommentsFound,totalParentCommentsReturned,totalRecommendationsFound,totalRecommendationsReturned,totalReplyCommentsFound,totalReplyCommentsReturned,totalReporterReplyCommentsFound,totalReporterReplyCommentsReturned,article_id
0,,1564242487,2,,1,3,,59,38,3,3,37,25,13,7,22,13,0,0,0
1,,1564242501,2,,0,3,,0,0,0,0,0,0,0,0,0,0,0,0,1
2,,1564242507,2,,0,3,,0,0,0,0,0,0,0,0,0,0,0,0,2
3,,1564242513,2,,1,3,,86,38,0,0,52,25,69,31,34,13,0,0,3
4,,1564242533,2,,1,3,,43,28,0,0,32,25,21,11,11,3,0,0,4


In [150]:
print(len(privacy_comment_df))
print(len(privacy_comment_df.columns))
privacy_comment_df.head(2)

11555
26


Unnamed: 0,commentID,status,commentSequence,userID,userDisplayName,userLocation,userTitle,userURL,picURL,commentTitle,commentBody,createDate,updateDate,approveDate,recommendations,replyCount,editorsSelection,parentID,parentUserDisplayName,depth,commentType,trusted,recommendedFlag,permID,isAnonymous,article_id
0,101672372,approved,101672372,0,JRB,KCMO,,,,<br\//>,"$5 billion? Hey, somebody go over there and ge...",1564179032,1564230824,1564230824,0,0,False,,,1,comment,0,0,101672372,False,0
1,101677421,approved,101677421,0,Matt,Montreal,,,,<br\//>,I was one of the 80 million of Anthem Blue Cr...,1564224235,1564229710,1564229710,0,0,False,,,1,comment,0,0,101677421,False,0


In [151]:
# Summary stats
print('Total unique articles:', len(privacy_comment_df.article_id.unique()))
print()
print('Total unique comments and replies:', len(privacy_comment_df.commentID.unique()))
for idx in privacy_comment_df.article_id.unique():
    print('\tArticle ' + str(idx) + ':', len(privacy_comment_df[privacy_comment_df['article_id']==idx]))
print()
print('Total unique comments:', len(privacy_comment_df[privacy_comment_df['parentID'].isnull()]))
for idx in privacy_comment_df.article_id.unique():
    print('\tArticle ' + str(idx) + ':', len(privacy_comment_df[(privacy_comment_df['article_id']==idx)&(privacy_comment_df['parentID'].isnull())]))
print()
print('Total unique replies:', len(privacy_comment_df[privacy_comment_df['parentID'].notnull()]))
for idx in privacy_comment_df.article_id.unique():
    print('\tArticle ' + str(idx) + ':', len(privacy_comment_df[(privacy_comment_df['article_id']==idx)&(privacy_comment_df['parentID'].notnull())]))

Total unique articles: 68

Total unique comments and replies: 11555
	Article 0: 57
	Article 3: 84
	Article 4: 43
	Article 5: 214
	Article 8: 151
	Article 9: 25
	Article 10: 126
	Article 11: 45
	Article 12: 164
	Article 13: 90
	Article 16: 57
	Article 17: 14
	Article 18: 106
	Article 19: 26
	Article 20: 41
	Article 21: 18
	Article 23: 126
	Article 25: 104
	Article 27: 329
	Article 29: 55
	Article 31: 263
	Article 32: 164
	Article 33: 258
	Article 35: 56
	Article 36: 52
	Article 37: 159
	Article 39: 190
	Article 41: 22
	Article 43: 161
	Article 44: 448
	Article 45: 196
	Article 46: 215
	Article 48: 217
	Article 49: 65
	Article 50: 67
	Article 51: 203
	Article 52: 176
	Article 53: 67
	Article 54: 61
	Article 55: 118
	Article 56: 29
	Article 57: 207
	Article 58: 911
	Article 59: 270
	Article 61: 102
	Article 63: 176
	Article 65: 234
	Article 67: 143
	Article 68: 104
	Article 70: 283
	Article 71: 100
	Article 73: 200
	Article 74: 311
	Article 75: 126
	Article 77: 125
	Article 78: 463
	Artic

## Unused / Deprecated Code