In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = 100
idx = pd.IndexSlice

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
sb.set_style('whitegrid')

import requests
import json
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
from urllib.parse import urlparse, quote, unquote
from datetime import datetime

from collections import Counter

# requires pip install tldextract
#import tldextract

import networkx as nx 
import time, os, re

_dir = 'E:/Dropbox/Workspace/Wikipedia_Trump/'

In [2]:
# http://stackoverflow.com/a/312464/1574687

def chunk_list(l,size=50): 
    """Yield successive n-sized chunks from l."""
    chunk_list = list()
    for i in range(0, len(l), size):
        chunk_list.append(l[i:i + size])
    return chunk_list

In [425]:
with open('bots.json','r') as f:
    bot_list = json.load(f)
    
bot_list = [name[5:] for name in bot_list]
print("There are {0:,} bots.".format(len(bot_list)))

There are 1,712 bots.


### Revisions

In [3]:
def get_page_revisions(page_title,lang='en'):
    """Takes Wikipedia page title and returns a DataFrame of revisions
    
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
        
    Returns:
    revision_list - a list of dictionaries, where each dictionary is the revision
        meta-data such as parentid, revid,sha1, size, timestamp, and user name
    """
    
    revision_list = list()
    
    query_string = "https://{1}.wikipedia.org/w/api.php?action=query&titles={0}&prop=revisions&rvprop=ids|userid|comment|timestamp|user|size|sha1&rvlimit=500&rvdir=older&format=json&formatversion=2".format(page_title,lang)
    json_response = requests.get(query_string).json()
    subquery_revision_list = json_response['query']['pages'][0]['revisions']
    revision_list += subquery_revision_list
    
    while True:
    
        if 'continue' not in json_response:
            break
            
        else:
            query_continue = json_response['continue']['rvcontinue']
            query_string = "https://{1}.wikipedia.org/w/api.php?action=query&titles={0}&prop=revisions&rvprop=ids|timestamp|user|size|sha1&rvlimit=500&rvcontinue={2}&rvdir=older&format=json&formatversion=2".format(page_title,lang,query_continue)
            json_response = requests.get(query_string).json()
            subquery_revision_list = json_response['query']['pages'][0]['revisions']
            revision_list += subquery_revision_list
            #time.sleep(1)
    
    df = pd.DataFrame(revision_list)
    df['page'] = page_title
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].apply(lambda x:x.date())
    df['userid'] = df['userid'].fillna(0).apply(lambda x:str(int(x)))
    #df['lang'] = lang
    
    return df

### User contributions

In [411]:
def get_user_contributions(username,lang='en',start=pd.Timestamp('2015-01-01'),stop=pd.Timestamp('2017-11-09'),skip_power=True):
    """Takes Wikipedia username and returns a DataFrame of revisions
    
    username - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
    start - a datetime or Timestamp for the earliest user contributions to retrieve
    stop - a datetime or Timestamp for the latest user contributions to retrieve
    skip_power = If True, skips users who made more than 500 edits in a month
        
    Returns:
    revision_list - a DataFrame containing the revision meta-data such as 
        parentid, revid,sha1, size, timestamp, and user name
        
    API endpoint docs: https://www.mediawiki.org/wiki/API:Usercontribs
    """
    
    start_utc = datetime.strftime(start, '%Y-%m-%dT%H:%M:%SZ')
    stop_utc = datetime.strftime(stop, '%Y-%m-%dT%H:%M:%SZ')
    
    revision_list = list()
    
    query_string = "https://{1}.wikipedia.org/w/api.php?action=query&list=usercontribs&ucuser={0}&ucprop=ids|title|comment|timestamp|flags|size|sizediff&ucstart={2}&ucstop={3}&uclimit=500&ucdir=newer&format=json&formatversion=2".format(username,lang,start_utc,stop_utc)
    json_response = requests.get(query_string).json()
    subquery_revision_list = json_response['query']['usercontribs']
    
    # If the first 500 edits took place in less than 30 days, we've got ourselves a power user, bot, or cyborg
    earliest_first_500 = pd.to_datetime(json_response['query']['usercontribs'][0]['timestamp'])
    latest_first_500 = pd.to_datetime(json_response['query']['usercontribs'][-1]['timestamp'])
    days_elapsed_first_500 = latest_first_500 - earliest_first_500
    
    if len(subquery_revision_list) == 500 and days_elapsed_first_500 > np.timedelta64(30,'D'):
    
        revision_list += subquery_revision_list

        while True:

            if 'continue' not in json_response:
                break

            else:
                query_continue = json_response['continue']['uccontinue']
                query_string = "https://{1}.wikipedia.org/w/api.php?action=query&list=usercontribs&ucuser={0}&ucprop=ids|title|comment|timestamp|flags|size|sizediff&ucstart={2}&ucstop={3}&uclimit=500&ucdir=newer&uccontinue={4}&format=json&formatversion=2".format(username,lang,start_utc,stop_utc,query_continue)
                json_response = requests.get(query_string).json()
                subquery_revision_list = json_response['query']['usercontribs']
                revision_list += subquery_revision_list
                #time.sleep(1)
    
    elif 'continue' not in json_response:
        
        revision_list += subquery_revision_list
        
    df = pd.DataFrame(revision_list)

    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].apply(lambda x:x.date())
    df['userid'] = df['userid'].fillna(0).apply(lambda x:str(int(x)))

    return df

In [466]:
def get_user_info(username_list,lang='en'):
    """Takes a list of Wikipedia usernames and returns a JSON of their information
    
    username_list - a list of strings for all the usernames
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
        
    Returns:
    users_info - a list of information about users
    
    API endpoint docs: https://www.mediawiki.org/wiki/API:Users
    """
    users_info = []
    
    chunked_username_list = chunk_list(username_list)
    
    for chunk in chunked_username_list:
        usernames = '|'.join(chunk)
        query_string = "https://{1}.wikipedia.org/w/api.php?action=query&list=users&ususers={0}&usprop=blockinfo|groups|editcount|registration|gender&format=json&formatversion=2".format(usernames,lang)
        json_response = requests.get(query_string).json()
        if 'query' in json_response:
            users_info += json_response['query']['users']
    
    return users_info

### Inter-language

In [4]:
def get_interlanguage_links(page_title,lang='en'):
    """The function accepts a page_title and returns a dictionary containing 
    the title of the page in its other languages
       
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language edition, 
        defaults to "en"
       
    Returns:
    langlink_dict - a dictionary keyed by lang codes and page title as values
    """
    
    query_string = "https://{1}.wikipedia.org/w/api.php?action=query&format=json&prop=langlinks&formatversion=2&titles={0}&llprop=autonym|langname&lllimit=500".format(page_title,lang)
    json_response = requests.get(query_string).json()
    
    interlanguage_link_dict = dict()
    interlanguage_link_dict['en'] = page_title

    if 'langlinks' in json_response['query']['pages'][0]:
        langlink_dict = json_response['query']['pages'][0]['langlinks']

        for d in langlink_dict:
            lang = d['lang']
            title = d['title']
            interlanguage_link_dict[lang] = title
            
    return interlanguage_link_dict

In [5]:
def get_interlanguage_revisions(page_title,lang='en'):
    """Takes a Wikipedia page title and return the interlanguage revision history
    
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
    
    Returns:
    extlinks_per_lang - a dictionary keyed by language returning a dictionary
        keyed by page title returning a Counter dictionary of external links'
        top-level domains and counts
    """
    revisions_df_dict = {}

    language_titles = get_interlanguage_links(page_title,lang)

    for lang,title in language_titles.items():
        try:
            revisions_df_dict[lang] = get_page_revisions(title,lang)
        
        except KeyboardInterrupt:
            raise
            
        except:
            print("Error getting revisions in {0} version of \"{1}\"".format(lang,title))
            pass
    
    concat_df = pd.concat(revisions_df_dict.values(),keys=revisions_df_dict.keys(),
                          names=['lang','rev_num']).reset_index()
    
    return concat_df

### External links

In [6]:
def get_rev_externallinks(revid,lang='en',redirects=1):
    """Takes a revision id and returns a list of external links on the revision
    
    revid - a numeric revision id as a string
    lang - a string (typically two letter ISO 639-1 code) for the language 
        edition, defaults to "en"
    redirects - 1 or 0 for whether to follow page redirects, defaults to 1
    parse - 1 or 0 for whether to return the raw HTML or paragraph text
    
    Returns:
    str - a list of strings with the URLs
    """
    
    # Get the response from the API for a query
    # After passing a page title, the API returns the HTML markup of the current article version within a JSON payload
    req = requests.get('https://{2}.wikipedia.org/w/api.php?action=parse&format=json&oldid={0}&redirects={1}&prop=externallinks&disableeditsection=1&disabletoc=1'.format(revid,redirects,lang))
    
    # Read the response into JSON to parse and extract the HTML
    json_string = json.loads(req.text)
    
    if 'parse' in json_string.keys():
        if 'externallinks' in json_string['parse']:
            return json_string['parse']['externallinks']

In [7]:
get_rev_externallinks('3953165')

['http://www.nbc.com/nbc/The_Apprentice/bios/Donald_J._Trump.html',
 'http://www.trumponline.com',
 'http://www.trump.com',
 'http://www.askmen.com/men/business_politics/38_donald_trump.html',
 'http://www.who2.com/donaldtrump.html',
 'http://www.bad-bad.com/gesch/d_trump.htm',
 'http://www.rasscass.com/templ/te_bio.php?PID=187&RID=1']

### Content

In [8]:
def get_rev_content(revid,lang='en',redirects=1,parsed_text=1):
    """Takes a revision id and returns a (large) string of the HTML content 
    of the revision.
    
    revid - a numeric revision id as a string
    lang - a string (typically two letter ISO 639-1 code) for the language 
        edition, defaults to "en"
    redirects - 1 or 0 for whether to follow page redirects, defaults to 1
    parse - 1 or 0 for whether to return the raw HTML or paragraph text
    
    Returns:
    str - a (large) string of the content of the revision
    """
    
    bad_titles = ['Special:','Wikipedia:','Help:','Template:','Category:','International Standard','Portal:','s:','File:','Digital object identifier','(page does not exist)']
    
    # Get the response from the API for a query
    # After passing a page title, the API returns the HTML markup of the current article version within a JSON payload
    req = requests.get('https://{2}.wikipedia.org/w/api.php?action=parse&format=json&oldid={0}&redirects={1}&prop=text&disableeditsection=1&disabletoc=1'.format(revid,redirects,lang))
    
    # Read the response into JSON to parse and extract the HTML
    json_string = json.loads(req.text)
    
    if 'parse' in json_string.keys():
        page_html = json_string['parse']['text']['*']

        # Parse the HTML into Beautiful Soup
        soup = BeautifulSoup(page_html,'lxml')
        
        # Remove sections at end
        bad_sections = ['See_also','Notes','References','Bibliography','External_links']
        sections = soup.find_all('h2')
        for section in sections:
            if section.span['id'] in bad_sections:
                
                # Clean out the divs
                div_siblings = section.find_next_siblings('div')
                for sibling in div_siblings:
                    sibling.clear()
                    
                # Clean out the ULs
                ul_siblings = section.find_next_siblings('ul')
                for sibling in ul_siblings:
                    sibling.clear()
        
        # Get all the paragraphs
        paras = soup.find_all('p')
        
        text_list = []
        
        for para in paras:
            if parsed_text:
                _s = para.text
                # Remove the citations
                _s = re.sub(r'\[[0-9]+\]','',_s)
                text_list.append(_s)
            else:
                text_list.append(str(para))
        
        return '\n'.join(text_list)

In [9]:
def get_rev_markup(revid,lang='en',redirects=1,parsed_text=1):
    """Takes a revision id and returns a (large) string of the HTML content 
    of the revision.
    
    revid - a numeric revision id as a string
    lang - a string (typically two letter ISO 639-1 code) for the language 
        edition, defaults to "en"
    redirects - 1 or 0 for whether to follow page redirects, defaults to 1
    parse - 1 or 0 for whether to return the raw HTML or paragraph text
    
    Returns:
    str - a (large) string of the content of the revision
    """
    
    bad_titles = ['Special:','Wikipedia:','Help:','Template:','Category:','International Standard','Portal:','s:','File:','Digital object identifier','(page does not exist)']
    
    # Get the response from the API for a query
    # After passing a page title, the API returns the HTML markup of the current article version within a JSON payload
    req = requests.get('https://{2}.wikipedia.org/w/api.php?action=parse&format=json&oldid={0}&redirects={1}&prop=text&disableeditsection=1&disabletoc=1'.format(revid,redirects,lang))
    
    # Read the response into JSON to parse and extract the HTML
    json_string = json.loads(req.text)
    
    if 'parse' in json_string.keys():
        page_html = json_string['parse']['text']['*']

        # Parse the HTML into Beautiful Soup
        soup = BeautifulSoup(page_html,'lxml')
        
        return str(soup)

In [10]:
def get_rev_outlinks(revid,lang='en',redirects=1):
    """Takes a page title and returns a list of wiki-links on the page. The 
    list may contain duplicates and the position in the list is approximately 
    where the links occurred.
    
    revid - a numeric revision id as a string
    lang - a string (typically two letter ISO 639-1 code) for the language 
        edition, defaults to "en"
    redirects - 1 or 0 for whether to follow page redirects, defaults to 1
    
    Returns:
    outlinks_per_lang - a dictionary keyed by language returning a dictionary 
        keyed by page title returning a list of outlinks
    """
        
    bad_titles = ['Special:','Wikipedia:','Help:','Template:','Category:','International Standard','Portal:','s:','File:','Digital object identifier','(page does not exist)']
    
    # Get the response from the API for a query
    # After passing a page title, the API returns the HTML markup of the current article version within a JSON payload
    req = requests.get('https://{2}.wikipedia.org/w/api.php?action=parse&format=json&oldid={0}&redirects={1}&prop=text&disableeditsection=1&disabletoc=1'.format(revid,redirects,lang))
    
    # Read the response into JSON to parse and extract the HTML
    json_string = json.loads(req.text)
    
    # Initialize an empty list to store the links
    outlinks_list = [] 
    
    if 'parse' in json_string.keys():
        page_html = json_string['parse']['text']['*']

        # Parse the HTML into Beautiful Soup
        soup = BeautifulSoup(page_html,'lxml')
        
        # Remove sections at end
        bad_sections = ['See_also','Notes','References','Bibliography','External_links']
        sections = soup.find_all('h2')
        for section in sections:
            if section.span['id'] in bad_sections:
                
                # Clean out the divs
                div_siblings = section.find_next_siblings('div')
                for sibling in div_siblings:
                    sibling.clear()
                    
                # Clean out the ULs
                ul_siblings = section.find_next_siblings('ul')
                for sibling in ul_siblings:
                    sibling.clear()

        # Delete tags associated with templates
        for tag in soup.find_all('tr'):
            tag.replace_with('')

        # For each paragraph tag, extract the titles within the links
        for para in soup.find_all('p'):
            for link in para.find_all('a'):
                if link.has_attr('title'):
                    title = link['title']
                    # Ignore links that aren't interesting or are redlinks
                    if all(bad not in title for bad in bad_titles) and 'redlink' not in link['href']:
                        outlinks_list.append(title)

        # For each unordered list, extract the titles within the child links
        for unordered_list in soup.find_all('ul'):
            for item in unordered_list.find_all('li'):
                for link in item.find_all('a'):
                    if link.has_attr('title'):
                        title = link['title']
                        # Ignore links that aren't interesting or are redlinks
                        if all(bad not in title for bad in bad_titles) and 'redlink' not in link['href']:
                            outlinks_list.append(title)

    return outlinks_list

In [11]:
def get_page_outlinks(page_title,lang='en',redirects=1):
    """Takes a page title and returns a list of wiki-links on the page. The 
    list may contain duplicates and the position in the list is approximately 
    where the links occurred.
    
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language 
        edition, defaults to "en"
    redirects - 1 or 0 for whether to follow page redirects, defaults to 1
    
    Returns:
    outlinks_per_lang - a dictionary keyed by language returning a dictionary 
        keyed by page title returning a list of outlinks
    """
    
    # Replace spaces with underscores
    page_title = page_title.replace(' ','_')
    
    bad_titles = ['Special:','Wikipedia:','Help:','Template:','Category:','International Standard','Portal:','s:','File:','Digital object identifier','(page does not exist)']
    
    # Get the response from the API for a query
    # After passing a page title, the API returns the HTML markup of the current article version within a JSON payload
    req = requests.get('https://{2}.wikipedia.org/w/api.php?action=parse&format=json&page={0}&redirects={1}&prop=text&disableeditsection=1&disabletoc=1'.format(page_title,redirects,lang))
    
    # Read the response into JSON to parse and extract the HTML
    json_string = json.loads(req.text)
    
    # Initialize an empty list to store the links
    outlinks_list = [] 
    
    if 'parse' in json_string.keys():
        page_html = json_string['parse']['text']['*']

        # Parse the HTML into Beautiful Soup
        soup = BeautifulSoup(page_html,'lxml')
        
        # Remove sections at end
        bad_sections = ['See_also','Notes','References','Bibliography','External_links']
        sections = soup.find_all('h2')
        for section in sections:
            if section.span['id'] in bad_sections:
                
                # Clean out the divs
                div_siblings = section.find_next_siblings('div')
                for sibling in div_siblings:
                    sibling.clear()
                    
                # Clean out the ULs
                ul_siblings = section.find_next_siblings('ul')
                for sibling in ul_siblings:
                    sibling.clear()
        
        # Delete tags associated with templates
        for tag in soup.find_all('tr'):
            tag.replace_with('')

        # For each paragraph tag, extract the titles within the links
        for para in soup.find_all('p'):
            for link in para.find_all('a'):
                if link.has_attr('title'):
                    title = link['title']
                    # Ignore links that aren't interesting or are redlinks
                    if all(bad not in title for bad in bad_titles) and 'redlink' not in link['href']:
                        outlinks_list.append(title)

        # For each unordered list, extract the titles within the child links
        for unordered_list in soup.find_all('ul'):
            for item in unordered_list.find_all('li'):
                for link in item.find_all('a'):
                    if link.has_attr('title'):
                        title = link['title']
                        # Ignore links that aren't interesting or are redlinks
                        if all(bad not in title for bad in bad_titles) and 'redlink' not in link['href']:
                            outlinks_list.append(title)

    return outlinks_list

### Categories

In [12]:
def get_category_memberships(page_title,lang='en'):
    """The function accepts a page_title and returns a list of categories
    the page is a member of
    
    category_title - a string of the page name
    
    Returns:
    members - a list containing strings of the categories of which the page is a mamber
    
    """
    _S="https://{1}.wikipedia.org/w/api.php?action=query&prop=categories&titles={0}&clprop=timestamp&clshow=!hidden&cllimit=500&format=json&formatversion=2".format(page_title,lang)
    json_response = requests.get(_S).json()

    categories = list()

    if 'pages' in json_response['query']:
        if 'categories' in json_response['query']['pages'][0]:
            for category in json_response['query']['pages'][0]['categories']:
                title = category['title']#.split(':')[1]
                categories.append(title)
                #timestamp = category['timestamp']
                #categories.append({title:timestamp})
            
    return categories

In [13]:
def get_category_subcategories(category_title,lang='en'):
    """The function accepts a category_title and returns a list of the category's sub-categories
    
    category_title - a string (including "Category:" prefix) of the category name
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
    
    Returns:
    members - a list containing strings of the sub-categories in the category
    
    """
    # Replace spaces with underscores
    category_title = category_title.replace(' ','_')
    
    # Make sure "Category:" appears in the title
    if 'Category:' not in category_title:
        category_title = 'Category:' + category_title
        
    _S="https://{1}.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle={0}&cmtype=subcat&cmprop=title&cmlimit=500&format=json&formatversion=2".format(category_title,lang)
    json_response = requests.get(_S).json()

    members = list()
    
    if 'categorymembers' in json_response['query']:
        for member in json_response['query']['categorymembers']:
            members.append(member['title'])
            
    return members

In [14]:
def get_category_members(category_title,depth=1,lang='en'):
    """The function accepts a category_title and returns a list of category members
    
    category_title - a string (including "Category:" prefix) of the category name
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
    
    Returns:
    members - a list containing strings of the page titles in the category
    
    """
    # Replace spaces with underscores
    category_title = category_title.replace(' ','_')
    
    # Make sure "Category:" appears in the title
    if 'Category:' not in category_title:
        category_title = 'Category:' + category_title
    
    _S="https://{1}.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle={0}&cmprop=title&cmnamespace=0&cmlimit=500&format=json&formatversion=2".format(category_title,lang)
    json_response = requests.get(_S).json()

    members = list()
    
    if depth < 0:
        return members
    
    if 'categorymembers' in json_response['query']:
        for member in json_response['query']['categorymembers']:
            members.append(member['title'])
            
    subcats = get_category_subcategories(category_title,lang=lang)
    
    for subcat in subcats:
        members += get_category_members(subcat,depth-1)
            
    return members

### External links

In [15]:
def get_external_links(page_title,lang='en'):
    external_links = list()
    
    query_string = "https://{1}.wikipedia.org/w/api.php?action=query&titles={0}&prop=extlinks&ellimit=500&format=json&formatversion=2".format(page_title,lang)
    json_response = requests.get(query_string).json()
    
    if 'missing' not in json_response['query']['pages'][0] and 'extlinks' in json_response['query']['pages'][0]:
        extlinks = json_response['query']['pages'][0]['extlinks']
        
        # Clean the extlinks
        cleaned_extlinks = list()
        
        for l in extlinks:
            if 'web.archive.org' in l['url']: # Internet Archives have two https in them, get the second
                raw_url = 'http://' + l['url'].split('/http://')[1]
            else:
                raw_url = l['url']
            
            # Try to use the tldextract function, otherwise fall back to urlparse
            try:
                netloc = "{0}.{1}".format(tldextract.extract(raw_url).domain, tldextract.extract(raw_url).suffix)
            except:
                netloc = urlparse(raw_url).netloc
                
            external_links.append(netloc)
    
    return external_links

### Redirects linking to a page

In [16]:
def get_redirects_linking_here(page_title,lang='en',namespace=0):
    """Takes a page title and returns a list of redirects linking to the page
    
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language 
        edition, defaults to "en"
    namespace - limit to pages from a specific namespace, defaults to 0
    
    Returns:
    linkshere - a list of strings with the redirect titles
    """
    
    # Get the response from the API for a query
    # After passing a page title, the API returns the HTML markup of the current article version within a JSON payload
    
    lh_list = list()
    
    query_string = 'https://{1}.wikipedia.org/w/api.php?action=query&titles={0}&prop=linkshere&lhprop=title|redirect&lhnamespace={2}&lhshow=redirect&lhlimit=500&format=json&formatversion=2'.format(page_title,lang,namespace)
    json_response = requests.get(query_string).json()
    
    if 'linkshere' in json_response['query']['pages'][0]:
        subquery_lh_list = json_response['query']['pages'][0]['linkshere']
        lh_list += subquery_lh_list
    
        while True:

            if 'continue' not in json_response:
                break

            else:
                query_continue = json_response['continue']['lhcontinue']
                query_string = 'https://{1}.wikipedia.org/w/api.php?action=query&titles={0}&lhcontinue={3}&prop=linkshere&lhprop=title|redirect&lhnamespace={2}&lhshow=redirect&lhlimit=500&format=json&formatversion=2'.format(page_title,lang,namespace,query_continue)
                json_response = requests.get(query_string).json()
                subquery_lh_list = json_response['query']['pages'][0]['linkshere']
                lh_list += subquery_lh_list
    
    return [i['title'] for i in lh_list]

### Log events 

In [17]:
def get_log_events(page_title,lang='en'):
    """Takes Wikipedia page title and returns a list of revisions
    
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
        
    Returns:
    revision_list - a list of dictionaries, where each dictionary is the revision
        meta-data susch as parentid, revid,sha1, size, timestamp, and user name
    """
    
    event_list = list()
    
    query_string = "https://{1}.wikipedia.org/w/api.php?action=query&letitle={0}&list=logevents&leprop=ids|title|type|user|userid|timestamp|comment|tags&lelimit=500&format=json&formatversion=2".format(page_title,lang)
    json_response = requests.get(query_string).json()

    subquery_revision_list = json_response['query']['logevents']
    event_list += subquery_revision_list
    
    while True:
    
        if 'continue' not in json_response:
            break
            
        else:
            query_continue = json_response['continue']['lecontinue']
            query_string = "https://{1}.wikipedia.org/w/api.php?action=query&letitle={0}&list=logevents&leprop=ids|title|type|user|userid|timestamp|comment|tags&lelimit=500&lecontinue={2}&format=json&formatversion=2".format(page_title,lang,query_continue)
            json_response = requests.get(query_string).json()
            subquery_revision_list = json_response['query']['logevents']
            event_list += subquery_revision_list
            #time.sleep(1)
    
    df = pd.DataFrame(event_list)
    df['page'] = page_title
    
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df['date'] = df['timestamp'].apply(lambda x:x.date())
    
    if 'userid' in df.columns:
        df['userid'] = df['userid'].fillna(0).apply(lambda x:str(int(x)))
    #df['lang'] = lang
    
    return df

### Pageviews

In [211]:
def get_pageviews(page_title,lang='en',date_from='20150701',date_to=str(datetime.today().date()).replace('-','')):
    """Takes Wikipedia page title and returns a all the various pageview records
    
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
        datefrom - a date string in a YYYYMMDD format, defaults to 20150701
        dateto - a date string in a YYYYMMDD format, defaults to today
        
    Returns:
    revision_list - a DataFrame indexed by date and multi-columned by agent and access type
    """
    quoted_page_title = quote(page_title, safe='')
    
    df_list = []
    for access in ['all-access','desktop','mobile-app','mobile-web']:
        for agent in ['all-agents','user','spider','bot']:
            s = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{1}.wikipedia.org/{2}/{3}/{0}/daily/{4}/{5}".format(quoted_page_title,lang,access,agent,date_from,date_to)
            json_response = requests.get(s).json()
            df = pd.DataFrame(json_response['items'])
            df_list.append(df)

    concat_df = pd.concat(df_list)
    concat_df['timestamp'] = pd.to_datetime(concat_df['timestamp'],format='%Y%m%d%H')
    concat_df = concat_df.set_index(['timestamp','agent','access'])['views'].unstack([1,2]).sort_index(axis=1)
    concat_df[('page','page')] = page_title
    return concat_df

In [212]:
_pv = get_pageviews('Donald Trump')
_pv.head()

agent,all-agents,all-agents,all-agents,all-agents,bot,bot,bot,bot,spider,spider,spider,spider,user,user,user,user,page
access,all-access,desktop,mobile-app,mobile-web,all-access,desktop,mobile-app,mobile-web,all-access,desktop,mobile-app,mobile-web,all-access,desktop,mobile-app,mobile-web,page
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
2015-07-01,82159,35886,1292,44981,0,0,0,0,410,390,0,20,81749,35496,1292,44961,Donald Trump
2015-07-02,83458,37554,1453,44451,0,0,0,0,403,365,0,38,83055,37189,1453,44413,Donald Trump
2015-07-03,55615,22154,997,32464,0,0,0,0,441,419,0,22,55174,21735,997,32442,Donald Trump
2015-07-04,43865,16640,795,26430,0,0,0,0,492,469,0,23,43373,16171,795,26407,Donald Trump
2015-07-05,42220,16158,819,25243,0,0,0,0,306,291,0,15,41914,15867,819,25228,Donald Trump


# Start crawling

Get all the pages related to the Trump family.

In [193]:
trump_category_members = get_category_members('Category:Trump_family',depth=5)
print("There are {0:,} members of the 'Trump Family' category on English Wikipedia".format(len(set(trump_category_members))))

unique_trump_category_members = list(set(trump_category_members))

with open('trump_category_members.json','w') as f:
    json.dump(unique_trump_category_members,f)

There are 949 members of the 'Trump Family' category on English Wikipedia


In [24]:
clinton_category_members = get_category_members('Category:Family_of_Bill_and_Hillary_Clinton',depth=5)
print("There are {0:,} members of the 'Clinton Family' category on English Wikipedia".format(len(set(clinton_category_members))))

unique_clinton_category_members = list(set(clinton_category_members))

with open('clinton_category_members.json','w') as f:
    json.dump(unique_clinton_category_members,f)

There are 1,340 members of the 'Clinton Family' category on English Wikipedia


### Revisions
Get the revisions for each of these pages.

In [77]:
with open('trump_category_members.json','r') as f:
    unique_trump_category_members = json.load(f)
    
with open('clinton_category_members.json','r') as f:
    unique_clinton_category_members = json.load(f)

In [25]:
error_pages = []

for page in unique_trump_category_members:
#for page in unique_clinton_category_members:
    try:
        _df = get_page_revisions(page)
        renamed_page = page.replace(' ','_').replace('/','-').replace(':','_-').replace('?','')
        #_df.to_csv(_dir+'Data/Clinton/Revisions/{0}.csv'.format(renamed_page),encoding='utf8',index=False)
        _df.to_csv(_dir+'Data/Trump/Revisions/{0}.csv'.format(renamed_page),encoding='utf8',index=False)

    except KeyboardInterrupt:
        break
    except:
        print("Error on \"{0}\"".format(page))
        error_pages.append(page)
        pass

Error on "President Forever 2008 + Primaries"
Error on "Robert "Say" McIntosh"
Error on "South Park: Bigger, Longer & Uncut"


Fix errors.

In [248]:
error_pages += [i for i in unique_trump_category_members if ':' in i]
#error_pages += [i for i in unique_clinton_category_members if ':' in i]

for page in error_pages:
    _df = get_page_revisions(page)
    renamed_page = page.replace(' ','_').replace('/','-').replace(':','_-').replace('?','')
    _df.to_csv(_dir+'Data/Trump/Revisions/{0}.csv'.format(renamed_page),encoding='utf8',index=False)

Concatenate together.

In [415]:
clinton_revision_df_list = []
clinton_revision_files = os.listdir(_dir+'Data/Clinton/Revisions/')

for f in clinton_revision_files:
    try:
        df = pd.read_csv(_dir + 'Data/Clinton/Revisions/{0}'.format(f),engine='python',parse_dates=['timestamp','date'])
        clinton_revision_df_list.append(df)
    except KeyboardInterrupt:
        break
    except:
        print("Error on \"{0}\"".format(f))
        pass
    
all_clinton_revisions_df = pd.concat(clinton_revision_df_list)
all_clinton_revisions_df.reset_index(drop=True,inplace=True)

all_clinton_revisions_df['year'] = all_clinton_revisions_df['date'].apply(lambda x:x.year)
all_clinton_revisions_df['month'] = all_clinton_revisions_df['date'].apply(lambda x:x.month)
all_clinton_revisions_df['anon'].fillna(False,inplace=True)
all_clinton_revisions_df.sort_values(['page','timestamp'],ascending=True,inplace=True)
all_clinton_revisions_df.reset_index(inplace=True,drop=True)
all_clinton_revisions_df['size_diff'] = all_clinton_revisions_df.groupby('page')['size'].diff()
all_clinton_revisions_df['prev_user'] = all_clinton_revisions_df.groupby('page')['user'].shift()
all_clinton_revisions_df['rev_index'] = all_clinton_revisions_df.groupby('page')['timestamp'].apply(lambda x:x.argsort())
all_clinton_revisions_df['age'] = all_clinton_revisions_df.groupby('page')['timestamp'].apply(lambda x:round((x-x.min())/pd.Timedelta(1,'d'),0))
all_clinton_revisions_df['latency'] = all_clinton_revisions_df.groupby('page')['timestamp'].diff().apply(lambda x:x/pd.Timedelta(1,'s'))

#all_page_revisions_df.to_csv('all_trump_page_revisions.csv',encoding='utf8',index=False)
all_clinton_revisions_df.to_csv('all_clinton_page_revisions.csv',encoding='utf8',index=False)

all_clinton_revisions_df.head()

Unnamed: 0,anon,comment,commenthidden,date,page,parentid,revid,sha1,sha1hidden,size,suppressed,timestamp,user,userhidden,userid,year,month,size_diff,prev_user,rev_index,age,latency
0,False,,,2004-02-05,104th United States Congress,0,2322487,5d453b39808f887cf05cbda0875b086891e57b45,,19002,,2004-02-05 23:44:02,Seth Ilys,,0,2004,2,,,0,0.0,
1,False,,,2004-02-07,104th United States Congress,2322487,2348088,7eabcf8391925ce3efbf44d39d00411ba3c505de,,19030,,2004-02-07 01:07:11,Fabiform,,0,2004,2,28.0,Seth Ilys,1,1.0,91389.0
2,False,,,2004-02-09,104th United States Congress,2348088,2547608,7956424fe9b3195797bd3fd25234b37392e25009,,19451,,2004-02-09 20:33:48,Rmhermen,,0,2004,2,421.0,Fabiform,2,4.0,242797.0
3,False,,,2004-02-27,104th United States Congress,2547608,2915574,9c192346c871dc8882e6866290662300d2f484cd,,19478,,2004-02-27 08:50:35,Vardion,,0,2004,2,27.0,Rmhermen,3,21.0,1513007.0
4,True,,,2004-03-25,104th United States Congress,2915574,3475257,0cc6db379947ea9067c5433af6a08ea2f6db7c6d,,19485,,2004-03-25 18:44:40,66.167.49.186,,0,2004,3,7.0,Vardion,4,49.0,2368445.0


In [416]:
trump_revision_df_list = []
trump_revision_files = os.listdir(_dir+'Data/Trump/Revisions/')

for f in trump_revision_files:
    try:
        df = pd.read_csv(_dir + 'Data/Trump/Revisions/{0}'.format(f),engine='python',parse_dates=['timestamp','date'])
        trump_revision_df_list.append(df)
    except KeyboardInterrupt:
        break
    except:
        print("Error on \"{0}\"".format(f))
        pass
    
all_trump_revisions_df = pd.concat(trump_revision_df_list)
all_trump_revisions_df.reset_index(drop=True,inplace=True)

all_trump_revisions_df['year'] = all_trump_revisions_df['date'].apply(lambda x:x.year)
all_trump_revisions_df['month'] = all_trump_revisions_df['date'].apply(lambda x:x.month)
all_trump_revisions_df['anon'].fillna(False,inplace=True)
all_trump_revisions_df.sort_values(['page','timestamp'],ascending=True,inplace=True)
all_trump_revisions_df.reset_index(inplace=True,drop=True)
all_trump_revisions_df['size_diff'] = all_trump_revisions_df.groupby('page')['size'].diff()
all_trump_revisions_df['prev_user'] = all_trump_revisions_df.groupby('page')['user'].shift()
all_trump_revisions_df['rev_index'] = all_trump_revisions_df.groupby('page')['timestamp'].apply(lambda x:x.argsort())
all_trump_revisions_df['age'] = all_trump_revisions_df.groupby('page')['timestamp'].apply(lambda x:round((x-x.min())/pd.Timedelta(1,'d'),0))
all_trump_revisions_df['latency'] = all_trump_revisions_df.groupby('page')['timestamp'].diff().apply(lambda x:x/pd.Timedelta(1,'s'))

all_trump_revisions_df.to_csv('all_trump_page_revisions.csv',encoding='utf8',index=False)

all_trump_revisions_df.head()

Unnamed: 0,anon,comment,commenthidden,date,page,parentid,revid,sha1,sha1hidden,size,suppressed,timestamp,user,userhidden,userid,year,month,size_diff,prev_user,rev_index,age,latency
0,False,,,2016-07-30,/r/The Donald,0,732214190,4e740bd73bb2774f20bd07e47c82aecef915d2db,,42,,2016-07-30 11:42:31,Nyuszika7H,,0,2016,7,,,0,0.0,
1,False,,,2016-07-30,/r/The Donald,732214190,732214592,2f0d2e463baf26803e343652b3f9bf5cfed3d552,,87,,2016-07-30 11:46:38,Nyuszika7H,,0,2016,7,45.0,Nyuszika7H,1,0.0,247.0
2,False,,,2016-09-16,/r/The Donald,732214592,739695016,399fd716f6bea87cfe8d4378d8bca16f4fcfece3,,139,,2016-09-16 11:13:20,RussBot,,0,2016,9,52.0,Nyuszika7H,2,48.0,4145202.0
3,False,,,2016-10-08,/r/The Donald,739695016,743240915,1aca16fca4e79cf715ce3970edebafaebec922a4,,223,,2016-10-08 18:38:31,Paine Ellsworth,,0,2016,10,84.0,RussBot,3,70.0,1927511.0
4,False,,,2016-11-24,/r/The Donald,743240915,751319407,b7e9a0ae82d2f2337c03c10adbebcee2e7b0c3f2,,5309,,2016-11-24 21:31:53,Yoshiman6464,,0,2016,11,5086.0,Paine Ellsworth,4,117.0,4071202.0


### User contributions

In [535]:
all_clinton_subdf = all_clinton_revisions_df[(~all_clinton_revisions_df['anon']) & (~all_clinton_revisions_df['user'].isin(bot_list)) & (all_clinton_revisions_df['timestamp'] >= pd.Timestamp('2015-01-01')) & (all_clinton_revisions_df['timestamp'] <= pd.Timestamp('2017-11-09'))]
all_trump_subdf = all_trump_revisions_df[(~all_trump_revisions_df['anon']) & (~all_trump_revisions_df['user'].isin(bot_list))  & (all_trump_revisions_df['timestamp'] >= pd.Timestamp('2015-01-01')) & (all_trump_revisions_df['timestamp'] <= pd.Timestamp('2017-11-09'))]
both_subdf = pd.concat([all_clinton_subdf,all_trump_subdf])

all_clinton_articles = all_clinton_subdf['page'].unique()
all_trump_articles = all_trump_revisions_df['page'].unique()

#all_clinton_subdf_user_agg = all_clinton_subdf.groupby('user').agg({'sha1':pd.Series.nunique,'page':pd.Series.nunique,'timestamp':lambda x:(x.max() - x.min())/np.timedelta64(1,'D')})
#all_trump_subdf_user_agg = all_trump_subdf.groupby('user').agg({'sha1':pd.Series.nunique,'page':pd.Series.nunique,'timestamp':lambda x:(x.max() - x.min())/np.timedelta64(1,'D')})
both_subdf_user_agg = both_subdf.groupby('user').agg({'sha1':pd.Series.nunique,'page':pd.Series.nunique,'timestamp':lambda x:(x.max() - x.min())/np.timedelta64(1,'D')})

sha1_threshold = 5
page_threshold = 3
time_threshold = 1
#active_clinton_users_df = all_clinton_subdf_user_agg[(all_clinton_subdf_user_agg['sha1'] >= sha1_threshold) & (all_clinton_subdf_user_agg['page'] >= page_threshold) & (all_clinton_subdf_user_agg['timestamp'] >= time_threshold)]
#active_trump_users_df = all_trump_subdf_user_agg[(all_trump_subdf_user_agg['sha1'] >= sha1_threshold) & (all_trump_subdf_user_agg['page'] >= page_threshold) & (all_trump_subdf_user_agg['timestamp'] >= time_threshold)]
both_active_users_df = both_subdf_user_agg[(both_subdf_user_agg['sha1'] >= sha1_threshold) & (both_subdf_user_agg['page'] >= page_threshold) & (both_subdf_user_agg['timestamp'] >= time_threshold)]

clinton_active_users = both_subdf[(both_subdf['page'].isin(all_clinton_articles)) & (both_subdf['user'].isin(both_active_users_df.index))]['user'].unique()
trump_active_users = both_subdf[(both_subdf['page'].isin(all_trump_articles)) & (both_subdf['user'].isin(both_active_users_df.index))]['user'].unique()

#print("There are {0:,} active Clinton users.\nThere are {1:,} active Trump users\nThere are {2:,} in both sets.".format(len(active_clinton_users),len(active_trump_users),len(both_active_users_df)))
print("There are {0:,} active Clinton users.\nThere are {1:,} active Trump users\nThere are {2:,} in both sets.".format(len(clinton_active_users),len(trump_active_users),len(both_active_users_df)))

with open('active_users_clinton.json','w') as f:
    json.dump(list(clinton_active_users),f)
    
with open('active_users_trump.json','w') as f:
    json.dump(list(trump_active_users),f)
    
with open('active_users_both.json','w') as f:
    json.dump(list(both_active_users_df.index),f)

There are 2,467 active Clinton users.
There are 2,969 active Trump users
There are 3,016 in both sets.


In [536]:
len(set(clinton_active_users) & set(trump_active_users))

2420

Load it from disk if you haven't done all the crawling from above.

In [477]:
#with open('clinton_active_users.json','r') as f:
#    active_clinton_users = json.load(f)
    
#with open('trump_active_users.json','r') as f:
#    active_trump_users = json.load(f)

with open('active_users_both.json','r') as f:
    active_both_users = json.load(f)

Get top user information.

In [478]:
#active_clinton_userinfo = get_user_info(active_clinton_users)
#active_trump_userinfo = get_user_info(active_trump_users)

active_both_userinfo = get_user_info(active_both_users)

#with open('userinfo_active_users_clinton.json','w') as f:
#    json.dump(active_clinton_users,f)
#with open('userinfo_active_users_trump.json','w') as f:
#    json.dump(active_trump_userinfo,f)

with open('userinfo_active_users_both.json','w') as f:
    json.dump(active_both_userinfo,f)

In [479]:
#with open('userinfo_active_users_clinton.json','w') as f:
#    json.dump(active_clinton_users,f)
    
#with open('userinfo_active_users_trump.json','w') as f:
#    json.dump(active_trump_userinfo,f)

with open('userinfo_active_users_both.json','w') as f:
    json.dump(active_both_userinfo,f)

In [509]:
[i for i in active_both_userinfo if i['name'] == "Archer Rafferty"]

[{'blockedby': 'Someguy1221',
  'blockedbyid': 3315180,
  'blockedtimestamp': '2017-01-10T22:49:46Z',
  'blockexpiry': 'infinity',
  'blockid': 7125860,
  'blockreason': '[[WP:3RR|3RR]] violation on [[/r/The Donald]]',
  'editcount': 153,
  'gender': 'unknown',
  'groups': ['*', 'user', 'autoconfirmed'],
  'name': 'Archer Rafferty',
  'registration': '2016-08-10T12:14:24Z',
  'userid': 28931392}]

In [513]:
username = "Archer Rafferty"
lang = "en"
start,stop = pd.Timestamp('2013-01-01'), pd.Timestamp('2017-11-09')
start_utc = datetime.strftime(start, '%Y-%m-%dT%H:%M:%SZ')
stop_utc = datetime.strftime(stop, '%Y-%m-%dT%H:%M:%SZ')

query_string = "https://{1}.wikipedia.org/w/api.php?action=query&list=usercontribs&ucuser={0}&ucprop=ids|title|comment|timestamp|flags|size|sizediff&ucstart={2}&ucstop={3}&uclimit=500&ucdir=newer&format=json&formatversion=2".format(username,lang,start_utc,stop_utc)
json_response = requests.get(query_string).json()
subquery_revision_list = json_response['query']['usercontribs']
    
revision_list = list()

# If the first 500 edits took place in less than 30 days, we've got ourselves a power user, bot, or cyborg
earliest_first_500 = pd.to_datetime(json_response['query']['usercontribs'][0]['timestamp'])
latest_first_500 = pd.to_datetime(json_response['query']['usercontribs'][-1]['timestamp'])
days_elapsed_first_500 = latest_first_500 - earliest_first_500

if len(subquery_revision_list) == 500 and days_elapsed_first_500 > np.timedelta64(30,'D'):

    revision_list += subquery_revision_list

    while True:

        if 'continue' not in json_response:
            break

        else:
            query_continue = json_response['continue']['uccontinue']
            query_string = "https://{1}.wikipedia.org/w/api.php?action=query&list=usercontribs&ucuser={0}&ucprop=ids|title|comment|timestamp|flags|size|sizediff&ucstart={2}&ucstop={3}&uclimit=500&ucdir=newer&uccontinue={4}&format=json&formatversion=2".format(username,lang,start_utc,stop_utc,query_continue)
            json_response = requests.get(query_string).json()
            subquery_revision_list = json_response['query']['usercontribs']
            revision_list += subquery_revision_list
            #time.sleep(1)

elif 'continue' not in json_response:

    revision_list += subquery_revision_list

Get top user contributions.

In [518]:
usercontribs_dict = {}
#usercontribs_errors = []

for i,_payload in enumerate(active_both_userinfo[165:]):
    if 'name' in _payload:
        try:
            _username = _payload['name']
            _id = _payload['userid']
            _df = get_user_contributions(_username,start=pd.Timestamp('2013-01-01'),stop=pd.Timestamp('2017-11-09'))
            _df.to_csv(_dir + 'Data/Users/{0}.csv'.format(_id))
            time.sleep(.5)
            usercontribs_dict[_username] = _df
            if i % 300 == 0:
                print("At position: {0}".format(i))
        except KeyboardInterrupt:
            break
        except:
            print("Error on \"{0}\"".format(_payload['name']))
            usercontribs_errors.append(_payload['name'])
            pass

At position: 0
Error on "Arms"
Error on "Arxiloxos"
Error on "Asdasdasdff"
Error on "Ashvio"
Error on "Aspects"
Error on "AusLondonder"
Error on "AussieLegend"
Error on "AustralianRupert"
Error on "Averageguy007"
Error on "Azwu"
Error on "BD2412"
Error on "BDD"
Error on "BURSTHON3"
Error on "BabbaQ"
Error on "Backendgaming"
Error on "BaldBoris"
Error on "Barek"
Error on "BattlegroundsGames"
Error on "Bbb23"
Error on "Beaglemix"
Error on "Bearcat"
Error on "Bender the Bot"
Error on "Bender235"
Error on "Bennettn1997"
Error on "Benshim333"
Error on "BethNaught"
Error on "Beyond My Ken"
Error on "Bgwhite"
Error on "Bigmdude1"
Error on "BilCat"
Error on "Billy Hathorn"
Error on "Binksternet"
Error on "Biosthmors"
Error on "BizarreLoveTriangle"
Error on "Bjhillis"
Error on "Bkonrad"
Error on "BlackGhost2280"
Error on "BlackTerror"
Error on "BlazeKing252"
Error on "Blazoaustin"
Error on "Blb226"
Error on "BlobBlob98"
Error on "Bluehotel"
Error on "Bmclaughlin9"
Error on "Bobby232332"
Error o

Error on "Kookster66"
Error on "Ksenia2006"
Error on "Ksenia2727"
Error on "Kuioooooo"
Error on "KyleSolo2"
Error on "KylieTastic"
Error on "LDMaster1998"
Error on "La-Li-Lu-Le-Lo"
Error on "LacrimosaDiesIlla"
Error on "LadyofShalott"
Error on "Lance386"
Error on "Landingdude13"
Error on "LaughingNx"
Error on "Lectonar"
Error on "LeeBobBlack"
Error on "Leof616"
Error on "Lepricavark"
Error on "Lighthouse3050"
Error on "Lihaas"
Error on "LikkerdySplit"
Error on "Lilahdog568"
Error on "LittleWink"
Error on "Liz"
Error on "Localemediamonitor"
Error on "Lockley"
Error on "Look2See1"
Error on "Lotje"
Error on "Lourdes"
Error on "Luckiest0522"
Error on "Luckycat092710"
Error on "Ludwig van Mozart"
Error on "Lugnuts"
Error on "LukeStuartStar"
Error on "Lumbering in thought"
Error on "MX"
Error on "MZMcBride"
Error on "Machucanator1000"
Error on "Magic links bot"
Error on "Magicpotato123"
Error on "Magioladitis"
Error on "Majora"
Error on "Make America Great Again"
Error on "Malcolmxl5"
Error 

Error on "Usernamen1"
Error on "Utbindas"
Error on "Velostodon"
Error on "Versus001"
Error on "Vidatafazoli"
Error on "Vincelord"
Error on "Vincent5"
Error on "ViperSnake151"
Error on "Vjmlhds"
Error on "VoltaireEditor2016"
Error on "Volunteer Marek"
Error on "Vrrajkum"
Error on "Vsmith"
Error on "WWGB"
Error on "Waffles9761"
Error on "Washingtonediter"
Error on "Wavelength"
Error on "Wayne Elgin"
Error on "Wbm1058"
Error on "WeaponOfChoice1"
Error on "Weather28540"
Error on "WereSpielChequers"
Error on "Werldwayd"
Error on "What cat?"
Error on "WhatsUpWorld"
Error on "WhisperToMe"
Error on "Widefox"
Error on "Widr"
Error on "WikiDan61"
Error on "Wikipedical"
Error on "Wikipelli"
Error on "Wikishovel"
Error on "William Avery"
Error on "WilliamJE"
Error on "Woodensuperman"
At position: 2700
Error on "Woodstein52"
Error on "Wpeneditor"
Error on "Xandahar"
Error on "Xezbeth"
Error on "Xin Deui"
Error on "Y2kcrazyjoker4"
Error on "Yamaguchiå…ˆç”Ÿ"
Error on "Yintan"
Error on "Ymblanter"
Err

In [545]:
user_contribs[0]

'10001499.csv'

Unnamed: 0,comment,minor,new,ns,pageid,parentid,revid,size,sizediff,suppressed,texthidden,timestamp,title,top,user,userid,date
0,/* Japanese position */ c/e,False,False,0,29020730,530583585,530889949,108592,574,,,2013-01-02 08:58:59,Senkaku Islands dispute,False,Phoenix7777,10001499,2013-01-02
1,/* Japanese position */ moved a citation from ...,False,False,0,29020730,530889949,530890742,108466,-126,,,2013-01-02 09:08:37,Senkaku Islands dispute,False,Phoenix7777,10001499,2013-01-02
2,/* Question about the recent major edit */ bel...,False,False,1,29043602,530551814,531208987,24573,1714,,,2013-01-04 04:13:24,Talk:Senkaku Islands dispute,False,Phoenix7777,10001499,2013-01-04
3,Reverted edits by [[Special:Contributions/ 67....,False,False,0,330979,531235799,531240217,7455,-556,,,2013-01-04 09:32:59,Jap,False,Phoenix7777,10001499,2013-01-04
4,/* January 2013 */ new section,False,True,3,38116817,0,531240283,137,137,,,2013-01-04 09:33:54,User talk:67.225.9.144,False,Phoenix7777,10001499,2013-01-04
5,Undid revision 531860852 by [[Special:Contribu...,False,False,0,341418,531860852,531882538,97504,1121,,,2013-01-08 02:16:32,Korea under Japanese rule,False,Phoenix7777,10001499,2013-01-08
6,/* Japanese migration and land confiscation */...,False,False,0,341418,531882538,531883450,97992,488,,,2013-01-08 02:24:30,Korea under Japanese rule,False,Phoenix7777,10001499,2013-01-08
7,/* Airspace incursion */ The wording in inappr...,False,False,1,29043602,531890532,531893996,27627,525,,,2013-01-08 03:51:56,Talk:Senkaku Islands dispute,False,Phoenix7777,10001499,2013-01-08
8,/* Airspace incursion */ more logically explained,False,False,1,29043602,531893996,531919074,28206,579,,,2013-01-08 07:45:12,Talk:Senkaku Islands dispute,False,Phoenix7777,10001499,2013-01-08
9,/* Airspace incursion */ fix,False,False,1,29043602,531919074,531921889,28209,3,,,2013-01-08 08:16:39,Talk:Senkaku Islands dispute,False,Phoenix7777,10001499,2013-01-08


pandas.errors.ParserError

In [562]:
user_contribs = os.listdir(_dir + 'Data/Users')
usercontribs_l = []

for f in user_contribs:
    try:
        _df = pd.read_csv(_dir + 'Data/Users/' + f,encoding='latin1',index_col=0,parse_dates=['timestamp'],low_memory=False)
        try:
            _df = _df[_df['timestamp'] > pd.Timestamp('2014-01-01')]
        except TypeError:
            print("Type error in: " + f)
        usercontribs_l.append(_df)
    except KeyboardInterrupt:
        break
    except pd.errors.ParserError:
        print(f)
        pass

all_user_contribs_df = pd.concat(usercontribs_l)
all_user_contribs_df.to_csv('active_user_contribs.csv',encoding='utf8',index=False)

9784415.csv


In [None]:
del usercontribs_l
del all_user_contribs

### Redirects

In [None]:
clinton_redirect_members = {}
trump_redirect_members = {}

clinton_errors = []
trump_errors = []

for member in clinton_category_members:
    try:
        clinton_redirect_members[member] = get_redirects_linking_here(member)
    except:
        clinton_errors.append(member)
        pass
    clinton_redirect_members[member].append(member)
    
for member in trump_category_members:
    try:
        trump_redirect_members[member] = get_redirects_linking_here(member)
    except:
        trump_errors.append(member)
        pass
    trump_redirect_members[member].append(member)

with open('clinton_category_members_redirects.json','w') as f:
    json.dump(clinton_redirect_members,f)
    
with open('trump_category_members_redirects.json','w') as f:
    json.dump(trump_redirect_members,f)

### Log events

In [28]:
all_clinton_le_dict = {}
all_clinton_le_error_pages = []

for page in redirects_clinton_category_members:
    try:
        _df = get_log_events(page)
        all_clinton_le_dict[page] = _df
    except KeyboardInterrupt:
        break
    except:
        print("Error on \"{0}\"".format(page))
        all_clinton_le_error_pages.append(page)
        pass
    
all_trump_le_dict = {}
all_trump_le_error_pages = []

for page in redirects_trump_category_members:
    try:
        _df = get_log_events(page)
        all_trump_le_dict[page] = _df
    except KeyboardInterrupt:
        break
    except:
        print("Error on \"{0}\"".format(page))
        all_trump_le_error_pages.append(page)
        pass

Unnamed: 0,action,comment,commenthidden,date,logid,logpage,ns,page,pageid,tags,timestamp,title,type,user,userid
0,patrol,,,2008-04-13,14781556.0,0.0,0.0,Robert N. Chatigny,16729727.0,[],2008-04-13 13:39:20,Robert N. Chatigny,patrol,DragonflySixtyseven,62058
1,patrol,,,2009-03-05,20953870.0,0.0,0.0,John R. Tunheim,21834325.0,[],2009-03-05 23:39:29,John R. Tunheim,patrol,Polbot,4477315
2,delete_redir,[[WP:CSD#G6|G6]]: Deleted to make way for move,,2017-01-21,80218809.0,4067205.0,0.0,Bernard Nussbaum,3040825.0,[],2017-01-21 19:05:41,Bernard Nussbaum,delete,Arbor to SJ,4322169
3,delete,content was: '==Bernard Nussbaum==Former Chief...,,2005-11-26,995795.0,0.0,0.0,Bernard Nussbaum,3040825.0,[],2005-11-26 20:16:50,Bernard Nussbaum,delete,Lucky 6.9,51717
4,restore,6 revisions restored: restore for merge,,2009-04-02,21551687.0,0.0,0.0,Sidney H. Stein,21832373.0,[],2009-04-02 18:37:03,Sidney H. Stein,delete,BD2412,196446


In [None]:
all_clinton_le_df = pd.concat(all_clinton_le_dict.values(),keys=all_clinton_le_dict.keys())
all_clinton_le_df = all_clinton_le_df.reset_index(drop=True)
all_clinton_le_df['redirect_page'] = all_clinton_le_df['page'].map(clinton_redirect_mapping)
all_clinton_le_df['redirect_page'].fillna(all_clinton_le_df['page'],inplace=True)
all_clinton_le_df.to_csv('logevents_clinton.csv',encoding='utf8',index=False)

all_trump_le_df = pd.concat(all_trump_le_dict.values(),keys=all_trump_le_dict.keys())
all_trump_le_df = all_trump_le_df.reset_index(drop=True)
all_trump_le_df['redirect_page'] = all_trump_le_df['page'].map(trump_redirect_mapping)
all_trump_le_df['redirect_page'].fillna(all_trump_le_df['page'],inplace=True)
all_trump_le_df.to_csv('logevents_trump.csv',encoding='utf8',index=False)
all_trump_le_df.head()

### Pageviews

In [259]:
with open('clinton_category_members_redirects.json','r') as f:
    clinton_redirect_members = json.load(f)
    
with open('trump_category_members_redirects.json','r') as f:
    trump_redirect_members = json.load(f)
    
clinton_redirect_members_l = [r for target, redirects in clinton_redirect_members.items() for r in redirects]
trump_redirect_members_l = [r for target, redirects in trump_redirect_members.items() for r in redirects]

all_clinton_articles_w_redirects = list(set(unique_clinton_category_members) | set(clinton_redirect_members_l))
all_trump_articles_w_redirects = list(set(unique_trump_category_members) | set(trump_redirect_members_l))

print("There are {0:,} Clinton articles (with redirects).\nThere are {1:,} Trump articles (with redirects).".format(len(all_clinton_articles_w_redirects),len(all_trump_articles_w_redirects)))

There are 4,464 Clinton articles (with redirects).
There are 3,489 Trump articles (with redirects).


(Temporary) Some of the names are case sensitive differences, but these case-sensitive files are over-written in the file system. Re-crawl and add some fuzz so they're not overwritten.

In [263]:
clinton_case_sensitive = []
trump_case_sensitive = []

for a1 in all_clinton_articles_w_redirects:
    for a2 in all_clinton_articles_w_redirects:
        if a1.lower() == a2.lower() and a1 != a2:
            clinton_case_sensitive.append(a1)
            
for a1 in all_trump_articles_w_redirects:
    for a2 in all_trump_articles_w_redirects:
        if a1.lower() == a2.lower() and a1 != a2:
            trump_case_sensitive.append(a1)

clinton_case_sensitive = list(set(clinton_case_sensitive))
trump_case_sensitive = list(set(trump_case_sensitive))
            
print("There are {0:,} Clinton articles and {1:,} Trump articles affected by case sensitivity.".format(len(clinton_case_sensitive),len(trump_case_sensitive)))

There are 490 Clinton articles and 369 Trump articles affected by case sensitivity.


Crawl each candidate's pageview data and save it to a file (which we'll concatenate together later).

In [267]:
clinton_error_pages = []

#for i,page in enumerate(all_clinton_articles_w_redirects):
for i,page in enumerate(clinton_case_sensitive):
    try:
        _df = get_pageviews(page)
        renamed_page = quote(page.replace(' ','_').replace('/','-').replace(':','_-').replace('?',''), safe='')
        _df.to_csv(_dir+'Data/Clinton/Pageviews/{1}-{0}.csv'.format(renamed_page,i),encoding='utf8')
    except KeyboardInterrupt:
        break
    except:
        print("Error on \"{0}\"".format(page))
        clinton_error_pages.append(page)
        pass

In [268]:
trump_error_pages = []

#for i,page in enumerate(all_trump_articles_w_redirects):
for i,page in enumerate(trump_case_sensitive):
    try:
        _df = get_pageviews(page)
        renamed_page = quote(page.replace(' ','_').replace('/','-').replace(':','_-').replace('?',''), safe='')
        _df.to_csv(_dir+'Data/Trump/Pageviews/{1}-{0}.csv'.format(renamed_page,i),encoding='utf8')
    except KeyboardInterrupt:
        break
    except:
        print("Error on \"{0}\"".format(page))
        trump_error_pages.append(page)
        pass

Fix errors.

In [None]:
error_pages = set(os.listdir(_dir+'/Data/Revisions')) - set(os.listdir(_dir+'/Data/Pageviews'))
error_pages = ['/r/The Donald','American Horror Story','Bigan Kian','Fahrenheit 11/9','Trump: The Kremin Candidate','Trump: What\'s the Deal?','Where My Country Gone']

for page in error_pages:
    try:
        quoted_page = quote(page, safe='')
        _df = get_pageviews(quoted_page)
        renamed_page = page.replace(' ','_').replace('/','-').replace(':','_-').replace('?','')
        _df.to_csv(_dir+'Data/Pageviews/{0}.csv'.format(renamed_page),encoding='utf8')
    except KeyboardInterrupt:
        break
    except:
        print("Error on \"{0}\"".format(page))
        pass

(Temporary) Clean up duplicates.

In [276]:
for i,page in enumerate(clinton_case_sensitive):
    renamed_page = quote(page.replace(' ','_').replace('/','-').replace(':','_-').replace('?',''), safe='')
    path = _dir+'Data/Clinton/Pageviews/{0}.csv'.format(renamed_page,i)
    try:
        os.remove(path)
    except FileNotFoundError:
        pass
    
for i,page in enumerate(trump_case_sensitive):
    renamed_page = quote(page.replace(' ','_').replace('/','-').replace(':','_-').replace('?',''), safe='')
    path = _dir+'Data/Trump/Pageviews/{0}.csv'.format(renamed_page,i)
    try:
        os.remove(path)
    except FileNotFoundError:
        pass

Concatenate together.

In [349]:
redirects_clinton_pageview_df_list = []
clinton_pageview_files = os.listdir(_dir+'Data/Clinton/Pageviews/')

for f in clinton_pageview_files:
    try:
        df = pd.read_csv(_dir + 'Data/Clinton/Pageviews/{0}'.format(f),engine='python',header=[0,1],skiprows=[2],index_col=0)
        df.index = pd.to_datetime(df.index)        
        df[('page','page')] = df[('page','page')].apply(lambda x:unquote(x))
        df[('redirected_page','')] = df[('page','page')]
        redirects_clinton_pageview_df_list.append(df)
    except KeyboardInterrupt:
        break
    except:
        print("Error on \"{0}\"".format(f))
        pass
    
redirects_clinton_pageviews_df = pd.concat(redirects_clinton_pageview_df_list)
redirects_clinton_pageviews_df = redirects_clinton_pageviews_df.reset_index().set_index([('page','page'),('index','')])
redirects_clinton_pageviews_df.index.names = ['page','date']
redirects_clinton_pageviews_df.columns.names = ['','']
redirects_clinton_pageviews_df.to_csv('redirects_clinton_pageviews.csv',encoding='utf8')
redirects_clinton_pageviews_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,all-agents,all-agents,all-agents,all-agents,bot,bot,bot,bot,spider,spider,spider,spider,user,user,user,user,redirected_page
Unnamed: 0_level_1,Unnamed: 1_level_1,all-access,desktop,mobile-app,mobile-web,all-access,desktop,mobile-app,mobile-web,all-access,desktop,mobile-app,mobile-web,all-access,desktop,mobile-app,mobile-web,Unnamed: 18_level_1
page,date,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
"""Basket of deplorables""",2017-06-05,11.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,9.0,0.0,0.0,2.0,2.0,0.0,0.0,"""Basket of deplorables"""
"""Basket of deplorables""",2017-06-06,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,"""Basket of deplorables"""
"""Basket of deplorables""",2017-06-07,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Basket of deplorables"""
"""Basket of deplorables""",2017-06-08,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,"""Basket of deplorables"""
"""Basket of deplorables""",2017-06-09,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Basket of deplorables"""


In [350]:
redirects_trump_pageview_df_list = []
trump_pageview_files = os.listdir(_dir+'Data/Trump/Pageviews/')

for f in trump_pageview_files:
    try:
        df = pd.read_csv(_dir + 'Data/Trump/Pageviews/{0}'.format(f),engine='python',header=[0,1],skiprows=[2],index_col=0)
        df.index = pd.to_datetime(df.index)
        df[('page','page')] = df[('page','page')].apply(lambda x:unquote(x))
        df[('redirected_page','')] = df[('page','page')]
        redirects_trump_pageview_df_list.append(df)
    except KeyboardInterrupt:
        break
    except:
        print("Error on \"{0}\"".format(f))
        pass
    
redirects_trump_pageviews_df = pd.concat(redirects_trump_pageview_df_list)
redirects_trump_pageviews_df = redirects_trump_pageviews_df.reset_index().set_index([('page','page'),('index','')])
redirects_trump_pageviews_df.index.names = ['page','date']
redirects_trump_pageviews_df.columns.names = ['','']
redirects_trump_pageviews_df.to_csv('redirects_trump_pageviews.csv',encoding='utf8')
redirects_trump_pageviews_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,all-agents,all-agents,all-agents,all-agents,bot,bot,bot,bot,spider,spider,spider,spider,user,user,user,user,redirected_page
Unnamed: 0_level_1,Unnamed: 1_level_1,all-access,desktop,mobile-app,mobile-web,all-access,desktop,mobile-app,mobile-web,all-access,desktop,mobile-app,mobile-web,all-access,desktop,mobile-app,mobile-web,Unnamed: 18_level_1
page,date,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
"""Basket of deplorables""",2017-06-05,11.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,9.0,0.0,0.0,2.0,2.0,0.0,0.0,"""Basket of deplorables"""
"""Basket of deplorables""",2017-06-06,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,"""Basket of deplorables"""
"""Basket of deplorables""",2017-06-07,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Basket of deplorables"""
"""Basket of deplorables""",2017-06-08,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,"""Basket of deplorables"""
"""Basket of deplorables""",2017-06-09,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Basket of deplorables"""


Add the pageviews from the redirects to the parent article.

In [351]:
clinton_redirects_mapping = {} 
for target,redirects in clinton_redirect_members.items():
    clinton_redirects_mapping[target] = target
    for r in redirects:
        clinton_redirects_mapping[r] = target
        
trump_redirects_mapping = {}
for target,redirects in trump_redirect_members.items():
    trump_redirects_mapping[target] = target
    for r in redirects:
        trump_redirects_mapping[r] = target

In [352]:
#redirects_clinton_pageviews_df['redirected_page'] = redirects_clinton_pageviews_df.reset_index(0)[('page','')].values
#redirects_trump_pageviews_df['redirected_page'] = redirects_trump_pageviews_df.reset_index(0)[('page','')].values

redirects_clinton_pageviews_df['redirected_page'] = redirects_clinton_pageviews_df['redirected_page'].map(clinton_redirects_mapping)
redirects_trump_pageviews_df['redirected_page'] = redirects_trump_pageviews_df['redirected_page'].map(trump_redirects_mapping)

pv_columns = redirects_clinton_pageviews_df.columns.get_values()[:-1]
agg_func = dict(zip(pv_columns,[np.sum]*len(pv_columns)))

combined_clinton_pageviews_df = redirects_clinton_pageviews_df.reset_index().groupby([('redirected_page',''),('date','')]).agg(agg_func)
combined_trump_pageviews_df = redirects_trump_pageviews_df.reset_index().groupby([('redirected_page',''),('date','')]).agg(agg_func)

combined_clinton_pageviews_df.index.names = ['page','date']
combined_trump_pageviews_df.index.names = ['page','date']

combined_clinton_pageviews_df.to_csv('all_clinton_pageviews_redirects.csv',encoding='utf8')
combined_trump_pageviews_df.to_csv('all_trump_pageviews_redirects.csv',encoding='utf8')

The July 1, 2015 number should be north of 11,000 -- if it's not, we lost the main Hillary Clinton article somewhere along the way.

In [355]:
combined_clinton_pageviews_df.loc['Hillary Clinton',('all-agents','all-access')].head()

date
2015-07-01    13624.0
2015-07-02    12686.0
2015-07-03    11251.0
2015-07-04    10373.0
2015-07-05    11907.0
Name: (all-agents, all-access), dtype: float64

In [356]:
combined_trump_pageviews_df.loc['Donald Trump',('all-agents','all-access')].head()

date
2015-07-01    89519.0
2015-07-02    90804.0
2015-07-03    60450.0
2015-07-04    47489.0
2015-07-05    45924.0
Name: (all-agents, all-access), dtype: float64

### Revision content

First, find the revision id for each date that is closest in value to the median length of the article on that day. The goal here is to find a "typical" revision for that day by ignoring vandalism or other anomalous edits.

In [32]:
def closest_revid_to_median_size(_df):
    # https://stackoverflow.com/questions/30112202/how-do-i-find-the-closest-values-in-a-pandas-series-to-an-input-number
    return _df.iloc[(_df['size']-_df['size'].median()).abs().argsort()]['revid'].values[0]

closest_revid_to_median_size_df = all_page_revisions_df.groupby(['page','date']).apply(closest_revid_to_median_size)
closest_revid_to_median_size_df.head()

page                          date      
104th United States Congress  2004-02-05    2322487
                              2004-02-07    2348088
                              2004-02-09    2547608
                              2004-02-27    2915574
                              2004-03-25    3475257
dtype: int64

In [39]:
#median_trump_revids = closest_revid_to_median_size_df.loc['Donald Trump'].apply(str).to_dict()
median_clinton_revids = closest_revid_to_median_size_df.loc['Hillary Clinton'].apply(str).to_dict()

median_clinton_revids = {str(date.date()):revid for date,revid in median_clinton_revids.items()}

#print("There are {0:,} daily revisions for \"Donald Trump\"".format(len(median_trump_revids)))
print("There are {0:,} daily revisions for \"Hillary Clinton\"".format(len(median_clinton_revids)))

with open('median_clinton_revids.json','w') as f:
    json.dump(median_clinton_revids,f)

There are 2,720 daily revisions for "Hillary Clinton"


Second, pull the revision content (outlinks, external links, markup *etc.*) for each of these revisions.

In [41]:
pd.Series(median_clinton_revids).head()

2001-08-01    256189
2001-12-07    256190
2002-02-25     72270
2002-05-18     72370
2002-07-07    112733
dtype: object

In [42]:
#for date,revid in median_trump_revids.items():
for date,revid in median_clinton_revids.items():
    #_d = {'date':date,'revid':revid,'page':'Donald Trump'}
    _d = {'date':date,'revid':revid,'page':'Hillary Clinton'}
    _d['content'] = get_rev_content(revid)
    with open(_dir+'Data/Clinton/Content/{0}.json'.format(revid),'w') as f:
        json.dump(_d,f)

In [51]:
#for date,revid in median_trump_revids.items():
current_revids = [revid[:-5] for revid in os.listdir(_dir+'Data/Clinton/Links/')]
for date,revid in median_clinton_revids.items():
    #_d = {'date':date,'revid':revid,'page':'Donald Trump'}
    if revid not in current_revids:
        _d = {'date':date,'revid':revid,'page':'Hillary Clinton'}
        _d['links'] = get_rev_outlinks(revid)
        time.sleep(1)
        with open(_dir+'Data/Clinton/Links/{0}.json'.format(revid),'w') as f:
            json.dump(_d,f)

In [54]:
#current_revids = [revid[:-5] for revid in os.listdir(_dir+'Data/Trump/Markup/')]
#for date,revid in median_trump_revids.items():

current_revids = [revid[:-5] for revid in os.listdir(_dir+'Data/Clinton/Markup/')]
for date,revid in median_clinton_revids.items():
    
    if revid not in current_revids:
        #_d = {'date':date,'revid':revid,'page':'Donald Trump'}
        _d = {'date':date,'revid':revid,'page':'Hillary Clinton'}
        _d['markup'] = get_rev_markup(revid)
        time.sleep(1)
        with open(_dir+'Data/Clinton/Markup/{0}.json'.format(revid),'w') as f:
            json.dump(_d,f)

### Get all revisions for year before and after election

In [21]:
start, stop = pd.Timestamp('2015-11-07'), pd.Timestamp('2017-11-09')

In [27]:
all_trump_rev_df = pd.read_csv('all_trump_page_revisions.csv',low_memory=False,parse_dates=['date','timestamp'])
all_clinton_rev_df = pd.read_csv('all_clinton_page_revisions.csv',low_memory=False,parse_dates=['date','timestamp'])

In [48]:
trump_rev_df = all_trump_rev_df[all_trump_rev_df['page'] == "Donald Trump"]
hillary_rev_df = all_clinton_rev_df[all_clinton_rev_df['page'] == "Hillary Clinton"]

trump_rev_df = trump_rev_df[(trump_rev_df['date'] > start) & (trump_rev_df['date'] < stop)]
hillary_rev_df = hillary_rev_df[(hillary_rev_df['date'] > start) & (hillary_rev_df['date'] < stop)]

print("There are {0:,} revisions on the \"Donald Trump\" article over the year preceding and following the election".format(len(trump_rev_df)))
print("There are {0:,} revisions on the \"Hillary Clinton\" article over the year preceding and following the election".format(len(hillary_rev_df)))

There are 13,970 revisions on the "Donald Trump" article over the year preceding and following the election
There are 2,667 revisions on the "Hillary Clinton" article over the year preceding and following the election


In [70]:
unique_trump_revids = list(trump_rev_df.groupby('sha1').agg({'revid':lambda x:list(x)[0]})['revid'].values)
unique_hillary_revids = list(hillary_rev_df.groupby('sha1').agg({'revid':lambda x:list(x)[0]})['revid'].values)
print("There are {0:,} unique revisions on the \"Donald Trump\" article over the year preceding and following the election".format(len(unique_trump_revids)))
print("There are {0:,} unique revisions on the \"Hillary Clinton\" article over the year preceding and following the election".format(len(unique_hillary_revids)))

There are 12,800 unique revisions on the "Donald Trump" article over the year preceding and following the election
There are 2,216 unique revisions on the "Hillary Clinton" article over the year preceding and following the election


In [79]:
current_hillary_revids = [revid[:-5] for revid in os.listdir(_dir+'Data/Clinton/Markup/')]
for revid in unique_hillary_revids:   
    if revid not in current_hillary_revids:
        date = str(pd.to_datetime(hillary_rev_df[hillary_rev_df['revid'] == revid]['date'].values[0]).date())
        _d = {'date':date,'revid':int(revid),'page':'Hillary Clinton'}
        _d['markup'] = get_rev_markup(revid)
        time.sleep(.5)
        with open(_dir+'Data/Clinton/Markup/{0}.json'.format(revid),'w') as f:
            json.dump(_d,f)

In [80]:
current_trump_revids = [revid[:-5] for revid in os.listdir(_dir+'Data/Trump/Markup/')]
for revid in unique_trump_revids:
    if revid not in current_trump_revids:
        date = str(pd.to_datetime(trump_rev_df[trump_rev_df['revid'] == revid]['date'].values[0]).date())
        _d = {'date':date,'revid':int(revid),'page':'Donald Trump'}
        _d['markup'] = get_rev_markup(revid)
        time.sleep(.5)
        with open(_dir+'Data/Trump/Markup/{0}.json'.format(revid),'w') as f:
            json.dump(_d,f)