In [1]:
import pandas as pd

dataset = pd.read_csv('wikipedia_dataset_hasan/wikipedia.csv')

In [2]:
titles = pd.DataFrame(dataset, columns = ['page_title']).head(5)

titles.to_csv('wikipages.txt', index=None, header=None)

titles

Unnamed: 0,page_title
0,Casino Royale (2006 film)
1,Procellariidae
2,Kakapo
3,2005 NFL Draft
4,Danish football champions


In [35]:
import requests

S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"

def get_revisions_info(title, date_start, date_end):
    """Resquest revisions info of a wikipedia article by its title, in a period determined by start and end dates
 
    Parameters:
        title (str): The title of the article
        date_start (str): The start date of querying in the format ISO 8601: 2001-01-15T14:56:00Z
        date_end (str): The end date of querying in the format ISO 8601: 2001-01-15T14:56:00Z

    Returns:
        response (json): raw result of the request

    """
    params = {
        "action": "query",
        "prop": "revisions",
        "format": "json",
        "titles": title,
        "rvlimit": 500,
        "rvstart": date_start,
        "rvend": date_end,
        "rvdir": "older",
    }
    
    response = S.get(url=URL, params=params)    
    
    return response.json()

date_start = "2017-10-01T00:00:00Z"
date_end = "2015-10-01T00:00:00Z"
response = get_revisions_info("Procellariidae", date_start, date_end)
response

# with open("response.json", 'w') as fp:
#     json.dump(content, fp)

{'batchcomplete': '',
 'query': {'pages': {'224443': {'pageid': 224443,
    'ns': 0,
    'title': 'Procellariidae',
    'revisions': [{'revid': 802073837,
      'parentid': 793516133,
      'user': 'InternetArchiveBot',
      'timestamp': '2017-09-23T20:48:43Z',
      'comment': 'Rescuing 1 sources and tagging 2 as dead. #IABot (v1.5.3) ([[User:FA RotBot|FA RotBot]])'},
     {'revid': 793516133,
      'parentid': 787540085,
      'minor': '',
      'user': 'Rjwilmsi',
      'timestamp': '2017-08-02T07:22:42Z',
      'comment': 'Journal cites: fix page range, templated 1 journal cites (Diberri fmt authors) using [[Project:AWB|AWB]] (12158)'},
     {'revid': 787540085,
      'parentid': 786273570,
      'minor': '',
      'user': 'PrimeBOT',
      'timestamp': '2017-06-25T23:52:40Z',
      'comment': 'Replace [[Help:Magic_links|magic links]] with templates per [[Special:PermaLink/772743896#Future_of_magic_links|local RfC]] - [[User:PrimeBOT/13|BRFA]]'},
     {'revid': 786273570,
      'p

In [4]:
import json
with open("response.json", 'w') as fp:
    json.dump(response, fp)

In [6]:
date_start = "2017-10-01T00:00:00Z"
date_end = "2015-10-01T00:00:00Z"
    
revisions_info = parse_revisions_info_monthly(response, date_start, date_end)
response

In [8]:
date_start = "2017-10-01T00:00:00Z"
date_end = "2017-8-01T00:00:00Z"

def date_range_monthly(date_start, date_end):
        return pd.date_range(date_start, date_end, freq='MS').strftime("%Y-%m-%dT%H:%M:%SZ").tolist()[::-1]
    
date_range_monthly( date_end, date_start)

['2017-10-01T00:00:00Z', '2017-09-01T00:00:00Z', '2017-08-01T00:00:00Z']

In [9]:
import datetime
import dateutil.parser

def parse_date(date):
    """Parse date from ISO 8601 format to datetime
 
    Parameters:
        date (str): date in the format ISO 8601: 2001-01-15T14:56:00Z
        
    Returns:
        date (datetime): date parsed

    """
    return dateutil.parser.parse(date)

def format_date(date):
    """Format date ISO 8601 (i.e. 2001-01-15T14:56:00Z)
 
    Parameters:
        date (datetime): date
        
    Returns:
        date (str): ISO 8601 formatted date
    
    """
    s = "%Y-%m-%dT%H:%M:%SZ"
    return date.strftime(s)

date_idx=0
dates = pd.date_range("2017-01-03T01:32:59Z","2017-04-03T01:32:59Z", freq='MS').strftime("%Y-%m-%dT%H:%M:%SZ").tolist()

while date_idx < len(dates):
    print(dates[date_idx])
    date_idx += 1

2017-02-01T01:32:59Z
2017-03-01T01:32:59Z
2017-04-01T01:32:59Z


In [None]:
from pandas.io.json import json_normalize

#revisions = data["query"]["pages"][page_id]["revisions"]
revisions = list(data["query"]["pages"].values())[0]["revisions"]

json_normalize(revisions)

In [19]:
import requests

S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"

def get_revision(title, access_date):
    # title = "Procellariidae"
    # date = "2017-04-03T01:32:59.000Z"
    params = {
        "action": "query",
        "prop": "revisions",
        "titles": title,
        "rvprop": "timestamp|user|comment|content|ids",
        "rvslots": "main",
        "formatversion": "2",
        "format": "json",
        "rvlimit": 1,
        "rvstart": access_date,
        "rvdir": "older",
    }

    response = S.get(url=URL, params=params)
    
    return response.json()

def parse_revision_response(response, date):
    page = list(response["query"]["pages"])[0]
    revision = list(page["revisions"])[0]
    return (page["pageid"], page["title"], revision["user"], revision["timestamp"], revision["comment"], revision["slots"]["main"]["content"])

date = "2017-04-03T01:32:59.000Z"
response = request_revision("Procellariidae", "2017-04-03T01:32:59.000Z")
parse_revision_response(response, date)


(224443,
 'Procellariidae',
 'Pvmoutside',
 '2016-12-15T20:26:53Z',
 '',
 '{{pp-move-indef}}\n{{Automatic Taxobox\n| name = Procellariidae\n| image =Damier du Cap - Cape Petrel.jpg\n| image_width =\n| image_caption = [[Cape petrel]] (\'\'Daption capense\'\')\n| taxon = Procellariidae\n| diversity_ref = <ref name=Taxonomicon/>\n| diversity = 16 genera and about 70 species\n| diversity_link = List of Procellariidae\n| authority = [[William Elford Leach|Leach]], 1820<ref name=Taxonomicon>{{cite web| url=http://taxonomicon.taxonomy.nl/TaxonTree.aspx?id=51489&tree=0.1 | title=Systema Naturae 2000 / Classification&nbsp;— Superfamily Procellarioidea | accessdate=12 Feb 2009 | last=Brands| first=Sheila | date=Aug 14, 2008 | work=Project: The Taxonomicon}}</ref>\n| subdivision_ranks = Genera\n| subdivision =\n\'\'[[Macronectes]]\'\'<br />\n\'\'[[Fulmarus]]\'\'<br />\n\'\'[[Thalassoica]]\'\'<br />\n\'\'[[Daption]]\'\'<br />\n\'\'[[Snow petrel|Pagodroma]]\'\'<br />\n\'\'[[Halobaena]]\'\'<br />\n\

In [42]:
import requests

S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"

def get_revisions_class(title, date):
    # title = "Procellariidae"
    # date = "2017-04-03T01:32:59.000Z"
    params = {
        "action": "query",
        "prop": "revisions",
        "format": "json",
        "titles": title,
        "rvlimit": 50,
        "rvprop": "timestamp|user|comment|ids|content",
        "rvstart": date_start,
        "rvend": date_end,
        "rvdir": "older",
        "format": "json",
        "rvslots": "main",
    }
    response = S.get(url=URL, params=params)
    return response.json()

def parse_revision_class(response, date):
    page = list(response["query"]["pages"])[0]
    revision = list(page["revisions"])[0]
    return (page["pageid"], page["title"], revision["user"], revision["timestamp"], revision["comment"])

date = "2017-04-03T01:32:59.000Z"
get_revisions_class("Talk:Procellariidae", "2017-04-03T01:32:59.000Z")
#parse_revision_class(response, date)


{'batchcomplete': '',
 'query': {'pages': {'4613294': {'pageid': 4613294,
    'ns': 1,
    'title': 'Talk:Procellariidae',
    'revisions': [{'revid': 802073841,
      'parentid': 781334494,
      'user': 'InternetArchiveBot',
      'timestamp': '2017-09-23T20:48:44Z',
      'slots': {'main': {'contentmodel': 'wikitext',
        'contentformat': 'text/x-wiki',
        '*': '{{ArticleHistory|action1=PR\n|action1date=08:16, 2 August 2006\n|action1link=Wikipedia:Peer review/Procellariidae/archive1\n|action1result=reviewed\n|action1oldid=67204734\n\n|action2=FAC\n|action2date=01:30, 23 August 2006\n|action2link=Wikipedia:Featured article candidates/Procellariidae\n|action2result=promoted\n|action2oldid=71254405\n\n|maindate=October 22, 2006\n|currentstatus=FA\n}}\n{{Vital article|level=4|topic=Science|class=FA|subpage=Biology}}\n{{WikiProject Birds|class=FA|importance=High}}\n{{WP1.0|v0.7=pass|class=FA|category=Natsci}}\n\n==Splitting out species list==\nThis article is going to be too lon