# Coletando revisões da API da Wikipédia

A wikipédia disponibiliza algumas formas de coletar seus dados. 
Pode-se brincar bastante com essa API por meio da interface https://meta.wikimedia.org/wiki/Special:ApiSandbox.
para a realização destes exemplos os parâmetros foram escolhidos nesse sandbox e depois passados para cá.

Os artigos utilizados como exemplo fazem parte do dataset da dissertação do Professor Daniel Hassan.

In [None]:
import pandas as pd

dataset = pd.read_csv('wikipedia_dataset_hasan/wikipedia.csv')
titles = pd.DataFrame(dataset, columns = ['page_title']).head(5).values
titles

Abaixo um exemplo de uso da API da wikipédia. O paramêtro "format" como json faz com que o retorno venha no formato de dicionário em python, ao invés de xml ou html, que é mais fácil de ser manipulado.

In [None]:
import requests

S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
title = titles[0]

params = {
    "action": "query",
    "format": "json",
    "titles": title,
}

response = S.get(url=URL, params=params).json()
response

Os valores da _response_ podem ser facilmente acessados como um dicionário em python, por meio da sintaxe dict[chave].

In [None]:
query = response["query"]
print(query)

In [None]:
pages = query["pages"] # or response["query"]["pages"]
print(pages)

In [None]:
pages_list = list(pages.values()) # or list(response["query"]["pages"].values()) 
print(pages_list)

first_page_title = pages_list[0]["title"] # or list(response["query"]["pages"].values())[0]["title"]
print(first_page_title)

É possível coletar as revisões, dentre outras informações. A [documentação](https://en.wikipedia.org/w/api.php) pode ser lida em  

[outra API](https://en.wikipedia.org/api/rest_v1/#/) que ainda não foi explorada.

In [10]:
from wiki_revision_crawler import get_revisions_info, date_range_monthly, parse_revisions_info_monthly

date_start = "2017-10-01T00:00:00Z"
date_end = "2017-8-01T00:00:00Z"
 
print(date_range_monthly(date_end, date_start))

date_start = "2006-01-03T00:00:00Z"

response = get_revisions_info("Dypsis onilahensis", date_start, date_end)
print(response)

revisions_info = parse_revisions_info_monthly(response, date_start, date_end)


['2017-10-01T00:00:00Z', '2017-09-01T00:00:00Z', '2017-08-01T00:00:00Z']
{'error': {'code': 'badtimestamp_rvend', 'info': 'Invalid value "2017-8-01T00:00:00Z" for timestamp parameter "rvend".', '*': 'See https://en.wikipedia.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/mailman/listinfo/mediawiki-api-announce&gt; for notice of API deprecations and breaking changes.'}, 'servedby': 'mw1234'}


KeyError: 'query'

In [4]:
revisions_info

([{'access': '2019-07-01T00:00:00Z',
   'revision': {'revid': 833068079,
    'parentid': 825632911,
    'user': 'Tom.Reding',
    'timestamp': '2018-03-29T14:04:16Z',
    'comment': '+[[:Category:Taxonomy articles created by Polbot\u200e\u200e]]; cleanup; [[WP:GenFixes]] on, using [[Project:AWB|AWB]]'}},
  {'access': '2019-06-01T00:00:00Z',
   'revision': {'revid': 833068079,
    'parentid': 825632911,
    'user': 'Tom.Reding',
    'timestamp': '2018-03-29T14:04:16Z',
    'comment': '+[[:Category:Taxonomy articles created by Polbot\u200e\u200e]]; cleanup; [[WP:GenFixes]] on, using [[Project:AWB|AWB]]'}},
  {'access': '2019-05-01T00:00:00Z',
   'revision': {'revid': 833068079,
    'parentid': 825632911,
    'user': 'Tom.Reding',
    'timestamp': '2018-03-29T14:04:16Z',
    'comment': '+[[:Category:Taxonomy articles created by Polbot\u200e\u200e]]; cleanup; [[WP:GenFixes]] on, using [[Project:AWB|AWB]]'}},
  {'access': '2019-04-01T00:00:00Z',
   'revision': {'revid': 833068079,
    'pare

In [None]:
import json
with open("response.json", 'w') as fp:
    json.dump(response, fp)

In [None]:
import datetime
import dateutil.parser

def parse_date(date):
    """Parse date from ISO 8601 format to datetime
 
    Parameters:
        date (str): date in the format ISO 8601: 2001-01-15T14:56:00Z
        
    Returns:
        date (datetime): date parsed

    """
    return dateutil.parser.parse(date)

def format_date(date):
    """Format date ISO 8601 (i.e. 2001-01-15T14:56:00Z)
 
    Parameters:
        date (datetime): date
        
    Returns:
        date (str): ISO 8601 formatted date
    
    """
    s = "%Y-%m-%dT%H:%M:%SZ"
    return date.strftime(s)

date_idx=0
dates = pd.date_range("2017-01-03T01:32:59Z","2017-04-03T01:32:59Z", freq='MS').strftime("%Y-%m-%dT%H:%M:%SZ").tolist()

while date_idx < len(dates):
    print(dates[date_idx])
    date_idx += 1

In [None]:
from pandas.io.json import json_normalize

#revisions = data["query"]["pages"][page_id]["revisions"]
#revisions = list(data["query"]["pages"].values())[0]["revisions"]

json_normalize(revisions_info)

In [11]:
import requests

S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"

def get_revision(title, access_date):
    # title = "Procellariidae"
    # date = "2017-04-03T01:32:59.000Z"
    params = {
        "action": "query",
        "prop": "revisions",
        "titles": title,
        "rvprop": "timestamp|user|comment|content|ids",
        "rvslots": "main",
        "formatversion": "2",
        "format": "json",
        "rvlimit": 1,
        "rvstart": access_date,
        "rvdir": "older",
    }

    response = S.get(url=URL, params=params)
    
    return response.json()

def parse_revision_response(response, date):
    page = list(response["query"]["pages"])[0]
    revision = list(page["revisions"])[0]
    return (page["pageid"], page["title"], revision["user"], revision["timestamp"], revision["comment"], revision["slots"]["main"]["content"])

date = "2009-01-03T00:00:00.000Z"
response = get_revision("Procellariidae", date)
#parse_revision_response(response, date)
#category = parse_revision_category_content(response)
page = list(response["query"]["pages"])[0]
revision = list(page["revisions"])[0]
content = revision["slots"]["main"]["content"]
content

KeyError: 'slots'

In [47]:
def get_revision_info(title, access_date):
    # title = "Procellariidae"
    # date = "2017-04-03T01:32:59.000Z"
    params = {
        "action": "query",
        "prop": "revisions",
        "titles": title,
        "rvprop": "timestamp|user|comment|ids",
        "rvslots": "main",
        "formatversion": "2",
        "format": "json",
        "rvlimit": 500,
        "rvstart": access_date,
        "rvdir": "older",
    }

    response = S.get(url=URL, params=params)
    
    return response.json()
title = "Ubuntu (operating system)"
date_end = "2009-09-03T00:00:00Z"
get_revision_info(title, date_end)

{'batchcomplete': True,
 'query': {'pages': [{'pageid': 30404350,
    'ns': 0,
    'title': 'Ubuntu (operating system)'}]}}

In [43]:
def get_page_redirect(title):
    PARAMS = {
        'action': "query",
        'format': "json",
        'titles': title,
        'prop': "redirects"
    }

    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    return DATA

# title = 'List of Danish football champions'
# print(get_page_redirect(title))

title = 'Variegated fairywren'
print(get_page_redirect(title))

# title = 'List of Danish football champions'
# print(get_page_redirect(title))

{'batchcomplete': '', 'query': {'pages': {'4096922': {'pageid': 4096922, 'ns': 0, 'title': 'Variegated fairywren', 'redirects': [{'pageid': 4098228, 'ns': 0, 'title': 'Malurus lamberti'}, {'pageid': 7659390, 'ns': 0, 'title': 'Variegated Fairy-Wren'}, {'pageid': 24164108, 'ns': 0, 'title': 'Variegated Fairy-wren'}, {'pageid': 42782291, 'ns': 0, 'title': 'Variegated Fairywren'}, {'pageid': 43905868, 'ns': 0, 'title': 'Variegated fairy-wren'}, {'pageid': 55816394, 'ns': 0, 'title': 'Variegated fairy wren'}, {'pageid': 55816399, 'ns': 0, 'title': 'Variegated Fairy Wren'}, {'pageid': 55816537, 'ns': 0, 'title': 'Malurus Lamberti'}, {'pageid': 55816604, 'ns': 0, 'title': 'Variegated wren'}, {'pageid': 55816605, 'ns': 0, 'title': 'Variegated Wren'}]}}}}


In [None]:
def parse_revision_category_content(text):
    classes = []
    if text is not None:
        lines = text.replace("{{","}}").replace("\n","").split("}}")
        for line in lines:
            if "class" in line:
                print(line)
                atributes = line.split("|")
                wiki_project = ""
                wiki_class = "-"
                for idx, atribute in enumerate(atributes):
                    atribute_values = atribute.split("=")
                    if idx == 0:
                        wiki_project = atribute_values[0]
                    if atribute_values[0] == "class" and len(atribute_values) > 1 :
                        wiki_class = atribute_values[1]
                classes.append((wiki_project, wiki_class))
    return classes
parse_revision_category_content(content)