# Coletando revisões da API da Wikipédia

A wikipédia disponibiliza algumas formas de coletar seus dados. 
Pode-se brincar bastante com essa API por meio da interface https://meta.wikimedia.org/wiki/Special:ApiSandbox.
para a realização destes exemplos os parâmetros foram escolhidos nesse sandbox e depois passados para cá.

Os artigos utilizados como exemplo fazem parte do dataset da dissertação do Professor Daniel Hassan.

In [None]:
import pandas as pd

dataset = pd.read_csv('wikipedia_dataset_hasan/wikipedia.csv')
titles = pd.DataFrame(dataset, columns = ['page_title']).head(5).values
titles

Abaixo um exemplo de uso da API da wikipédia. O paramêtro "format" como json faz com que o retorno venha no formato de dicionário em python, ao invés de xml ou html, que é mais fácil de ser manipulado.

In [None]:
import requests

S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
title = titles[0]

params = {
    "action": "query",
    "format": "json",
    "titles": title,
}

response = S.get(url=URL, params=params).json()
response

Os valores da _response_ podem ser facilmente acessados como um dicionário em python, por meio da sintaxe dict[chave].

In [None]:
query = response["query"]
print(query)

In [None]:
pages = query["pages"] # or response["query"]["pages"]
print(pages)

In [None]:
pages_list = list(pages.values()) # or list(response["query"]["pages"].values()) 
print(pages_list)

first_page_title = pages_list[0]["title"] # or list(response["query"]["pages"].values())[0]["title"]
print(first_page_title)

In [None]:
regressão, SVM

É possível coletar as revisões, dentre outras informações. A [documentação](https://en.wikipedia.org/w/api.php) pode ser lida em  

[outra API](https://en.wikipedia.org/api/rest_v1/#/) que ainda não foi explorada.

In [1]:
from wiki_revision_crawler import get_revisions_info, date_range_monthly, parse_revisions_info_monthly

date_start = "2017-10-01T00:00:00Z"
date_end = "2017-8-01T00:00:00Z"
 
print(date_range_monthly(date_end, date_start))

date_start = "2006-01-03T00:00:00Z"

response = get_revisions_info("Dypsis onilahensis", date_start, date_end)
print(response)

revisions_info = parse_revisions_info_monthly(response, date_start, date_end)


['2017-10-01T00:00:00Z', '2017-09-01T00:00:00Z', '2017-08-01T00:00:00Z']
{'error': {'code': 'badtimestamp_rvend', 'info': 'Invalid value "2017-8-01T00:00:00Z" for timestamp parameter "rvend".', '*': 'See https://en.wikipedia.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/mailman/listinfo/mediawiki-api-announce&gt; for notice of API deprecations and breaking changes.'}, 'servedby': 'mw1287'}


KeyError: 'query'

In [4]:
revisions_info

([{'access': '2019-07-01T00:00:00Z',
   'revision': {'revid': 833068079,
    'parentid': 825632911,
    'user': 'Tom.Reding',
    'timestamp': '2018-03-29T14:04:16Z',
    'comment': '+[[:Category:Taxonomy articles created by Polbot\u200e\u200e]]; cleanup; [[WP:GenFixes]] on, using [[Project:AWB|AWB]]'}},
  {'access': '2019-06-01T00:00:00Z',
   'revision': {'revid': 833068079,
    'parentid': 825632911,
    'user': 'Tom.Reding',
    'timestamp': '2018-03-29T14:04:16Z',
    'comment': '+[[:Category:Taxonomy articles created by Polbot\u200e\u200e]]; cleanup; [[WP:GenFixes]] on, using [[Project:AWB|AWB]]'}},
  {'access': '2019-05-01T00:00:00Z',
   'revision': {'revid': 833068079,
    'parentid': 825632911,
    'user': 'Tom.Reding',
    'timestamp': '2018-03-29T14:04:16Z',
    'comment': '+[[:Category:Taxonomy articles created by Polbot\u200e\u200e]]; cleanup; [[WP:GenFixes]] on, using [[Project:AWB|AWB]]'}},
  {'access': '2019-04-01T00:00:00Z',
   'revision': {'revid': 833068079,
    'pare

In [None]:
import datetime
import dateutil.parser

def parse_date(date):
    """Parse date from ISO 8601 format to datetime
 
    Parameters:
        date (str): date in the format ISO 8601: 2001-01-15T14:56:00Z
        
    Returns:
        date (datetime): date parsed

    """
    return dateutil.parser.parse(date)

def format_date(date):
    """Format date ISO 8601 (i.e. 2001-01-15T14:56:00Z)
 
    Parameters:
        date (datetime): date
        
    Returns:
        date (str): ISO 8601 formatted date
    
    """
    s = "%Y-%m-%dT%H:%M:%SZ"
    return date.strftime(s)

date_idx=0
dates = pd.date_range("2017-01-03T01:32:59Z","2017-04-03T01:32:59Z", freq='MS').strftime("%Y-%m-%dT%H:%M:%SZ").tolist()

while date_idx < len(dates):
    print(dates[date_idx])
    date_idx += 1

In [None]:
from pandas.io.json import json_normalize

#revisions = data["query"]["pages"][page_id]["revisions"]
#revisions = list(data["query"]["pages"].values())[0]["revisions"]

json_normalize(revisions_info)

In [11]:
import requests

S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"

def get_revision(title, access_date):
    # title = "Procellariidae"
    # date = "2017-04-03T01:32:59.000Z"
    params = {
        "action": "query",
        "prop": "revisions",
        "titles": title,
       # "rvprop": "timestamp|user|comment|content|ids",
        "rvprop": "content",
        "rvslots": "main",
        "formatversion": "2",
        "format": "json",
        "rvlimit": 1,
        "rvstart": access_date,
        "rvdir": "older",
    }

    response = S.get(url=URL, params=params)
    
    return response.json()


def parse_revision_response(response, date):
    page = list(response["query"]["pages"])[0]
    revision = list(page["revisions"])[0]
   # return (page["pageid"], page["title"], revision["user"], revision["timestamp"], revision["comment"], revision["slots"]["main"]["content"])
    return revision["slots"]["main"]["content"]

date = "2009-01-03T00:00:00.000Z"
response = get_revision("Procellariidae", date)
parse_revision_response(response, date)
#category = parse_revision_category_content(response)
# page = list(response["query"]["pages"])[0]
# revision = list(page["revisions"])[0]
# content = revision["slots"]["main"]["content"]
# content

In [7]:
response

{'continue': {'rvcontinue': '20081113210734|251630846', 'continue': '||'},
 'query': {'pages': [{'pageid': 224443,
    'ns': 0,
    'title': 'Procellariidae',
    'revisions': [{'slots': {'main': {'contentmodel': 'wikitext',
        'contentformat': 'text/x-wiki',
        'content': '{{Taxobox\n| name = Procellariidae\n| image = Cape Petrel (Pintado) at Antarctic Convergence Zone.jpg\n| image_width = 250px\n| image_caption = [[Cape Petrel]] \'\'Daption capense\'\'\n| regnum = [[Animal]]ia\n| phylum = [[Chordate|Chordata]]\n| classis = [[bird|Aves]]\n| ordo = [[Procellariiformes]]\n| familia = \'\'\'Procellariidae\'\'\'\n| familia_authority = [[William Elford Leach|Leach]], 1820\n| subdivision_ranks = Genera\n| subdivision = \nSeveral, [[List of Procellariidae]].\n}}\nThe [[family (biology)|family]] \'\'\'Procellariidae\'\'\' is a group of [[seabird]]s that comprises the [[fulmarine petrel]]s, the [[gadfly petrel]]s, the [[prion (bird)|prions]], and the [[shearwater]]s. This family is p

In [27]:
def get_revision_info(title, access_date):
    # title = "Procellariidae"
    # date = "2017-04-03T01:32:59.000Z"
    params = {
        "action": "query",
        "prop": "revisions",
        "titles": title,
        "rvprop": "timestamp|user|comment|ids",
        "rvslots": "main",
        "formatversion": "2",
        "format": "json",
        "rvlimit": 500,
        "rvstart": access_date,
        "rvdir": "older",
              
    }

    response = S.get(url=URL, params=params)
    
    return response.json()
title = 'Spoo'
date_end = "2009-01-03T00:00:00Z"
get_revision_info(title, date_end)

{'batchcomplete': True,
 'query': {'pages': [{'pageid': 59643877, 'ns': 0, 'title': 'Spoo'}]}}

In [28]:
def get_page_redirect(title):
    """ redirect page
    """
    PARAMS = {
        'action': "query",
        'format': "json",
        'titles': title,
       # 'prop': "redirects"
        'redirects' : 1,
    }

    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    return DATA

title = 'Spoo'
print(get_page_redirect(title))

# title = 'Variegated fairywren'
# print(get_page_redirect(title))

title = 'List of Danish football champions'
print(get_page_redirect(title))

{'batchcomplete': '', 'query': {'pages': {'59643877': {'pageid': 59643877, 'ns': 0, 'title': 'Spoo'}}}}
{'batchcomplete': '', 'query': {'pages': {'4862862': {'pageid': 4862862, 'ns': 0, 'title': 'List of Danish football champions'}}}}


In [302]:
result = re.findall('(class=(.+?)}|class=(.+?)\||class=(.+?)\n)', '|class=FA\n|')
result
#return result[0][1] if result[0][0][-1:] == '}' else result[0][2] if result[0][0][-1:] == '|' else result[0][3]


[('class=FA\n', '', '', 'FA')]

In [318]:
def parse_revision_category_content(text):
    result = re.findall('(class=(.+?)}|class=(.+?)\||class=(.+?)\n)', content)
    return result[0][1] if result[0][0][-1:] == '}' else result[0][2] if result[0][0][-1:] == '|' else result[0][3]

title = "GNOME"
date = "2007-01-03T00:00:00.000Z"

response = get_revision(f"Talk:{title}", date)
content = parse_revision_response(response, date)
parse_revision_category_content(content)

'GA|importance=Top|bc-current=yes|nested=yes'

In [319]:
content

'{{talkheader}}\n{{ArticleHistory\n|action1=GAN\n|action1date=02:39, 16 December 2005\n|action1result=listed\n|action1oldid=31516584\n\n|action2=FAC\n|action2date=17:57, 18 December 2006\n|action2link=Wikipedia:Featured article candidates/Brazil/archive1\n|action2result=failed\n|action2oldid=95121865\n\n|action3=GAR\n|action3date=21:22, 10 January 2007\n|action3result=delisted\n|action3oldid=99625709\n\n|action4=GAN\n|action4date=14:33, 7 May 2007\n|action4result=listed\n|action4oldid=128902215\n\n|action5=GAR\n|action5date=03:42, 11 May 2007\n|action5link=Wikipedia:Good article review/Archive 18#Brazil\n|action5result=delisted\n|action5oldid=130215626\n\n|action6=FAC\n|action6date=00:11, 12 May 2007\n|action6link=Wikipedia:Featured article candidates/Brazil/archive2\n|action6result=not promoted\n|action6oldid=130215626\n\n|action7=GAN\n|action7date=02:35, 12 May 2007\n|action7result=failed\n|action7oldid=130215626\n\n|action8=GAN\n|action8date=28 June 2007\n|action8result=failed\n|act

In [64]:
import os
folder = "collected_data/revision_info_200701-200901/data"
titles = os.listdir(folder)
titles[:10]

['The Lion King',
 'Luther Burbank',
 'Avatar: The Last Airbender',
 'Freak the Sheep Vol. 2',
 'Heroes of Wrestling',
 'Lady',
 'New York State Route 345',
 'Universe of Kingdom Hearts',
 'Helpless Automaton',
 'Samuel G. Arnold']

In [107]:
import pandas as pd
title = "Nelvana"
input = f"{folder}/{title}"
df = pd.read_csv(input)

In [336]:
# Pope, Nelvana, Big Bang, Munster, Fremen, Chicago 19, GNOME, Stargate, Enron, Namco, Pholcidae, Freenet
title = "Pope"
def get_category(title, date):
    response = get_revision(f"Talk:{title}", date)
    content = parse_revision_response(response, date)
    #return re.findall("{{.*class=.*}}", content)
   # return re.search('class=(.+?)\||}}]', content).group(1)
    result = re.findall('(class=(.+?)^[|\n ]}|class=(.+?)\||class=(.+?)\n)', content)
    return result[0][1] if result[0][0][-1:] == '}' else result[0][2] if result[0][0][-1:] == '|' else result[0][3]# if result[0][0][-1:] == ' ' else result[0][4]

for i, row in df.iterrows():
    date = row['revision.timestamp']
    category = get_category(title, date)
    print(category)
    df.loc[i, 'raw_category'] = str(category)
df

    

B
B
A
A
A
A
A
A
A
A
A
A
A


KeyError: 'revisions'

In [342]:
date_start = "2002-01-03T00:00:00Z"
date_end = "2011-01-03T00:00:00Z"

dates = pd.date_range(date_start, date_end, freq='3MS').strftime("%Y-%m-%dT%H:%M:%SZ").tolist()[::-1]
dates

['2010-11-01T00:00:00Z',
 '2010-08-01T00:00:00Z',
 '2010-05-01T00:00:00Z',
 '2010-02-01T00:00:00Z',
 '2009-11-01T00:00:00Z',
 '2009-08-01T00:00:00Z',
 '2009-05-01T00:00:00Z',
 '2009-02-01T00:00:00Z',
 '2008-11-01T00:00:00Z',
 '2008-08-01T00:00:00Z',
 '2008-05-01T00:00:00Z',
 '2008-02-01T00:00:00Z',
 '2007-11-01T00:00:00Z',
 '2007-08-01T00:00:00Z',
 '2007-05-01T00:00:00Z',
 '2007-02-01T00:00:00Z',
 '2006-11-01T00:00:00Z',
 '2006-08-01T00:00:00Z',
 '2006-05-01T00:00:00Z',
 '2006-02-01T00:00:00Z',
 '2005-11-01T00:00:00Z',
 '2005-08-01T00:00:00Z',
 '2005-05-01T00:00:00Z',
 '2005-02-01T00:00:00Z',
 '2004-11-01T00:00:00Z',
 '2004-08-01T00:00:00Z',
 '2004-05-01T00:00:00Z',
 '2004-02-01T00:00:00Z',
 '2003-11-01T00:00:00Z',
 '2003-08-01T00:00:00Z',
 '2003-05-01T00:00:00Z',
 '2003-02-01T00:00:00Z',
 '2002-11-01T00:00:00Z',
 '2002-08-01T00:00:00Z',
 '2002-05-01T00:00:00Z',
 '2002-02-01T00:00:00Z']

In [345]:
# Pope, Nelvana, Big Bang, Munster, Fremen, Chicago 19, GNOME, Stargate, Enron, Namco, Pholcidae, Freenet, Kakapo
title = "Pigment"
for date in dates:
    try:
        category = get_category(title, date)
        print(f"{date} {category}")
    except:
        pass
{{class=A}}
{{class=A|aodkaod}}
{{class=A\n|}}
{{class=A }}

2010-11-01T00:00:00Z B
2010-08-01T00:00:00Z B
2010-05-01T00:00:00Z B
2010-02-01T00:00:00Z B
2009-11-01T00:00:00Z B
2009-08-01T00:00:00Z B
2009-05-01T00:00:00Z GA
2009-02-01T00:00:00Z GA
2008-11-01T00:00:00Z GA
2008-08-01T00:00:00Z GA
2008-05-01T00:00:00Z GA
2008-02-01T00:00:00Z GA
2007-11-01T00:00:00Z GA
2007-08-01T00:00:00Z GA
2007-05-01T00:00:00Z GA
2007-02-01T00:00:00Z GA
2006-11-01T00:00:00Z GA
2006-08-01T00:00:00Z "thumb tleft">
