## Importing libraries

In [8]:
# The 'requests' module is not a standard Python module. You will need to install this with pip/pip3 if you do not already have it
import requests

import json, time, urllib.parse
import pandas as pd

## Read csv

In [None]:
# Retrieving a list of cities by state from the given csv file
cities_by_state = pd.read_csv('us_cities_by_state_SEPT.2023.csv')
ARTICLE_TITLES = cities_by_state['page_title'].to_list()

## Pageview requests

In [9]:
#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<uwnetid@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

In [11]:
# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "url"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}


In [12]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    
    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


## ORES Requests

In [32]:
#########
#
#    CONSTANTS
#

#    The current LiftWing ORES API endpoint and prediction model
#
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (60.0/5000.0)-API_LATENCY_ASSUMED

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#    
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<{email_address}>, University of Washington, MSDS DATA 512 - AUTUMN 2023",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "",         # your email address should go here
    'access_token'  : ""          # the access token you create will need to go here
}

#
#    A dictionary of English Wikipedia article titles (keys) and sample revision IDs that can be used for this ORES scoring example
#
ARTICLE_REVISIONS = { 'Bison':1085687913 , 'Northern flicker':1086582504 , 'Red squirrel':1083787665 , 'Chinook salmon':1085406228 , 'Horseshoe bat':1060601936 }

#
#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}

#
#    These are used later - defined here so they, at least, have empty values
#
USERNAME = ""
ACCESS_TOKEN = ""
#

In [33]:
USERNAME = "Ameyabhamare"
ACCESS_TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiJmMGViYzU5YjJlYWY1OWI2OTk4Zjg2OGI0ZTI3YzI4YyIsImp0aSI6ImJhMTk2Zjc5MTE2OGNiY2JkMjYyZDA0NWY5MzdiZWRlMmRmMmJiMGUzYTZjNzE2YmNkMzllZDA1MzkxNzQxZDU1MzY1NDFhZDRlNjZkYmUwIiwiaWF0IjoxNjk3MTQxNjYzLjcxNTEzNiwibmJmIjoxNjk3MTQxNjYzLjcxNTEzOSwiZXhwIjozMzI1NDA1MDQ2My43MTM5MTcsInN1YiI6IjczOTkxMTY1IiwiaXNzIjoiaHR0cHM6Ly9tZXRhLndpa2ltZWRpYS5vcmciLCJyYXRlbGltaXQiOnsicmVxdWVzdHNfcGVyX3VuaXQiOjUwMDAsInVuaXQiOiJIT1VSIn0sInNjb3BlcyI6WyJiYXNpYyIsImNyZWF0ZWVkaXRtb3ZlcGFnZSIsImVkaXRwcm90ZWN0ZWQiXX0.QF7kx0LSHINl6GZW5mdOu3QI5ZWGYBiurV-u_qJZbmHAClgjD_huC_qesO-jzcHXdOmCIe-6dEDzdlRIc57XHwqUmLpgkm5JVAyhw0ew8lS3z-XZvaxpMa4EDSyG-6lrZrE2yz_6OyCwoz3cI3D2x4tSFJ7z43GmZEui00-zuCf-rUcG9cpfolA9DzxF4bta8k47mtZE4ozxMvJwziScxcxyJDRz0HwV2GdCSBRM0FYE1zYMrmp1tdeHIRlMkPGlgw1jzdKcn3bvZjt1LTOEB4EeJYuafuDzKBwp8hZQRlEecEZ5aZAe4QMB0D76AQQWko7vC6TLks34cymmrMWsn1ExgJPpcpNVnRjTuoZGM7Nf3bw1aUXBUahMps2WmCtZgGCSgcgFpbSaP6e6QgRE-4yOl7_wqMFzbllBQ-P2TZEnttecz6IGE_H_lQOmJFfh5In92FYoTy6YsrkiZxAx0z754JTfzvKl-iXE_oSHM011KA8HPNRCbVKNXo3No9gk2Hea-W1eBvVnBhRbwLBIr4NbHJvg-qlHRNuYytRfJSH9OYXVkz4srEBEMi_s2aLjsY2Ku-BLHxokAe0oCeF0aGcL-uocFnPdXI7NT7Dv85-DbVarvvgPDPTKZ0ZSuwYLSI-GY6I4wBBdjDKHXfoXVQvl9_tB3PkaLp66SrKsAXQ"

In [34]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   endpoint_url = API_ORES_LIFTWING_ENDPOINT, 
                                   model_name = API_ORES_EN_QUALITY_MODEL, 
                                   request_data = ORES_REQUEST_DATA_TEMPLATE, 
                                   header_format = REQUEST_HEADER_TEMPLATE, 
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):
    
    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token
    
    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")
    
    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)
    
    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        #response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


## Calculating article quality by retrieving revision ID and making a ORES request

In [48]:
for title in ARTICLE_TITLES:
    article_response = request_pageinfo_per_article(title)
    article_info = list(article_response['query']['pages'].values())[0]
    revision_id = article_info['lastrevid']
    ores_score = request_ores_score_per_article(article_revid=revision_id,
                                       email_address="ameyarb@uw.edu",
                                       access_token=ACCESS_TOKEN)
    article_quality = list(ores_score['enwiki']['scores'].values())[0]['articlequality']['score']['prediction']

C
C
C
GA
C
C


KeyboardInterrupt: 