In [2]:
#you may need to pip install pandas and tqdm before using this script
import pandas as pd
import requests
import time 
import json
import urllib.parse
from tqdm import tqdm
import numpy as np

In [239]:
politicians = pd.read_csv("politicians_by_country_SEPT.2022.csv")

In [240]:
populations = pd.read_csv("population_by_country_2022.csv")
#lets add a column to the populations table that will map each country to its most specific region. We will name this column "region"
#in this case each country will be mapped to the MOST SPECIFIC region so instead of AFRICA a country may be marked as WEST AFRICA or whatever specific region tag it is assigned.
regions_dict = {}
region = ""
for geography in populations["Geography"]:
    if geography.isupper():
        region = geography
    regions_dict[geography] = region
populations["region"] = populations["Geography"].map(regions_dict)

In [241]:
#########
#
#    CONSTANTS for WIKIPEDIA AND ORES look ups
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"
# The current ORES API endpoint
API_ORES_SCORE_ENDPOINT = "https://ores.wikimedia.org/v3"
# A template for mapping to the URL
API_ORES_SCORE_PARAMS = "/scores/{context}/{revid}/{model}"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': 'shurygin@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2022',
}

# This is just a list of all the politicians we will be looking at
ARTICLE_TITLES = politicians["name"]

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time, will be filled in before lookup
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

ORES_PARAMS_TEMPLATE = {
    "context": "enwiki",        # which WMF project for the specified revid
    "revid" : "",               # the revision to be scored - this will probably change each call
    "model": "articlequality"   # the AI/ML scoring model to apply to the reviewion
}


In [242]:
"""
This method looks up a particular article title in the wikipedia API and returns information about it. For our purposes we only really desire the last_rev_id.
Which is the ID of the last revision given on the article.

This method does not throw errors, however will print them and return None when they occur.
"""
def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_title: return None
    
    request_template['titles'] = article_title
        
    # make the request
    try:
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [243]:
"""
This method looks up a particular revision ID in the ORES API and returns the wikipedia assigned quality score given to that version of the wikipedia article.
article_revid is an integer, sourced from the information given by the request_pageinfo_per_article() method.

This method does not throw errors, however will print them and return None when they occur.
"""
def request_ores_score_per_article(article_revid = None, 
                                   endpoint_url = API_ORES_SCORE_ENDPOINT, 
                                   endpoint_params = API_ORES_SCORE_PARAMS, 
                                   request_template = ORES_PARAMS_TEMPLATE,
                                   headers = REQUEST_HEADERS,
                                   features=False):
    # Make sure we have an article revision id
    if not article_revid: return None
    
    # set the revision id into the template
    request_template['revid'] = article_revid
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # the features used by the ML model can sometimes be returned as well as scores
    if features:
        request_url = request_url+"?features=true"
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [244]:
# this helper function takes in a row of the "politicians_by_country_SEPT.2022.csv" table and looks up the name of the article in the wikipedia API.
# its main role is error handling if the CSV is malformed or if the request_pageinfo_per_article() encounters an error retrieving information.
#will print information and return None if any errors are encountered (either malformed information in the CSV, unexpected inputs, or connectivity issues with the API call)
def rev_id_helper(row):
    name = row[0] #the name of the article will always come in first column, unless schema of politicians table is changed
    json_info = request_pageinfo_per_article(name)
    if json_info is None: #we will log this error later
        return None
    json_info = json_info["query"]["pages"]
    page_id = list(json_info.keys())[0] #this pageid is only used as a key in the JSON returned by the API.
    if "lastrevid" in json_info[page_id].keys():
        rev_id = json_info[page_id]["lastrevid"]
    else: #we will log this error later
        print(row)
        rev_id = None
    return rev_id

#looks up each politician using request_pageinfo_per_article and appends their last revision ID onto the pandas dataframe.
# see rev_id_helper for implementation details.
def append_last_rev_ids(politicians):
    politicians["rev_id"] = politicians.progress_apply(lambda row: rev_id_helper(row), axis=1)
    return politicians

# this helper function takes in a row of the "politicians_by_country_SEPT.2022.csv" table AFTER the append_last_rev_ids call has been applied onto it
# This method then looks up the last revision id with the ORES API to find a quality score for the article.
# its main role is error handling if the CSV is malformed or if the request_pageinfo_per_article() encounters an error retrieving information.
#will print information and return None if any errors are encountered (either malformed information in the CSV, unexpected inputs, or connectivity issues with the API call)
def ores_helper(row):
    revid = row[3]
    json_info = request_ores_score_per_article(revid)
    if json_info is None: #we have a connection error to this specific endpoint, return None so we can log them later
        return "json_error"
    json_info = json_info["enwiki"]["scores"]
    page_id = list(json_info.keys())[0] #this pageid is only used as a key in the JSON returned by the API.
    json_info = json_info[page_id]["articlequality"]
    if "score" in json_info.keys():
        ores = json_info["score"]["prediction"]
        return ores
    else: #we have an ORES error, we should log this... We will give it a None value and log all Nones later
        print(row)
        return None
#looks up each revision id in a table (after append_last_rev_ids has been applied) and calls the ORES API to obtain a quality score, saving it in the "ores" column of the input table
def append_ores_scores(df):
    df["ores"] = df.progress_apply(lambda row: ores_helper(row), axis=1)
    return df

In [245]:
tqdm.pandas()
politicians = append_last_rev_ids(politicians)


 32%|███▏      | 2448/7584 [09:13<18:49,  4.55it/s]

name                                     Prince Ofosu Sefah
url        https://en.wikipedia.org/wiki/Prince_Ofosu_Sefah
country                                               Ghana
Name: 2446, dtype: object


 39%|███▉      | 2987/7584 [11:15<16:59,  4.51it/s]

name                                    Harjit Kaur Talwandi
url        https://en.wikipedia.org/wiki/Harjit_Kaur_Talw...
country                                                India
Name: 2985, dtype: object


 50%|████▉     | 3786/7584 [14:16<14:06,  4.49it/s]

name                                     Kang Sun-nam
url        https://en.wikipedia.org/wiki/Kang_Sun-nam
country                                  Korea, North
Name: 3784, dtype: object


 64%|██████▍   | 4881/7584 [18:28<09:36,  4.69it/s]

name                                Segun “Aeroland” Adewale
url        https://en.wikipedia.org/wiki/Segun_”Aeroland”...
country                                              Nigeria
Name: 4879, dtype: object


 77%|███████▋  | 5803/7584 [22:00<06:16,  4.74it/s]

name                                     Roman Konoplev
url        https://en.wikipedia.org/wiki/Roman_Konoplev
country                                          Russia
Name: 5801, dtype: object


 84%|████████▎ | 6346/7584 [24:04<04:59,  4.14it/s]

name                                  Nhlanhla “Lux” Dlamini
url        https://en.wikipedia.org/wiki/Nhlanhla_”Lux”_D...
country                                         South Africa
Name: 6344, dtype: object


100%|██████████| 7584/7584 [28:48<00:00,  4.39it/s]


In [246]:
#We have been logging any errors in rev_id lookup. So lets quickly identify those articles and exclude them.
print("The following politicians are excluded from the ORES quality search and subsequent analysis because information on their last revision ID was not available.")
print(politicians[politicians["rev_id"].isna()]["name"])
politicians = politicians[politicians["rev_id"].notna()]
politicians["rev_id"] = politicians["rev_id"].astype(int)

The following politicians are excluded from the ORES quality search and subsequent analysis because information on their last revision ID was not available.
2446          Prince Ofosu Sefah
2985        Harjit Kaur Talwandi
3784                Kang Sun-nam
4879    Segun “Aeroland” Adewale
5801              Roman Konoplev
6344      Nhlanhla “Lux” Dlamini
Name: name, dtype: object


In [247]:
tqdm.pandas()
politicians = append_ores_scores(politicians)


100%|██████████| 7578/7578 [25:16<00:00,  5.00it/s]


In [248]:
#We have been logging any errors in rev_id lookup. So lets quickly identify those articles and exclude them.
print("The following politicians are excluded from subsequent analysis because information on their ORES score was not available.")
print(politicians[politicians["ores"].isna()]["name"])
politicians = politicians[politicians["ores"].notna()]

The following politicians are excluded from subsequent analysis because information on their ORES score was not available.
Series([], Name: name, dtype: object)


In [249]:
#At this point I modified the populations CSV to add a combined north and south korea as some of the politicians had listed countrys of just "korean"
#Which did not exist in the populations csv.

wp_politicians_by_country = politicians.merge(populations, how="left", left_on="country", right_on="Geography")
wp_politicians_by_country = wp_politicians_by_country[["country", "region", "Population (millions)", "name", "rev_id", "ores"]]
countries_no_matches = populations.merge(politicians, how="left", left_on="Geography", right_on = "country")
countries_no_matches = countries_no_matches[countries_no_matches["name"].isna()]["Geography"]
countries_no_matches.to_csv("wp_countries-no_match.txt", index=False)
wp_politicians_by_country.to_csv("wp_politicians_by_country.csv", index=False)

In [3]:
wp_politicians_by_country = pd.read_csv("wp_politicians_by_country.csv")
high_quality_ores_codes = ["FA", "GA"]
wp_politicians_by_country_high_quality_only = wp_politicians_by_country[wp_politicians_by_country["ores"].isin(high_quality_ores_codes)]

In [27]:
"""
STEP 5: RESULTS.
Lets generate all of the tables using the analysis we have done. We will then display them in 6 separate cells below this one.
"""
#top_10_countries_by_coverage = wp_politicians_by_country.groupby(["country"])["name"].count().sort_values(ascending=False)[:10]
top_10_countries_by_coverage = wp_politicians_by_country.groupby(["country"]).agg({"Population (millions)": ["count", "sum"]})
top_10_countries_by_coverage["per_million_capita"] = top_10_countries_by_coverage[('Population (millions)','count')] / top_10_countries_by_coverage[('Population (millions)','sum')]
top_10_countries_by_coverage = top_10_countries_by_coverage.sort_values(by='per_million_capita', ascending=False)
top_10_countries_by_coverage = top_10_countries_by_coverage[top_10_countries_by_coverage["per_million_capita"] != np.inf][:10]


#bottom_10_countries_by_coverage = wp_politicians_by_country.groupby(["country"])["name"].count().sort_values(ascending=True)[:10]
bottom_10_countries_by_coverage = wp_politicians_by_country.groupby(["country"]).agg({"Population (millions)": ["count", "sum"]})
bottom_10_countries_by_coverage["per_million_capita"] = bottom_10_countries_by_coverage[('Population (millions)','count')] / bottom_10_countries_by_coverage[('Population (millions)','sum')]
bottom_10_countries_by_coverage = bottom_10_countries_by_coverage.sort_values(by='per_million_capita', ascending=True)
bottom_10_countries_by_coverage = bottom_10_countries_by_coverage[bottom_10_countries_by_coverage["per_million_capita"] != np.inf][:10]


#top_10_countries_by_quality = wp_politicians_by_country_high_quality_only.groupby(["country"])["name"].count().sort_values(ascending=False)[:10]
top_10_countries_by_quality = wp_politicians_by_country_high_quality_only.groupby(["country"]).agg({"Population (millions)": ["count", "sum"]})
top_10_countries_by_quality["per_million_capita"] = top_10_countries_by_quality[('Population (millions)','count')] / top_10_countries_by_quality[('Population (millions)','sum')]
top_10_countries_by_quality = top_10_countries_by_quality.sort_values(by='per_million_capita', ascending=False)
top_10_countries_by_quality = top_10_countries_by_quality[top_10_countries_by_quality["per_million_capita"] != np.inf][:10]


#bottom_10_countries_by_quality = wp_politicians_by_country_high_quality_only.groupby(["country"])["name"].count().sort_values(ascending=True)[:10]
bottom_10_countries_by_quality = wp_politicians_by_country_high_quality_only.groupby(["country"]).agg({"Population (millions)": ["count", "sum"]})
bottom_10_countries_by_quality["per_million_capita"] = bottom_10_countries_by_quality[('Population (millions)','count')] / bottom_10_countries_by_quality[('Population (millions)','sum')]
bottom_10_countries_by_quality = bottom_10_countries_by_quality.sort_values(by='per_million_capita', ascending=True)
bottom_10_countries_by_quality = bottom_10_countries_by_quality[bottom_10_countries_by_quality["per_million_capita"] != np.inf][:10]

top_10_regions_by_coverage = wp_politicians_by_country.groupby(["region"]).agg({"Population (millions)": ["count", "sum"]})
top_10_regions_by_coverage["per_million_capita"] = top_10_regions_by_coverage[('Population (millions)','count')] / top_10_regions_by_coverage[('Population (millions)','sum')]
top_10_regions_by_coverage = top_10_regions_by_coverage.sort_values(by='per_million_capita', ascending=False)

top_10_regions_by_quality = wp_politicians_by_country_high_quality_only.groupby(["region"]).agg({"Population (millions)": ["count", "sum"]})
top_10_regions_by_quality["per_million_capita"] = top_10_regions_by_quality[('Population (millions)','count')] / top_10_regions_by_quality[('Population (millions)','sum')]
top_10_regions_by_quality = top_10_regions_by_quality.sort_values(by='per_million_capita', ascending=False)

In [28]:
"""
STEP 5: RESULTS: Table 1 Top 10 countries by coverage
"""
top_10_countries_by_coverage

Unnamed: 0_level_0,Population (millions),Population (millions),per_million_capita
Unnamed: 0_level_1,count,sum,Unnamed: 3_level_1
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Marshall Islands,9,0.9,10.0
Dominica,4,0.4,10.0
Federated States of Micronesia,13,1.3,10.0
Andorra,10,1.0,10.0
Grenada,5,0.5,10.0
Antigua and Barbuda,17,1.7,10.0
St. Vincent and the Grenadines,3,0.3,10.0
St. Kitts-Nevis,3,0.3,10.0
Seychelles,6,0.6,10.0
Tonga,3,0.3,10.0


In [29]:
"""
STEP 5: RESULTS: Table 2 Bottom 10 countries by coverage
"""
bottom_10_countries_by_coverage

Unnamed: 0_level_0,Population (millions),Population (millions),per_million_capita
Unnamed: 0_level_1,count,sum,Unnamed: 3_level_1
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
China,2,2873.2,0.000696
India,178,252261.6,0.000706
Indonesia,111,30580.5,0.00363
Pakistan,120,28296.0,0.004241
Nigeria,221,48288.5,0.004577
Brazil,89,19117.2,0.004655
Bangladesh,56,9587.2,0.005841
Russia,173,24963.9,0.00693
Mexico,1,127.5,0.007843
Japan,108,13489.2,0.008006


In [30]:
"""
STEP 5: RESULTS: Table 3 Top 10 countries by high quality
"""
top_10_countries_by_quality

Unnamed: 0_level_0,Population (millions),Population (millions),per_million_capita
Unnamed: 0_level_1,count,sum,Unnamed: 3_level_1
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Andorra,2,0.2,10.0
Montenegro,3,1.8,1.666667
Suriname,1,0.6,1.666667
Estonia,1,1.3,0.769231
Kosovo,1,1.8,0.555556
Slovenia,2,4.2,0.47619
Gabon,2,4.8,0.416667
Albania,6,16.8,0.357143
Lithuania,3,8.4,0.357143
Armenia,1,3.0,0.333333


In [31]:
"""
STEP 5: RESULTS: Table 4 Bottom 10 countries by high quality
"""
bottom_10_countries_by_quality

Unnamed: 0_level_0,Population (millions),Population (millions),per_million_capita
Unnamed: 0_level_1,count,sum,Unnamed: 3_level_1
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
India,6,8503.2,0.000706
Indonesia,14,3857.0,0.00363
Pakistan,5,1179.0,0.004241
Nigeria,4,874.0,0.004577
Russia,16,2308.8,0.00693
Japan,2,249.8,0.008006
Ethiopia,3,370.2,0.008104
Vietnam,2,198.8,0.01006
Iran,2,177.2,0.011287
Germany,4,333.2,0.012005


In [32]:
"""
STEP 5: RESULTS: Table 5 Geographic regions by total coverage
"""
top_10_regions_by_coverage

Unnamed: 0_level_0,Population (millions),Population (millions),per_million_capita
Unnamed: 0_level_1,count,sum,Unnamed: 3_level_1
region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
OCEANIA,86,110.1,0.781108
NORTHERN EUROPE,262,1348.4,0.194304
CARIBBEAN,201,1239.5,0.162162
CENTRAL AMERICA,195,1755.7,0.111067
CENTRAL ASIA,106,1788.4,0.059271
SOUTHERN EUROPE,890,19444.6,0.045771
WESTERN ASIA,687,15577.1,0.044103
EASTERN AFRICA,650,19068.0,0.034089
NORTHERN AFRICA,227,7639.9,0.029712
MIDDLE AFRICA,203,7919.0,0.025635


In [33]:
"""
STEP 5: RESULTS: Table 6 Geographic regions by high quality coverage
"""
top_10_regions_by_quality

Unnamed: 0_level_0,Population (millions),Population (millions),per_million_capita
Unnamed: 0_level_1,count,sum,Unnamed: 3_level_1
region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
OCEANIA,2,9.3,0.215054
NORTHERN EUROPE,8,46.6,0.171674
MIDDLE AFRICA,5,43.9,0.113895
CENTRAL AMERICA,10,102.2,0.097847
CARIBBEAN,8,89.6,0.089286
CENTRAL ASIA,3,45.2,0.066372
WESTERN ASIA,28,538.8,0.051967
SOUTHERN EUROPE,46,910.6,0.050516
NORTHERN AFRICA,6,125.8,0.047695
SOUTH AMERICA,13,302.3,0.043004
