# Homework 1 - Professionalism & Reproducibility
### DATA 512 - Human-Centered Data Science
#### Aamir Darukhanawalla

#### Import the required libraries

In [541]:
import numpy as np
import pandas as pd

import json
import requests
import time

from math import ceil

### Step 1: Getting the Article and Population Data

In [542]:
POLITICIANS_FILE = 'Data/politicians_by_country_SEPT.2022.csv.xlsx'
POPULATION_FILE = 'Data/population_by_country_2022.csv.xlsx'
politicians_list = pd.read_excel(POLITICIANS_FILE)
population_list = pd.read_excel(POPULATION_FILE)

#### Analyze data for politicians by country

In [543]:
# Check for duplicate entries in the data
politicians_list.value_counts().head()

name                               url                                                              country    
Ibrahim Megag Samatar              https://en.wikipedia.org/wiki/Ibrahim_Megag_Samatar              Somalia        2
Abdirahman Aw Ali Farrah           https://en.wikipedia.org/wiki/Abdirahman_Aw_Ali_Farrah           Somalia        2
8th National Assembly of Slovenia  https://en.wikipedia.org/wiki/8th_National_Assembly_of_Slovenia  Slovenia       1
Mohammad Hashem Taufiqui           https://en.wikipedia.org/wiki/Mohammad_Hashem_Taufiqui           Afghanistan    1
Mohammad Reza Kalaei               https://en.wikipedia.org/wiki/Mohammad_Reza_Kalaei               Iran           1
dtype: int64

In [544]:
# Check for duplicate article names
politicians_list['name'].value_counts().head()

Torokul Dzhanuzakov    4
Eduard Hedvicek        2
Jožef Krajnc           2
Sergey Abisov          2
Vladimir Pavićević     2
Name: name, dtype: int64

In [545]:
# Check an example having duplicate article names
politicians_list[politicians_list['name'] == 'Torokul Dzhanuzakov']

Unnamed: 0,name,url,country
3626,Torokul Dzhanuzakov,https://en.wikipedia.org/wiki/Torokul_Dzhanuzakov,Kazakhstan
3983,Torokul Dzhanuzakov,https://en.wikipedia.org/wiki/Torokul_Dzhanuzakov,Kyrgyzstan
6894,Torokul Dzhanuzakov,https://en.wikipedia.org/wiki/Torokul_Dzhanuzakov,Tajikistan
7341,Torokul Dzhanuzakov,https://en.wikipedia.org/wiki/Torokul_Dzhanuzakov,Uzbekistan


Politician Dataset Considerations:
- The politicians dataset consists of 48 politicians having duplicate names.
- However, these duplicate politicians happen to be associated with different countries. Hence, we choose to keep these records.
- There are 2 records that have duplicate values for all fields. We drop the duplicate values.


In [546]:
# Drop records having duplicate entries in all columns
politicians_list.drop_duplicates(inplace = True)

#### Analyze data for population by country

In [547]:
population_list.loc[population_list['Population (millions)'].sort_values().index].head(10)

Unnamed: 0,Geography,Population (millions)
183,Liechtenstein,0.0
231,Tuvalu,0.0
185,Monaco,0.0
211,San Marino,0.0
226,Palau,0.0
223,Nauru,0.0
91,St. Kitts-Nevis,0.1
83,Dominica,0.1
221,Kiribati,0.1
78,Antigua and Barbuda,0.1


Population Dataset Considerations:
- Since the population has been stored in millions rounded to 1 decimal place, a few countries happen to have a population of 0.0 million. These countries will raise errors while calculating metrics per capita. We do not make any changes for now, but we shall need to handle these cases while performing per capita analysis.
- We need to identify countries and regions by checking if the Geography is all caps. We then assign each country to a region (its closest parent).

#### Populate region for the respective country
We identify regions from countries based on whether they have all capital letters.
We then populate the region column for regions and add Null values for countries.
Using forward fill, we populate the regions of their respective countries.

In [548]:
population_list['Region'] = population_list['Geography'].map(lambda name: name if name.isupper() else None)

In [549]:
population_list['Region'] = population_list['Region'].ffill()

In [550]:
population_list.head()

Unnamed: 0,Geography,Population (millions),Region
0,WORLD,7963.0,WORLD
1,AFRICA,1419.0,AFRICA
2,NORTHERN AFRICA,251.0,NORTHERN AFRICA
3,Algeria,44.9,NORTHERN AFRICA
4,Egypt,103.5,NORTHERN AFRICA


### Step 2: Getting Article Quality Predictions

#### API Call to the MediaWiki API to retrieve the last revision ID from the Wikipedia article title

In [551]:
# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0 / 100.0) - API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': 'aamird@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2022',
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = politicians_list['name'].unique()

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": '|'.join(politicians_list['name'].head(50)),
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

In [552]:
def request_pageinfo_per_article(article_titles = None,
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT,
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_titles: return None

    request_template['titles'] = article_titles

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


We group the list of article titles into chunks of 50 articles. We then request the data for 50 articles in each API call.

JSON responses that have a negative key indicate articles that were not found.
We do not add these articles to our DataFrame and store them in a separate list.

In [553]:
info = {}
unknown_articles = []
for chunk in np.array_split(ARTICLE_TITLES, ceil(len(ARTICLE_TITLES) / 50)):
    pages = request_pageinfo_per_article('|'.join(chunk))['query']['pages']
    for page_ix in list(pages.keys()):
        if int(page_ix) < 0:
            unknown_articles.append(pages[page_ix]['title'])
            del pages[page_ix]
        else:
            break
    info.update(pages)
df_politicians = pd.DataFrame.from_dict(info, orient='index', columns=['title','lastrevid'])

In [554]:
print('The articles for the following politicians do not return a valid response from the Wikipedia API:')
unknown_articles

The articles for the following politicians do not return a valid response from the Wikipedia API:


['Prince Ofosu Sefah',
 'Harjit Kaur Talwandi',
 'Abd al-Razzaq al-Hasani',
 'Kang Sun-nam',
 'Abiodun Abimbola Orekoya',
 'Roman Konoplev']

#### API Call to the ORES API to get the predicted article quality from its last revision ID

In [555]:
# The current ORES API endpoint
API_ORES_SCORE_ENDPOINT = "https://ores.wikimedia.org/v3"
# A template for mapping to the URL
API_ORES_SCORE_PARAMS = "/scores/{context}/?models={model}&revids={revids}"

# Use some delays so that we do not hammer the API with our requests
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0 / 100.0) - API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': 'aamird@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2022'
}

# A dictionary of English Wikipedia article titles (keys) and sample revision IDs that can be used for this ORES scoring example
ARTICLE_REVISIONS = df_politicians['lastrevid'].apply(str)

# This template lists the basic parameters for making an ORES request
ORES_PARAMS_TEMPLATE = {
    "context": "enwiki",        # which WMF project for the specified revid
    "revids" : "",              # the revision to be scored - this will probably change each call
    "model": "articlequality"   # the AI/ML scoring model to apply to the reviewion
}

In [556]:
def request_ores_score_per_article(article_revids = None,
                                   endpoint_url = API_ORES_SCORE_ENDPOINT,
                                   endpoint_params = API_ORES_SCORE_PARAMS,
                                   request_template = ORES_PARAMS_TEMPLATE,
                                   headers = REQUEST_HEADERS,
                                   features=False):
    # Make sure we have an article revision id
    if not article_revids: return None

    # set the revision id into the template
    request_template['revids'] = article_revids

    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)

    # the features used by the ML model can sometimes be returned as well as scores
    if features:
        request_url = request_url+"?features=true"

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

We again request the data for 50 articles in each API call by dividing the revision IDs list into chunks of 50.

In [557]:
scores_dict = {}
for chunk in np.array_split(ARTICLE_REVISIONS, ceil(len(ARTICLE_REVISIONS) / 50)):
    scores = request_ores_score_per_article('|'.join(chunk))['enwiki']['scores']
    for rev_id in scores:
        scores_dict[rev_id] = scores[rev_id]['articlequality']['score']['prediction']

We add the article quality to the politicians DataFrame

In [558]:
df_politicians['Article Quality'] = df_politicians['lastrevid'].map(str).map(scores_dict)

In [559]:
df_politicians.head()

Unnamed: 0,title,lastrevid,Article Quality
65412901,Abas Basir,1098419766,C
27428272,Abdul Baqi Turkistani,889226470,Stub
42972519,Abdul Ghafar Lakanwal,943562276,Start
29443640,Abdul Ghani Ghani,1072441893,Stub
44098744,Abdul Malik Hamwar,1100874645,Stub


### Step 3: Combining the Datasets
Create a single DataFrame containing politician article data, the article quality, and their respective country's population data

In [561]:
df = df_politicians.merge(
    politicians_list,
    left_on='title',
    right_on='name',
    how='left'
).merge(
    population_list[~population_list['Geography'].str.isupper()],
    left_on='country',
    right_on='Geography',
    how='outer'
)

In [562]:
df.head()

Unnamed: 0,title,lastrevid,Article Quality,name,url,country,Geography,Population (millions),Region
0,Abas Basir,1098420000.0,C,Abas Basir,https://en.wikipedia.org/wiki/Abas_Basir,Afghanistan,Afghanistan,41.1,SOUTH ASIA
1,Abdul Baqi Turkistani,889226500.0,Stub,Abdul Baqi Turkistani,https://en.wikipedia.org/wiki/Abdul_Baqi_Turki...,Afghanistan,Afghanistan,41.1,SOUTH ASIA
2,Abdul Ghafar Lakanwal,943562300.0,Start,Abdul Ghafar Lakanwal,https://en.wikipedia.org/wiki/Abdul_Ghafar_Lak...,Afghanistan,Afghanistan,41.1,SOUTH ASIA
3,Abdul Ghani Ghani,1072442000.0,Stub,Abdul Ghani Ghani,https://en.wikipedia.org/wiki/Abdul_Ghani_Ghani,Afghanistan,Afghanistan,41.1,SOUTH ASIA
4,Abdul Malik Hamwar,1100875000.0,Stub,Abdul Malik Hamwar,https://en.wikipedia.org/wiki/Abdul_Malik_Hamwar,Afghanistan,Afghanistan,41.1,SOUTH ASIA


#### Identifying countries having cannot find a match to join on between the politicians and population data

In [563]:
# Countries having no match
set(df['Geography']) ^ set(df['country'])

{'Australia',
 'Brunei',
 'Canada',
 'China,  Hong Kong SAR',
 'China,  Macao SAR',
 'Curacao',
 'French Guiana',
 'French Polynesia',
 'Guadeloupe',
 'Guam',
 'Ireland',
 'Kiribati',
 'Korean',
 'Martinique',
 'Mauritius',
 'Mayotte',
 'New Caledonia',
 'New Zealand',
 'Philippines',
 'Puerto Rico',
 'Reunion',
 'Sao Tome and Principe',
 'United Kingdom',
 'United States',
 'Western Sahara',
 'eSwatini'}

In [564]:
# Save the list of countries having no match to a text file
with open('Output/wp_countries-no_match.txt', 'w') as file:
    file.write('\n'.join(sorted(list(set(df['Geography']) ^ set(df['country'])))))

#### Filter the DataFrame to only contain the required columns and drop unrequired data

In [565]:
# Remove records having missing values for 'country' or 'Geography'
# Select only the required columns
df = df[df['Geography'].notnull() & df['country'].notnull()][['country', 'Region', 'Population (millions)', 'title', 'lastrevid', 'Article Quality']]

In [566]:
# Rename columns to the required names
df.rename(columns={
    'Region': 'region',
    'Population (millions)': 'population',
    'title': 'article_title',
    'lastrevid': 'revision_id',
    'Article Quality': 'article_quality'
}, inplace=True)

In [567]:
df.head()

Unnamed: 0,country,region,population,article_title,revision_id,article_quality
0,Afghanistan,SOUTH ASIA,41.1,Abas Basir,1098420000.0,C
1,Afghanistan,SOUTH ASIA,41.1,Abdul Baqi Turkistani,889226500.0,Stub
2,Afghanistan,SOUTH ASIA,41.1,Abdul Ghafar Lakanwal,943562300.0,Start
3,Afghanistan,SOUTH ASIA,41.1,Abdul Ghani Ghani,1072442000.0,Stub
4,Afghanistan,SOUTH ASIA,41.1,Abdul Malik Hamwar,1100875000.0,Stub


In [568]:
# Save the above DataFrame to a CSV file
df.to_csv('Output/wp_politicians_by_country.csv', index=False)

### Step 4: Analysis

In [569]:
# Add an indicator variable feature to identify high quality articles
df['high_quality'] = df['article_quality'].isin(['FA', 'GA'])

In [570]:
# Get the number of articles per capita for each country
article_count_country = df.groupby('country')['article_title'].count() / (df.groupby('country')['population'].mean() * 10**6)
article_count_country.head()

country
Afghanistan    2.871046e-06
Albania        2.964286e-05
Algeria        7.572383e-07
Andorra        1.000000e-04
Angola         1.179775e-06
dtype: float64

In [571]:
# Get the number of high quality articles per capita for each country
high_quality_article_count_country = df.groupby('country')['high_quality'].sum() / (df.groupby('country')['population'].mean() * 10**6)
high_quality_article_count_country.head()

country
Afghanistan    1.459854e-07
Albania        2.142857e-06
Algeria        0.000000e+00
Andorra        2.000000e-05
Angola         0.000000e+00
dtype: float64

#### Add the populations for the respective regions

In [572]:
df = df.merge(
    population_list,
    left_on='region',
    right_on='Geography'
).drop(
    columns=['Geography', 'Region']
).rename(
    columns={'Population (millions)': 'region_population'}
)

In [573]:
# Get the number of articles per capita for each region
article_count_region = df.groupby('region')['article_title'].count() / (df.groupby('region')['region_population'].mean() * 10**6)
article_count_region.head()

region
CARIBBEAN          4.568182e-06
CENTRAL AMERICA    1.095506e-06
CENTRAL ASIA       1.358974e-06
EAST ASIA          1.463560e-07
EASTERN AFRICA     1.369979e-06
dtype: float64

In [574]:
# Get the number of articles per capita for each region
high_quality_article_count_region = df.groupby('region')['high_quality'].sum() / (df.groupby('region')['region_population'].mean() * 10**6)
high_quality_article_count_region.head()

region
CARIBBEAN          1.818182e-07
CENTRAL AMERICA    5.617978e-08
CENTRAL ASIA       3.846154e-08
EAST ASIA          9.557945e-09
EASTERN AFRICA     3.171247e-08
dtype: float64

### Step 5: Results

##### Top 10 countries by coverage: The 10 countries with the highest total articles per capita (in descending order) .

In [575]:
article_count_country[article_count_country < np.inf].sort_values(ascending=False).head(10)

country
Antigua and Barbuda               0.000170
Federated States of Micronesia    0.000130
Andorra                           0.000100
Barbados                          0.000093
Marshall Islands                  0.000090
Montenegro                        0.000060
Seychelles                        0.000060
Luxembourg                        0.000053
Bhutan                            0.000051
Grenada                           0.000050
dtype: float64

##### Bottom 10 countries by coverage: The 10 countries with the lowest total articles per capita (in ascending order) .

In [576]:
article_count_country.sort_values(ascending=True).head(10)

country
China           1.392176e-09
Mexico          7.843137e-09
Saudi Arabia    8.174387e-08
Romania         1.052632e-07
India           1.255998e-07
Sri Lanka       1.339286e-07
Egypt           1.352657e-07
Ethiopia        2.025932e-07
Taiwan          2.155172e-07
Vietnam         2.716298e-07
dtype: float64

##### Top 10 countries by high quality: The 10 countries with the highest high quality articles per capita (in descending order) .

In [577]:
high_quality_article_count_country[high_quality_article_count_country < np.inf].sort_values(ascending=False).head(10)

country
Andorra                  2.000000e-05
Montenegro               5.000000e-06
Albania                  2.142857e-06
Suriname                 1.666667e-06
Bosnia-Herzegovina       1.470588e-06
Lithuania                1.071429e-06
Croatia                  1.052632e-06
Slovenia                 9.523810e-07
Palestinian Territory    9.259259e-07
Gabon                    8.333333e-07
dtype: float64

##### Bottom 10 countries by high quality: The 10 countries with the lowest high quality articles per capita (in ascending order).

In [578]:
high_quality_article_count_country.sort_values(ascending=True).head(10)

country
Laos                0.0
Mongolia            0.0
Moldova             0.0
Mexico              0.0
Marshall Islands    0.0
Malta               0.0
Maldives            0.0
Malawi              0.0
Madagascar          0.0
Luxembourg          0.0
dtype: float64

In [579]:
(high_quality_article_count_country == 0).sum()

86

We notice that 86 countries have no high quality articles.
Hence, we also display the result for countries having at least one high quality article.

In [580]:
high_quality_article_count_country[high_quality_article_count_country > 0].sort_values(ascending=True).head(10)

country
India       4.233700e-09
Thailand    1.497006e-08
Japan       1.601281e-08
Nigeria     1.830664e-08
Vietnam     2.012072e-08
Colombia    2.036660e-08
Uganda      2.118644e-08
Pakistan    2.120441e-08
Sudan       2.132196e-08
Iran        2.257336e-08
dtype: float64

##### Geographic regions by total coverage: A rank ordered list of geographic regions (in descending order) by total articles per capita.

In [581]:
article_count_region.sort_values(ascending=False)

region
SOUTHERN EUROPE    5.894040e-06
CARIBBEAN          4.568182e-06
WESTERN EUROPE     3.548223e-06
EASTERN EUROPE     2.560976e-06
NORTHERN EUROPE    2.448598e-06
WESTERN ASIA       2.333333e-06
OCEANIA            1.954545e-06
SOUTHERN AFRICA    1.710145e-06
EASTERN AFRICA     1.369979e-06
CENTRAL ASIA       1.358974e-06
SOUTH AMERICA      1.329493e-06
WESTERN AFRICA     1.320930e-06
CENTRAL AMERICA    1.095506e-06
MIDDLE AFRICA      1.035714e-06
NORTHERN AFRICA    9.043825e-07
SOUTHEAST ASIA     6.094675e-07
SOUTH ASIA         3.227092e-07
EAST ASIA          1.463560e-07
dtype: float64

##### Geographic regions by high quality coverage: Rank ordered list of geographic regions (in descending order) by high quality articles per capita.

In [582]:
high_quality_article_count_region.sort_values(ascending=False)

region
SOUTHERN EUROPE    3.046358e-07
CARIBBEAN          1.818182e-07
EASTERN EUROPE     1.324042e-07
WESTERN EUROPE     1.116751e-07
WESTERN ASIA       9.523810e-08
NORTHERN EUROPE    7.476636e-08
SOUTHERN AFRICA    5.797101e-08
CENTRAL AMERICA    5.617978e-08
OCEANIA            4.545455e-08
CENTRAL ASIA       3.846154e-08
SOUTHEAST ASIA     3.550296e-08
EASTERN AFRICA     3.171247e-08
WESTERN AFRICA     3.023256e-08
SOUTH AMERICA      2.995392e-08
NORTHERN AFRICA    2.788845e-08
MIDDLE AFRICA      2.551020e-08
SOUTH ASIA         1.145418e-08
EAST ASIA          9.557945e-09
dtype: float64

Let us also analyze the ration of high quality articles per country and region

In [585]:
(high_quality_article_count_country / article_count_country).sort_values(ascending=False)

country
Romania         1.000000
Saudi Arabia    0.666667
Gabon           0.333333
Andorra         0.200000
Korea, South    0.154762
                  ...   
Monaco               NaN
Nauru                NaN
Palau                NaN
San Marino           NaN
Tuvalu               NaN
Length: 184, dtype: float64

In [586]:
(high_quality_article_count_region / article_count_region).sort_values(ascending=False)

region
EAST ASIA          0.065306
SOUTHEAST ASIA     0.058252
EASTERN EUROPE     0.051701
SOUTHERN EUROPE    0.051685
CENTRAL AMERICA    0.051282
WESTERN ASIA       0.040816
CARIBBEAN          0.039801
SOUTH ASIA         0.035494
SOUTHERN AFRICA    0.033898
WESTERN EUROPE     0.031474
NORTHERN AFRICA    0.030837
NORTHERN EUROPE    0.030534
CENTRAL ASIA       0.028302
MIDDLE AFRICA      0.024631
OCEANIA            0.023256
EASTERN AFRICA     0.023148
WESTERN AFRICA     0.022887
SOUTH AMERICA      0.022530
dtype: float64