# Loading data

In [127]:
# Importing Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

import json, time, urllib.parse
import requests

In [128]:
# load the data files
df_politician = pd.read_csv("politicians_by_country_SEPT.2022.csv")
df_population = pd.read_csv("population_by_country_2022.csv")

df_politician.shape, df_population.shape

((7584, 3), (233, 2))

# 1. Performing Basic Data Validation
This section is checking the overall format of the dataframe, number of unique values , missing values and datatype.

### 1.1 Politician Dataset

In [129]:
# printing first few rows
df_politician.head()

Unnamed: 0,name,url,country
0,Shahjahan Noori,https://en.wikipedia.org/wiki/Shahjahan_Noori,Afghanistan
1,Abdul Ghafar Lakanwal,https://en.wikipedia.org/wiki/Abdul_Ghafar_Lak...,Afghanistan
2,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan
3,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan
4,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan


In [130]:
# checking basic metrics about the dataframe
pd.DataFrame({'col_name': df_politician.columns,
              'null_vals': df_politician.isnull().sum(),
             'dtypes': df_politician.dtypes,
             'nuniques': df_politician.nunique()})

Unnamed: 0,col_name,null_vals,dtypes,nuniques
name,name,0,object,7534
url,url,0,object,7534
country,country,0,object,185


### 1.2 Dealing with duplicates in Politics Dataset

In [131]:
# looking at duplicated rows
df_politician.drop_duplicates(subset=['name', 'url', 'country'], inplace = True)
df_politician.shape

(7582, 3)

**Comments:**
- There are still some duplicates with name and url but different country. 
- Cannot delete these rows without more information

### 1.3 Population Dataset

In [133]:
# Fixing the regions and countries
df_population['shifted'] = df_population['Geography'].shift(-1)
df_population = df_population[~((df_population['Geography'].str.isupper() == True) & (df_population['shifted'].str.isupper() == True))].iloc[:,0:2].reset_index().drop('index', axis = 1)

# Finding the capitalised strings
regions = pd.DataFrame()
regions['region'] = df_population[df_population['Geography'].str.isupper()]['Geography']
regions['flag'] = np.arange(1, len(regions['region']) + 1)

# Merging capitalised strings and existing countries
df = df_population.merge(regions, left_on = "Geography", right_on = "region", how = 'left').iloc[:,[0,1,3]]

# fill in region to populate
df['flag'] = df['flag'].expanding().max()
df_population = df.merge(regions, on = "flag", how = 'inner')
df_population = df_population.iloc[:,[0, 1, 3]]

# keep countries in the geography column
df_population = df_population[df_population['Geography'] != df_population['region']]
df_population.head()

Unnamed: 0,Geography,Population (millions),region
1,Algeria,44.9,NORTHERN AFRICA
2,Egypt,103.5,NORTHERN AFRICA
3,Libya,6.8,NORTHERN AFRICA
4,Morocco,36.7,NORTHERN AFRICA
5,Sudan,46.9,NORTHERN AFRICA


# 2. Data Extraction

### 2.1 Setting the parameters for PAI page request
In order to use the API to get information about the wikipedia pages, there are some basic paramters that need to be set. Setting the constants here - 

In [134]:
# English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# Assuming roughly 2ms latency on the API and network
API_LATENCY_ASSUMED = 0.002      
API_THROTTLE_WAIT = (1.0/100.0) - API_LATENCY_ASSUMED

# In case something goes wrong
REQUEST_HEADERS = {'User-Agent': 'aish25@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2022'}

# Titles will be changed 
ARTICLE_TITLES = None

# String of additional page properties that can be returned
# PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

### 2.2 Defining function to make page info requests
The parameters defined above will be used to make API requests. We need to make multiple requests and hence writing a function for the same. 

In [135]:
# defining function to make multiple requests
def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_title: return None
    
    request_template['titles'] = article_title
        
    # make the request
    try:
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

### 2.3 Getting the page info
Using the defined paramters and function, generating a dataframe containing the page information. 

In [67]:
ARTICLE_TITLES = list(df_politician['name'].unique())

In [69]:
lastrevid_list = []
available_lastrevid = []
unavailable_lastrevid = []
for i in range(0,len(ARTICLE_TITLES)):
    try:
        info = request_pageinfo_per_article(ARTICLE_TITLES[i])
        lastrevid_list.append(info['query']['pages'][list(info['query']['pages'].keys())[0]]['lastrevid'])
        available_lastrevid.append(ARTICLE_TITLES[i])
    except:
        unavailable_lastrevid.append(ARTICLE_TITLES[i])
        print("Couldn't get the page info for: ", ARTICLE_TITLES[i])

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
Couldn't get the page info for:  Jiří Ventruba
Couldn't get the page info for:  Prince Ofosu Sefah
Couldn't get the page info for:  Harjit Kaur Talwandi
Couldn't get the page info for:  Abd al-Razzaq al-Hasani
Couldn't get the page info for:  Abiodun Abimbola Orekoya
Couldn't get the page info for:  Segun “Aeroland” Adewale
Couldn't get the page info for:  Roman Konoplev
Couldn't get the page info for:  Nhlanhla “Lux” Dlamini


**Comments:**
- Information for 8 article titles was not fetched.
- Certain pages have quotes which are different from quotes in python.

In [71]:
# saving into csv as intermediate file
info_df = pd.DataFrame({"name": available_lastrevid, 
                        "lastrevid": lastrevid_list})
info_df.to_csv('lastrevid.csv', index=False)

### 2.4 Setting paramters for ORES request

In [72]:
# The current ORES API endpoint
API_ORES_SCORE_ENDPOINT = "https://ores.wikimedia.org/v3"

# A template for mapping to the URL
API_ORES_SCORE_PARAMS = "/scores/{context}/{revid}/{model}"

# Use some delays so that we do not hammer the API with our requests
API_LATENCY_ASSUMED = 0.002 
API_THROTTLE_WAIT = (1.0/100.0) - API_LATENCY_ASSUMED

# in case of errors
REQUEST_HEADERS = {
    'User-Agent': 'aish25@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2022'
}

# Dictionary of Wikipedia article titles (keys) and revision IDs 
ARTICLE_REVISIONS = None

# Basic parameters for making an ORES request
ORES_PARAMS_TEMPLATE = {
    "context": "enwiki",        # which WMF project for the specified revid
    "revid" : "",               # the revision to be scored - this will probably change each call
    "model": "articlequality"   # the AI/ML scoring model to apply to the reviewion
}

### 2.5 Getting ORES Predictions

In [78]:
# function to get the ores scores
def request_ores_score_per_article(article_revid = None, 
                                   endpoint_url = API_ORES_SCORE_ENDPOINT, 
                                   endpoint_params = API_ORES_SCORE_PARAMS, 
                                   request_template = ORES_PARAMS_TEMPLATE,
                                   headers = REQUEST_HEADERS,
                                   features=False):
    
    # Make sure we have an article revision id
    if not article_revid: return None
    
    # set the revision id into the template
    request_template['revid'] = article_revid
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # the features used by the ML model can sometimes be returned as well as scores
    if features:
        request_url = request_url+"?features=true"
    
    # make the request
    try:
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [81]:
# setting empty lists to store information
ores_pred = []
unavailable_oresscore = []
available_oresscore = []
for i in range(0,len(info_df['lastrevid'])):
    try:
        response = request_ores_score_per_article(info_df['lastrevid'][i])
        prediction = response['enwiki']['scores'][list(response['enwiki']['scores'].keys())[0]]['articlequality']['score']['prediction']
        ores_pred.append(prediction)
        available_oresscore.append(info_df['lastrevid'][i])
    except:
        unavailable_oresscore.append(info_df['lastrevid'][i])
        print("Couldn't get the ores scores for: ", info_df['lastrevid'][i])

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
Couldn't get the ores scores for:  1058539988


In [82]:
len(ores_pred)

7525

In [84]:
# saving into csv as intermediate file
preds_df = pd.DataFrame({"lastrevid": available_oresscore, 
                        "ores_pred": ores_pred})
preds_df.to_csv('ores_preds.csv', index=False)

# 3. Generating final dataset
Combining the information collected and stores in  info_df and preds_df dataframes about the lastrevid and scores. Joining this information with the initial df_politician and df_population files. 

Finally identifying missing values and storing them separately. Saving the cleaned dataset with no missing values.

### 3.1 Merging data files

In [108]:
info_df.shape, preds_df.shape

((7526, 2), (7525, 2))

In [109]:
df_combined = info_df.merge(preds_df, on=['lastrevid'], how = 'left')

In [110]:
df_combined = df_politician[['name', 'country']].merge(df_combined, on = "name", how = 'left')
df_combined.head()

Unnamed: 0,name,country,lastrevid,ores_pred
0,Shahjahan Noori,Afghanistan,1099689000.0,GA
1,Abdul Ghafar Lakanwal,Afghanistan,943562300.0,Start
2,Majah Ha Adrif,Afghanistan,852404100.0,Start
3,Haroon al-Afghani,Afghanistan,1095102000.0,B
4,Tayyab Agha,Afghanistan,1104998000.0,Start


In [111]:
# Consolidated data table with all politicians, article quality scores and population information
df_combined = df_combined.merge(df_population, left_on = 'country', right_on = 'Geography', how = 'outer')
print(df_combined.shape)
df_combined.head()

(7609, 7)


Unnamed: 0,name,country,lastrevid,ores_pred,Geography,Population (millions),region
0,Shahjahan Noori,Afghanistan,1099689000.0,GA,Afghanistan,41.1,SOUTH ASIA
1,Abdul Ghafar Lakanwal,Afghanistan,943562300.0,Start,Afghanistan,41.1,SOUTH ASIA
2,Majah Ha Adrif,Afghanistan,852404100.0,Start,Afghanistan,41.1,SOUTH ASIA
3,Haroon al-Afghani,Afghanistan,1095102000.0,B,Afghanistan,41.1,SOUTH ASIA
4,Tayyab Agha,Afghanistan,1104998000.0,Start,Afghanistan,41.1,SOUTH ASIA


### 3.2 Identifying countries with missing wiki entries

In [116]:
# List of countries with no wikipedia data
missing_wiki_list = df_combined.loc[df_combined['country'].isnull()]['Geography'].unique()
missing_wiki_list

array(['Western Sahara', 'Mauritius', 'Mayotte', 'Reunion',
       'Sao Tome and Principe', 'eSwatini', 'Canada', 'United States',
       'Curacao', 'Guadeloupe', 'Martinique', 'Puerto Rico',
       'French Guiana', 'Brunei', 'Philippines', 'China,  Hong Kong SAR',
       'China,  Macao SAR', 'Ireland', 'United Kingdom', 'Australia',
       'French Polynesia', 'Guam', 'Kiribati', 'New Caledonia',
       'New Zealand'], dtype=object)

In [117]:
# List of countries with no population data
misisng_population_list = df_combined.loc[df_combined['Geography'].isnull()]['country'].unique()
misisng_population_list

array(['Korean'], dtype=object)

In [120]:
# combining and saving the list
no_match = list(set(np.append(missing_wiki_list, misisng_population_list)))

with open('wp_countries-no_match.txt', 'w') as f:
    for i in no_match:
        f.write(i)
        f.write('\n')

### 3.3 Saving Data for Analysis
After identifying the countries with missing values, saving the rest of the rows in a csv file for data analysis

In [122]:
df_final = df_combined[(~df_combined['country'].isnull()) & (~df_combined['Geography'].isnull())]
df_final = df_final.drop(['Geography'], axis = 1)

In [123]:
df_final.columns

Index(['name', 'country', 'lastrevid', 'ores_pred', 'Population (millions)',
       'region'],
      dtype='object')

In [126]:
df_final = df_final.rename(columns = {'country': 'country', 
                                      'region': 'region',
                                      'Population (millions)': 'population', 
                                      'name': 'article_title', 
                                      'latestrevid': 'revision_id', 
                                      'ores_pred': 'article_quality'})

df_final.to_csv('wp_politicians_by_country.csv', index = False)
df_final.head()

Unnamed: 0,article_title,country,lastrevid,article_quality,population,region
0,Shahjahan Noori,Afghanistan,1099689000.0,GA,41.1,SOUTH ASIA
1,Abdul Ghafar Lakanwal,Afghanistan,943562300.0,Start,41.1,SOUTH ASIA
2,Majah Ha Adrif,Afghanistan,852404100.0,Start,41.1,SOUTH ASIA
3,Haroon al-Afghani,Afghanistan,1095102000.0,B,41.1,SOUTH ASIA
4,Tayyab Agha,Afghanistan,1104998000.0,Start,41.1,SOUTH ASIA
