# Assignment 2: Bias

In [24]:
import numpy as np
import pandas as pd
import json
import requests

Filter out the rows in the page_data data frame that contain "Template:" in the "page" column.

In [11]:
page_data = pd.read_csv('country/data/page_data.csv')
page_data = page_data[page_data['page'].str.contains('Template:', na = False) == 0]
page_data

Unnamed: 0,page,country,rev_id
1,Bir I of Kanem,Chad,355319463
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188
12,Yos Por,Cambodia,393822005
23,Julius Gregr,Czech Republic,395521877
24,Edvard Gregr,Czech Republic,395526568
...,...,...,...
47192,Yahya Jammeh,Gambia,807482007
47193,Lucius Fairchild,United States,807483006
47194,Fahd of Saudi Arabia,Saudi Arabia,807483153
47195,Francis Fessenden,United States,807483270


Filter out the data frame to fields without capital letters, and store the capital-only fields in a separate variable for later anaylsis.

In [18]:
wpds = pd.read_csv('WPDS_2020_data.csv')
wpds_caps = wpds[wpds['Name'].str.isupper()]
wpds = wpds[wpds['Name'].str.isupper() == 0]

Write the grouping function that will batch the API call into 50 at a time.

In [None]:
def grouping(count, lst):
    for i in range(0,len(lst),count):
        yield lst[i:i+count]

Write the API call function that uses the endpoint to access the score predictions group them.

In [33]:
def api_call(rev_id):
    headers = {
    'User-Agent': 'https://github.com/anantr98',
    'From': 'anantr@uw.edu'
    }
    endpoint = 'https://ores.wikimedia.org/v3/scores/enwiki/?models=articlequality&revids={rev_id}'
    call = requests.get(endpoint.format(rev_id = rev_id), headers=headers)
    response = call.json()
    qual_preds = []
    for rev_id, val in response['enwiki']['scores'].items():
        val_dict = val['articlequality']
        if "error" not in val_dict:
            prediction = {
                'rev_id': int(rev_id),
                'prediction': val_dict['score']['prediction']
            }
            qual_preds.append(prediction)
    return qual_preds

Get the predictions from the call.

In [None]:
id_group = list(grouping(50,page_data['rev_id']))
predictions=[]
for id_val in id_group:
    predictions.append(api_call("|".join(str(x) for x in id_val)))

Create a data frame with solely the rev_ids and the prediction scores for that particular ID.

In [57]:
rev_id = []
prediction = []
for val in predictions:
    for innerVal in val: 
        rev_id.append(innerVal['rev_id'])
        prediction.append(innerVal['prediction'])
wiki_data = pd.DataFrame({'rev_id' : rev_id,'prediction':prediction})

Merge the wiki data and the population data together.

In [84]:
merge1 = pd.merge(wiki_data,page_data,on='rev_id',how='left')
merge1 = merge1.rename(columns={'country':'Name'})
merge2 = pd.merge(merge1, wpds, on = 'Name', how = 'left')

Separate the data frame into two separate data frames, those with matches and those without matches for population data.

In [87]:
wp_wpds_politicians_by_country = merge2.dropna()
wp_wpds_countries_no_match = merge2[merge2.isna().any(axis=1)]

Filter out the data frame to include only the 5 columns of concern.

In [91]:
wp_wpds_politicians_by_country = wp_wpds_politicians_by_country[['Name', 'page', 'rev_id', 'prediction', 'Population']]
wp_wpds_politicians_by_country = wp_wpds_politicians_by_country.rename(columns={'Name':'country', 
                                                                                'page':'article_name', 
                                                                                'rev_id':'revision_id', 
                                                                                'prediction': 'article_quality_est.', 
                                                                                'Population': 'population'})
#wp_wpds_politicians_by_country.head()

Write the two new data frames to the csv.

In [95]:
wp_wpds_countries_no_match.to_csv('wp_wpds_countries_no_match.csv')
wp_wpds_politicians_by_country.to_csv('wp_wpds_politicians_by_country.csv')

Top 10 countries by coverage: 10 highest-ranked countries in terms of number of politician articles as a proportion of country population

In [122]:
countries = {}
for country in wp_wpds_politicians_by_country['country'].unique():
    countries[country] = wp_wpds_politicians_by_country['country'].value_counts()[country]/wp_wpds_politicians_by_country['population'][wp_wpds_politicians_by_country['country']==country].unique()[0]
top_ten_countries_by_proportion = pd.DataFrame(countries, index=[0]).T.sort_values(by=[0], ascending=False)[0:10]
top_ten_countries_by_proportion

Unnamed: 0,0
Tuvalu,0.0054
Nauru,0.004727
San Marino,0.002382
Monaco,0.001053
Liechtenstein,0.000718
Marshall Islands,0.000649
Tonga,0.000636
Iceland,0.000546
Andorra,0.000415
Federated States of Micronesia,0.00034


Bottom 10 countries by coverage: 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population

In [123]:
bottom_ten_countries_by_proportion = pd.DataFrame(countries, index=[0]).T.sort_values(by=[0], ascending=True)[0:10]
bottom_ten_countries_by_proportion

Unnamed: 0,0
India,6.913792e-07
Indonesia,7.691204e-07
China,8.050571e-07
Uzbekistan,8.193363e-07
Ethiopia,8.789029e-07
Zambia,1.359878e-06
"Korea, North",1.396486e-06
Thailand,1.68335e-06
Mozambique,1.861002e-06
Bangladesh,1.866803e-06


Top 10 countries by relative quality: 10 highest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality

In [149]:
good_quality_by_country = wp_wpds_politicians_by_country[(wp_wpds_politicians_by_country['article_quality_est.']=='GA') | (wp_wpds_politicians_by_country['article_quality_est.']=='FA')]
countries = {}
for country in good_quality_by_country['country'].unique():
    good_count = len(good_quality_by_country[good_quality_by_country['country']==country])
    total = len(wp_wpds_politicians_by_country[wp_wpds_politicians_by_country['country']==country])
    countries[country] = good_count/total
top_ten_countries_by_relative_quality = pd.DataFrame(countries, index=[0]).T.sort_values(by=[0], ascending=False)[0:10]
top_ten_countries_by_relative_quality

Unnamed: 0,0
"Korea, North",0.222222
Saudi Arabia,0.128205
Romania,0.122449
Central African Republic,0.121212
Uzbekistan,0.107143
Mauritania,0.104167
Guatemala,0.084337
Dominica,0.083333
Syria,0.078125
Benin,0.076923


Bottom 10 countries by relative quality: 10 lowest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality

In [150]:
bottom_ten_countries_by_relative_quality = pd.DataFrame(countries, index=[0]).T.sort_values(by=[0], ascending=True)[0:10]
bottom_ten_countries_by_relative_quality

Unnamed: 0,0
Belgium,0.001927
Tanzania,0.002475
Switzerland,0.002488
Nepal,0.002809
Peru,0.002857
Nigeria,0.002959
Portugal,0.003145
Colombia,0.003509
Lithuania,0.004098
Morocco,0.004854


Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the total count of politician articles from countries in each region as a proportion of total regional population

In [259]:
wp_wpds_politicians_by_country = wp_wpds_politicians_by_country.reset_index(drop=False)
## Define the regions
wpds_original = pd.read_csv('WPDS_2020_data.csv')
northern_africa = wpds_original[3:10]
western_africa = wpds_original[11:27]
eastern_africa = wpds_original[28:48]
middle_africa = wpds_original[49:58]
southern_africa = wpds_original[59:64]
northern_america = wpds_original[65:67]
central_america = wpds_original[69:77]
caribbean = wpds_original[78:95]
south_america = wpds_original[96:110]
western_asia = wpds_original[111:129]
central_asia = wpds_original[130:135]
south_asia = wpds_original[136:145]
southeast_asia = wpds_original[146:157]
east_asia = wpds_original[158:166]
northern_europe = wpds_original[168:179]
western_europe = wpds_original[180:189]
eastern_europe = wpds_original[190:200]
southern_europe = wpds_original[201:216]
oceania = wpds_original[217:233]
sub_regions = ['NORTHERN AFRICA', 'WESTERN AFRICA',
       'EASTERN AFRICA', 'MIDDLE AFRICA', 'SOUTHERN AFRICA',
       'NORTHERN AMERICA','CENTRAL AMERICA', 'CARIBBEAN', 'SOUTH AMERICA',
       'WESTERN ASIA', 'CENTRAL ASIA', 'SOUTH ASIA', 'SOUTHEAST ASIA',
       'EAST ASIA', 'NORTHERN EUROPE', 'WESTERN EUROPE',
       'EASTERN EUROPE', 'SOUTHERN EUROPE', 'OCEANIA']
subsets = [northern_africa, western_africa, eastern_africa, middle_africa,
           southern_africa, northern_america,central_america, caribbean,
           south_america, western_asia, central_asia, south_asia, 
           southeast_asia, east_asia, northern_europe, western_europe, 
           eastern_europe, southern_europe, oceania]

region = []
for i in range(0,len(subsets)):
    for j in range(0,len(subsets[i])):
        region.append(sub_regions[i])            
wpds['region'] = region
wpds = wpds.rename(columns={'Name':'country'})

wpds_merged = pd.merge(wp_wpds_politicians_by_country, wpds[['country', 'region']],on='country',how='left')

sub_region_counts = {}
for subreg in wpds_merged['region'].unique():
    sub_region_counts[subreg] = wpds_merged['region'].value_counts()[subreg]/int(wpds_caps['Population'][wpds_caps['Name']==subreg])

top_ten_subregions_by_proportion = pd.DataFrame(sub_region_counts, index=[0]).T.sort_values(by=[0], ascending=False)[0:10]
top_ten_subregions_by_proportion

Unnamed: 0,0
NORTHERN EUROPE,3.9e-05
OCEANIA,3.6e-05
SOUTHERN EUROPE,3.1e-05
WESTERN EUROPE,2.2e-05
CARIBBEAN,1.6e-05
EASTERN EUROPE,1.4e-05
WESTERN ASIA,9e-06
CENTRAL AMERICA,9e-06
SOUTH AMERICA,8e-06
SOUTHERN AFRICA,7e-06


Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the relative proportion of politician articles from countries in each region that are of GA and FA-quality

In [265]:
good_quality_by_subregion = wpds_merged[(wpds_merged['article_quality_est.']=='GA') | (wpds_merged['article_quality_est.']=='FA')]
good_quality_subregion = {}
for country in good_quality_by_subregion['region'].unique():
    good_quality_subregion[country] = good_quality_by_subregion['region'].value_counts()[country]/wpds_merged['region'].value_counts()[country]

top_ten_subregions_by_quality = pd.DataFrame(good_quality_subregion, index=[0]).T.sort_values(by=[0], ascending=False)[0:10]
top_ten_subregions_by_quality

Unnamed: 0,0
NORTHERN AMERICA,0.054708
CENTRAL ASIA,0.041152
SOUTHEAST ASIA,0.035884
WESTERN ASIA,0.034314
EASTERN EUROPE,0.028912
EAST ASIA,0.026786
NORTHERN EUROPE,0.025635
MIDDLE AFRICA,0.02406
SOUTHERN EUROPE,0.022647
NORTHERN AFRICA,0.021135
