In [1]:
import numpy as np
import pandas as pd
import json
import requests

In [2]:
# load the two datasets
page_data = pd.read_csv('page_data.csv')
pop_data = pd.read_csv('WPDS_2020_data.csv')

In [3]:
# get rid of the rows that starts with 'template:' for the page column
page_data_cleaned = page_data[~page_data['page'].str.startswith('Template:')].reset_index()
page_data_cleaned.drop(columns='index',inplace=True)

In [4]:
# separate country data with sub-region data
region = pop_data[pop_data['Name'].str.isupper()].reset_index()
country = pop_data[~pop_data['Name'].str.isupper()].reset_index()
region.drop(columns='index',inplace=True)
country.drop(columns='index',inplace=True)

In [12]:
# prepare to get data from API endpoint
endpoint = 'https://ores.wikimedia.org/v3/scores/enwiki/?models=articlequality&revids={rev_id}'
# create headers 
headers = {
    'User-Agent': 'https://github.com/aixinransummer',
    'From': 'aixinran@uw.edu'
}
# create a function to get data
def api_call(endpoint, parameter):
    call = requests.get(endpoint.format(rev_id = parameter), headers=headers)
    response = call.json()
    return response

In [63]:
# configure rev_id to the ORES API in batch size of 50
# iterate through the article dataset to get prediction scores by calling the function defined above
batch_size = 50
list_id = []
list_pred = []
for i in range(0,len(page_data_cleaned), batch_size):
    index_start = i
    index_end = min(i+batch_size,len(page_data_cleaned))
    cur_data = api_call(endpoint, '|'.join(str(x) for x in page_data_cleaned.rev_id.iloc[index_start:index_end]))
    for key, value in cur_data['enwiki']['scores'].items():
        if 'score' in value['articlequality']:
            list_id.append(key)
            list_pred.append(value['articlequality']['score']['prediction'])
        elif 'error' in value['articlequality']:
            list_id.append(key)
            list_pred.append(value['articlequality']['error']['type'])

In [69]:
# create a dataframe to record the scores for each article
article_pred = pd.DataFrame(list(zip(list_id, list_pred)), columns =['rev_id', 'prediction'])
pred_allowed = ['Stub', 'Start','C','B','GA','FA']
article_no_pred = article_pred[~article_pred['prediction'].isin(pred_allowed)].reset_index()
article_pred_cleaned = article_pred[article_pred['prediction'].isin(pred_allowed)].reset_index()
article_no_pred.drop(columns='index',inplace=True)
article_pred_cleaned.drop(columns='index',inplace=True)
# log articles that don't have scores
article_no_pred.to_csv('article_no_score.csv',index=False)

In [107]:
# merge three dataframes together
page_pop_merge = page_data_cleaned.merge(country, left_on='country', right_on='Name', how='outer')
article_pred_cleaned['rev_id']=article_pred_cleaned['rev_id'].astype(int)
data_with_na = page_pop_merge.merge(article_pred_cleaned, on='rev_id',how='left')
data_with_na

Unnamed: 0,page,country,rev_id,FIPS,Name,Type,TimeFrame,Data (M),Population,prediction
0,Bir I of Kanem,Chad,355319463.0,TD,Chad,Country,2019.0,16.877,16877000.0,Stub
1,Abdullah II of Kanem,Chad,498683267.0,TD,Chad,Country,2019.0,16.877,16877000.0,Stub
2,Salmama II of Kanem,Chad,565745353.0,TD,Chad,Country,2019.0,16.877,16877000.0,Stub
3,Kuri I of Kanem,Chad,565745365.0,TD,Chad,Country,2019.0,16.877,16877000.0,Stub
4,Mohammed I of Kanem,Chad,565745375.0,TD,Chad,Country,2019.0,16.877,16877000.0,Stub
...,...,...,...,...,...,...,...,...,...,...
46723,,,,PF,French Polynesia,Country,2019.0,0.280,280000.0,
46724,,,,GU,Guam,Country,2019.0,0.175,175000.0,
46725,,,,NC,New Caledonia,Country,2019.0,0.295,295000.0,
46726,,,,PW,Palau,Country,2019.0,0.018,18000.0,


In [108]:
# output the no-match table and the final dataset to csv files
data_no_match = data_with_na[data_with_na.isna().any(axis=1)].reset_index()
data_no_match.drop(columns=['FIPS','Name','Type','TimeFrame','Data (M)','index'], inplace=True)
data_no_match.to_csv('wp_wpds_countries-no_match.csv',index=False)
data_cleaned = data_with_na[~data_with_na.isna().any(axis=1)].reset_index()
data_cleaned.drop(columns=['FIPS','Name','Type','TimeFrame','Data (M)','index'], inplace=True)
data_cleaned.to_csv('wp_wpds_politicians_by_country.csv',index=False)

## Data Analysis

In [266]:
# calcualte articles-per-population rate for each country
num_articles_by_country = data_cleaned.groupby(['country'])['page'].count().reset_index()
num_articles_by_country.rename(columns={'page':'num_articles'}, inplace=True)
population_by_country = data_cleaned[['country','Population']].drop_duplicates().reset_index()
population_by_country.drop(columns=['index'],inplace=True)
articles_per_population = num_articles_by_country.merge(population_by_country, on='country', how='right')

In [268]:
articles_per_population['articles_per_pop'] = articles_per_population['num_articles'] * 100.0/articles_per_population['Population']

In [269]:
articles_per_population

Unnamed: 0,country,num_articles,Population,articles_per_pop
0,Chad,96,16877000.0,0.000569
1,Palestinian Territory,179,5008000.0,0.003574
2,Cambodia,213,15497000.0,0.001374
3,Canada,839,38190000.0,0.002197
4,Egypt,234,100803000.0,0.000232
...,...,...,...,...
177,Barbados,14,287000.0,0.004878
178,Belize,16,419000.0,0.003819
179,Djibouti,37,988000.0,0.003745
180,Zambia,25,18384000.0,0.000136


In [270]:
# calcualte good-articles rate for each country
high_quality_article = data_cleaned[data_cleaned['prediction'].isin(['FA','GA'])].reset_index()
good_articles_by_country = high_quality_article.groupby(['country'])['page'].count().reset_index()
percent_good_articles = num_articles_by_country.merge(good_articles_by_country, on='country', how='left')
percent_good_articles.rename(columns={'page':'num_good_articles'},inplace=True)
percent_good_articles['num_good_articles']= percent_good_articles['num_good_articles'].fillna(0)
percent_good_articles['percent_good_articles'] = percent_good_articles['num_good_articles'] * 100.0/percent_good_articles['num_articles']

In [271]:
percent_good_articles

Unnamed: 0,country,num_articles,num_good_articles,percent_good_articles
0,Afghanistan,319,13.0,4.075235
1,Albania,456,3.0,0.657895
2,Algeria,116,2.0,1.724138
3,Andorra,34,0.0,0.000000
4,Angola,106,0.0,0.000000
...,...,...,...,...
177,Venezuela,130,3.0,2.307692
178,Vietnam,187,13.0,6.951872
179,Yemen,116,3.0,2.586207
180,Zambia,25,0.0,0.000000


In [272]:
# Identify which country belongs to which region
cur_region = ''
pop_data_copy = pop_data.copy()
pop_data_copy['region'] = np.nan
for row_num in range(len(pop_data_copy)):
    if pop_data_copy.iloc[row_num,1].isupper():
        cur_region = pop_data_copy.iloc[row_num,1]
    pop_data_copy.loc[row_num,'region'] = cur_region

In [273]:
pop_data_copy

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population,region
0,WORLD,WORLD,World,2019,7772.850,7772850000,WORLD
1,AFRICA,AFRICA,Sub-Region,2019,1337.918,1337918000,AFRICA
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,244.344,244344000,NORTHERN AFRICA
3,DZ,Algeria,Country,2019,44.357,44357000,NORTHERN AFRICA
4,EG,Egypt,Country,2019,100.803,100803000,NORTHERN AFRICA
...,...,...,...,...,...,...,...
229,WS,Samoa,Country,2019,0.200,200000,OCEANIA
230,SB,Solomon Islands,Country,2019,0.715,715000,OCEANIA
231,TO,Tonga,Country,2019,0.099,99000,OCEANIA
232,TV,Tuvalu,Country,2019,0.010,10000,OCEANIA


In [274]:
# calculate the articles-per-population rate for each sub-region
region=region[region['Name']!= 'WORLD'].reset_index()
region.drop(columns='index',inplace=True)
population_by_region = region[['Name','Population']]
data_cleaned_region_all = data_cleaned.merge(pop_data_copy, left_on='country', right_on='Name',how='left')
data_cleaned_region_all = data_cleaned_region_all[['page','country','region']]
num_articles_by_region = data_cleaned_region_all.groupby(['region'])['page'].count().reset_index()
num_articles_by_region.rename(columns={'page':'num_articles'}, inplace=True)
articles_per_population_region = num_articles_by_region.merge(population_by_region, left_on='region',right_on='Name', how='right')
articles_per_population_region.drop(columns='region',inplace=True)
articles_per_population_region.rename(columns={'Name':'region','page':'num_articles'},inplace=True)

In [276]:
# calculate the articles-per-population rate for each continent by summing up numbers of sub-regions
articles_per_population_region.loc[0,'num_articles'] = articles_per_population_region.loc[1:6,'num_articles'].sum()
articles_per_population_region.loc[7,'num_articles'] = articles_per_population_region.loc[8:10,'num_articles'].sum()
articles_per_population_region.loc[11,'num_articles'] = articles_per_population_region.loc[12:16,'num_articles'].sum()
articles_per_population_region.loc[17,'num_articles'] = articles_per_population_region.loc[18:21,'num_articles'].sum()

In [277]:
articles_per_population_region['articles_per_pop'] = articles_per_population_region['num_articles'] * 100.0/articles_per_population_region['Population']

In [278]:
articles_per_population_region

Unnamed: 0,num_articles,region,Population,articles_per_pop
0,8578.0,AFRICA,1337918000,0.000641
1,899.0,NORTHERN AFRICA,244344000,0.000368
2,2139.0,WESTERN AFRICA,401115000,0.000533
3,2502.0,EASTERN AFRICA,444970000,0.000562
4,665.0,MIDDLE AFRICA,179757000,0.00037
5,472.0,SOUTHERN AFRICA,67732000,0.000697
6,1901.0,NORTHERN AMERICA,368193000,0.000516
7,5270.0,LATIN AMERICA AND THE CARIBBEAN,651036000,0.000809
8,1543.0,CENTRAL AMERICA,178611000,0.000864
9,695.0,CARIBBEAN,43233000,0.001608


In [279]:
# calculate the high-quality-articles rate for each sub-region
data_cleaned_region = high_quality_article.merge(pop_data_copy, left_on='country', right_on='Name',how='left')
data_cleaned_region = data_cleaned_region[['page','country','region']]
good_articles_by_region = data_cleaned_region.groupby(['region'])['page'].count().reset_index()
good_articles_by_region.rename(columns={'page':'num_good_articles'})
percent_good_articles_region = num_articles_by_region.merge(good_articles_by_region, on='region')
percent_good_articles_region = percent_good_articles_region.merge(region[['Name']], left_on='region',right_on='Name',how='right')
percent_good_articles_region.drop(columns='region',inplace=True)
percent_good_articles_region.rename(columns={'page':'num_good_articles','Name':'region'}, inplace=True)

In [280]:
# calculate the high-quality-articles rate for each continent by summing up numbers of sub-regions
percent_good_articles_region.loc[0,'num_good_articles'] = percent_good_articles_region.loc[1:6,'num_good_articles'].sum()
percent_good_articles_region.loc[7,'num_good_articles'] = percent_good_articles_region.loc[8:10,'num_good_articles'].sum()
percent_good_articles_region.loc[11,'num_good_articles'] = percent_good_articles_region.loc[12:16,'num_good_articles'].sum()
percent_good_articles_region.loc[17,'num_good_articles'] = percent_good_articles_region.loc[18:21,'num_good_articles'].sum()
percent_good_articles_region.loc[0,'num_articles'] = percent_good_articles_region.loc[1:6,'num_articles'].sum()
percent_good_articles_region.loc[7,'num_articles'] = percent_good_articles_region.loc[8:10,'num_articles'].sum()
percent_good_articles_region.loc[11,'num_articles'] = percent_good_articles_region.loc[12:16,'num_articles'].sum()
percent_good_articles_region.loc[17,'num_articles'] = percent_good_articles_region.loc[18:21,'num_articles'].sum()

In [281]:
percent_good_articles_region['percent_good_articles'] = percent_good_articles_region['num_good_articles']/percent_good_articles_region['num_articles']

In [282]:
percent_good_articles_region

Unnamed: 0,num_articles,num_good_articles,region,percent_good_articles
0,8578.0,223.0,AFRICA,0.025997
1,899.0,19.0,NORTHERN AFRICA,0.021135
2,2139.0,40.0,WESTERN AFRICA,0.0187
3,2502.0,35.0,EASTERN AFRICA,0.013989
4,665.0,16.0,MIDDLE AFRICA,0.02406
5,472.0,9.0,SOUTHERN AFRICA,0.019068
6,1901.0,104.0,NORTHERN AMERICA,0.054708
7,5270.0,76.0,LATIN AMERICA AND THE CARIBBEAN,0.014421
8,1543.0,23.0,CENTRAL AMERICA,0.014906
9,695.0,13.0,CARIBBEAN,0.018705


## Results

In [283]:
# Top 10 countries by coverage aka articles-per-population rate
articles_per_population.sort_values('articles_per_pop', ascending=False, ignore_index=True)[:10]

Unnamed: 0,country,num_articles,Population,articles_per_pop
0,Tuvalu,54,10000.0,0.54
1,Nauru,52,11000.0,0.472727
2,San Marino,81,34000.0,0.238235
3,Monaco,40,38000.0,0.105263
4,Liechtenstein,28,39000.0,0.071795
5,Marshall Islands,37,57000.0,0.064912
6,Tonga,63,99000.0,0.063636
7,Iceland,201,368000.0,0.05462
8,Andorra,34,82000.0,0.041463
9,Federated States of Micronesia,36,106000.0,0.033962


In [292]:
# Bottom 10 countries by coverage
articles_per_population['Population'] = articles_per_population['Population'].map(lambda x: '%.1f' % x)
articles_per_population.sort_values('articles_per_pop', ascending=True, ignore_index=True)[:10]

Unnamed: 0,country,num_articles,Population,articles_per_pop
0,India,968,1400100000.0,6.9e-05
1,Indonesia,209,271739000.0,7.7e-05
2,China,1129,1402385000.0,8.1e-05
3,Uzbekistan,28,34174000.0,8.2e-05
4,Ethiopia,101,114916000.0,8.8e-05
5,Zambia,25,18384000.0,0.000136
6,"Korea, North",36,25779000.0,0.00014
7,Thailand,112,66534000.0,0.000168
8,Mozambique,58,31166000.0,0.000186
9,Bangladesh,317,169809000.0,0.000187


In [295]:
# Top 10 countries by relative quality aka high-quailty-articles rate
percent_good_articles.sort_values('percent_good_articles', ascending=False, ignore_index=True)[:10]

Unnamed: 0,country,num_articles,num_good_articles,percent_good_articles
0,"Korea, North",36,8.0,22.2
1,Saudi Arabia,117,15.0,12.8
2,Romania,343,42.0,12.2
3,Central African Republic,66,8.0,12.1
4,Uzbekistan,28,3.0,10.7
5,Mauritania,48,5.0,10.4
6,Guatemala,83,7.0,8.4
7,Dominica,12,1.0,8.3
8,Syria,128,10.0,7.8
9,Benin,91,7.0,7.7


In [301]:
# Bottom 10 countries by relative quality 
percent_good_articles.sort_values('percent_good_articles', ascending=True, ignore_index=True)[:10]

Unnamed: 0,country,num_articles,num_good_articles,percent_good_articles
0,Estonia,148,0.0,0.0
1,Djibouti,37,0.0,0.0
2,Lesotho,29,0.0,0.0
3,Finland,569,0.0,0.0
4,Kazakhstan,78,0.0,0.0
5,Zambia,25,0.0,0.0
6,Eritrea,16,0.0,0.0
7,Guadeloupe,49,0.0,0.0
8,Kiribati,30,0.0,0.0
9,Grenada,36,0.0,0.0


In [302]:
# Geographic regions by coverage
articles_per_population_region['articles_per_pop'] = articles_per_population_region['articles_per_pop'].map(lambda x: '%.6f' % x)
articles_per_population_region.sort_values('articles_per_pop', ascending=False,ignore_index=True)

Unnamed: 0,num_articles,region,Population,articles_per_pop
0,3126.0,OCEANIA,43155000,0.007244
1,3763.0,NORTHERN EUROPE,105990000,0.00355
2,3710.0,SOUTHERN EUROPE,153251000,0.002421
3,4560.0,WESTERN EUROPE,195479000,0.002333
4,15765.0,EUROPE,746622000,0.002112
5,695.0,CARIBBEAN,43233000,0.001608
6,3732.0,EASTERN EUROPE,291902000,0.001279
7,2563.0,WESTERN ASIA,280927000,0.000912
8,1543.0,CENTRAL AMERICA,178611000,0.000864
9,5270.0,LATIN AMERICA AND THE CARIBBEAN,651036000,0.000809


In [304]:
# Geographic regions by relative quality
percent_good_articles_region['percent_good_articles'] = percent_good_articles_region['percent_good_articles'].map(lambda x: '%.6f' % x)
percent_good_articles_region.sort_values('percent_good_articles',ascending=False, ignore_index=True)

Unnamed: 0,num_articles,num_good_articles,region,percent_good_articles
0,1901.0,104.0,NORTHERN AMERICA,0.054708
1,2020.0,73.0,SOUTHEAST ASIA,0.036139
2,2563.0,89.0,WESTERN ASIA,0.034725
3,3732.0,118.0,EASTERN EUROPE,0.031618
4,2473.0,76.0,EAST ASIA,0.030732
5,245.0,7.0,CENTRAL ASIA,0.028571
6,3763.0,102.0,NORTHERN EUROPE,0.027106
7,11667.0,316.0,ASIA,0.027085
8,8578.0,223.0,AFRICA,0.025997
9,665.0,16.0,MIDDLE AFRICA,0.02406


## Reflection

From tables that report coverage data, I noticed that countries with high article-per-population percentage tend to have way smaller population than countries associated with low articles-per-population percentage. In fact, the top 10 countries by coverage all have less than 500 thousand people while the bottom 10 countries by coverage all have more than 18 million people. The coverage metric seems to be inversely proportional to countries' population in a way. As reflected by the region table as well, Oceania, as the sub-region that has the least population, actually has the highest articles-per-population rate. 

I was surprised to see that North Korea ranks the top one country with the highest relative article quality among all countries. It's also interesting to see that besides North Korea, all the other top 10 countries with high relative quality rate are not English-speaking countries, which contradicts to what I previously expected. There are two potential reasons. First, there are many factors that can be used by ORES to predict article qualities, and those countries might be accidently good at certain factors that heavily influence the prediction score. Second, the high relative quality rate might just be due to low number of articles. For example, Dominica has 12 articles about politicians in total, and one good article will make the good-articles rate as high as 8 percent. The same 8 percent might not be achieved as easily if a country had 100 articles in total. 

In general, I learned that as we make analysis and conclusions based on data, it's important to notice any potential bias existing in the data or in the way we process the data. Otherwise, false statements might be made based on unknown confounding factors and applied to other data-related research. Without questioning the potential bias in the ORES algorithm or limitations in the dataset itself, it's dangerous to blindly trust the results that were extracted from the data. 

### Response to Questions
Before started working on the assignment, I thought one type of bias would be that English-speaking countries will tend to have higher good-article rates than non-English-speaking countries. After all, our data is from English Wikipedia pages, so articles from those countires are more likely to be very well-written as their readers are native Englih speakers. Altough North America has a very high good-article rate, I found that the top 10 countries and regions by relative quality aren't mostly English-speaking countries/regions. 

As I processed the data, I found that there are countries that are not included in the final data due to missing information. For example, if the dataset doesn't have population information about a country, then all the articles of that country will not be included in the final dataset. Or if a big portion of a country's articles don't have prediction scores, then the total number of valid articles will be significantly less. Cases like these may introduce bias into our analysis process because while most countries are retained, some are excluded. Another potential source of bias may come from the prediction mechanism itself. The prediction score may not truely reflect the quality of the article, rather it may be correlated with some external factors, such as the average education level of the politician's country that's being evaluated. Because the algorithm was trained on articles that were truely peer-reviewed by editors, if there were bias in articles that were highly scored, it's likely to see a biased algorithm due to the limitation of the original training sample.

The results indicate that there is a systematic bias in English Wikipedia as a data source. As suggested by the tables above, there is a big difference between countries with least and most article-per-population rate. The rate isn't supposed to vary too much across countries since the rate of politicians should stay relatively stable, and definately not as fluctuating as indicated from the results. That is, country with more population should have more coverage of politician articles in general. The inconsistent rate suggests that there is bias in the data itself that will potentially create misleading results. 