<a href="https://colab.research.google.com/github/aly-such/data-512-a2/blob/main/hcds_a2_bias.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# install ores (run only one time)
# !pip install ores

# imports
import requests
import json
import pandas as pd
import numpy as np
from ores import api

In [16]:
!git clone https://github.com/aly-such/data-512-a2.git

https://github.com/aly-such/data-512-a2/blob/fa7970bf43f3d94a0dc5fc3514365818a8fda4c5/WPDS_2020_data.csv

Cloning into 'data-512-a2'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 11 (delta 2), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (11/11), done.


In [20]:
# Politicians by Country dataset - from Figshare
wiki = pd.read_csv('https://raw.githubusercontent.com/aly-such/data-512-a2/main/page_data.csv')
# Population Data - from Population Reference Bureau
pop = pd.read_csv('https://raw.githubusercontent.com/aly-such/data-512-a2/main/WPDS_2020_data.csv')

In [21]:
wiki.head()

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


In [22]:
# Template:... are not wiki articles, drop from dataframe
wiki = wiki[~wiki.page.str.contains("Template:")]

In [23]:
wiki

Unnamed: 0,page,country,rev_id
1,Bir I of Kanem,Chad,355319463
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188
12,Yos Por,Cambodia,393822005
23,Julius Gregr,Czech Republic,395521877
24,Edvard Gregr,Czech Republic,395526568
...,...,...,...
47192,Yahya Jammeh,Gambia,807482007
47193,Lucius Fairchild,United States,807483006
47194,Fahd of Saudi Arabia,Saudi Arabia,807483153
47195,Francis Fessenden,United States,807483270


In [109]:
pop = pop.drop(columns= ['TimeFrame', 'FIPS', 'Data (M)'])
pop.head()

Unnamed: 0,Name,Type,Population
0,WORLD,World,7772850000
1,AFRICA,Sub-Region,1337918000
2,NORTHERN AFRICA,Sub-Region,244344000
3,Algeria,Country,44357000
4,Egypt,Country,100803000


In [110]:
# Separate The regional populations from the country populations
# First, create a dataframe of the Names that are not in all caps (country-level counts)
# These rows will match country values in paga_data.csv
country_pop = pop[~pop.Name.str.isupper()]

# Second, create a dataframe of the Names that are in all caps (regional-level counts)
# These rows will not have a match in paga_data.csv
region_pop = pop[pop.Name.str.isupper()]

In [111]:
region_pop.head()

Unnamed: 0,Name,Type,Population
0,WORLD,World,7772850000
1,AFRICA,Sub-Region,1337918000
2,NORTHERN AFRICA,Sub-Region,244344000
10,WESTERN AFRICA,Sub-Region,401115000
27,EASTERN AFRICA,Sub-Region,444970000


In [112]:
country_pop.head()

Unnamed: 0,Name,Type,Population
3,Algeria,Country,44357000
4,Egypt,Country,100803000
5,Libya,Country,6891000
6,Morocco,Country,35952000
7,Sudan,Country,43849000


In [59]:
# Start a ores session using api package
ores_session = api.Session('https://ores.wikimedia.org', 'DATA512 A2 ams884@uw.edu')

In [60]:
# Pull the results of the session
result = ores_session.score('enwiki', ['articlequality'], wiki['rev_id'])

In [61]:
# Create new column that will keep track of predicted quality
# Create an empty list to append to
predictions = []

# Loop through the results of the session to append the scores to our empty list
for prediction in result:
  try:
    predictions.append(prediction['articlequality']['score']['prediction'])
  except:
    predictions.append(-1) # appends -1 where there is no prediction

In [64]:
# Create a new dataframe that includes wiki data as well as score data
wiki_scores = wiki
wiki_scores['article_quality'] = predictions

wiki_scores.head()

Unnamed: 0,page,country,rev_id,article_quality
1,Bir I of Kanem,Chad,355319463,Stub
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188,Stub
12,Yos Por,Cambodia,393822005,Stub
23,Julius Gregr,Czech Republic,395521877,Stub
24,Edvard Gregr,Czech Republic,395526568,Stub


In [65]:
# Merge wiki/pred data with country population data
merge_df = wiki_scores.merge(country_pop, left_on='country', right_on='Name', how='inner')

In [66]:
merge_df.head()

Unnamed: 0,page,country,rev_id,article_quality,FIPS,Name,Type,TimeFrame,Data (M),Population
0,Bir I of Kanem,Chad,355319463,Stub,TD,Chad,Country,2019,16.877,16877000
1,Abdullah II of Kanem,Chad,498683267,Stub,TD,Chad,Country,2019,16.877,16877000
2,Salmama II of Kanem,Chad,565745353,Stub,TD,Chad,Country,2019,16.877,16877000
3,Kuri I of Kanem,Chad,565745365,Stub,TD,Chad,Country,2019,16.877,16877000
4,Mohammed I of Kanem,Chad,565745375,Stub,TD,Chad,Country,2019,16.877,16877000


In [75]:
# Create single dataframe of wikipedia, prediction, and country population data
# This is to simply rename all the columns to intuitive names

wikipedia_df = pd.DataFrame({
    'country' : merge_df['country'],
    'article_name' : merge_df['page'],
    'revision_id' : merge_df['rev_id'],
    'article_quality_est.' : merge_df['article_quality'],
    'population' : merge_df['Population']
} )

In [94]:
print(wikipedia_df.head())

  country          article_name  revision_id article_quality_est.  population
0    Chad        Bir I of Kanem    355319463                 Stub    16877000
1    Chad  Abdullah II of Kanem    498683267                 Stub    16877000
2    Chad   Salmama II of Kanem    565745353                 Stub    16877000
3    Chad       Kuri I of Kanem    565745365                 Stub    16877000
4    Chad   Mohammed I of Kanem    565745375                 Stub    16877000


In [95]:
# Drop the rows that did not produce a prediction score
wikipedia_df_final = wikipedia_df.loc[wikipedia_df['article_quality_est.'] != -1]

wikipedia_df_final.head()

Unnamed: 0,country,article_name,revision_id,article_quality_est.,population
0,Chad,Bir I of Kanem,355319463,Stub,16877000
1,Chad,Abdullah II of Kanem,498683267,Stub,16877000
2,Chad,Salmama II of Kanem,565745353,Stub,16877000
3,Chad,Kuri I of Kanem,565745365,Stub,16877000
4,Chad,Mohammed I of Kanem,565745375,Stub,16877000


In [97]:
# Save the rows that did not produce a prediction score to a separate csv
wikipedia_no_score = wikipedia_df.loc[wikipedia_df['article_quality_est.'] == -1]

wikipedia_no_score.head()

Unnamed: 0,country,article_name,revision_id,article_quality_est.,population
36,Chad,Kalthouma Nguembang,762816132,-1,16877000
534,Canada,Pierre-Luc Paquette,708813010,-1,38190000
568,Canada,James H. Stuart,715457941,-1,38190000
598,Canada,René Matteau,723308478,-1,38190000
602,Canada,David J. Reimer,724052271,-1,38190000


In [98]:
wikipedia_df_final.to_csv('wp_wpds_politicians_by_country', sep = ',')
wikipedia_no_score.to_csv('wp_wpds_countries-no_match.csv', sep = ',')

In [99]:
# Count of Articles by Country
art_country = wikipedia_df_final.groupby('country').count()['article_name'].astype(int).reset_index()
art_country.rename(columns = {'article_name' : 'article_count'}, inplace = True)
art_country.head()

Unnamed: 0,country,article_count
0,Afghanistan,319
1,Albania,456
2,Algeria,116
3,Andorra,34
4,Angola,106


In [113]:
# Merge Count of Articles by Country and Country Population
art_prop = art_country.merge(country_pop, left_on = 'country', right_on = 'Name', how = 'inner')

# Create a Proportion Column of Number of Articles Per the Country's Population
art_prop['percentage'] = (art_prop['article_count'] * 100) / art_prop['Population'] # multiply by 100 to get a percentage

art_prop.head()

Unnamed: 0,country,article_count,Name,Type,Population,percentage
0,Afghanistan,319,Afghanistan,Country,38928000,0.000819
1,Albania,456,Albania,Country,2838000,0.016068
2,Algeria,116,Algeria,Country,44357000,0.000262
3,Andorra,34,Andorra,Country,82000,0.041463
4,Angola,106,Angola,Country,32522000,0.000326


In [114]:
# Group high quality articles together (High Quality = 'FA' and 'GA')
high_quality = pd.concat([wikipedia_df_final.loc[wikipedia_df_final['article_quality_est.']=='FA'], 
                           wikipedia_df_final.loc[wikipedia_df_final['article_quality_est.']=='GA']])

# Obtain count of high quality articles
high_quality_group = high_quality.groupby('country').count()['article_name'].reset_index()

# Create a dataframe of country and high quality article count
high_quality_df = pd.DataFrame({'country':high_quality_group['country'], 'high_quality_article_count':high_quality_group['article_name']})
high_quality_df.head()

Unnamed: 0,country,high_quality_article_count
0,Afghanistan,13
1,Albania,3
2,Algeria,2
3,Argentina,16
4,Armenia,5


In [115]:
# Merge high quality df (which has the counts) with the number of articles by country dataframe
high_quality_prop = high_quality_df.merge(art_country, left_on = 'country', right_on = 'country', how = 'inner')

# Find the proportion of high quality articles and number of total articles by country
high_quality_prop['Percentage of Quality Articles'] = (high_quality_df['high_quality_article_count'] * 100) / high_quality_prop['article_count']

high_quality_prop.head()

Unnamed: 0,country,high_quality_article_count,article_count,Percentage of Quality Articles
0,Afghanistan,13,319,4.075235
1,Albania,3,456,0.657895
2,Algeria,2,116,1.724138
3,Argentina,16,491,3.258656
4,Armenia,5,193,2.590674


In [120]:
# Sort article proportion data by country in descending order
rank_countries = art_prop.sort_values(['percentage'], ascending=[False])

## Sort High Quality article proportion by country in descending order
rank__countries_high_quality = high_quality_prop.sort_values(['Percentage of Quality Articles'], ascending=[False])

In [117]:
# Top 10 countries with largest proportion of articles to country population
rank_countries.head(10)

Unnamed: 0,country,article_count,Name,Type,Population,percentage
169,Tuvalu,54,Tuvalu,Country,10000,0.54
117,Nauru,52,Nauru,Country,11000,0.472727
138,San Marino,81,San Marino,Country,34000,0.238235
110,Monaco,40,Monaco,Country,38000,0.105263
95,Liechtenstein,28,Liechtenstein,Country,39000,0.071795
104,Marshall Islands,37,Marshall Islands,Country,57000,0.064912
164,Tonga,63,Tonga,Country,99000,0.063636
70,Iceland,201,Iceland,Country,368000,0.05462
3,Andorra,34,Andorra,Country,82000,0.041463
52,Federated States of Micronesia,36,Federated States of Micronesia,Country,106000,0.033962


In [118]:
# Bottom 10 countries with largest proportion of articles to country population
rank_countries.tail(10)

Unnamed: 0,country,article_count,Name,Type,Population,percentage
13,Bangladesh,317,Bangladesh,Country,169809000,0.000187
114,Mozambique,58,Mozambique,Country,31166000,0.000186
162,Thailand,112,Thailand,Country,66534000,0.000168
84,"Korea, North",36,"Korea, North",Country,25779000,0.00014
181,Zambia,25,Zambia,Country,18384000,0.000136
51,Ethiopia,101,Ethiopia,Country,114916000,8.8e-05
176,Uzbekistan,28,Uzbekistan,Country,34174000,8.2e-05
34,China,1129,China,Country,1402385000,8.1e-05
72,Indonesia,209,Indonesia,Country,271739000,7.7e-05
71,India,968,India,Country,1400100000,6.9e-05


In [121]:
# Top 10 countries with largest proportion of high quality articles to total number of articles
rank__countries_high_quality.head(10)

Unnamed: 0,country,high_quality_article_count,article_count,Percentage of Quality Articles
63,"Korea, North",8,36,22.222222
109,Saudi Arabia,15,117,12.820513
106,Romania,42,343,12.244898
23,Central African Republic,8,66,12.121212
140,Uzbekistan,3,28,10.714286
82,Mauritania,5,48,10.416667
46,Guatemala,7,83,8.433735
33,Dominica,1,12,8.333333
125,Syria,10,128,7.8125
11,Benin,7,91,7.692308


In [122]:
# Bottom 10 countries with largest proportion of high quality articles to total number of articles
rank__countries_high_quality.tail(10)

Unnamed: 0,country,high_quality_article_count,article_count,Percentage of Quality Articles
87,Morocco,1,206,0.485437
73,Lithuania,1,244,0.409836
27,Colombia,1,285,0.350877
104,Portugal,1,318,0.314465
94,Nigeria,2,676,0.295858
101,Peru,1,350,0.285714
89,Nepal,1,356,0.280899
124,Switzerland,1,402,0.248756
128,Tanzania,1,404,0.247525
10,Belgium,1,519,0.192678


In [124]:
# Merge wiki/pred data with whole population data
merge_pop_df = wiki_scores.merge(pop, left_on='country', right_on='Name', how='inner')

merge_pop_df.head()

Unnamed: 0,page,country,rev_id,article_quality,Name,Type,Population
0,Bir I of Kanem,Chad,355319463,Stub,Chad,Country,16877000
1,Abdullah II of Kanem,Chad,498683267,Stub,Chad,Country,16877000
2,Salmama II of Kanem,Chad,565745353,Stub,Chad,Country,16877000
3,Kuri I of Kanem,Chad,565745365,Stub,Chad,Country,16877000
4,Mohammed I of Kanem,Chad,565745375,Stub,Chad,Country,16877000
