In [1]:
import csv
import json
import os
import pandas as pd
import pickle
import requests

### Data Cleaning

##### Read the data into Pandas Dataframes.

In [2]:
page_data_file_path = "./page_data.csv"
wpds_data_file_path = "./WPDS_2018_data.csv"
page_data_df = pd.read_csv(page_data_file_path)
wpds_df = pd.read_csv(wpds_data_file_path)

In [3]:
page_data_df.head()

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


In [4]:
wpds_df.head()

Unnamed: 0,Geography,Population mid-2018 (millions)
0,AFRICA,1284.0
1,Algeria,42.7
2,Egypt,97.0
3,Libya,6.5
4,Morocco,35.2


##### Clean page_data by removing the pages which represent templates.

In [5]:
is_template = page_data_df['page'].str.match('Template:')
page_data_cleaned_df = page_data_df[~is_template]

##### Clean wpds data by removing the rows representing cumulative regions or continents.

In [6]:
wpds_df["is_continent"] = wpds_df.Geography.str.isupper()
wpds_countries_df = wpds_df[~wpds_df["is_continent"]]
wpds_continents_df = wpds_df[wpds_df["is_continent"]]

##### Showing the wpds rows corresponding to Cumulative regions (continents).

In [7]:
wpds_continents_df

Unnamed: 0,Geography,Population mid-2018 (millions),is_continent
0,AFRICA,1284,True
56,NORTHERN AMERICA,365,True
59,LATIN AMERICA AND THE CARIBBEAN,649,True
95,ASIA,4536,True
144,EUROPE,746,True
189,OCEANIA,41,True


##### Map each country to its region.

In [8]:
country_region_dict = {}
cur_region = None
for row in wpds_df.iterrows():
    geography = row[1]["Geography"]
    if geography.isupper():
        cur_region = geography
    else:
        country_region_dict[geography] = cur_region

country_region_df = pd.DataFrame(list(country_region_dict.items()), columns=['country', 'region'])

In [9]:
country_region_df.head()

Unnamed: 0,country,region
0,Algeria,AFRICA
1,Egypt,AFRICA
2,Libya,AFRICA
3,Morocco,AFRICA
4,Sudan,AFRICA


### Getting article quality predictions from ORES.

##### Making ORES requests using REST API. Alternatively, the ORES python package can be used, but it has additional dependencies which may cause trouble while installing.

In [10]:
# Copied from Demo: "https://github.com/Ironholds/data-512-a2/blob/master/hcds-a2-bias_demo.ipynb".
headers = {'User-Agent' : 'https://github.com/bhuvi3', 'From' : 'msbhuvan@uw.edu'}

def get_ores_data(revision_ids, headers, batch_size=100):
    def chunker(seq, size):
        """
        Taken from Stack Overflow answer by 'nosklo': https://stackoverflow.com/questions/434287/what-is-the-most-pythonic-way-to-iterate-over-a-list-in-chunks.
        """
        return (seq[pos:pos + size] for pos in range(0, len(seq), size))

    # Define the endpoint
    endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'

    aggregated_response = {}
    for rev_ids_group in chunker(revision_ids, batch_size):
        # Specify the parameters - smushing all the revision IDs together separated by | marks.
        # Yes, 'smush' is a technical term, trust me I'm a scientist.
        # What do you mean "but people trusting scientists regularly goes horribly wrong" who taught you tha- oh.
        params = {'project' : 'enwiki',
                  'model'   : 'wp10',
                  'revids'  : '|'.join(str(x) for x in rev_ids_group)
                  }
        uri = endpoint.format(**params)
        api_call = requests.get(uri)
        cur_response = api_call.json()
        aggregated_response.update(cur_response['enwiki']['scores'])

    return aggregated_response


##### The API call over all the revision ids might take few minutes. The ORES REST API was throwing errors when queried for more than approx. 200 revision ids in a single call. Hence, I am querying the revision ids in batches. Also, I am storing the queries results in a local pickle file, so that we can avoid making API calls if running this multiple times.

In [11]:
# Note: This cell may take few minutes to run (~5 min)
# For each revision_id in our data, we get ORES quality class predictions.
ores_res_cache_file = "cached_ores_api_call_res.pickle"
if os.path.exists(ores_res_cache_file):
    with open(ores_res_cache_file, "rb") as fp:
        ores_call_res = pickle.load(fp)
else:
    revision_ids = []
    for row in page_data_cleaned_df.iterrows():
        row_series = row[1]
        revision_ids.append(int(row_series["rev_id"]))

    ores_call_res = get_ores_data(revision_ids, headers)

##### Parse the API call result and add the article_quality to the page_data. Ignore the article for which the ORES quality could not be retrieved, and store these article revision ids in a file locally.

In [12]:
quality_categories_dict = {}
missed_rev_ids = []
for key, value in ores_call_res.items():
    try:
        quality_categories_dict[key] = value["wp10"]["score"]['prediction']
    except:
        quality_categories_dict[key] = "missed"
        missed_rev_ids.append(key)

missed_rev_ids_file = "ores_missed_rev_ids.txt"
with open(missed_rev_ids_file, "w") as fp:
    for rev_id in missed_rev_ids:
        fp.write("%s\n" % rev_id)

print("Total number of articles for which ORES quality could not be retrieved: %s. "
      "The revision_ids of these articles have been written to %s"
      % (len(missed_rev_ids), missed_rev_ids_file))

page_quality_df = pd.DataFrame(list(quality_categories_dict.items()), columns=['rev_id', 'article_quality']).astype({'rev_id': 'int64'})
page_data_joined_df = page_data_cleaned_df.merge(page_quality_df, on="rev_id", how="inner")
page_data_joined_filtered_df = page_data_joined_df[page_data_joined_df["article_quality"] != "missed"]

Total number of articles for which ORES quality could not be retrieved: 155. The revision_ids of these articles have been written to ores_missed_rev_ids.txt


In [13]:
page_data_joined_filtered_df = page_data_joined_filtered_df.rename(columns={"rev_id": "revision_id", "page": "article_name"})
page_data_joined_filtered_df.head()

Unnamed: 0,article_name,country,revision_id,article_quality
0,Bir I of Kanem,Chad,355319463,Stub
1,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188,Stub
2,Yos Por,Cambodia,393822005,Stub
3,Julius Gregr,Czech Republic,395521877,Stub
4,Edvard Gregr,Czech Republic,395526568,Stub


In [14]:
wpds_countries_df["Population mid-2018 (millions)"] = wpds_countries_df["Population mid-2018 (millions)"].str.replace(',', '')
wpds_countries_df = wpds_countries_df.astype({"Population mid-2018 (millions)": "float32"})
wpds_countries_df["population"] = wpds_countries_df["Population mid-2018 (millions)"] * 1000000
wpds_countries_df = wpds_countries_df.drop(columns=["is_continent", "Population mid-2018 (millions)"])
wpds_countries_df = wpds_countries_df.rename(columns={"Geography": "country"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
wpds_countries_df.head()

Unnamed: 0,country,population
1,Algeria,42700000.0
2,Egypt,97000000.0
3,Libya,6500000.0
4,Morocco,35200000.0
5,Sudan,41700000.0


##### Combine the Wikipedia and Population data (from WPDS).

In [16]:
page_wpds_merged_df = page_data_joined_filtered_df.merge(wpds_countries_df, on="country", how="left")

is_no_match = page_wpds_merged_df["population"].isnull()

no_match_rows_file = "wp_wpds_countries-no_match.csv"
page_wpds_merged_df_no_match = page_wpds_merged_df[is_no_match]
page_wpds_merged_df_no_match.to_csv(no_match_rows_file, index=False)
print("Rows which did not match have been saved at %s" % no_match_rows_file)

page_wpds_merged_df_matched = page_wpds_merged_df[~is_no_match]
matched_rows_file = "wp_wpds_politicians_by_country.csv"
page_wpds_merged_df_matched.to_csv(matched_rows_file, index=False)
print("Rows matched have been saved at %s" % matched_rows_file)

Rows which did not match have been saved at wp_wpds_countries-no_match.csv
Rows matched have been saved at wp_wpds_politicians_by_country.csv


In [17]:
# Rows where the countries did not match.
page_wpds_merged_df_no_match.head()

Unnamed: 0,article_name,country,revision_id,article_quality,population
1,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188,Stub,
3,Julius Gregr,Czech Republic,395521877,Stub,
4,Edvard Gregr,Czech Republic,395526568,Stub,
18,Presidents of the General Council of French Gu...,French Guiana,546364151,Stub,
29,Timoteo Menéndez,Salvadoran,566504165,Start,


In [18]:
# Rows where countries matched.
page_wpds_merged_df_matched.head()

Unnamed: 0,article_name,country,revision_id,article_quality,population
0,Bir I of Kanem,Chad,355319463,Stub,15400000.0
2,Yos Por,Cambodia,393822005,Stub,16000000.0
5,Robert Douglas Cook,Canada,401577829,Stub,37200000.0
6,List of Grand Viziers of Egypt,Egypt,442937236,Stub,97000000.0
7,Sehba Musharraf,Pakistan,448555418,Stub,200600000.0


### Analysis

##### Create an analysis df with the following metrics for analying the bias.
- coverage: The percentage of articles by population. If a country has a population of 10,000 people, and you found 10 articles about politicians from that country, then the percentage of articles-per-population would be .1%.
- relative_quality: The percentage of high-quality articles. If a country has 10 articles about politicians, and 2 of them are FA or GA class articles, then the percentage of high-quality articles would be 20%.

In [19]:
# Find number of articles per country.
country_article_counts_df = page_wpds_merged_df_matched.groupby("country").size().reset_index(name='article_count')

# Find number of high quality articles per country.
is_high_quality = (page_wpds_merged_df_matched["article_quality"] == "FA") | (page_wpds_merged_df_matched["article_quality"] == "GA")
country_high_quality_article_count_df = page_wpds_merged_df_matched[is_high_quality].groupby("country").size().reset_index(name='high_quality_article_count')

# Make an analysis dataframe with computed metrics.
analysis_df = country_article_counts_df.merge(wpds_countries_df, on="country", how="inner")
analysis_df = analysis_df.merge(country_high_quality_article_count_df, on="country", how="left")
analysis_df['high_quality_article_count'] = analysis_df['high_quality_article_count'].fillna(value=0).astype("int64")

# Add the percentage metrics.
analysis_df["coverage_perc"] = (analysis_df["article_count"] / analysis_df["population"]) * 100
analysis_df["relative_quality"] = (analysis_df["high_quality_article_count"] / analysis_df["article_count"]) * 100

In [20]:
analysis_df.head()

Unnamed: 0,country,article_count,population,high_quality_article_count,coverage_perc,relative_quality
0,Afghanistan,320,36500000.0,12,0.000877,3.75
1,Albania,457,2900000.0,3,0.015759,0.656455
2,Algeria,116,42700000.0,2,0.000272,1.724138
3,Andorra,34,80000.0,0,0.0425,0.0
4,Angola,106,30400000.0,0,0.000349,0.0


##### Add region-wise metrics.

In [21]:
region_analysis_df = analysis_df.drop(columns=["coverage_perc", "relative_quality"]).merge(country_region_df, on="country", how="inner")
region_analysis_df = region_analysis_df.groupby("region").sum()

region_analysis_df["coverage_perc"] = (region_analysis_df["article_count"] / region_analysis_df["population"]) * 100
region_analysis_df["relative_quality"] = (region_analysis_df["high_quality_article_count"] / region_analysis_df["article_count"]) * 100

In [22]:
region_analysis_df

Unnamed: 0_level_0,article_count,population,high_quality_article_count,coverage_perc,relative_quality
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AFRICA,6851,1172400000.0,125,0.000584,1.824551
ASIA,11531,4513100000.0,310,0.000256,2.688405
EUROPE,15864,734590000.0,322,0.00216,2.029753
LATIN AMERICA AND THE CARIBBEAN,5169,628270000.0,69,0.000823,1.334881
NORTHERN AMERICA,1921,365200000.0,99,0.000526,5.153566
OCEANIA,3128,39780000.0,66,0.007863,2.109974


### Analysis Results

##### Top 10 countries by coverage: 10 highest-ranked countries in terms of number of politician articles as a proportion of country population.

In [23]:
# Additional columns have been retained to allow for observation.
analysis_df.sort_values("coverage_perc", ascending=False).head(10)

Unnamed: 0,country,article_count,population,high_quality_article_count,coverage_perc,relative_quality
166,Tuvalu,54,9999.999776,5,0.54,9.259259
115,Nauru,52,9999.999776,0,0.52,0.0
135,San Marino,81,29999.999329,0,0.27,0.0
108,Monaco,40,39999.999106,0,0.1,0.0
93,Liechtenstein,28,39999.999106,0,0.07,0.0
161,Tonga,63,100000.00149,0,0.063,0.0
103,Marshall Islands,37,59999.998659,0,0.061667,0.0
68,Iceland,201,400000.00596,2,0.05025,0.995025
3,Andorra,34,79999.998212,0,0.0425,0.0
61,Grenada,36,100000.00149,1,0.036,2.777778


##### Bottom 10 countries by coverage: 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population.

In [24]:
analysis_df.sort_values("coverage_perc", ascending=True).head(10)

Unnamed: 0,country,article_count,population,high_quality_article_count,coverage_perc,relative_quality
69,India,980,1371300000.0,17,7.1e-05,1.734694
70,Indonesia,210,265200000.0,10,7.9e-05,4.761905
34,China,1130,1393800000.0,41,8.1e-05,3.628319
173,Uzbekistan,28,32900000.0,2,8.5e-05,7.142857
51,Ethiopia,101,107500000.0,2,9.4e-05,1.980198
82,"Korea, North",36,25600000.0,7,0.000141,19.444444
178,Zambia,25,17700000.0,0,0.000141,0.0
159,Thailand,112,66200000.0,3,0.000169,2.678571
112,Mozambique,58,30500000.0,0,0.00019,0.0
13,Bangladesh,319,166400000.0,3,0.000192,0.940439


##### Top 10 countries by relative quality: 10 highest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality.

In [25]:
analysis_df.sort_values("relative_quality", ascending=False).head(10)

Unnamed: 0,country,article_count,population,high_quality_article_count,coverage_perc,relative_quality
82,"Korea, North",36,25600000.0,7,0.000141,19.444444
137,Saudi Arabia,118,33400000.0,15,0.000353,12.711864
104,Mauritania,48,4500000.0,6,0.001067,12.5
31,Central African Republic,66,4700000.0,8,0.001404,12.121212
132,Romania,343,19500000.0,39,0.001759,11.370262
166,Tuvalu,54,10000.0,5,0.54,9.259259
19,Bhutan,33,800000.0,3,0.004125,9.090909
44,Dominica,12,70000.0,1,0.017143,8.333333
155,Syria,128,18300000.0,10,0.000699,7.8125
18,Benin,91,11500000.0,7,0.000791,7.692308


##### Bottom 10 countries by relative quality: 10 lowest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality.

In [26]:
analysis_df.sort_values("relative_quality", ascending=True).head(10)

Unnamed: 0,country,article_count,population,high_quality_article_count,coverage_perc,relative_quality
143,Slovakia,116,5400000.0,0,0.002148,0.0
114,Namibia,162,2500000.0,0,0.00648,0.0
30,Cape Verde,37,600000.0,0,0.006167,0.0
112,Mozambique,58,30500000.0,0,0.00019,0.0
38,Costa Rica,147,5000000.0,0,0.00294,0.0
108,Monaco,40,40000.0,0,0.1,0.0
43,Djibouti,37,1000000.0,0,0.0037,0.0
107,Moldova,423,3500000.0,0,0.012086,0.0
167,Uganda,185,44100000.0,0,0.00042,0.0
49,Eritrea,16,6000000.0,0,0.000267,0.0


##### Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the total count of politician articles from countries in each region as a proportion of total regional population.

In [27]:
region_analysis_df.sort_values("coverage_perc", ascending=False)

Unnamed: 0_level_0,article_count,population,high_quality_article_count,coverage_perc,relative_quality
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
OCEANIA,3128,39780000.0,66,0.007863,2.109974
EUROPE,15864,734590000.0,322,0.00216,2.029753
LATIN AMERICA AND THE CARIBBEAN,5169,628270000.0,69,0.000823,1.334881
AFRICA,6851,1172400000.0,125,0.000584,1.824551
NORTHERN AMERICA,1921,365200000.0,99,0.000526,5.153566
ASIA,11531,4513100000.0,310,0.000256,2.688405


##### Geographic regions by  relative quality: Ranking of geographic regions (in descending order) in terms of the relative proportion of politician articles from countries in each region that are of GA and FA-quality.

In [28]:
region_analysis_df.sort_values("relative_quality", ascending=False)

Unnamed: 0_level_0,article_count,population,high_quality_article_count,coverage_perc,relative_quality
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NORTHERN AMERICA,1921,365200000.0,99,0.000526,5.153566
ASIA,11531,4513100000.0,310,0.000256,2.688405
OCEANIA,3128,39780000.0,66,0.007863,2.109974
EUROPE,15864,734590000.0,322,0.00216,2.029753
AFRICA,6851,1172400000.0,125,0.000584,1.824551
LATIN AMERICA AND THE CARIBBEAN,5169,628270000.0,69,0.000823,1.334881


### Reflections and implications: Please refer to the README file in the repository root for reflections and implications of the analyses provided in this project.