# Homework 2 - Considering Bias in Data

### Step 1: Getting the data

In [3]:
## import statements
import pandas as pd
import json, time, urllib.parse
import requests

import aiohttp
import asyncio

In [4]:
#fetching the data
us_cities_by_state = pd.read_csv('data/us_cities_by_state_SEPT.2023.csv')
#state_populations = pd.read_excel('data/NST-EST2022-POP.xlsx')
#region_reference = pd.read_excel("data/US States by Region - US Census Bureau.xlsx")


In [5]:
# print(f"Getting page info data for: {ARTICLE_TITLES[3]}")
# info = request_pageinfo_per_article(ARTICLE_TITLES[3])
# print(json.dumps(info,indent=4))

#### Cleaning up the dataframe

In [6]:
## inconsistencies with the state list

# Dropping duplicates
us_cities_by_state.drop_duplicates(inplace=True, ignore_index=True)


In [7]:
#checking to see if we missed any unwanted entries
a = list(us_cities_by_state.page_title)
b = {}
c = set()

for state in a:
    if state in b:
        c.add(state)
    else:
        b[state] = 1
us_cities_by_state[us_cities_by_state['page_title'].isin(c)]

Unnamed: 0,state,page_title,url
1683,Colorado,2020 United States census,https://en.wikipedia.org/wiki/2020_United_Stat...
1684,Colorado,2010 United States census,https://en.wikipedia.org/wiki/2010_United_Stat...
2030,Florida,County (United States),https://en.wikipedia.org/wiki/County_(United_S...
5196,Iowa,County (United States),https://en.wikipedia.org/wiki/County_(United_S...
12947,New_York,Population,https://en.wikipedia.org/wiki/Population
18433,Tennessee,County (United States),https://en.wikipedia.org/wiki/County_(United_S...
18780,Texas,Population,https://en.wikipedia.org/wiki/Population
18781,Texas,2020 United States census,https://en.wikipedia.org/wiki/2020_United_Stat...
18782,Texas,2010 United States census,https://en.wikipedia.org/wiki/2010_United_Stat...
21234,Wisconsin,County (United States),https://en.wikipedia.org/wiki/County_(United_S...


We now need to remove these rows from the original dataframe

In [8]:
us_cities_by_state.drop(us_cities_by_state[us_cities_by_state['page_title'].isin(c)].index, inplace=True)


In [9]:
us_cities_by_state

Unnamed: 0,state,page_title,url
0,Alabama,"Abbeville, Alabama","https://en.wikipedia.org/wiki/Abbeville,_Alabama"
1,Alabama,"Adamsville, Alabama","https://en.wikipedia.org/wiki/Adamsville,_Alabama"
2,Alabama,"Addison, Alabama","https://en.wikipedia.org/wiki/Addison,_Alabama"
3,Alabama,"Akron, Alabama","https://en.wikipedia.org/wiki/Akron,_Alabama"
4,Alabama,"Alabaster, Alabama","https://en.wikipedia.org/wiki/Alabaster,_Alabama"
...,...,...,...
21520,Wyoming,"Wamsutter, Wyoming","https://en.wikipedia.org/wiki/Wamsutter,_Wyoming"
21521,Wyoming,"Wheatland, Wyoming","https://en.wikipedia.org/wiki/Wheatland,_Wyoming"
21522,Wyoming,"Worland, Wyoming","https://en.wikipedia.org/wiki/Worland,_Wyoming"
21523,Wyoming,"Wright, Wyoming","https://en.wikipedia.org/wiki/Wright,_Wyoming"


### Step 2: Getting last revision id for an article



In [51]:

### Define constants
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"
API_THROTTLE_WAIT = 0.1

# Define other constants and variables
REQUEST_HEADERS = {
    'User-Agent': '<aadi2000@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"

PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",  # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}
       

async def request_pageinfo_per_article(session, article_title, endpoint_url=API_ENWIKIPEDIA_ENDPOINT,
                                       request_template=PAGEINFO_PARAMS_TEMPLATE, headers=REQUEST_HEADERS):
    request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    async with session.get(endpoint_url, headers=headers, params=request_template) as response:
        try:
            json_response = await response.json()
            # Get the page ID for the article
            page_id = list(json_response['query']['pages'].keys())[0]
            lastrevid = json_response['query']['pages'][page_id]['lastrevid']
            return lastrevid
        except Exception as e:
            print(f"Failed to retrieve data for {article_title}: {e}")
            return None


async def main(article_titles, page_info_dict, unable_to_get):
    async with aiohttp.ClientSession() as session:
        tasks = []
        count = 0  # Initialize a counter
        for i, title in enumerate(article_titles):
            task = request_pageinfo_per_article(session, title)
            tasks.append(task)
            if i % 10 == 0:
                print(f"{i} of {len(article_titles)} have been requested")
                
            if (i + 1) % 1000 == 0:
                print(f"{i + 1} articles' IDs have been stored")
                count += 1
        responses = await asyncio.gather(*tasks)
        for i, response in enumerate(responses):
            if response is not None:
                page_info_dict[article_titles[i]] = response
            else:
                print(f"Failed to retrieve data for {article_titles[i]}")
                unable_to_get[article_titles[i]] = 1
        print(f"{count * 1000} articles are done")
    return page_info_dict, unable_to_get

# Define the list of articles
article_list = list(us_cities_by_state["page_title"])


0 of 3 have been requested
0 articles are done


({'Worland, Wyoming': 1166347917,
  'Wright, Wyoming': 1166334449,
  'Yoder, Wyoming': 1171182284},
 {})

In [94]:
import json

# Define the batch size
batch_size = 1000
batch_number = 0

# Create a list to store the results of each batch
results = []

# Iterate over the article_list in batches of size batch_size
for i in range(0, len(article_list), batch_size):
    batch_number += 1
    article_list1 = article_list[i:i + batch_size]
    page_info_dict = {}
    unable_to_get = {}
    await main(article_list1, page_info_dict, unable_to_get)
    
    # Save the results of this batch to a JSON file
    batch_result = {
        "batch_number": batch_number,
        "page_info_dict": page_info_dict,
        "unable_to_get": unable_to_get
    }
    results.append(batch_result)

    await asyncio.sleep(10) #to reduce the burden on the servers and get less failures

0 of 1000 have been requested
10 of 1000 have been requested
20 of 1000 have been requested
30 of 1000 have been requested
40 of 1000 have been requested
50 of 1000 have been requested
60 of 1000 have been requested
70 of 1000 have been requested
80 of 1000 have been requested
90 of 1000 have been requested
100 of 1000 have been requested
110 of 1000 have been requested
120 of 1000 have been requested
130 of 1000 have been requested
140 of 1000 have been requested
150 of 1000 have been requested
160 of 1000 have been requested
170 of 1000 have been requested
180 of 1000 have been requested
190 of 1000 have been requested
200 of 1000 have been requested
210 of 1000 have been requested
220 of 1000 have been requested
230 of 1000 have been requested
240 of 1000 have been requested
250 of 1000 have been requested
260 of 1000 have been requested
270 of 1000 have been requested
280 of 1000 have been requested
290 of 1000 have been requested
300 of 1000 have been requested
310 of 1000 have be

In [95]:
#Check for any values that were not retreived
for batch_result in results:
    print(f"Batch {batch_result['batch_number']}:")
    print(f"page_info_dict length: {len(batch_result['page_info_dict'])}")
    print(f"unable_to_get length: {len(batch_result['unable_to_get'])}")

Batch 1:
page_info_dict length: 1000
unable_to_get length: 0
Batch 2:
page_info_dict length: 1000
unable_to_get length: 0
Batch 3:
page_info_dict length: 1000
unable_to_get length: 0
Batch 4:
page_info_dict length: 1000
unable_to_get length: 0
Batch 5:
page_info_dict length: 1000
unable_to_get length: 0
Batch 6:
page_info_dict length: 1000
unable_to_get length: 0
Batch 7:
page_info_dict length: 1000
unable_to_get length: 0
Batch 8:
page_info_dict length: 1000
unable_to_get length: 0
Batch 9:
page_info_dict length: 1000
unable_to_get length: 0
Batch 10:
page_info_dict length: 1000
unable_to_get length: 0
Batch 11:
page_info_dict length: 1000
unable_to_get length: 0
Batch 12:
page_info_dict length: 1000
unable_to_get length: 0
Batch 13:
page_info_dict length: 1000
unable_to_get length: 0
Batch 14:
page_info_dict length: 1000
unable_to_get length: 0
Batch 15:
page_info_dict length: 1000
unable_to_get length: 0
Batch 16:
page_info_dict length: 1000
unable_to_get length: 0
Batch 17:
page_in

In [96]:
# Merge all page_info_dict dictionaries
all_page_info_dict = {}
for batch_result in results:
    all_page_info_dict.update(batch_result['page_info_dict'])

print(len(all_page_info_dict))

with open('data/all_page_info_dict.json', 'w') as json_file:
    json.dump(all_page_info_dict, json_file)

In [None]:
rev_id = pd.DataFrame(list(all_page_info_dict.items()), columns=['page_title', 'rev_id'])
us_cities_by_state = us_cities_by_state.merge(rev_id,on='page_title')

In [123]:
us_cities_by_state.isna().sum()

state         0
page_title    0
url           0
rev_id        0
dtype: int64

In [120]:
us_cities_by_state.to_csv('data/cleaned_us_cities_by_state_with_id.csv',index=False)

There seems to be no empty values, so it seems to be good for further processes

### Getting the ORES scores

In [124]:
us_cities_by_state = pd.read_csv('data/cleaned_us_cities_by_state_with_id.csv')

In [125]:
import json, time, urllib.parse
import requests

In [126]:
email = ""
username = ""
access_token = ""

In [127]:
#    The current LiftWing ORES API endpoint and prediction model
#
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (60.0/5000.0)-API_LATENCY_ASSUMED

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#    
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<"+email+">, University of Washington, MSDS DATA 512 - AUTUMN 2023",
    'Content-Type': 'application/json',
    'Authorization': "Bearer "+access_token
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : email,    
    'access_token'  : access_token
}

#
#    A dictionary of English Wikipedia article titles (keys) and sample revision IDs that can be used for this ORES scoring example
#
ex_article_revisions = { 'Bison':1085687913 , 'Northern flicker':1086582504 , 'Red squirrel':1083787665 , 'Chinook salmon':1085406228 , 'Horseshoe bat':1060601936 }

#
#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}

#
#    These are used later - defined here so they, at least, have empty values
#
USERNAME = username
ACCESS_TOKEN = access_token
#

In [128]:
#ORES API puller fcn
def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   endpoint_url = API_ORES_LIFTWING_ENDPOINT, 
                                   model_name = API_ORES_EN_QUALITY_MODEL, 
                                   request_data = ORES_REQUEST_DATA_TEMPLATE, 
                                   header_format = REQUEST_HEADER_TEMPLATE, 
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):
    
    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token
    
    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")
    
    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)
    
    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        #response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [None]:
#   Which article - the key for the article dictionary defined above
article_title = "Bison"
#
print(f"Getting LiftWing ORES scores for '{article_title}' with revid: {ex_article_revisions[article_title]:d}")
#
#    Make the call, just pass in the article revision ID, email address, and access token
score = request_ores_score_per_article(article_revid=ex_article_revisions[article_title],
                                       email_address=email,
                                       access_token=ACCESS_TOKEN)
#
#    Output the result
print(json.dumps(score,indent=4))


## Scratchpad