# Homework 2 - Considering Bias in Data

### Step 1: Getting the data

In [106]:
## import statements
import pandas as pd
import json, time, urllib.parse
import requests

import aiohttp
import asyncio

In [107]:
#fetching the data
us_cities_by_state = pd.read_csv('data/us_cities_by_state_SEPT.2023.csv')
#state_populations = pd.read_excel('data/NST-EST2022-POP.xlsx')
#region_reference = pd.read_excel("data/US States by Region - US Census Bureau.xlsx")


In [108]:
# print(f"Getting page info data for: {ARTICLE_TITLES[3]}")
# info = request_pageinfo_per_article(ARTICLE_TITLES[3])
# print(json.dumps(info,indent=4))

#### Cleaning up the dataframe

In [109]:
## inconsistencies with the state list

# Dropping duplicates
us_cities_by_state.drop_duplicates(inplace=True, ignore_index=True)


In [110]:
#checking to see if we missed any unwanted entries
a = list(us_cities_by_state.page_title)
b = {}
c = set()

for state in a:
    if state in b:
        c.add(state)
    else:
        b[state] = 1
us_cities_by_state[us_cities_by_state['page_title'].isin(c)]

Unnamed: 0,state,page_title,url
1683,Colorado,2020 United States census,https://en.wikipedia.org/wiki/2020_United_Stat...
1684,Colorado,2010 United States census,https://en.wikipedia.org/wiki/2010_United_Stat...
2030,Florida,County (United States),https://en.wikipedia.org/wiki/County_(United_S...
5196,Iowa,County (United States),https://en.wikipedia.org/wiki/County_(United_S...
12947,New_York,Population,https://en.wikipedia.org/wiki/Population
18433,Tennessee,County (United States),https://en.wikipedia.org/wiki/County_(United_S...
18780,Texas,Population,https://en.wikipedia.org/wiki/Population
18781,Texas,2020 United States census,https://en.wikipedia.org/wiki/2020_United_Stat...
18782,Texas,2010 United States census,https://en.wikipedia.org/wiki/2010_United_Stat...
21234,Wisconsin,County (United States),https://en.wikipedia.org/wiki/County_(United_S...


We now need to remove these rows from the original dataframe

In [111]:
us_cities_by_state.drop(us_cities_by_state[us_cities_by_state['page_title'].isin(c)].index, inplace=True)


In [112]:
us_cities_by_state

Unnamed: 0,state,page_title,url
0,Alabama,"Abbeville, Alabama","https://en.wikipedia.org/wiki/Abbeville,_Alabama"
1,Alabama,"Adamsville, Alabama","https://en.wikipedia.org/wiki/Adamsville,_Alabama"
2,Alabama,"Addison, Alabama","https://en.wikipedia.org/wiki/Addison,_Alabama"
3,Alabama,"Akron, Alabama","https://en.wikipedia.org/wiki/Akron,_Alabama"
4,Alabama,"Alabaster, Alabama","https://en.wikipedia.org/wiki/Alabaster,_Alabama"
...,...,...,...
21520,Wyoming,"Wamsutter, Wyoming","https://en.wikipedia.org/wiki/Wamsutter,_Wyoming"
21521,Wyoming,"Wheatland, Wyoming","https://en.wikipedia.org/wiki/Wheatland,_Wyoming"
21522,Wyoming,"Worland, Wyoming","https://en.wikipedia.org/wiki/Worland,_Wyoming"
21523,Wyoming,"Wright, Wyoming","https://en.wikipedia.org/wiki/Wright,_Wyoming"


### Step 2: Getting last revision id for an article



In [113]:

### Define constants
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"
API_THROTTLE_WAIT = 0.1

# Define other constants and variables
REQUEST_HEADERS = {
    'User-Agent': '<aadi2000@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"

PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",  # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}
       

async def request_pageinfo_per_article(session, article_title, endpoint_url=API_ENWIKIPEDIA_ENDPOINT,
                                       request_template=PAGEINFO_PARAMS_TEMPLATE, headers=REQUEST_HEADERS):
    request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    async with session.get(endpoint_url, headers=headers, params=request_template) as response:
        try:
            json_response = await response.json()
            # Get the page ID for the article
            page_id = list(json_response['query']['pages'].keys())[0]
            lastrevid = json_response['query']['pages'][page_id]['lastrevid']
            return lastrevid
        except Exception as e:
            print(f"Failed to retrieve data for {article_title}: {e}")
            return None


async def main(article_titles, page_info_dict, unable_to_get):
    async with aiohttp.ClientSession() as session:
        tasks = []
        count = 0  # Initialize a counter
        for i, title in enumerate(article_titles):
            task = request_pageinfo_per_article(session, title)
            tasks.append(task)
            if i % 10 == 0:
                print(f"{i} of {len(article_titles)} have been requested")
                
            if (i + 1) % 1000 == 0:
                print(f"{i + 1} articles' IDs have been stored")
                count += 1
        responses = await asyncio.gather(*tasks)
        for i, response in enumerate(responses):
            if response is not None:
                page_info_dict[article_titles[i]] = response
            else:
                print(f"Failed to retrieve data for {article_titles[i]}")
                unable_to_get[article_titles[i]] = 1
        print(f"{count * 1000} articles are done")
    return page_info_dict, unable_to_get

# Define the list of articles
article_list = list(us_cities_by_state["page_title"])


In [114]:
import json

# Define the batch size
batch_size = 1000
batch_number = 0

# Create a list to store the results of each batch
results = []

# Iterate over the article_list in batches of size batch_size
for i in range(0, len(article_list), batch_size):
    batch_number += 1
    article_list1 = article_list[i:i + batch_size]
    page_info_dict = {}
    unable_to_get = {}
    await main(article_list1, page_info_dict, unable_to_get)
    
    # Save the results of this batch to a JSON file
    batch_result = {
        "batch_number": batch_number,
        "page_info_dict": page_info_dict,
        "unable_to_get": unable_to_get
    }
    results.append(batch_result)

    await asyncio.sleep(10) #to reduce the burden on the servers and get less failures

0 of 1000 have been requested
10 of 1000 have been requested
20 of 1000 have been requested
30 of 1000 have been requested
40 of 1000 have been requested
50 of 1000 have been requested
60 of 1000 have been requested
70 of 1000 have been requested
80 of 1000 have been requested
90 of 1000 have been requested
100 of 1000 have been requested
110 of 1000 have been requested
120 of 1000 have been requested
130 of 1000 have been requested
140 of 1000 have been requested
150 of 1000 have been requested
160 of 1000 have been requested
170 of 1000 have been requested
180 of 1000 have been requested
190 of 1000 have been requested
200 of 1000 have been requested
210 of 1000 have been requested
220 of 1000 have been requested
230 of 1000 have been requested
240 of 1000 have been requested
250 of 1000 have been requested
260 of 1000 have been requested
270 of 1000 have been requested
280 of 1000 have been requested
290 of 1000 have been requested
300 of 1000 have been requested
310 of 1000 have be

1000 articles are done


CancelledError: 

In [None]:
#Check for any values that were not retreived
for batch_result in results:
    print(f"Batch {batch_result['batch_number']}:")
    print(f"page_info_dict length: {len(batch_result['page_info_dict'])}")
    print(f"unable_to_get length: {len(batch_result['unable_to_get'])}")

Batch 1:
page_info_dict length: 1000
unable_to_get length: 0
Batch 2:
page_info_dict length: 1000
unable_to_get length: 0
Batch 3:
page_info_dict length: 1000
unable_to_get length: 0
Batch 4:
page_info_dict length: 1000
unable_to_get length: 0
Batch 5:
page_info_dict length: 1000
unable_to_get length: 0
Batch 6:
page_info_dict length: 1000
unable_to_get length: 0
Batch 7:
page_info_dict length: 1000
unable_to_get length: 0
Batch 8:
page_info_dict length: 1000
unable_to_get length: 0
Batch 9:
page_info_dict length: 1000
unable_to_get length: 0
Batch 10:
page_info_dict length: 1000
unable_to_get length: 0
Batch 11:
page_info_dict length: 1000
unable_to_get length: 0
Batch 12:
page_info_dict length: 1000
unable_to_get length: 0
Batch 13:
page_info_dict length: 1000
unable_to_get length: 0
Batch 14:
page_info_dict length: 1000
unable_to_get length: 0
Batch 15:
page_info_dict length: 1000
unable_to_get length: 0
Batch 16:
page_info_dict length: 1000
unable_to_get length: 0
Batch 17:
page_in

In [None]:
# Merge all page_info_dict dictionaries
all_page_info_dict = {}
for batch_result in results:
    all_page_info_dict.update(batch_result['page_info_dict'])

print(len(all_page_info_dict))

with open('data/all_page_info_dict.json', 'w') as json_file:
    json.dump(all_page_info_dict, json_file)

In [None]:
rev_id = pd.DataFrame(list(all_page_info_dict.items()), columns=['page_title', 'rev_id'])
us_cities_by_state = us_cities_by_state.merge(rev_id,on='page_title')

In [None]:
us_cities_by_state.isna().sum()

state         0
page_title    0
url           0
rev_id        0
dtype: int64

In [None]:
us_cities_by_state.to_csv('data/cleaned_us_cities_by_state_with_id.csv',index=False)

There seems to be no empty values, so it seems to be good for further processes

### Step 3: Getting the ORES scores

In [None]:
us_cities_by_state = pd.read_csv('data/cleaned_us_cities_by_state_with_id.csv')

In [None]:
import json, time, urllib.parse
import requests

In [135]:
# import tracemalloc
# tracemalloc.start()

In [None]:
email = "aadi2000@uw.edu"
username = "DrSniperwolf"
access_token = "<ENTER YOUR ACCESS TOKEN HERE"


In [None]:
### Using async functions to generate multiple requests
import asyncio
import json
import requests
import aiohttp


API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"
API_LATENCY_ASSUMED = 0.002
API_THROTTLE_WAIT = (60.0 / 5000.0) - API_LATENCY_ASSUMED

REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<"+email+">, University of Washington, MSDS DATA 512 - AUTUMN 2023",
    'Content-Type': 'application/json',
    'Authorization': "Bearer "+access_token
}

REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address': email,
    'access_token': access_token
}

ex_article_revisions = {'Bison': 1085687913, 'Northern flicker': 1086582504, 'Red squirrel': 1083787665,
                        'Chinook salmon': 1085406228, 'Horseshoe bat': 1060601936}

ORES_REQUEST_DATA_TEMPLATE = {
    "lang": "en",
    "rev_id": "",
    "features": True
}



article_list = list(us_cities_by_state.page_title)
article_list = article_list[21500:25000]

ores_scores = {}
no_prediction = []
key_error_list = []

article_id = {}

with open('data/all_page_info_dict.json', 'r') as file:
    article_id = json.load(file)

In [None]:
import asyncio
import json
import aiohttp

# ... Your existing constants and data ...

async def request_ores_score_per_article(session, article_revid=None, email_address=None, access_token=None,
                                        endpoint_url=API_ORES_LIFTWING_ENDPOINT,
                                        model_name=API_ORES_EN_QUALITY_MODEL,
                                        request_data=ORES_REQUEST_DATA_TEMPLATE,
                                        header_format=REQUEST_HEADER_TEMPLATE,
                                        header_params=REQUEST_HEADER_PARAMS_TEMPLATE):
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token

    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")

    request_url = endpoint_url.format(model_name=model_name)

    headers = {}
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)

    try:
        print(request_data)
        async with session.post(request_url, headers=headers, data=json.dumps(request_data)) as response:
            json_response = await response.json()
        print(f"Task completed for rev_id {article_revid}")  # Add this line for debugging
        #print(json_response)
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [None]:
import time

async def process_articles():
    async with aiohttp.ClientSession() as session:
        article_count = 0
        tasks = []
        articles_processed = 0  # Counter for articles processed
        articles_per_iteration = 1000 # Number of articles to process in each iteration

        for article in article_list:
            task = request_ores_score_per_article(session, article_revid=article_id[article],
                                                  email_address=email, access_token=access_token)
            tasks.append(task)

            # Increment the counter
            articles_processed += 1

            if articles_processed % articles_per_iteration == 0:
                # If 1000 articles have been processed, gather the results
                results = await asyncio.gather(*tasks)
                for article, score in zip(article_list[articles_processed - articles_per_iteration:articles_processed], results):
                    if score is None:
                        no_prediction.append(article)
                    else:
                        try:
                            print(article, score)
                            ores_scores[article] = score['enwiki']['scores'][str(article_id[article])]['articlequality']['score']['prediction']
                        except KeyError:
                            key_error_list.append(article)

                # Reset tasks for the next iteration
                tasks = []

                # Wait for 5 seconds
                time.sleep(5)

        # Save the final output to JSON
        ores_scores_json_object = json.dumps(ores_scores, indent=4)
        with open(f'ores_scores_final_6.json', 'w') as outfile:
            outfile.write(ores_scores_json_object)


In [None]:
await process_articles()

  await process_articles()
Object allocated at (most recent call last):
  File "C:\Users\aadit\AppData\Local\Temp\ipykernel_21684\958723189.py", lineno 11
    task = request_ores_score_per_article(session, article_revid=article_id[article],


### Step 4: Merging all the files

We have to merge merged_data_ores.json and the us_cities_by_state-cleaned to a new one

In [122]:
with open("data/merged_data_ores.json", 'r') as file:
    ores_final = json.load(file)

us_cities_by_state = pd.read_csv('data/cleaned_us_cities_by_state_with_id.csv')

In [123]:
print(us_cities_by_state.shape[0],len(ores_final))

21515 21515


Seems like we have captured the values for all the articles. Let us join these 2 datasets

In [124]:
# Merge on 'page_title'
us_cities_by_state['quality'] = us_cities_by_state['page_title'].map(ores_final)

In [125]:
us_cities_by_state

Unnamed: 0,state,page_title,url,rev_id,quality
0,Alabama,"Abbeville, Alabama","https://en.wikipedia.org/wiki/Abbeville,_Alabama",1171163550,C
1,Alabama,"Adamsville, Alabama","https://en.wikipedia.org/wiki/Adamsville,_Alabama",1177621427,C
2,Alabama,"Addison, Alabama","https://en.wikipedia.org/wiki/Addison,_Alabama",1168359898,C
3,Alabama,"Akron, Alabama","https://en.wikipedia.org/wiki/Akron,_Alabama",1165909508,GA
4,Alabama,"Alabaster, Alabama","https://en.wikipedia.org/wiki/Alabaster,_Alabama",1179139816,C
...,...,...,...,...,...
21510,Wyoming,"Wamsutter, Wyoming","https://en.wikipedia.org/wiki/Wamsutter,_Wyoming",1169591845,GA
21511,Wyoming,"Wheatland, Wyoming","https://en.wikipedia.org/wiki/Wheatland,_Wyoming",1176370621,GA
21512,Wyoming,"Worland, Wyoming","https://en.wikipedia.org/wiki/Worland,_Wyoming",1166347917,GA
21513,Wyoming,"Wright, Wyoming","https://en.wikipedia.org/wiki/Wright,_Wyoming",1166334449,GA


In [126]:
us_cities_by_state[us_cities_by_state['quality'].isna()]

Unnamed: 0,state,page_title,url,rev_id,quality


Let us now add the population data and the regions information for each state

In [127]:
# Load the US population data from an Excel file, skipping the first 4 rows of metadata.
us_population_data = pd.read_excel('data/NST-EST2022-POP.xlsx', skiprows=4)

# Remove unnecessary rows and reset the index.
us_population_data = us_population_data[4:]
us_population_data.reset_index(drop=True, inplace=True)

# Rename the columns for clarity.
us_population_data.columns = ['state', '2020_est', '2020', '2021', '2022']

# Filter and clean the data: Keep only rows where the 'state' names start with a period.
us_population_data = us_population_data[us_population_data['state'].str.contains('^\.', na=False)]
us_population_data['state'] = us_population_data['state'].str.slice(1)
us_population_data = us_population_data[['state', '2022']].reset_index(drop=True)

# Rename the columns to reflect the year.
us_population_data.columns = ['state', 'Population_2022']

#us_population_data


In [128]:
# Load regional division data from an Excel file
us_regions = pd.read_excel('data/US States by Region - US Census Bureau.xlsx')

# Fill missing REGION and DIVISION values with the previous valid value
us_regions['REGION'].ffill(inplace=True)
us_regions['DIVISION'].ffill(inplace=True)

# Filter rows with valid STATE entries (remove non-state rows)
us_regions = us_regions.dropna(subset=['STATE'])

# Convert column names to lowercase for consistency
us_regions.columns = us_regions.columns.str.lower()

#us_regions



In [129]:
us_cities_by_state['state'].unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Delaware', 'Florida', 'Georgia_(U.S._state)',
       'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',
       'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts',
       'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
       'Nevada', 'New_Hampshire', 'New_Jersey', 'New_Mexico', 'New_York',
       'North_Carolina', 'North_Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Rhode_Island', 'South_Carolina', 'South_Dakota',
       'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West_Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

Looks like we need to clean some of the state names to merge with the other datasets

In [130]:
# Clean state names, handling any specific cases
us_cities_by_state['state'] = us_cities_by_state['state'].apply(lambda x: 'Georgia' if x == 'Georgia_(U.S._state)' else x)

# The cityxstate data had values like New_york, fixing that
us_cities_by_state['state'] = us_cities_by_state.state.str.replace('_', ' ')

In [143]:
# merge all the datasets
temp1 = us_regions.merge(us_population_data, on = 'state').merge(us_cities_by_state, on='state')
temp1.shape

(21515, 8)

In [144]:
temp1.drop(columns = ['url','region'], inplace = True)


In [146]:
renaming = {
    'division': 'regional_division',
    'Population_2022': 'population',
    'page_title':'article_title',
    'rev_id': 'revision_id',
    'quality':'article_quality'
}

temp1.rename(columns=renaming, inplace=True)

In [148]:
temp1.sort_values('article_title')

Unnamed: 0,regional_division,state,population,article_title,revision_id,article_quality
20851,Pacific,Hawaii,1440196.0,"'Ewa Gentry, Hawaii",419689771,Stub
15396,East South Central,Alabama,5074296.0,"Abbeville, Alabama",1171163550,C
14015,South Atlantic,Georgia,10912876.0,"Abbeville, Georgia",1171167087,C
17424,West South Central,Louisiana,4590241.0,"Abbeville, Louisiana",1178840199,C
16278,East South Central,Mississippi,2940057.0,"Abbeville, Mississippi",1171172603,C
...,...,...,...,...,...,...
14014,South Atlantic,Florida,22244823.0,"Zolfo Springs, Florida",1171166674,GA
11771,West North Central,Minnesota,5717184.0,"Zumbro Falls, Minnesota",1165908705,C
11252,West North Central,Minnesota,5717184.0,"Zumbrota, Minnesota",1170033989,C
11011,West North Central,Iowa,3200517.0,"Zwingle, Iowa",1171169323,C


In [149]:
temp1.to_csv('data/wp_scored_city_articles_by_state.csv',index=False)

## Scratchpad