# Homework 2 - Considering Bias in Data

### Step 1: Getting the data

In [2]:
## import statements
import pandas as pd
import json, time, urllib.parse
import requests

In [3]:
#fetching the data
us_cities_by_state = pd.read_csv('data/us_cities_by_state_SEPT.2023.csv')
#state_populations = pd.read_excel('data/NST-EST2022-POP.xlsx')
#region_reference = pd.read_excel("data/US States by Region - US Census Bureau.xlsx")


In [4]:
#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<uwnetid@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = [ 'Bison', 'Northern flicker', 'Red squirrel', 'Chinook salmon', 'Horseshoe bat' ]

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    
    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [5]:
print(f"Getting page info data for: {ARTICLE_TITLES[3]}")
info = request_pageinfo_per_article(ARTICLE_TITLES[3])
print(json.dumps(info,indent=4))

Getting page info data for: Chinook salmon
{
    "batchcomplete": "",
    "query": {
        "pages": {
            "1212891": {
                "pageid": 1212891,
                "ns": 0,
                "title": "Chinook salmon",
                "contentmodel": "wikitext",
                "pagelanguage": "en",
                "pagelanguagehtmlcode": "en",
                "pagelanguagedir": "ltr",
                "touched": "2023-10-10T22:39:15Z",
                "lastrevid": 1178125499,
                "length": 49187,
                "watchers": 102,
                "talkid": 3909817,
                "fullurl": "https://en.wikipedia.org/wiki/Chinook_salmon",
                "editurl": "https://en.wikipedia.org/w/index.php?title=Chinook_salmon&action=edit",
                "canonicalurl": "https://en.wikipedia.org/wiki/Chinook_salmon"
            }
        }
    }
}


#### Cleaning up the dataframe

In [8]:
## inconsistencies with the state list

# Dropping duplicates
us_cities_by_state.drop_duplicates(inplace=True, ignore_index=True)


In [21]:
#checking to see if we missed any unwanted entries
a = list(us_cities_by_state.page_title)
b = {}
c = set()

for state in a:
    if state in b:
        c.add(state)
    else:
        b[state] = 1
us_cities_by_state[us_cities_by_state['page_title'].isin(c)]

Unnamed: 0,state,page_title,url,state_in_page_title


We now need to remove these rows from the original dataframe

In [10]:
us_cities_by_state.drop(us_cities_by_state[us_cities_by_state['page_title'].isin(c)].index, inplace=True)


In [59]:
us_cities_by_state

Unnamed: 0,state,page_title,url
0,Alabama,"Abbeville, Alabama","https://en.wikipedia.org/wiki/Abbeville,_Alabama"
1,Alabama,"Adamsville, Alabama","https://en.wikipedia.org/wiki/Adamsville,_Alabama"
2,Alabama,"Addison, Alabama","https://en.wikipedia.org/wiki/Addison,_Alabama"
3,Alabama,"Akron, Alabama","https://en.wikipedia.org/wiki/Akron,_Alabama"
4,Alabama,"Alabaster, Alabama","https://en.wikipedia.org/wiki/Alabaster,_Alabama"
...,...,...,...
21520,Wyoming,"Wamsutter, Wyoming","https://en.wikipedia.org/wiki/Wamsutter,_Wyoming"
21521,Wyoming,"Wheatland, Wyoming","https://en.wikipedia.org/wiki/Wheatland,_Wyoming"
21522,Wyoming,"Worland, Wyoming","https://en.wikipedia.org/wiki/Worland,_Wyoming"
21523,Wyoming,"Wright, Wyoming","https://en.wikipedia.org/wiki/Wright,_Wyoming"


### Step 2: Getting Article Quality Predictions

## Scratchpad

In [63]:
#!pip install wikipedia-api
import wikipediaapi


In [67]:
def is_place_or_city_wikipedia(title):
    wiki_wiki = wikipediaapi.Wikipedia(language='en', extract_format=wikipediaapi.ExtractFormat.WIKI,
    headers={
    'User-Agent': '<uwnetid@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
},user_agent='aadi2000@uw.edu')

    page = wiki_wiki.page(title)

    if page.exists():
        # Check if the article contains keywords that indicate it's about a place or city
        keywords = ['city', 'town', 'municipality', 'capital', 'village']
        for keyword in keywords:
            if keyword in page.text.lower():
                return True  # It's likely a place or city

        return False  # Not a place or city
    else:
        return None  # Article not found

# Example usage:
#titles = ['New York City', 'Mount Everest', 'Banana Republic']
for title in us_cities_by_state['page_title']:
    is_location = is_place_or_city_wikipedia(title)
    if is_location:
        print(f"{title} is likely a place or city.")
    elif is_location is False:
        print(f"{title} is not a place or city.")
    else:
        print(f"Unable to determine for {title}.")


Abbeville, Alabama is likely a place or city.
Adamsville, Alabama is likely a place or city.
Addison, Alabama is likely a place or city.
Akron, Alabama is likely a place or city.
Alabaster, Alabama is likely a place or city.
Albertville, Alabama is likely a place or city.
Alexander City, Alabama is likely a place or city.
Aliceville, Alabama is likely a place or city.
Allgood, Alabama is likely a place or city.
Altoona, Alabama is likely a place or city.
Andalusia, Alabama is likely a place or city.
Anderson, Lauderdale County, Alabama is likely a place or city.
Anniston, Alabama is likely a place or city.


KeyboardInterrupt: 

In [69]:
import wikipediaapi
import re

def is_place_or_city_wikipedia(title):
    wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI,
        headers={
            'User-Agent': 'Your User Agent',  # Replace with your user agent
        },
        user_agent='aadi2000@uw.edu'
    )

    page = wiki_wiki.page(title)

    if page.exists():
        # Get the article text and convert it to lowercase for case-insensitive matching
        text = page.text.lower()

        # Use a regular expression to search for any of the keywords in the text
        keywords = ['city', 'town', 'municipality', 'capital', 'village']
        keyword_pattern = '|'.join(keywords)
        if re.search(keyword_pattern, text):
            return True  # It's likely a place or city

        return False  # Not a place or city

    return None  # Article not found

# Example usage:
titles = ['New York City', 'Mount Everest', 'Banana Republic']
for title in titles:
    is_location = is_place_or_city_wikipedia(title)
    if is_location:
        print(f"{title} is likely a place or city.")
    elif is_location is False:
        print(f"{title} is not a place or city.")
    else:
        print(f"Unable to determine for {title}.")


New York City is likely a place or city.
Mount Everest is likely a place or city.
Banana Republic is likely a place or city.


In [None]:
# us_cities_by_state['state_in_page_title'] = us_cities_by_state.apply(lambda row: row['state'] in row['page_title'], axis=1)
# us_cities_by_state[us_cities_by_state['state_in_page_title']==False].head(20)

#us_cities_by_state[us_cities_by_state['page_title'].str.contains('Denver')]
