# Homework 2
Aditi shrivastava

### License
This code example was developed by Dr. David W. McDonald for use in DATA 512, a course in the UW MS Data Science degree program. This code is provided under the [Creative Commons](https://creativecommons.org) [CC-BY license](https://creativecommons.org/licenses/by/4.0/). Revision 1.1 - August 14, 2023

In [60]:
import json, time, urllib.parse
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import numpy as np

### fkhfk

The API request will be made using one procedure. The idea is to make this reusable. The procedure is parameterized, but relies on the constants above for the important parameters. The underlying assumption is that this will be used to request data for a set of article pages. Therefore the parameter most likely to change is the article_title.

In [77]:
#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.004       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<adts@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = pd.read_csv('input_data/us_cities_by_state_SEPT.2023.csv').page_title.values

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
# PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

### fkhsd 
fksdh

In [78]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    
    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template, timeout=(10, 60))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


### fkhsdf 
fdskfjhd fkhds kfhsd h

In [None]:
dic = {}

for i in range(0, len(ARTICLE_TITLES), 50):
    chunk = ARTICLE_TITLES[i:i+50]
    page_titles = '|'.join(chunk)
    request_info = PAGEINFO_PARAMS_TEMPLATE.copy()
    request_info['titles'] = page_titles
    info = request_pageinfo_per_article(request_template=request_info)

    for i in info['query']['pages']:
        page = info['query']['pages'][i]
        dic[i] = {}
        dic[i]['article_title'] = page['title']
        dic[i]['state'] = page['title'].split(', ')[-1]
        dic[i]['revision_id'] = page['lastrevid']

revids = pd.DataFrame.from_dict(dic, orient='index')
revids.to_csv('revision_ids.csv')

In [105]:
print('There appear to be ' + str(len(ARTICLE_TITLES) - len(pd.read_csv('revision_ids.csv').article_title.values)) + ' duplicate values that were removed after scraping')

There appear to be 638 duplicate values that were removed after scraping


### dkjshfdsh f
fhdksfjh dsfjh 

In [2]:
#########
#
#    CONSTANTS
#

#    The current LiftWing ORES API endpoint and prediction model
#
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"


#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.004       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (60.0/5000.0)-API_LATENCY_ASSUMED

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#    
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<adts0000@gmail.com>, University of Washington, MSDS DATA 512 - AUTUMN 2023",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}

#    This is a template for the parameters that we need to supply in the headers of an API request
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "adts0000@gmail.com",         # your email address should go here
    'access_token'  : "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiI4MGY5YThhZGI5YTg1YTY0ZmYwOGY1NDQ2NDI5NGQ4ZSIsImp0aSI6IjBmZDI1YTQ5MzBiNDc2MDc0N2YyYTkyZGVjYTU3MzJmMzY1OWQ5NzQzMGQ4MjNhOWM4ZmY0ZmY3ZTljNjBhYTc0OTQzMzU4NmY3MzJmOGE0IiwiaWF0IjoxNjk3MzEyNTI4LjE2ODA1MywibmJmIjoxNjk3MzEyNTI4LjE2ODA1OCwiZXhwIjozMzI1NDIyMTMyOC4xNjYxOCwic3ViIjoiNzQwMDUwMzMiLCJpc3MiOiJodHRwczovL21ldGEud2lraW1lZGlhLm9yZyIsInJhdGVsaW1pdCI6eyJyZXF1ZXN0c19wZXJfdW5pdCI6NTAwMCwidW5pdCI6IkhPVVIifSwic2NvcGVzIjpbImJhc2ljIiwiY3JlYXRlZWRpdG1vdmVwYWdlIiwiZWRpdHByb3RlY3RlZCJdfQ.hoLCl7BYJEIE9fudGdyFGnK_YieHZl_E1SKWhQ82kb_ftI3srWwJLTd4vzPDpEzVDUBoxThVEOkfJ0kju-odneJf5dfPw2hpRxoLu9518bozayFtP1yyll7V1yluq8-cGbsEZqllevC0qBrRw-5RNcb5ZlpvvD8E6SPHGXq4aA-EqygK458T5kSlKr-bN3E3VbfxXkrljZVepykyRml2YJMI7sJ9TeF0yU_L5uL9pbOHKN9cRadB7YFhju7NI_bq_1ooOn33q4lBkTkql_ChjbmkhEn3u8GJ-3Tg9F_PuEcFo2JOD2BxXLKJwqJGurvly3-NKnJ-9U2b-Lmxdhzi0JnmoCajs2fObEGYZraTQIW7REVeaq5pKllc_BOkiDaHS9YgA5q5ay1sqkcqhDqt8kv-UF2gonQ8ePGLtZtlEaKjIQlaHJtz05YNilXZW9yv5H4x9NhXF5QK_mW26U9w_Or-HfMilsnU3DjBQuX03n55OBKqAATtNWKJhizFy_nR1SZyds0H1Rr-lLuzE50ZA5svGCZT0Y79JbSe6NgFUeXg3MOPAm6QMYKK2u1g_FDkmsEKIcCB0KWISk6eGsH5yNSmmPHkkz3wlctkukMOeexa2swT3s2Kx9tr7FwYmVd_bsWFj0pqELZggTAdQzm9i7U38sWzbdJ2mn9Nkaxx_Ho"          # the access token you create will need to go here
}

#    A dictionary of English Wikipedia article titles (keys) and sample revision IDs that can be used for this ORES scoring example
ARTICLE_REVISIONS = { 'Bison':1085687913 , 'Northern flicker':1086582504 , 'Red squirrel':1083787665 , 'Chinook salmon':1085406228 , 'Horseshoe bat':1060601936 }
# ARTICLE_REVISIONS = dict(zip(revids.article_title, revids.revision_id))

#    This is a template of the data required as a payload when making a scoring request of the ORES model
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}

#    These are used later - defined here so they, at least, have empty values
#
USERNAME = "adts0000@gmail.com"
ACCESS_TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiI4MGY5YThhZGI5YTg1YTY0ZmYwOGY1NDQ2NDI5NGQ4ZSIsImp0aSI6IjBmZDI1YTQ5MzBiNDc2MDc0N2YyYTkyZGVjYTU3MzJmMzY1OWQ5NzQzMGQ4MjNhOWM4ZmY0ZmY3ZTljNjBhYTc0OTQzMzU4NmY3MzJmOGE0IiwiaWF0IjoxNjk3MzEyNTI4LjE2ODA1MywibmJmIjoxNjk3MzEyNTI4LjE2ODA1OCwiZXhwIjozMzI1NDIyMTMyOC4xNjYxOCwic3ViIjoiNzQwMDUwMzMiLCJpc3MiOiJodHRwczovL21ldGEud2lraW1lZGlhLm9yZyIsInJhdGVsaW1pdCI6eyJyZXF1ZXN0c19wZXJfdW5pdCI6NTAwMCwidW5pdCI6IkhPVVIifSwic2NvcGVzIjpbImJhc2ljIiwiY3JlYXRlZWRpdG1vdmVwYWdlIiwiZWRpdHByb3RlY3RlZCJdfQ.hoLCl7BYJEIE9fudGdyFGnK_YieHZl_E1SKWhQ82kb_ftI3srWwJLTd4vzPDpEzVDUBoxThVEOkfJ0kju-odneJf5dfPw2hpRxoLu9518bozayFtP1yyll7V1yluq8-cGbsEZqllevC0qBrRw-5RNcb5ZlpvvD8E6SPHGXq4aA-EqygK458T5kSlKr-bN3E3VbfxXkrljZVepykyRml2YJMI7sJ9TeF0yU_L5uL9pbOHKN9cRadB7YFhju7NI_bq_1ooOn33q4lBkTkql_ChjbmkhEn3u8GJ-3Tg9F_PuEcFo2JOD2BxXLKJwqJGurvly3-NKnJ-9U2b-Lmxdhzi0JnmoCajs2fObEGYZraTQIW7REVeaq5pKllc_BOkiDaHS9YgA5q5ay1sqkcqhDqt8kv-UF2gonQ8ePGLtZtlEaKjIQlaHJtz05YNilXZW9yv5H4x9NhXF5QK_mW26U9w_Or-HfMilsnU3DjBQuX03n55OBKqAATtNWKJhizFy_nR1SZyds0H1Rr-lLuzE50ZA5svGCZT0Y79JbSe6NgFUeXg3MOPAm6QMYKK2u1g_FDkmsEKIcCB0KWISk6eGsH5yNSmmPHkkz3wlctkukMOeexa2swT3s2Kx9tr7FwYmVd_bsWFj0pqELZggTAdQzm9i7U38sWzbdJ2mn9Nkaxx_Ho"
#

In [3]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid):
    email_address=USERNAME
    access_token=ACCESS_TOKEN
    endpoint_url = API_ORES_LIFTWING_ENDPOINT
    model_name = API_ORES_EN_QUALITY_MODEL
    request_data = ORES_REQUEST_DATA_TEMPLATE
    header_format = REQUEST_HEADER_TEMPLATE
    header_params = REQUEST_HEADER_PARAMS_TEMPLATE
    
    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token
    
    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")
    
    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)
    
    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        # response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [None]:
dic = {}
rev_ids = revids.revision_id.values

batch_size = 20
revid_batches = [rev_ids[i:i + batch_size] for i in range(0, len(rev_ids), batch_size)]

def process_batch(batch):
    for revid in batch:
        data = request_ores_score_per_article(int(revid))
        dic[int(revid)] = data['enwiki']['scores'][str(revid)]['articlequality']

with ThreadPoolExecutor(10) as executor:
    for batch in revid_batches:
        executor.submit(process_batch, batch)
        time.sleep(10) 

scores = pd.DataFrame.from_dict(dic, orient='index')       
scores.columns = ['revision_id', 'article_quality']
scores.article_quality = scores.article_quality.apply(lambda x: x['score']['prediction'])
revids.to_csv('article_scores.csv') 

### fkdsfhd s
fdhskjfuhsdfkh

In [46]:
revids.head(5)

Unnamed: 0,article_title,state,revision_id
104730,"Abbeville, Alabama",Alabama,1171163550
104761,"Adamsville, Alabama",Alabama,1177621427
105188,"Addison, Alabama",Alabama,1168359898
104726,"Akron, Alabama",Alabama,1165909508
105109,"Alabaster, Alabama",Alabama,1179139816


In [47]:
scores.head(5)

Unnamed: 0,revision_id,article_quality
0,1171163550,C
1,1177621427,C
2,1168359898,C
3,1165909508,GA
4,1179139816,C


In [171]:
population = pd.read_excel('input_data/NST-EST2022-POP.xlsx')
population.columns = ['state', 'base', '2020pop', '2021pop', 'population']
population.state = population.state.apply(lambda x: str(x).strip('.'))
population = population[population.state.isin(np.unique(revids.state))][['state', 'population']]
population.to_csv('state_populations.csv')
population.head(5)

Unnamed: 0,state,population
8,Alabama,5074296.0
9,Alaska,733583.0
10,Arizona,7359197.0
11,Arkansas,3045637.0
12,California,39029342.0


In [172]:
region = pd.read_excel('input_data/US States by Region - US Census Bureau.xlsx')[['DIVISION', 'STATE']]
region.columns = ['regional_division', 'state']
region = region.dropna(axis=0, how='all')
region['regional_division'] = ['New England']*7 + ['Middle Atlantic']*4 + ['East North Central']*6 + ['West North Central']*8 + ['South Atlantic']*9+ ['East South Central']*5+ ['West South Central']*5+ ['Mountain']*9+ ['Pacific']*6
region = region[region.state.isin(population.state)]
region.to_csv('regional_divisions.csv')
region.head(5)

Unnamed: 0,regional_division,state
3,New England,Maine
4,New England,Massachusetts
5,New England,New Hampshire
6,New England,Rhode Island
7,New England,Vermont


fejhfjhf dfsjfh djhf l

In [120]:
df = revids.merge(region, on='state').merge(scores, on='revision_id').merge(population, on='state')
df.head(5)

Unnamed: 0,article_title,state,revision_id,regional_division,article_quality,population
0,"Abbeville, Alabama",Alabama,1171163550,East South Central,C,5074296.0
1,"Adamsville, Alabama",Alabama,1177621427,East South Central,C,5074296.0
2,"Addison, Alabama",Alabama,1168359898,East South Central,C,5074296.0
3,"Akron, Alabama",Alabama,1165909508,East South Central,GA,5074296.0
4,"Alabaster, Alabama",Alabama,1179139816,East South Central,C,5074296.0


In [121]:
df.to_csv('wp_scored_city_articles_by_state.csv')

## fskfh ds
fdhskfjh dfjdh fjhf sdjfh ds


#### frudrehf 
dhkdhdkh

In [141]:
df1 = df.groupby('state').count().reset_index()[['state', 'article_title']]
df1.columns = ['state', 'num_articles']
df1 = population.merge(df1, on='state')
df1['coverage'] = df1['num_articles']/df1.population
df1.sort_values(by='coverage', ascending=False).head(10)

Unnamed: 0,state,population,num_articles,coverage
42,Vermont,647064.0,328,0.000507
31,North Dakota,779261.0,356,0.000457
17,Maine,1385340.0,483,0.000349
38,South Dakota,909824.0,310,0.000341
13,Iowa,3200517.0,1042,0.000326
1,Alaska,733583.0,148,0.000202
35,Pennsylvania,12972008.0,2554,0.000197
20,Michigan,10034113.0,1772,0.000177
47,Wyoming,581381.0,99,0.00017
26,New Hampshire,1395231.0,234,0.000168


#### frudrehf 
dhkdhdkh

In [142]:
df2 = df1.sort_values(by='coverage', ascending=True)
df2.head(10)

Unnamed: 0,state,population,num_articles,coverage
30,North Carolina,10698973.0,50,5e-06
25,Nevada,3177772.0,18,6e-06
4,California,39029342.0,476,1.2e-05
2,Arizona,7359197.0,91,1.2e-05
43,Virginia,8683619.0,133,1.5e-05
7,Florida,22244823.0,409,1.8e-05
33,Oklahoma,4019800.0,74,1.8e-05
14,Kansas,2937150.0,63,2.1e-05
18,Maryland,6164660.0,157,2.5e-05
46,Wisconsin,5892539.0,190,3.2e-05


#### frudrehf 
dhkdhdkh

In [147]:
df3 = df[df.article_quality.isin(['FA', 'GA'])]
df3 = df3.groupby('state').count().reset_index()[['state', 'article_title']]
df3.columns = ['state', 'num_high_quality_articles']
df3 = population.merge(df3, on='state')
df3['coverage'] = df3['num_high_quality_articles']/df3.population
df3.sort_values(by='coverage', ascending=False).head(10)

Unnamed: 0,state,population,num_high_quality_articles,coverage
42,Vermont,647064.0,45,7e-05
47,Wyoming,581381.0,39,6.7e-05
38,South Dakota,909824.0,56,6.2e-05
45,West Virginia,1775156.0,106,6e-05
24,Montana,1122867.0,54,4.8e-05
26,New Hampshire,1395231.0,63,4.5e-05
35,Pennsylvania,12972008.0,565,4.4e-05
23,Missouri,6177957.0,262,4.2e-05
1,Alaska,733583.0,31,4.2e-05
27,New Jersey,9261699.0,378,4.1e-05


#### frudrehf 
dhkdhdkh

In [148]:
df4 = df3.sort_values(by='coverage', ascending=True)
df4.head(10)

Unnamed: 0,state,population,num_high_quality_articles,coverage
30,North Carolina,10698973.0,20,2e-06
43,Virginia,8683619.0,18,2e-06
25,Nevada,3177772.0,7,2e-06
2,Arizona,7359197.0,24,3e-06
4,California,39029342.0,170,4e-06
7,Florida,22244823.0,117,5e-06
29,New York,19677151.0,111,6e-06
18,Maryland,6164660.0,42,7e-06
14,Kansas,2937150.0,22,7e-06
33,Oklahoma,4019800.0,31,8e-06


#### frudrehf 
dhkdhdkh

In [166]:
df5 = df.groupby('regional_division').count().reset_index()[['regional_division', 'article_title']]
df5.columns = ['regional_division', 'num_articles']
df5 = population.merge(region, on='state').groupby('regional_division').sum().reset_index().merge(df5, on='regional_division')
df5['coverage'] = df5.num_articles/df5.population
df5.sort_values(by='coverage', ascending=False)

Unnamed: 0,regional_division,population,num_articles,coverage
7,West North Central,19721893.0,3570,0.000181
4,New England,11503343.0,1433,0.000125
0,East North Central,47097779.0,4748,0.000101
2,Middle Atlantic,41910858.0,3771,9e-05
1,East South Central,19578002.0,1524,7.8e-05
8,West South Central,41685250.0,2093,5e-05
3,Mountain,25514320.0,1181,4.6e-05
6,South Atlantic,66781137.0,1845,2.8e-05
5,Pacific,53229044.0,1295,2.4e-05


#### frudrehf 
dhkdhdkh

In [168]:
df6 = df[df.article_quality.isin(['FA', 'GA'])]
df6 = df6.groupby('regional_division').count().reset_index()[['regional_division', 'article_title']]
df6.columns = ['regional_division', 'num_high_quality_articles']
df6 = population.merge(region, on='state').groupby('regional_division').sum().reset_index().merge(df6, on='regional_division')
df6['coverage'] = df6.num_high_quality_articles/df6.population
df6.sort_values(by='coverage', ascending=False)

Unnamed: 0,regional_division,population,num_high_quality_articles,coverage
7,West North Central,19721893.0,636,3.2e-05
2,Middle Atlantic,41910858.0,1054,2.5e-05
4,New England,11503343.0,224,1.9e-05
1,East South Central,19578002.0,315,1.6e-05
0,East North Central,47097779.0,714,1.5e-05
8,West South Central,41685250.0,630,1.5e-05
3,Mountain,25514320.0,332,1.3e-05
5,Pacific,53229044.0,487,9e-06
6,South Atlantic,66781137.0,524,8e-06
