In [108]:
# 
# These are standard python modules
import json, time, urllib.parse
#
# The 'requests' module is not a standard Python module. You will need to install this with pip/pip3 if you do not already have it
import requests
import pandas

The CSV file is imported. The list of articles is extracted from the resulting dataframe.

In [193]:
#importing the CSV file
df = pandas.read_csv('us_cities_by_state_SEPT.2023.csv')
article_name_list = list(set(df['page_title']))

A subset of the above dataframe containing just the article title and the state is saved to be used later.

In [368]:
state_article_mapping = df[["page_title", "state"]]
state_article_mapping

Unnamed: 0,page_title,state
0,"Abbeville, Alabama",Alabama
1,"Adamsville, Alabama",Alabama
2,"Addison, Alabama",Alabama
3,"Akron, Alabama",Alabama
4,"Alabaster, Alabama",Alabama
...,...,...
22152,"Wamsutter, Wyoming",Wyoming
22153,"Wheatland, Wyoming",Wyoming
22154,"Worland, Wyoming",Wyoming
22155,"Wright, Wyoming",Wyoming


A few constants are defined that make it easier to access certain values to enhance readability. 

In [113]:
#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<anair4@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = article_name_list 

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}


In [18]:
#########
#
#    CONSTANTS
#

#    The current LiftWing ORES API endpoint and prediction model
#
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (60.0/5000.0)-API_LATENCY_ASSUMED

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#    
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "anair4@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2023",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "",         # your email address should go here
    'access_token'  : ""          # the access token you create will need to go here
}

#
#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}

#
#    These are used later - defined here so they, at least, have empty values
#
USERNAME = ""
ACCESS_TOKEN = ""
#

The request_pageinfo_per_article is a function that creates and sends a request by combining the endpoint url with the parameters for the request.

In [9]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    
    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

The function created above is run for each of the articles which are stored in a list which is iterated through. The last revision ID is extracted from the response and compiled into a dictionary which is stored in a JSON file. 

In [None]:
rev_id_info = dict()
for i in range(0, len(ARTICLE_TITLES)):
    print(f"Getting page info data for: {ARTICLE_TITLES[i]}")
    info = request_pageinfo_per_article(ARTICLE_TITLES[i])
    temp_key = list(info['query']['pages'].keys())[0]
    rev_id_info[ARTICLE_TITLES[i]] = info['query']['pages'][temp_key]['lastrevid']
    
with open("rev-id-details.json", "w") as final:
   json.dump(rev_id_info, final)

The username and access code were generated from the Wikimedia page. The link is https://api.wikimedia.org/wiki/Authentication#:~:text=Create%20token,place%2C%20like%20a%20password%20manager.

In [29]:
USERNAME = "Anair12"
ACCESS_TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiJmNTAyNmQ3MjhmNWFlZDk0YTFhYWJjN2VmMDQ2MmI3MSIsImp0aSI6ImU5NDA3NDQzNzI5NDFkYjQ2N2UxYjE0NjA2NmM2N2NlMTRhNDZhYjhlYTEzNzk4M2E5ZmQ2MzA2MTg0MGVkMmRjMThhYmE1YmUyNzg5MmY4IiwiaWF0IjoxNjk3MzI0ODk1LjE1NDA1LCJuYmYiOjE2OTczMjQ4OTUuMTU0MDUzLCJleHAiOjMzMjU0MjMzNjk1LjE1Mjc1Niwic3ViIjoiNzQwMDU5NTgiLCJpc3MiOiJodHRwczovL21ldGEud2lraW1lZGlhLm9yZyIsInJhdGVsaW1pdCI6eyJyZXF1ZXN0c19wZXJfdW5pdCI6NTAwMCwidW5pdCI6IkhPVVIifSwic2NvcGVzIjpbImJhc2ljIl19.tMHApBeuauq1kVBM1OW2_aQwh_ANrMhRXut2FlaKFc-XrurT_U-QrJkjECewRCU54MiEwKCwxZWX4QEWXjHgMwRlRAm1Qj11yA5As8Br6f_QnWit4B4XBv_MfBCmTVphhQLhfXN6rnWmbaPQJR1M4VRg0y3NXhaZ8ExSLuJUk6BsiZcFM-vNLZHtyTr0op8FkQd5w6XdgozXoAq76ggMFHnj4_yJ3rtKy3cuPlnSUQpHiTYXLorHcN--9b_RV5SPRGFxrZ1D9oNjajzg9p0ENLnSB9UmES-3ZBSAhDPUcpRzUfuyVpyzMlyUZIG_UIfzI2sRXPGEpoTNYiOzKOrhV_FdwosAVjIgfqd5eCGjPbYo7W6eGebYAHDTcqibFGSRWR3LqG21RS01a0p77vLU-zklNuqgSDfLfof3Mr15yckS8TMY4Mw9-T3g5SBbOFSFOkiSaAT-sElfp2nVkfmDgrj_izv_AFweIEpXwkxnwRU8IvnmpqPujZoIAQ-eJFOnl7lNGUuLy9OlBX5XuJiAoDiCKYsBmzsuOP7ac69MpS4THjE3ZnPbOnXAVulTe2WGDU0LH8VM3kUPnGo7n3S8RXlu5Ub5Yo6E-DRDvOASL5w7PDLbog6qOsvI0jwLCRDysLMdfG3-rfOvD0CqBNzzOVnOheGiuiu2lWONIjESH2g"

In [None]:
#
#   Decode the Wikimedia JWT Access token
#
#   NOTE: This is not required to use LiftWing to request ORES scores. This is just being done to satisfy my curiosity.
#
import base64

print("Decoding the ACCESS_TOKEN:")
try:
    token_components = ACCESS_TOKEN.split(".")
    if len(token_components) == 3:
        header = json.loads(base64.b64decode(token_components[0]).decode())
        payload = json.loads(base64.b64decode(token_components[1]).decode())
        print("Token Header:",json.dumps(header,indent=4))
        print("Token Payload:",json.dumps(payload,indent=4))
        #print("Token Signature:",token_components[2])
        print("Token Signature: <value_suppressed>")
        #
        #  One should be able to use public/private keys to actually validate that signature - left as an exercise for later
        #
    else:
        print(f"The ACCESS_TOKEN appears to be improperly structured. It should have 3 components and it has {len(token_components)}")
except Exception as ex:
    print(f"Looks like the ACCESS_TOKEN is undefined or an empty value")
    raise(ex)

The revision IDs extracted are used as parameters along with the username and access token in order to create and send a request using the ORES API which allows use to retrieve information about the predicted quality scores of each article.

In [31]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   endpoint_url = API_ORES_LIFTWING_ENDPOINT, 
                                   model_name = API_ORES_EN_QUALITY_MODEL, 
                                   request_data = ORES_REQUEST_DATA_TEMPLATE, 
                                   header_format = REQUEST_HEADER_TEMPLATE, 
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):
    
    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token
    
    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")
    
    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)
    
    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        #response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

The above function is called on all of the articles and the predicted quality scores are stored in a dictionary which is written into a JSON file. Ensure that the the limit is not exceeded.

In [None]:
with open("rev-id-details.json", "r") as final:
   rev_id_scores = json.load(final)
rev_id_scores_final = {}
bad_recs = {}
for j in range(0, len(ARTICLE_TITLES)):
    rev_id = 0
    article_title = ARTICLE_TITLES[j]
    print(article_title, j)
    #print(f"Getting LiftWing ORES scores for '{article_title}' with revid: {rev_id_scores[article_title]:d}")
#
#    Make the call, just pass in the article revision ID, email address, and access token
    score = request_ores_score_per_article(article_revid=rev_id_scores[article_title],
                                       email_address="anair4@uw.edu",
                                       access_token=ACCESS_TOKEN)
    if score != None:
        rev_id = int(rev_id_scores[article_title])
        rev_id_scores_final[article_title] = {}
        rev_id_scores_final[article_title]["revision_id"] = rev_id
        rev_id_scores_final[article_title]["score"] = score["enwiki"]["scores"]
    else:
        rev_id = int(rev_id_scores[article_title])
        bad_recs[article_title]["rev_id"] = rev_id

#

In [None]:
Dump the results into a JSON file to save.

In [152]:
#    Output the result
with open("score-details.json", "w") as final:
   json.dump(rev_id_scores_final, final)
#

In [132]:
with open("rev-id-details.json", "r") as final:
   rev_id_scores_final2 = json.load(final)

Since the number of requests was so high, the API would stop responding after every 3000-4000 requests with a 429 error code signifying too many requests. So, the dictionary was saved into a file in order to save the responses that were given and then the function was run again to update the doctionary with the remaining values. The process takes about 7 hours in total. A dictionary to store the unclassified articles was created but all the articles were given a classification.  

In [None]:
bad_recs2 = {}
for j in range(8925, len(ARTICLE_TITLES)):
    article_title = ARTICLE_TITLES[j]
    print(article_title, j)
    #print(f"Getting LiftWing ORES scores for '{article_title}' with revid: {rev_id_scores[article_title]:d}")
#
#    Make the call, just pass in the article revision ID, email address, and access token
    score = request_ores_score_per_article(article_revid=rev_id_scores[article_title],
                                       email_address="anair4@uw.edu",
                                       access_token=ACCESS_TOKEN)
    if score != None:
        rev_id = int(rev_id_scores[article_title])
        rev_id_scores_final[article_title] = {}
        rev_id_scores_final[article_title]["revision_id"] = rev_id
        rev_id_scores_final[article_title]["score"] = score["enwiki"]["scores"]
    else:
        rev_id = int(rev_id_scores[article_title])
        bad_recs[article_title]["rev_id"] = rev_id
#

In [154]:
#    Output the result
with open("score-details2.json", "w") as final:
   json.dump(rev_id_scores_final, final)
#

In [None]:
bad_recs3 = {}
for j in range(11778, len(ARTICLE_TITLES)):
    article_title = ARTICLE_TITLES[j]
    print(article_title, j)
    #print(f"Getting LiftWing ORES scores for '{article_title}' with revid: {rev_id_scores[article_title]:d}")
#
#    Make the call, just pass in the article revision ID, email address, and access token
    score = request_ores_score_per_article(article_revid=rev_id_scores[article_title],
                                       email_address="anair4@uw.edu",
                                       access_token=ACCESS_TOKEN)
    if score != None:
        rev_id = int(rev_id_scores[article_title])
        rev_id_scores_final[article_title] = {}
        rev_id_scores_final[article_title]["revision_id"] = rev_id
        rev_id_scores_final[article_title]["score"] = score["enwiki"]["scores"]
    else:
        rev_id = int(rev_id_scores[article_title])
        bad_recs[article_title]["rev_id"] = rev_id
#

In [156]:
#    Output the result
with open("score-details3.json", "w") as final:
   json.dump(rev_id_scores_final, final)
#

In [None]:
bad_recs4 = {}
for j in range(14533, len(ARTICLE_TITLES)):
    article_title = ARTICLE_TITLES[j]
    print(article_title, j)
    #print(f"Getting LiftWing ORES scores for '{article_title}' with revid: {rev_id_scores[article_title]:d}")
#
#    Make the call, just pass in the article revision ID, email address, and access token
    score = request_ores_score_per_article(article_revid=rev_id_scores[article_title],
                                       email_address="anair4@uw.edu",
                                       access_token=ACCESS_TOKEN)
    if score != None:
        rev_id = int(rev_id_scores[article_title])
        rev_id_scores_final[article_title] = {}
        rev_id_scores_final[article_title]["revision_id"] = rev_id
        rev_id_scores_final[article_title]["score"] = score["enwiki"]["scores"]
    else:
        rev_id = int(rev_id_scores[article_title])
        bad_rec4[article_title]["rev_id"] = rev_id
#

In [159]:
#    Output the result
with open("score-details4.json", "w") as final:
   json.dump(rev_id_scores_final, final)
#

In [None]:
bad_recs5 = {}
for j in range(21026, len(ARTICLE_TITLES)):
    article_title = ARTICLE_TITLES[j]
    print(article_title, j)
    #print(f"Getting LiftWing ORES scores for '{article_title}' with revid: {rev_id_scores[article_title]:d}")
#
#    Make the call, just pass in the article revision ID, email address, and access token
    score = request_ores_score_per_article(article_revid=rev_id_scores[article_title],
                                       email_address="anair4@uw.edu",
                                       access_token=ACCESS_TOKEN)
    if score != None:
        rev_id = int(rev_id_scores[article_title])
        rev_id_scores_final[article_title] = {}
        rev_id_scores_final[article_title]["revision_id"] = rev_id
        rev_id_scores_final[article_title]["score"] = score["enwiki"]["scores"]
    else:
        rev_id = int(rev_id_scores[article_title])
        bad_rec5[article_title]["rev_id"] = rev_id
#

In [161]:
#    Output the result
with open("score-details5.json", "w") as final:
   json.dump(rev_id_scores_final, final)
#

In [None]:
bad_recs5 = {}
for j in range(21144, len(ARTICLE_TITLES)):
    article_title = ARTICLE_TITLES[j]
    print(article_title, j)
    #print(f"Getting LiftWing ORES scores for '{article_title}' with revid: {rev_id_scores[article_title]:d}")
#
#    Make the call, just pass in the article revision ID, email address, and access token
    score = request_ores_score_per_article(article_revid=rev_id_scores[article_title],
                                       email_address="anair4@uw.edu",
                                       access_token=ACCESS_TOKEN)
    if score != None:
        rev_id = int(rev_id_scores[article_title])
        rev_id_scores_final[article_title] = {}
        rev_id_scores_final[article_title]["revision_id"] = rev_id
        rev_id_scores_final[article_title]["score"] = score["enwiki"]["scores"]
    else:
        rev_id = int(rev_id_scores[article_title])
        bad_rec5[article_title]["rev_id"] = rev_id
#

In [164]:
#    Output the result
with open("score-details6.json", "w") as final:
   json.dump(rev_id_scores_final, final)
#

In [379]:
article_title_list = []
state_list = []
rev_id_list = []
classification_list = []
for key in list(rev_id_scores_final.keys()):
    split = key.split(',')
    l = len(split)
    state = df.loc[df['page_title'] == key, 'state']
    rev_id = rev_id_scores_final[key]["revision_id"]
    state_list.append(state.values[0].replace('_', ' '))
    rev_id_list.append(rev_id)
    article_title_list.append(key)
    classification_list.append(rev_id_scores_final[key]["score"][str(rev_id)]["articlequality"]["score"]["prediction"])


A new dataframe is created by combining the article title, revision id, state and the article quality which is the classification of the article from the ORES API. This information is taken from the dictionary that was created above.

In [449]:
data = {"state":state_list, "article_title":article_title_list, "revision_id":rev_id_list, "article_quality":classification_list}
data_to_analyze = pandas.DataFrame(data)
data_to_analyze.to_csv('wp_scored_city_articles_by_state.csv')

The US States by Region - US Census Bureau sheet is read in which specifies the regional division that each state belongs to. Using the ffill() function, the file is formatted in order to create a mapping between regional division and state.

In [383]:
region_df = pandas.read_csv('US States by Region - US Census Bureau - Sheet1.csv')
cols = ['REGION', 'DIVISION']
region_df.loc[:,cols] = region_df.loc[:,cols].ffill()
region_df = region_df.drop('REGION', axis=1).rename(columns = {'DIVISION':'regional_division', 'STATE':'state'})
region_df 

Unnamed: 0,regional_division,state
0,,
1,New England,
2,New England,Connecticut
3,New England,Maine
4,New England,Massachusetts
...,...,...
58,Pacific,Alaska
59,Pacific,California
60,Pacific,Hawaii
61,Pacific,Oregon


The file containing the populations of the states and regions is downloaded from https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-total.html. The file is then read in and formatted by dropping all the unnecessary columns and rows. a leading . before each state is removed as are the commas from the population as it will be used as a float data type during the analysis.

In [None]:
pop_df = pandas.read_csv('state_population.csv')
rel_columns = pop_df.drop(['Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3'], axis = 1)
rel_rows = rel_columns.drop([0,1,2,61,62,63,64,65, 59]).reset_index(drop=True)
rel_rows = rel_rows.rename(columns={'table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)':'state', 'Unnamed: 4': 'population'})
rel_rows = rel_rows.replace('^\\.', '', regex=True)
rel_rows = rel_rows.replace(',','', regex=True)

The populations of each state are merged into the final dataset. An inner join is used over the state which acts as a foreign key. All values which are not states are ignored.

In [None]:
merged_data = pandas.merge(left=data_to_analyze, right=rel_rows, left_on='state', right_on='state')

The regional divisions of each state are merged into the final dataset as well. An inner join is used over the state which acts as a foreign key.

In [None]:
final_data = pandas.merge(left=merged_data, right=region_df, left_on='state', right_on='state')

In order to identify the 10 US states with the highest total articles per capita (in descending order), the subset of the relevant columns are first identified. It is then grouped by state and population and the total number of articles is identified. This is then divided by the state population to get the highest total articles per capita. The data frame is then sorted according to the requirements of the question. 

In [437]:
#Identify the relevant columns
q1_data = final_data[['state','population']]
#Group by state to find the number of articles per state
q1_data_grouped = q1_data.groupby(['state', 'population']).size().reset_index()
#Divide by state population which is converted to float to find the highest total articles per capita 
q1_data_grouped[0] = q1_data_grouped[0]/q1_data_grouped['population'].astype(float)
#Sort the dataframe by the total articles per capita 
sorted_q1_data = q1_data_grouped.sort_values(by=0, ascending=False)
sorted_q2_data = q1_data_grouped.sort_values(by=0)

The 10 US states with the highest total articles per capita (in descending order) are identified.

In [438]:
sorted_q1_data = sorted_q1_data.rename(columns = {0:"Top 10 US states by coverage"})
sorted_q1_data.reset_index(drop=True)[0:10]

Unnamed: 0,state,population,Top 10 US states by coverage
0,Vermont,647064,0.000508
1,North Dakota,779261,0.000457
2,Maine,1385340,0.000349
3,South Dakota,909824,0.000342
4,Iowa,3200517,0.000326
5,Alaska,733583,0.000203
6,Pennsylvania,12972008,0.000197
7,Michigan,10034113,0.000177
8,Wyoming,581381,0.00017
9,New Hampshire,1395231,0.000168


The 10 US states with the lowest total articles per capita (in ascending order) are identified.

In [439]:
sorted_q2_data = sorted_q2_data.rename(columns = {0:"Bottom 10 US states by coverage"})
sorted_q2_data.reset_index(drop=True)[0:10]

Unnamed: 0,state,population,Bottom 10 US states by coverage
0,North Carolina,10698973,5e-06
1,Nevada,3177772,6e-06
2,California,39029342,1.2e-05
3,Arizona,7359197,1.2e-05
4,Virginia,8683619,1.5e-05
5,Florida,22244823,1.9e-05
6,Oklahoma,4019800,1.9e-05
7,Kansas,2937150,2.1e-05
8,Maryland,6164660,2.5e-05
9,Wisconsin,5892539,3.2e-05


High quality articles are defined as the ones that have been classified as either FA or GA which stands for Featured Article or Good Article. To identify the number of high quality articles per capita, they are extracted into a new dataframe. They are then grouped by state and population and the total number of articles is identified which is then divided by the state population to get the highest total high quality articles per capita. The data frame is then sorted according to the requirements of the question. 

In [440]:
#Identify the relevant columns
q3_data = final_data[['state','population','article_quality']]
#Identify the FA and GA classified articles
q3_data = q3_data.loc[q3_data['article_quality'].isin(['FA','GA'])]
#Group by state to find the number of articles per state
q3_data_grouped = q3_data.groupby(['state', 'population']).size().reset_index()
#Divide by state population which is converted to float to find the highest total articles per capita 
q3_data_grouped[0] = q3_data_grouped[0]/q3_data_grouped['population'].astype(float)
#Sort the dataframe by the number of high quality articles per capita 
sorted_q3_data = q3_data_grouped.sort_values(by=0, ascending=False)
sorted_q4_data = q3_data_grouped.sort_values(by=0)

The 10 US states with the highest high quality articles per capita (in descending order) are identified.

In [445]:
sorted_q3_data = sorted_q3_data.rename(columns = {0:"Top 10 US states by high quality"})
sorted_q3_data[0:10].reset_index(drop=True)[0:10]

Unnamed: 0,state,population,Top 10 US states by high quality
0,Vermont,647064,7e-05
1,Wyoming,581381,6.7e-05
2,South Dakota,909824,6.2e-05
3,West Virginia,1775156,6e-05
4,Montana,1122867,4.9e-05
5,New Hampshire,1395231,4.5e-05
6,Pennsylvania,12972008,4.4e-05
7,Missouri,6177957,4.3e-05
8,Alaska,733583,4.2e-05
9,New Jersey,9261699,4.1e-05


The 10 US states with the lowest high quality articles per capita (in ascending order) are identified.

In [446]:
sorted_q4_data = sorted_q4_data.rename(columns = {0:"Bottom 10 US states by high quality"})
sorted_q4_data[0:10].reset_index(drop=True)[0:10]

Unnamed: 0,state,population,Bottom 10 US states by high quality
0,North Carolina,10698973,2e-06
1,Virginia,8683619,2e-06
2,Nevada,3177772,3e-06
3,Arizona,7359197,3e-06
4,California,39029342,4e-06
5,Florida,22244823,5e-06
6,New York,19677151,6e-06
7,Maryland,6164660,7e-06
8,Kansas,2937150,7e-06
9,Oklahoma,4019800,8e-06


In order to identify total articles per capita for a regional division, we need to group by regional division and calculate the number of articles that are associated with that division. The other steps all remain the same as the above.

In [447]:
#Identify the relevant columns
q5_data = final_data[['regional_division', 'state', 'population']]
#Convert the population column from string to float
q5_data['population'] = q5_data['population'].astype(float)
#Group by regional division and get the sum of populations as a new column
q5_data_grouped_population = q5_data.groupby(['regional_division', 'state', 'population']).sum().reset_index()
q5_data_grouped_population = q5_data_grouped_population.groupby(['regional_division'])['population'].sum().reset_index()
#Group by regional division and get the number of articles for each regional division
q5_data_grouped = q5_data.groupby(['regional_division']).size().reset_index()
#Divide the two for per capita values
q5_data_grouped[0] = q5_data_grouped[0]/q5_data_grouped_population['population']
#sort in descending order
sorted_q5_data = q5_data_grouped.sort_values(by=0, ascending=False)
sorted_q5_data = sorted_q5_data.rename(columns = {0:"Census divisions by total coverage"})
sorted_q5_data.reset_index(drop=True)[0:10]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q5_data['population'] = q5_data['population'].astype(float)


Unnamed: 0,regional_division,Census divisions by total coverage
0,West North Central,0.000181
1,New England,0.000125
2,East North Central,0.000101
3,Middle Atlantic,9e-05
4,East South Central,7.8e-05
5,West South Central,5e-05
6,Mountain,4.7e-05
7,Pacific,2.4e-05
8,South Atlantic,2.3e-05


In order to identify the number of total high quality articles per capita for a regional division, we need to group by regional division and calculate the number of articles that are associated with that division among the entries that are classified as either FA or GA. The other steps all remain the same as the above.

In [448]:
#Identify the relevant columns
q6_data = final_data[['regional_division','state','article_quality','population']]
#Extract the values classified as FA and GA
q6_data = q6_data.loc[q6_data['article_quality'].isin(['FA','GA'])]
##Convert the population column from string to float
q6_data['population'] = q6_data['population'].astype(float)
#Group by regional division and get the sum of populations as a new column
q6_data_grouped_population = q6_data.groupby(['regional_division', 'state', 'population']).sum().reset_index()
q6_data_grouped_population = q6_data_grouped_population.groupby(['regional_division'])['population'].sum().reset_index()
#Group by regional division and get the number of articles for each regional division
q6_data_grouped = q6_data.groupby(['regional_division']).size().reset_index()
#Divide the two for per capita values
q6_data_grouped[0] = q6_data_grouped[0]/q5_data_grouped_population['population']
#sort in descending order
sorted_q6_data = q6_data_grouped.sort_values(by=0, ascending=False)
sorted_q6_data = sorted_q6_data.rename(columns = {0:"Census divisions by high quality coverage"})
sorted_q6_data.reset_index(drop=True)[0:10]

Unnamed: 0,regional_division,Census divisions by high quality coverage
0,West North Central,3.2e-05
1,Middle Atlantic,2.5e-05
2,New England,2e-05
3,East South Central,1.6e-05
4,East North Central,1.5e-05
5,West South Central,1.5e-05
6,Mountain,1.3e-05
7,Pacific,9e-06
8,South Atlantic,8e-06
