Import the Required Packages:

In [2]:
# standard python packages
import json, time, urllib.parse
import requests

# packages for data manipulation
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np

# package to ignore the warnings
import warnings
warnings.filterwarnings("ignore")

API call to get the Wikipedia Articles

In [3]:
#########
#This code has been taken from Dr David Mcdonald's repository
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<uwnetid@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

In [4]:
# importing states by region list

region = pd.read_excel('/content/US States by Region - US Census Bureau.xlsx')
region.head()

Unnamed: 0,REGION,DIVISION,STATE
0,Northeast,,
1,,New England,
2,,,Connecticut
3,,,Maine
4,,,Massachusetts


In [5]:
# importing U.S. City Data from CSV into a Pandas DataFrame
states = pd.read_csv('/content/us_cities_by_state_SEPT.2023.csv')
states.head()

Unnamed: 0,state,page_title,url
0,Alabama,"Abbeville, Alabama","https://en.wikipedia.org/wiki/Abbeville,_Alabama"
1,Alabama,"Adamsville, Alabama","https://en.wikipedia.org/wiki/Adamsville,_Alabama"
2,Alabama,"Addison, Alabama","https://en.wikipedia.org/wiki/Addison,_Alabama"
3,Alabama,"Akron, Alabama","https://en.wikipedia.org/wiki/Akron,_Alabama"
4,Alabama,"Alabaster, Alabama","https://en.wikipedia.org/wiki/Alabaster,_Alabama"


In [6]:
# list of Wikipedia article titles
ARTICLE_TITLES = states['state'].tolist()

In [7]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None,
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT,
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):

    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [8]:
# fetching the articles lastrevid
all_info = []
for i in range(0, len(ARTICLE_TITLES), 50):

    # joining 50 articles name in one string and then passing it through the API call function
    info = request_pageinfo_per_article("|".join(ARTICLE_TITLES[i:i+50]))

    # appending the output to the final list
    all_info.extend(list(info['query']['pages'].values()))

In [11]:
from tqdm import tqdm

# Request and store Page Info for Wikipedia Articles in a CSV file
data = []
for title in tqdm(ARTICLE_TITLES):
    info = request_pageinfo_per_article(title)
    if 'query' in info and 'pages' in info['query']:
        pages = info['query']['pages']
        for key, value in pages.items():
            if 'lastrevid' in value and 'title' in value:
                data.append({'Title': value['title'], 'Last_Revision_ID': value['lastrevid']})

# Convert the data into a DataFrame and store it in a CSV file
df_info = pd.DataFrame(data)
df_info.to_csv('/content/wiki_page_info.csv', index=False)

100%|██████████| 22157/22157 [1:29:55<00:00,  4.11it/s]


Now we have the lastrevid of each article, so making an API call to ORES to get the Score of article.

In [9]:
# Constants for the ORES API
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"
API_LATENCY_ASSUMED = 0.002
API_THROTTLE_WAIT = (60.0 / 5000.0) - API_LATENCY_ASSUMED

In [10]:
# Other necessary constants
USERNAME = "<Qwertyishank09>"
ACCESS_TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiI2NmQxMzVlOWRjNTMxZTE3NjQzMDEyYWEzOTJhYTc4YSIsImp0aSI6ImU4Y2Q3ODRjMmIzN2FhZDE2YTU1OWQ1MTY5MmFmZDM4Y2ZiMzVlNTgzMDNkZTU1YWJjNWEwNTg1ODIzZTA4Njk3NWU2M2M2NmNlYjAwMTA5IiwiaWF0IjoxNjk3NDQyMDEzLjk5NzMyMSwibmJmIjoxNjk3NDQyMDEzLjk5NzMyNCwiZXhwIjozMzI1NDM1MDgxMy45OTU4MSwic3ViIjoiNzQwMTUxODIiLCJpc3MiOiJodHRwczovL21ldGEud2lraW1lZGlhLm9yZyIsInJhdGVsaW1pdCI6eyJyZXF1ZXN0c19wZXJfdW5pdCI6NTAwMCwidW5pdCI6IkhPVVIifSwic2NvcGVzIjpbImJhc2ljIl19.ZXkfn9L3vw_ddmS-ZHFF4RKEuF5qW8mFnSzlwYHEtfiGhrKrVAc0bwD4S2nQHDE0CyhFx6MstEtl9d7qrxukZae3Xbib0sX4sZ0FFSFt1-bxWsMhnAM2qpbd13wr9y2wzEnDooApXDjo2m7y9gtifhlJ-_JhVj3ww7HaDFuhjOGpnebZz-UNIWRpMkLVmMLFpuEziJ-xpf5eUP5EPUWn46h4D20dH67Y8EpngVhrOyhmhWqmNGolqRppX4ARidyinX4RIdMBtbH870UscgLUrx4xfVGf5PdC6as1v7aKvw1TpWigZT9MmWGnMmmcnIjV2K5htT21snUeSVoCtGuTof_lNTnpyYXJ3AlSCdsAQilPVHjZ10wZJeYrfsYBwN3n8eHyAbqBb-gOSo6RpZRFTPG7emXGirSbzTIwRsGvfJx-M8gR0ylC6kUFtTSwFAM4JdxOtEXWArPqRrxcY_aOv-bRHfx9sb3E_dBtCbOKgfr45pqcl6DnJ8716ZK2QRhEEgOqRQOc60o6vYLrq4IB78U1tHFDG1VZiIRNFCH5Ef8HPk5hNEt_oiz7ZT9-J1pbNqE3KwkNL9FV7iMyslIn6kxIodtD7czwaTnyjTC68QL1ZvZnZFaVxhX-zozFe-5nTkZMraCfun7K3KPh-5bw52lLpiggZ97D56YXqBGGscM"

# Function to make the ORES API request
def request_ores_score_per_article(article_title, article_revid, email_address, access_token):
    request_data = {
        "lang": "en",
        "rev_id": article_revid,
        "features": True
    }

    header_params = {
        'email_address': email_address,
        'access_token': access_token
    }

    headers = {
        'User-Agent': f"{email_address}, University of Washington, MSDS DATA 512 - AUTUMN 2023",
        'Content-Type': 'application/json',
        'Authorization': f"Bearer {access_token}"
    }

    request_url = API_ORES_LIFTWING_ENDPOINT.format(model_name=API_ORES_EN_QUALITY_MODEL)
    try_count = 0
    while try_count < 10:
        try:
            if API_THROTTLE_WAIT > 0.0:
                time.sleep(API_THROTTLE_WAIT)
            response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
            response.raise_for_status()  # Check for non-200 status
            json_response = response.json()
            return json_response
        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error occurred: {http_err}")
            print(f"Status code: {response.status_code}")
            print(f"Retrying after {2 ** try_count} seconds.")
            time.sleep(2 ** try_count)
            try_count += 1
        except requests.exceptions.RequestException as e:
            print(f"Request failed for {article_title}. Retrying after {2 ** try_count} seconds.")
            time.sleep(2 ** try_count)
            try_count += 1
    return None

Converting the wikipedia page information from csv to a dictionary:

In [11]:
# Your CSV file name
csv_file = '/content/wiki_page_info.csv'
# Reading the CSV file and converting it to a list of dictionaries
articles = pd.read_csv(csv_file).to_dict(orient='records')
articles


[{'Title': 'Abbeville, Alabama', 'Last_Revision_ID': 1171163550},
 {'Title': 'Adamsville, Alabama', 'Last_Revision_ID': 1177621427},
 {'Title': 'Addison, Alabama', 'Last_Revision_ID': 1168359898},
 {'Title': 'Akron, Alabama', 'Last_Revision_ID': 1165909508},
 {'Title': 'Alabaster, Alabama', 'Last_Revision_ID': 1179139816},
 {'Title': 'Albertville, Alabama', 'Last_Revision_ID': 1179198677},
 {'Title': 'Alexander City, Alabama', 'Last_Revision_ID': 1179140073},
 {'Title': 'Aliceville, Alabama', 'Last_Revision_ID': 1167792390},
 {'Title': 'Allgood, Alabama', 'Last_Revision_ID': 1165909718},
 {'Title': 'Altoona, Alabama', 'Last_Revision_ID': 1165909823},
 {'Title': 'Andalusia, Alabama', 'Last_Revision_ID': 1179141586},
 {'Title': 'Anderson, Lauderdale County, Alabama',
  'Last_Revision_ID': 662691565},
 {'Title': 'Anniston, Alabama', 'Last_Revision_ID': 1176049382},
 {'Title': 'Arab, Alabama', 'Last_Revision_ID': 1171375371},
 {'Title': 'Ardmore, Alabama', 'Last_Revision_ID': 1176903479},


In [13]:
import csv
from tqdm import tqdm

# Loop over each article to get the ORES score
for article in tqdm(articles):
    title = article["Title"]
    rev_id = article["Last_Revision_ID"]

    score = request_ores_score_per_article(title, rev_id, USERNAME, ACCESS_TOKEN)



    if score is not None:
        try:
            prediction = score["enwiki"]["scores"][str(rev_id)]["articlequality"]["score"]["prediction"]
        except KeyError as e:
            print(f"KeyError: {e}. One of the required keys is missing in the response for Title: {title}, Rivision Id: {rev_id}")
            prediction = "N/A"  # Set default value for prediction

        # Define the file path in Google Colab
        csv_path = '/content/ores_predictions.csv'

        # Write data to the CSV file
        with open(csv_path, mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Title', 'Revision_ID', 'Prediction'])
            # Write data row
            writer.writerow([title, rev_id, prediction])
    else:
        print(f"Failed to get score for {title}.")

  0%|          | 80/22157 [00:47<2:57:22,  2.07it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 20%|█▉        | 4381/22157 [37:02<2:01:02,  2.45it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 20%|██        | 4476/22157 [38:18<2:42:38,  1.81it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 21%|██        | 4688/22157 [40:22<1:59:34,  2.43it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 22%|██▏       | 4823/22157 [41:54<2:11:10,  2.20it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 23%|██▎       | 4990/22157 [43:38<2:07:58,  2.24it/s]

HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 1 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 2 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 4 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 8 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 16 seconds.
HTTP error occurred

 29%|██▉       | 6401/22157 [58:44<2:31:43,  1.73it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 29%|██▉       | 6500/22157 [59:59<1:40:58,  2.58it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 45%|████▌     | 9988/22157 [1:25:33<1:10:51,  2.86it/s]

HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 1 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 2 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 4 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 8 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 16 seconds.
HTTP error occurred

 45%|████▌     | 9989/22157 [1:42:39<1041:13:58, 308.06s/it]

Failed to get score for Seneca Township, Michigan.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 1 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 2 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 4 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 8 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code:

 45%|████▌     | 10033/22157 [1:51:35<1:48:36,  1.86it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 45%|████▌     | 10034/22157 [1:52:07<33:39:03,  9.99s/it]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 46%|████▋     | 10262/22157 [1:54:12<1:22:52,  2.39it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 60%|██████    | 13368/22157 [2:21:29<1:29:30,  1.64it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 62%|██████▏   | 13690/22157 [2:24:36<1:48:16,  1.30it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 68%|██████▊   | 14984/22157 [2:35:16<43:56,  2.72it/s]

HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 1 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 2 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 4 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 8 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 16 seconds.
HTTP error occurred

 68%|██████▊   | 14985/22157 [2:52:22<613:44:24, 308.07s/it]

Failed to get score for Grafton, Ohio.


 75%|███████▍  | 16511/22157 [3:03:47<31:07,  3.02it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 75%|███████▌  | 16631/22157 [3:05:05<36:40,  2.51it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 77%|███████▋  | 17048/22157 [3:08:12<31:26,  2.71it/s]

HTTP error occurred: 503 Server Error: Service Unavailable for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 503
Retrying after 1 seconds.


 84%|████████▍ | 18614/22157 [3:18:09<23:35,  2.50it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 84%|████████▍ | 18616/22157 [3:18:41<7:01:19,  7.14s/it]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 90%|█████████ | 19980/22157 [3:30:00<13:26,  2.70it/s]

HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 1 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 2 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 4 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 8 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 16 seconds.
HTTP error occurred

 90%|█████████ | 19981/22157 [3:47:06<186:14:09, 308.11s/it]

Failed to get score for Lone Star, Texas.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 1 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 2 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 4 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retrying after 8 seconds.
HTTP error occurred: 429 Client Error: Too Many Requests for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 429
Retr

 90%|█████████ | 20012/22157 [3:49:30<18:53,  1.89it/s]

HTTP error occurred: 500 Server Error: Internal Server Error for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 500
Retrying after 1 seconds.


 90%|█████████ | 20026/22157 [3:49:38<15:10,  2.34it/s]

HTTP error occurred: 500 Server Error: Internal Server Error for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 500
Retrying after 1 seconds.


 90%|█████████ | 20040/22157 [3:49:46<14:25,  2.44it/s]

HTTP error occurred: 500 Server Error: Internal Server Error for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 500
Retrying after 1 seconds.


 94%|█████████▍| 20878/22157 [3:55:26<09:17,  2.29it/s]

HTTP error occurred: 500 Server Error: Internal Server Error for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 500
Retrying after 1 seconds.


 95%|█████████▌| 21099/22157 [3:56:57<08:40,  2.03it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 95%|█████████▌| 21103/22157 [3:57:30<1:05:36,  3.74s/it]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


 96%|█████████▌| 21165/22157 [3:58:39<08:46,  1.88it/s]

HTTP error occurred: 504 Server Error: Gateway Timeout for url: https://api.wikimedia.org/service/lw/inference/v1/models/enwiki-articlequality:predict
Status code: 504
Retrying after 1 seconds.


100%|██████████| 22157/22157 [4:08:07<00:00,  1.49it/s]


In [14]:
#Reading the ores prediction file into pandas dataframe

# Create a list of the header values
headers = ["title", "rev_id", "prediction"]


data_rows = []
with open('/content/ores_predictions.csv', 'r') as file:
    csv_reader = csv.reader(file)
    data_rows = [row for row in csv_reader]

# Rewrite the data with headers to the same file
with open('/content/ores_predictions.csv', 'w', newline='') as file:
    csv_writer = csv.writer(file)
    csv_writer.writerow(headers)  # Write the headers first
    csv_writer.writerows(data_rows)  # Write the existing data rows below the headers


# Reading the CSV file
df = pd.read_csv('/content/ores_predictions.csv')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44316 entries, 0 to 44315
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       44316 non-null  object
 1   rev_id      44316 non-null  object
 2   prediction  44316 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [17]:
# This part defines a list of US states, for use in data cleaning and filtering operations.
# The list includes the names of all 50 states in the United States.

# List of US states
us_states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut",
    "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa",
    "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan",
    "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
    "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma",
    "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee",
    "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
]

In [18]:
# The clean_title function takes a title as input, cleans the title by removing
#leading and trailing whitespaces, and checks for the presence of a US state.
#It returns the cleaned title if a US state is found else, it returns None.

# Function to clean the title and check for the presence of a US state
def clean_title(title):
    title = title.strip()
    parts = re.split(r',\s*', title)
    if parts[-1].title() in us_states:
        return ", ".join(parts[-2:]).strip()
    else:
        return None

In [19]:
# Applying the clean_title function to the 'title' column of the DataFrame.
#It will update the 'title' column with the cleaned versions

# Applying the cleaning function to the 'title' column

import re

df['cleaned_title'] = df['title'].apply(clean_title)

In [20]:
#printing rows which dont have US states in their names


removed_rows = df[df['cleaned_title'].isnull()]  # Storing the removed rows

# Printing the rows to be removed - probable
print("The following rows can be removed, as they dont have state in their name:")
print(removed_rows)

The following rows can be removed, as they dont have state in their name:
       title       rev_id  prediction cleaned_title
0      Title  Revision_ID  Prediction          None
2      Title  Revision_ID  Prediction          None
4      Title  Revision_ID  Prediction          None
6      Title  Revision_ID  Prediction          None
8      Title  Revision_ID  Prediction          None
...      ...          ...         ...           ...
44306  Title  Revision_ID  Prediction          None
44308  Title  Revision_ID  Prediction          None
44310  Title  Revision_ID  Prediction          None
44312  Title  Revision_ID  Prediction          None
44314  Title  Revision_ID  Prediction          None

[22218 rows x 4 columns]


In [21]:
# Now we filter out the rows whoch dont have the name of state in them

# List of known cities or use an external dataset for this purpose
known_cities = [
    "Los Angeles", "San Diego", "San Francisco", "Denver", "Miami", "Atlanta", "New Orleans", "Eastwood, Syracuse",
    "Boston", "Nantucket", "Hyde Park, Boston", "Detroit", "Minneapolis", "Philadelphia","Echols County","Riverview, St. Louis","Wailua, Kauai",
    "Pittsburgh", "Oklahoma City", "Salt Lake City", "Seattle", "Milwaukee", "Indianapolis", "St. Louis", "Las Vegas", "New York City", ""
]

# Filtering out the rows that are not cities
to_be_removed = df[df['cleaned_title'].isnull() & ~df['title'].isin(known_cities)]
to_be_removed

Unnamed: 0,title,rev_id,prediction,cleaned_title
0,Title,Revision_ID,Prediction,
2,Title,Revision_ID,Prediction,
4,Title,Revision_ID,Prediction,
6,Title,Revision_ID,Prediction,
8,Title,Revision_ID,Prediction,
...,...,...,...,...
44306,Title,Revision_ID,Prediction,
44308,Title,Revision_ID,Prediction,
44310,Title,Revision_ID,Prediction,
44312,Title,Revision_ID,Prediction,


In [22]:
to_be_removed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22192 entries, 0 to 44314
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          22192 non-null  object
 1   rev_id         22192 non-null  object
 2   prediction     22192 non-null  object
 3   cleaned_title  0 non-null      object
dtypes: object(4)
memory usage: 866.9+ KB


In [23]:
# Now we filter out the rows from the dataframe where the 'title' column does not contain US State or city

df = df[~df['title'].isin(to_be_removed['title'])]
df = df.drop('cleaned_title', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22124 entries, 1 to 44315
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       22124 non-null  object
 1   rev_id      22124 non-null  object
 2   prediction  22124 non-null  object
dtypes: object(3)
memory usage: 691.4+ KB


In [26]:
# Saving the cleaned data to a new CSV file
df.to_csv('/content/cleaned_data.csv', index=False)