In [1]:
# Cell 1: Project Setup and Configuration

# This first cell imports necessary libraries and loads the project's configuration from the 'project.json' file. 
# This ensures that allsubsequent steps have access to the project's root path and settings.


from pathlib import Path
import json

# Find the project's root directory. This allows the notebook to be
# run from the 'notebooks' subfolder without breaking file paths.
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

# Load the main configuration file.
# This file contains all the key parameters for the project, such as
# the starting category, API settings, and language.
CONF_PATH = ROOT / "conf" / "project.json"
CONF = json.load(open(CONF_PATH))

# Print the root path and the loaded configuration to verify
# that everything has been loaded correctly before proceeding.
print(f"✅ Project Root: {ROOT}")
print(f"✅ Config Loaded: {CONF}")

✅ Project Root: C:\Users\drrahman\wiki-gaps-project
✅ Config Loaded: {'project': 'wiki-gaps', 'created': '2025-10-04T22:34:57', 'language': 'en', 'seed_categories': ['Category:Living people'], 'recurse_depth': 0, 'api_sleep': 0.2, 'api_maxlag': 5, 'attrs': {'gender': 'P21', 'country': 'P27', 'occupation': 'P106'}, 'time_windows': {'start_month': '2015-01', 'end_month': None}, 'ethics': {'aggregate_only': True, 'min_cell': 20}}


In [2]:
# Cell 2: API Session and Request Handling 

# Uses a direct `requests.get` for each call. 
# This ensures every API request is completely independent and stateless, which is more robust against rare, state-related network issues that can occur during very long-running jobs.

import time
import requests
import pandas as pd
from tqdm.notebook import tqdm

# Define the English Wikipedia API endpoint
ENWIKI_API = "https://en.wikipedia.org/w/api.php"

# Use API settings from our configuration file
SLEEP = CONF["api_sleep"]
MAXLAG = CONF["api_maxlag"]
USER_AGENT = f"WikiGaps/0.1 (contact: ashhik96@gmail.com)"
# Define headers that will be sent with every request
HEADERS = {"User-Agent": USER_AGENT}

def mw_get(params: dict):
    """
    A stateless wrapper for making GET requests to the MediaWiki API.
    """
    p = params.copy()
    p.update({"format": "json", "formatversion": 2, "maxlag": MAXLAG})
    
    try:
        # Use a simple, stateless `requests.get()` for each call
        response = requests.get(ENWIKI_API, params=p, headers=HEADERS, timeout=60)
        response.raise_for_status()
        js = response.json()
        
        # Check for server lag errors
        if "error" in js and js["error"].get("code") == "maxlag":
            wait_time = int(js["error"].get("lag", 5))
            print(f"Server lag detected. Waiting {wait_time}s and will skip this batch.")
            time.sleep(wait_time)
            return None # Skip this batch and let the main loop continue

        return js
        
    except requests.exceptions.RequestException as e:
        print(f"An API request failed: {e}")
        return None
    except requests.exceptions.JSONDecodeError:
        print(f"Failed to decode JSON. Status: {response.status_code}, Text: {response.text[:100]}")
        return None

print("✅ Stateless API helper function is ready.")

✅ Stateless API helper function is ready.


In [3]:
# Cell 3: Category Walking Functions

# This cell defines the functions needed to get a list of all articles
# within a specific Wikipedia category. It's designed to handle very
# large categories by fetching members in pages of 500 at a time.

def get_category_members(category_title: str, namespace: int = 0) -> pd.DataFrame:
    """
    Fetches all members of a single category page.

    Args:
        category_title: The full title of the category (e.g., "Category:Living people").
        namespace: The namespace to search (0 for articles, 14 for subcategories).

    Returns:
        A pandas DataFrame with the 'pageid' and 'title' of each member.
    """
    member_list = []
    continuation_token = None
    
    # The API returns results in pages, so we loop until the 'continue' token is gone
    while True:
        params = {
            "action": "query",
            "list": "categorymembers",
            "cmtitle": category_title,
            "cmnamespace": namespace,
            "cmlimit": 500,  # Request the maximum number of members per page
        }
        
        # If the API gave us a continuation token, add it to the next request
        if continuation_token:
            params["cmcontinue"] = continuation_token
            
        # Make the API call
        result = mw_get(params)
        if not result or "query" not in result:
            break # Stop if the request failed or returned an empty result

        # Add the retrieved members to our list
        members = result.get("query", {}).get("categorymembers", [])
        member_list.extend(members)
        
        # Check for a new continuation token to get the next page
        continuation_token = result.get("continue", {}).get("cmcontinue")
        if not continuation_token:
            break # No more pages, so we're done
            
        time.sleep(SLEEP) # Be polite and pause between requests
        
    if not member_list:
        return pd.DataFrame(columns=["pageid", "title"])
        
    # Convert the list of results into a clean DataFrame
    return pd.DataFrame(member_list)[["pageid", "title"]].drop_duplicates()

print("✅ Category walking functions are ready.")

✅ Category walking functions are ready.


In [4]:
# Cell 4: Execute the Category Walk

# This cell runs the main process to enumerate all articles in the seed categories.
# It uses the 'get_category_members' function from the previous cell and a
# progress bar to track the process for each starting category.

all_pages_frames = []
seed_categories = CONF["seed_categories"]

print(f"Starting to walk through {len(seed_categories)} seed categor(y/ies)...")

# Loop through each category defined in the project.json configuration
for category in tqdm(seed_categories, desc="Processing Categories"):
    print(f"Fetching members for: {category}...")
    
    # Fetch all the article pages (namespace=0) in the category
    pages_df = get_category_members(category, namespace=0)
    
    # Add a column to track which seed category this page came from
    if not pages_df.empty:
        pages_df["seed_category"] = category
        all_pages_frames.append(pages_df)

# Combine the results from all categories into a single DataFrame
if all_pages_frames:
    seed_pages_df = pd.concat(all_pages_frames, ignore_index=True)

    # Clean the final DataFrame by removing any duplicate pages (if categories overlap),
    # sorting by pageid, and resetting the index for a clean output.
    seed_pages_df = (
        seed_pages_df
        .drop_duplicates(subset=["pageid"])
        .sort_values("pageid")
        .reset_index(drop=True)
    )
    
    # Display the total number of pages found and a sample of the data
    print(f"\n✅ Found a total of {len(seed_pages_df):,} unique pages.")
    print("Sample of the seed pages DataFrame:")
    display(seed_pages_df.head())
else:
    print("\n⚠️ No pages found. Check your seed categories in project.json.")
    # Create an empty DataFrame to prevent errors in later cells
    seed_pages_df = pd.DataFrame(columns=["pageid", "title", "seed_category"])

Starting to walk through 1 seed categor(y/ies)...


Processing Categories:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching members for: Category:Living people...

✅ Found a total of 1,128,338 unique pages.
Sample of the seed pages DataFrame:


Unnamed: 0,pageid,title,seed_category
0,340,Alain Connes,Category:Living people
1,595,Andre Agassi,Category:Living people
2,890,Anna Kournikova,Category:Living people
3,910,Arne Kaijser,Category:Living people
4,1020,Anatoly Karpov,Category:Living people


In [5]:
# Cell 5: Page ID to Wikidata QID Mapping Function 

import math

def map_pageids_to_qids(pages_df: pd.DataFrame, batch_size: int = 50) -> pd.DataFrame:
    pageids = pages_df["pageid"].tolist()
    all_mapped_pages = []

    batch_range = range(0, len(pageids), batch_size)
    for i in tqdm(batch_range, desc="Mapping Page IDs to QIDs"):
        id_batch = pageids[i:i + batch_size]
        id_string = "|".join(map(str, id_batch))
        
        params = {
            "action": "query",
            "prop": "pageprops",
            "ppprop": "wikibase_item",
            "pageids": id_string,
            "redirects": 1,
        }
        
        result = mw_get(params)
        
        # --- THIS IS THE CORRECTED LOGIC ---
        # It now correctly checks for 'pages' inside the 'query' dictionary.
        if result and "query" in result and "pages" in result.get("query", {}):
            for page_info in result["query"]["pages"]:
                qid = page_info.get("pageprops", {}).get("wikibase_item")
                if qid:
                    all_mapped_pages.append({
                        "pageid": page_info.get("pageid"),
                        "title": page_info.get("title"),
                        "qid": qid
                    })
        
        time.sleep(SLEEP)

    # Handle the case where no QIDs were found at all
    if not all_mapped_pages:
        return pd.DataFrame(columns=['pageid', 'title', 'qid'])

    return pd.DataFrame(all_mapped_pages)

print("✅ Corrected Page ID to QID mapping function is ready.")

✅ Corrected Page ID to QID mapping function is ready.


In [6]:
# Cell 6A: Small-Scale Test Run

# Before running the full multi-hour process, this cell tests the entire mapping and cleaning pipeline on a small sample of 500 pages.
# If this cell completes successfully, we can be confident the full run will work.

print("--- Starting a small-scale test on 500 pages ---")

# Create a small sample from our main DataFrame
sample_df = seed_pages_df.head(500)
print(f"Sample size: {len(sample_df)} pages.")

# Run the same mapping function on the smaller sample
test_qids_df = map_pageids_to_qids(sample_df)

# Use the same robust checking and cleaning logic as the main cell
if not test_qids_df.empty and 'qid' in test_qids_df.columns:
    test_qids_df_unique = (
        test_qids_df
        .dropna(subset=["qid"])
        .drop_duplicates(subset=["qid"])
        .sort_values("pageid")
        .reset_index(drop=True)
    )
    print(f"\n✅ TEST SUCCESSFUL: Mapped {len(test_qids_df_unique)} pages to unique QIDs.")
    print("Sample of the test results:")
    display(test_qids_df_unique.head())
else:
    print("\n⚠️ TEST FAILED: The mapping process returned no data even for a small sample.")
    print("There may still be an underlying network or API issue.")

--- Starting a small-scale test on 500 pages ---
Sample size: 500 pages.


Mapping Page IDs to QIDs:   0%|          | 0/10 [00:00<?, ?it/s]


✅ TEST SUCCESSFUL: Mapped 500 pages to unique QIDs.
Sample of the test results:


Unnamed: 0,pageid,title,qid
0,340,Alain Connes,Q313590
1,595,Andre Agassi,Q7407
2,890,Anna Kournikova,Q131120
3,910,Arne Kaijser,Q4794599
4,1020,Anatoly Karpov,Q131674


In [7]:
# Cell 6B: Execute the Page ID to QID Mapping 

# This cell calls the mapping function from the previous step to fetch the Wikidata QID for every page. 
# Includes a check to ensure data was actually collected before attempting to clean it.

print("Starting the mapping process. This will take a long time...")

qids_df = map_pageids_to_qids(seed_pages_df)

# Check if the process returned a DataFrame with a 'qid' column before processing
if not qids_df.empty and 'qid' in qids_df.columns:
    # It's possible for multiple pages (e.g., redirects) to map to the same QID.
    # We'll clean the final list by dropping any duplicate QIDs to ensure each
    # person is represented only once.
    qids_df_unique = (
        qids_df
        .dropna(subset=["qid"])
        .drop_duplicates(subset=["qid"])
        .sort_values("pageid")
        .reset_index(drop=True)
    )

    # Display the total number of unique QIDs found and a sample of the data
    print(f"\n✅ Successfully mapped {len(qids_df_unique):,} pages to unique QIDs.")
    print("Sample of the final QID DataFrame:")
    display(qids_df_unique.head())

else:
    print("\n⚠️ Error: The mapping process completed but returned no data.")
    print("This might be due to a network issue or a problem with the API.")
    print("Please check your internet connection and consider re-running this cell.")
    # Create an empty DataFrame with the correct columns to prevent future errors
    qids_df_unique = pd.DataFrame(columns=['pageid', 'title', 'qid'])

Starting the mapping process. This will take a long time...


Mapping Page IDs to QIDs:   0%|          | 0/22567 [00:00<?, ?it/s]


✅ Successfully mapped 1,125,607 pages to unique QIDs.
Sample of the final QID DataFrame:


Unnamed: 0,pageid,title,qid
0,340,Alain Connes,Q313590
1,595,Andre Agassi,Q7407
2,890,Anna Kournikova,Q131120
3,910,Arne Kaijser,Q4794599
4,1020,Anatoly Karpov,Q131674


In [8]:
# Cell 7A: Fetch Creation Timestamps Function 

# The Wikipedia API requires us to ask for the first revision of each page individually, rather than in batches.
# This function loops through each pageid and makes a separate request.

from datetime import datetime

def get_creation_timestamps(pages_df: pd.DataFrame) -> pd.DataFrame:
    """
    Fetches the creation timestamp for a list of pageids, one at a time.
    """
    pageids = pages_df["pageid"].tolist()
    timestamps = []

    # Loop through each pageid individually
    for pageid in tqdm(pageids, desc="Fetching Creation Timestamps"):
        params = {
            "action": "query",
            "prop": "revisions",
            "rvprop": "timestamp",
            "rvlimit": 1,
            "rvdir": "newer",
            "pageids": pageid, # Send only one pageid at a time
        }
        
        result = mw_get(params)
        
        if result and "query" in result and "pages" in result.get("query", {}):
            # The response will contain only one page_info object
            page_info = result["query"]["pages"][0]
            timestamp = page_info.get("revisions", [{}])[0].get("timestamp")
            if timestamp:
                timestamps.append({
                    "pageid": page_info.get("pageid"),
                    "creation_timestamp": timestamp
                })
        
        # A very short sleep is sufficient here
        time.sleep(0.02)

    if not timestamps:
        return pd.DataFrame(columns=['pageid', 'creation_timestamp'])

    return pd.DataFrame(timestamps)

print("✅ (one-by-one) timestamp function is ready.")

✅ (one-by-one) timestamp function is ready.


In [9]:
# Cell 7B: Small-Scale Test for Timestamps

# Fetching process on a small sample before starting the full run.

print("--- Starting a small-scale test for timestamps on 500 pages ---")

# Use the 'qids_df_unique' DataFrame that was created successfully in Cell 6
sample_df = qids_df_unique.head(500)

test_timestamps_df = get_creation_timestamps(sample_df)

if not test_timestamps_df.empty:
    print("\n✅ TIMESTAMP TEST SUCCESSFUL.")
    print("Sample of the test results:")
    display(test_timestamps_df.head())
else:
    print("\n⚠️ TIMESTAMP TEST FAILED.")

--- Starting a small-scale test for timestamps on 500 pages ---


Fetching Creation Timestamps:   0%|          | 0/500 [00:00<?, ?it/s]


✅ TIMESTAMP TEST SUCCESSFUL.
Sample of the test results:


Unnamed: 0,pageid,creation_timestamp
0,340,2001-09-08T15:21:56Z
1,595,2001-02-06T20:50:01Z
2,890,2001-08-28T13:25:02Z
3,910,2001-05-19T15:58:12Z
4,1020,2001-06-15T16:43:42Z


In [10]:
# Cell 8: Execute Timestamp Fetching (with Incremental Saves)

# This version saves progress to a CSV file after every 10,000 pages.
# This means you can safely stop the script at any time and it will automatically resume where it left off the next time you run it.

print("Starting to fetch creation timestamps...")

# Define the output path and check for existing data to resume from
output_path = ROOT / "data" / "processed" / "timestamps_partial.csv"
timestamps_list = []
processed_pageids = set()

if output_path.exists():
    print(f"Resuming from existing file: {output_path.name}")
    existing_df = pd.read_csv(output_path)
    timestamps_list = existing_df.to_dict('records')
    processed_pageids = set(existing_df['pageid'])
    print(f"Loaded {len(processed_pageids):,} existing timestamps. Resuming...")

# Filter out pages we already have timestamps for
pages_to_fetch_df = qids_df_unique[~qids_df_unique['pageid'].isin(processed_pageids)]

if pages_to_fetch_df.empty:
    print("All timestamps have already been fetched.")
    timestamps_df = pd.DataFrame(timestamps_list)
else:
    # Loop through each pageid that still needs to be fetched
    for pageid in tqdm(pages_to_fetch_df['pageid'].tolist(), desc="Fetching Creation Timestamps"):
        params = {
            "action": "query", "prop": "revisions", "rvprop": "timestamp",
            "rvlimit": 1, "rvdir": "newer", "pageids": pageid,
        }
        
        result = mw_get(params)
        
        if result and "query" in result and "pages" in result.get("query", {}):
            page_info = result["query"]["pages"][0]
            timestamp = page_info.get("revisions", [{}])[0].get("timestamp")
            if timestamp:
                timestamps_list.append({
                    "pageid": page_info.get("pageid"),
                    "creation_timestamp": timestamp
                })

        # --- Incremental Save Logic ---
        # Save after every 10,000 new items are collected
        if len(timestamps_list) > 0 and len(timestamps_list) % 10000 == 0:
             if len(timestamps_list) > len(processed_pageids):
                pd.DataFrame(timestamps_list).to_csv(output_path, index=False)
                print(f"\nSaved progress: {len(timestamps_list):,} total timestamps collected.")
        
        time.sleep(0.02)

# Final save at the end
timestamps_df = pd.DataFrame(timestamps_list)
if not timestamps_df.empty:
    timestamps_df.to_csv(output_path, index=False)

print(f"\n✅ Successfully fetched all timestamps for {len(timestamps_df):,} pages.")
print("Sample of the final timestamps DataFrame:")
display(timestamps_df.head())

Starting to fetch creation timestamps...
Resuming from existing file: timestamps_partial.csv
Loaded 940,000 existing timestamps. Resuming...


Fetching Creation Timestamps:   0%|          | 0/185702 [00:00<?, ?it/s]


Saved progress: 950,000 total timestamps collected.

Saved progress: 960,000 total timestamps collected.

Saved progress: 970,000 total timestamps collected.

Saved progress: 980,000 total timestamps collected.

Saved progress: 990,000 total timestamps collected.

Saved progress: 1,000,000 total timestamps collected.

Saved progress: 1,010,000 total timestamps collected.

Saved progress: 1,020,000 total timestamps collected.

Saved progress: 1,030,000 total timestamps collected.

Saved progress: 1,040,000 total timestamps collected.

Saved progress: 1,050,000 total timestamps collected.

Saved progress: 1,060,000 total timestamps collected.

Saved progress: 1,070,000 total timestamps collected.

Saved progress: 1,080,000 total timestamps collected.

Saved progress: 1,090,000 total timestamps collected.

Saved progress: 1,100,000 total timestamps collected.

Saved progress: 1,110,000 total timestamps collected.

Saved progress: 1,120,000 total timestamps collected.

✅ Successfully fetc

Unnamed: 0,pageid,creation_timestamp
0,340,2001-09-08T15:21:56Z
1,595,2001-02-06T20:50:01Z
2,890,2001-08-28T13:25:02Z
3,910,2001-05-19T15:58:12Z
4,1020,2001-06-15T16:43:42Z


In [11]:
# Cell 9: Merge Data and Save Final Output

# Merge the DataFrame containing the QIDs with the DataFrame containing the creation timestamps and save the result to a single CSV file in the 'data/raw' directory.

print("Merging QIDs and timestamps...")

# Merge the two DataFrames on the 'pageid' column.
# 'Left' merge to ensure all pages from our main QID list.
final_df = pd.merge(qids_df_unique, timestamps_df, on="pageid", how="left")

# Rename the 'creation_timestamp' column to 'first_edit_ts' to match the project schema.
final_df = final_df.rename(columns={"creation_timestamp": "first_edit_ts"})

# Select and reorder the columns for the final output file.
output_columns = ["pageid", "title", "qid", "first_edit_ts"]
final_df = final_df[output_columns]

# Generate a timestamped filename for the output file.
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
output_path = ROOT / "data" / "raw" / f"seed_enwiki_{ts}.csv"

# Save the final DataFrame to a CSV file.
final_df.to_csv(output_path, index=False)

print(f"\n✅ Success! Notebook 01 is complete.")
print(f"Final dataset saved to: {output_path.name}")
print(f"Total rows: {len(final_df):,}")
print("Sample of the final output:")
display(final_df.head())

Merging QIDs and timestamps...

✅ Success! Notebook 01 is complete.
Final dataset saved to: seed_enwiki_20251007-213232.csv
Total rows: 1,125,607
Sample of the final output:


Unnamed: 0,pageid,title,qid,first_edit_ts
0,340,Alain Connes,Q313590,2001-09-08T15:21:56Z
1,595,Andre Agassi,Q7407,2001-02-06T20:50:01Z
2,890,Anna Kournikova,Q131120,2001-08-28T13:25:02Z
3,910,Arne Kaijser,Q4794599,2001-05-19T15:58:12Z
4,1020,Anatoly Karpov,Q131674,2001-06-15T16:43:42Z
