# Wikipedia Page Reference Analysis

This Jupyter notebook contains Python code for analyzing the reference structure of Wikipedia pages. It includes two main components:

1. A `Counter` class for processing and analyzing page references
2. Interactive cells for user input and result display

## Features:

- Efficient data handling using Pandas and Parquet
- Analysis of reference degrees (how many steps away pages are from a given page)
- Calculation of weighted mean distance, total nodes, and network diameter
- Option to consider only the first reference or all references for each page

## Dependencies:

- pandas
- os
- typing

## Note:

The analysis can be performed considering either only the first reference of each page or all references. This allows for different perspectives on the Wikipedia link structure.

In [1]:
import pandas as pd
from os.path import join

from typing import List, Set  # For type hinting


def get_ordinal_suffix(n: int) -> str:
    """
    Get the ordinal suffix for a given integer.

    Args:
        n (int): The integer for which to determine the ordinal suffix.

    Returns:
        str: The ordinal suffix (e.g., 'st', 'nd', 'rd', 'th').
    """

    if 10 <= n % 100 <= 20:
        return "th"
    return {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th")


def weighted_mean(values: List[int]) -> float:
    """
    Calculate the weighted mean of a list of numbers.

    Parameters:
    values (list of float or int): List of numbers to calculate the weighted mean.

    Returns:
    float: The weighted mean of the list.
    """

    # Calculate weighted sum
    weighted_sum = sum((i + 1) * value for i, value in enumerate(values))

    # Calculate the sum of weights (positions)
    sum_of_weights = sum(value for value in values)

    # Calculate the weighted mean
    return round(weighted_sum / sum_of_weights, 2)


class Counter:
    df_filtered: pd.DataFrame

    def __init__(self, df: pd.DataFrame):
        """
        Initialize the Counter with a filtered DataFrame.

        Args:
            df (pd.DataFrame): The DataFrame containing Wikipedia page data.
        """
        print("[INFO] Applying modifiers and filters to DataFrame")

        # Filter out rows where 'Page References' is NaN or empty
        self.df_filtered = df[
            df["Page References"].notna() & df["Page References"].str.len().gt(0)
        ].copy()

        # Convert 'Page Title' to lowercase and ensure 'Page References' are lists of lowercase references
        self.df_filtered["Page Title"] = self.df_filtered["Page Title"].str.lower()
        self.df_filtered["Page References"] = self.df_filtered["Page References"].apply(
            lambda refs: [ref.lower() for ref in refs]
        )

    def count(self, page_title: str, first_ref: bool):
        return count(self.df_filtered, page_title, first_ref)


def read_parquet(wikinamedate: str) -> None:
    """
    Reads a processed Parquet file containing Wikipedia page data and returns it as a DataFrame.

    Args:
        wikinamedate (str): The date and name string used to locate the Parquet file.

    Returns:
        pd.DataFrame: The DataFrame containing the processed page data.
    """

    print(f"[INFO] Reading '{wikinamedate.replace('/', '-')}/processed.parquet'")
    df = pd.read_parquet(
        join("../output/", wikinamedate.replace("/", "-"), "processed.parquet")
    )

    return df


def count(df: pd.DataFrame, page_title: str, first_ref: bool) -> None:
        """
        Counts the occurrences of a specific page title in the DataFrame's 'Page References' column,
        iteratively updating the count for multiple degrees of reference.

        Args:
            df (pd.DataFrame): The DataFrame containing Wikipedia page data.
            page_title (str): The title of the page to start counting references from.
            first_ref (bool): Whether to only consider the first reference in each list of references.

        Returns:
            None: This function prints the count and does not return any value.
        """

        df_filtered = df.copy()
        # If first_ref is True, only consider the first reference in each list
        if first_ref:
            df_filtered["Page References"] = df_filtered["Page References"].apply(
                lambda refs: [refs[0]]
            )

        # Initialize the sets for processed pages and pages to process
        processed_pages: Set[str] = set()
        pages_to_process: Set[str] = {page_title.lower()}
        counts: List[int] = []

        # Process pages in degrees of reference
        while pages_to_process:
            degree = len(counts) + 1

            print(
                f"Checking {degree}{get_ordinal_suffix(degree)} degree of distance",
                end="\r",
            )
            # Find all rows where any reference is in pages_to_process
            current_batch = df_filtered[
                df_filtered["Page References"].apply(
                    lambda refs: any(ref in pages_to_process for ref in refs)
                )
            ]

            # Update the set of pages to process in the next iteration
            new_pages = set(current_batch["Page Title"]) - processed_pages

            # Add current batch count to the counter
            counts.append(len(new_pages))

            # Update processed pages
            processed_pages.update(new_pages)

            # Update pages_to_process with new pages
            pages_to_process = new_pages

        print(f"Degree Series: {str(counts)}")
        print(f"Mean Distance: {weighted_mean(counts)} nodes")
        print(
            f"Total Nodes: {str(sum(counts))} ({round(100 * sum(counts)/df_filtered['Page Title'].count(), 2)}%)"
        )
        print(f"Diameter: {str(len(counts))} nodes")
        print(f"First Reference: {str(first_ref)}")


In [2]:
%%time
# Restore variable from different Jupyter notebook
%store -r wikinamedate

# Reads a processed Parquet file containing Wikipedia page data
df = read_parquet(wikinamedate)
counter = Counter(df)
# del df # (maybe you should uncomment this)
print()


[INFO] Reading 'ptwiki-20240720/processed.parquet'
[INFO] Applying modifiers and filters to DataFrame

CPU times: user 53.3 s, sys: 4.74 s, total: 58.1 s
Wall time: 53.7 s


In [3]:
%%time

page_title = input('Type a Page Title:')
print()

counter.count(page_title, first_ref=True)
print()


Type a Page Title: Filosofia



Degree Series: [386, 1102, 1090, 3039, 6954, 9894, 46271, 19748, 20979, 15232, 27173, 27683, 24657, 17524, 12703, 9222, 6269, 4558, 4019, 1777, 965, 418, 207, 82, 16, 1, 0]
Mean Distance: 10.72 nodes
Total Nodes: 261969 (13.75%)
Diameter: 27 nodes

CPU times: user 1min 18s, sys: 366 ms, total: 1min 19s
Wall time: 1min 21s
