# WIKI-SHEAF

@ Alessio Borgi
@ Stefano Previti

### 0: IMPORTING LIBRARIES

In [9]:
!pip install requests wikipedia-api wptools

Collecting wptools
  Downloading wptools-0.4.17-py2.py3-none-any.whl.metadata (14 kB)
Collecting html2text (from wptools)
  Downloading html2text-2024.2.26.tar.gz (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycurl (from wptools)
  Downloading pycurl-7.45.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Downloading wptools-0.4.17-py2.py3-none-any.whl (38 kB)
Downloading pycurl-7.45.6-cp311-cp311-manylinux_2_28_x86_64.whl (4.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: html2text
  Building wheel for html2text (setup.py) ... [?25l[?25hdone
  Created wheel for html2text: filename=html2text-2024.2.26-py3-none-any.whl size=33111 sha256=4817ce6c31f294bf50c573db3b036cc5f22af02557f22ef1498c1f8d2eee87d1
  Stored

In [None]:
import requests
import json
import pprint
import wikipediaapi  # Library to interact with Wikipedia easily
import wptools      # Library to extract additional Wikipedia data
from urllib.parse import urlparse, unquote

### 1: GATHERING NECESSARY INFORMATION

In [14]:
def get_wikipedia_data(page_url):
    '''
    Function to extract data from an English Wikipedia page.
    It retrieves:
      - Page title
      - Summary
      - Categories
      - Infobox data
      - Internal links
      - External links
      - References and citations
      - Geographical coordinates
      - List of headings (titles and subtitles)
    '''

    # 1) Title extraction.
    parsed_url = urlparse(page_url)
    title = unquote(parsed_url.path.split("/")[-1])

    # 2) Wikipedia API object for English Wikipedia.
    page = wikipediaapi.Wikipedia(
        user_agent="WikiSheaf/1.0",
        language='en'
    ).page(title)

    if not page.exists():
        return None

    # 3) Extracted data Dictionary creation.
    data = {}

    # 4) Title, Summary and Categories Inclusion in the Dictionary.
    data['title'] = page.title
    data['summary'] = page.summary
    data['categories'] = list(page.categories.keys())

    # 5) Infobox, internal & external links, references & citations, geo info
    #    and headings additional information.
    try:
        # Create a wptools page instance for the same title.
        wp_page = wptools.page(title, lang="en")
        # Fetch parsed data from Wikipedia.
        wp_page.get_parse()
        extra = wp_page.data

        # Extract infobox data if available.
        data["infobox"] = extra.get("infobox", None)

        # Extract internal links (links to other Wikipedia pages).
        data["internal_links"] = extra.get("links", None)

        # Extract external links from the page.
        data["external_links"] = extra.get("externallinks", None)

        # Extract references and citations.
        data["references"] = extra.get("references", None)

        # Extract geographical coordinates, if present.
        data["geographical_coordinates"] = extra.get("coordinates", None)

        # Extract the list of headings (titles and subtitles).
        data["headings"] = extra.get("sections", None)

    except Exception as e:
        print(f"Error fetching extra data from wptools for page '{title}':", e)
        data["infobox"] = None
        data["internal_links"] = None
        data["external_links"] = None
        data["references"] = None
        data["geographical_coordinates"] = None
        data["headings"] = None

    return data

# Example usage
if __name__ == "__main__":
    # Example Wikipedia URL.
    wikipedia_url = "https://en.wikipedia.org/wiki/Pizza"
    wiki_data = get_wikipedia_data(wikipedia_url)
    pprint.pprint(wiki_data)


en.wikipedia.org (parse) Pizza


{'categories': ['Category:All Wikipedia articles written in American English',
                'Category:All articles with vague or ambiguous time',
                'Category:Articles with short description',
                'Category:CS1 Greek-language sources (el)',
                'Category:CS1 Italian-language sources (it)',
                'Category:CS1 Spanish-language sources (es)',
                'Category:CS1 Swedish-language sources (sv)',
                'Category:CS1 uses Greek-language script (el)',
                'Category:Cheese dishes',
                'Category:Commons category link is on Wikidata',
                'Category:Convenience foods',
                'Category:Flatbread dishes',
                'Category:Food combinations',
                'Category:Italian cuisine',
                'Category:Italian inventions',
                'Category:Mediterranean cuisine',
                'Category:National dishes',
                'Category:Neapolitan cuisine',
     

en.wikipedia.org (imageinfo) File:Pizza-3007395.jpg
Pizza (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Pizza-300...
  infobox: <dict(12)> name, image, image_size, caption, country, r...
  iwlinks: <list(6)> https://commons.wikimedia.org/wiki/Category:P...
  pageid: 24768
  parsetree: <str(77266)> <root><template><title>Short description...
  requests: <list(2)> parse, imageinfo
  title: Pizza
  wikibase: Q177
  wikidata_url: https://www.wikidata.org/wiki/Q177
  wikitext: <str(61267)> {{Short description|Italian dish with a f...
}


In [6]:




# ------------------------------------------------------------------------------
# Function to fetch data from Wikidata using its API.
# Given a Wikidata ID, this function retrieves a set of properties, including:
#   - English label and description.
#   - "Instance of" (P31) information.
#   - Inception date (P571).
#   - Image (P18).
#   - Country (P17).
#   - Official website (P856).
#   - And the English Wikipedia URL from the "sitelinks".
# ------------------------------------------------------------------------------
def get_wikidata_data(wikidata_id):
    URL = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    try:
        response = requests.get(URL)
        data = response.json()
        entity = data.get("entities", {}).get(wikidata_id, {})
        wikidata_info = {}
        wikidata_info["id"] = wikidata_id
        # Retrieve the English label and description.
        wikidata_info["label"] = entity.get("labels", {}).get("en", {}).get("value", None)
        wikidata_info["description"] = entity.get("descriptions", {}).get("en", {}).get("value", None)

        # Extract claims: a dictionary of properties.
        claims = entity.get("claims", {})

        # Instance of (P31): may include multiple values.
        if "P31" in claims:
            wikidata_info["instance_of"] = [
                claim["mainsnak"]["datavalue"]["value"]["id"]
                for claim in claims["P31"]
                if "mainsnak" in claim and "datavalue" in claim["mainsnak"]
            ]
        else:
            wikidata_info["instance_of"] = None

        # Inception (P571): usually a time or date value.
        if "P571" in claims:
            wikidata_info["inception"] = [
                claim["mainsnak"]["datavalue"]["value"]["time"]
                for claim in claims["P571"]
                if "mainsnak" in claim and "datavalue" in claim["mainsnak"]
            ]
        else:
            wikidata_info["inception"] = None

        # Image (P18): the file name of the image.
        if "P18" in claims:
            wikidata_info["image"] = [
                claim["mainsnak"]["datavalue"]["value"]
                for claim in claims["P18"]
                if "mainsnak" in claim and "datavalue" in claim["mainsnak"]
            ]
        else:
            wikidata_info["image"] = None

        # Country (P17): the country associated with the item.
        if "P17" in claims:
            wikidata_info["country"] = [
                claim["mainsnak"]["datavalue"]["value"]["id"]
                for claim in claims["P17"]
                if "mainsnak" in claim and "datavalue" in claim["mainsnak"]
            ]
        else:
            wikidata_info["country"] = None

        # Official website (P856): the URL of the official website.
        if "P856" in claims:
            wikidata_info["official_website"] = [
                claim["mainsnak"]["datavalue"]["value"]
                for claim in claims["P856"]
                if "mainsnak" in claim and "datavalue" in claim["mainsnak"]
            ]
        else:
            wikidata_info["official_website"] = None

        # Extract the English Wikipedia URL from sitelinks.
        if "sitelinks" in entity and "enwiki" in entity["sitelinks"]:
            wikidata_info["wikipedia_url"] = entity["sitelinks"]["enwiki"].get("url", None)
        else:
            wikidata_info["wikipedia_url"] = None

        return wikidata_info
    except Exception as e:
        print("Error fetching Wikidata data for", wikidata_id, ":", e)
    return None

# ------------------------------------------------------------------------------
# Helper function to parse the Wikidata ID from a given Wikidata URL.
# For example, given 'https://www.wikidata.org/wiki/Q177', it returns 'Q177'.
# ------------------------------------------------------------------------------
def parse_wikidata_id(wikidata_url):
    parsed_url = urlparse(wikidata_url)
    # Extract and return the last segment of the path.
    return unquote(parsed_url.path.split("/")[-1])

# ------------------------------------------------------------------------------
# Main function to iterate over the dataset and aggregate data from both Wikidata
# and Wikipedia for each item.
# ------------------------------------------------------------------------------
def gather_data(dataset):
    aggregated_data = []

    # Iterate over each entry in the dataset.
    for item in dataset:
        entry = {}
        # Each dataset entry has a Wikidata URL (under the "item" key),
        # along with a type and a category.
        wikidata_url = item.get("item")
        entry["original_item"] = item

        # Parse the Wikidata ID from the URL.
        wikidata_id = parse_wikidata_id(wikidata_url)
        entry["wikidata_id"] = wikidata_id

        # Fetch Wikidata data.
        wikidata_data = get_wikidata_data(wikidata_id)
        if wikidata_data is None:
            print(f"Wikidata data not found for {wikidata_url}")
            wikidata_data = {}
        entry["wikidata_data"] = wikidata_data

        # Retrieve the English Wikipedia URL from the Wikidata data.
        wikipedia_url = wikidata_data.get("wikipedia_url") if wikidata_data else None
        entry["wikipedia_url"] = wikipedia_url

        # Fetch Wikipedia data if a valid URL is present.
        if wikipedia_url:
            wiki_data = get_wikipedia_data(wikipedia_url)
            if wiki_data is None:
                print(f"Wikipedia page not found for {wikipedia_url}")
                wiki_data = {}
        else:
            wiki_data = {}
        entry["wikipedia_data"] = wiki_data

        aggregated_data.append(entry)

    return aggregated_data

# ------------------------------------------------------------------------------
# If run as the main script, create a fake dataset and gather data.
# ------------------------------------------------------------------------------
if __name__ == "__main__":
    # Create a fake dataset with Wikidata links.
    dataset = [
        {"item": "https://www.wikidata.org/wiki/Q177", "type": "Concept", "category": "Food"},
        {"item": "https://www.wikidata.org/wiki/Q12345", "type": "Concept", "category": "Music"},
        {"item": "https://www.wikidata.org/wiki/Q12418", "type": "Named Entity", "category": "Visual Arts"}
    ]

    # Gather data from both Wikidata and Wikipedia.
    aggregated_data = gather_data(dataset)

    # Pretty-print the aggregated data to view the results.
    pprint.pprint(aggregated_data)[0]




[{'original_item': {'category': 'Food',
                    'item': 'https://www.wikidata.org/wiki/Q177',
                    'type': 'Concept'},
  'wikidata_data': {'country': None,
                    'description': 'Italian universal popular dish with a flat '
                                   'dough-based base and toppings',
                    'id': 'Q177',
                    'image': ['Eq it-na pizza-margherita sep2005 sml.jpg'],
                    'inception': None,
                    'instance_of': ['Q19861951'],
                    'label': 'pizza',
                    'official_website': None,
                    'wikipedia_url': 'https://en.wikipedia.org/wiki/Pizza'},
  'wikidata_id': 'Q177',
  'wikipedia_data': {'categories': ['Category:All Wikipedia articles written '
                                    'in American English',
                                    'Category:All articles with vague or '
                                    'ambiguous time',
                

TypeError: 'NoneType' object is not subscriptable

### 2: BUILDING THE GRAPHS

### 3: HETEROPHILY INDEX

### 4: SHEAF NEURAL NETWORK