In [None]:
#requirements
!pip install wikidata
!pip install wikipedia
!pip install datasets
!pip install -q pandas beautifulsoup4
!pip install -q pandas requests

Collecting wikidata
  Downloading Wikidata-0.8.1-py3-none-any.whl.metadata (3.0 kB)
Downloading Wikidata-0.8.1-py3-none-any.whl (29 kB)
Installing collected packages: wikidata
Successfully installed wikidata-0.8.1
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=036c2b66ed63a05e14782ab98395612740718f4320959e3ffa3c93cdf5491d35
  Stored in directory: /root/.cache/pip/wheels/8f/ab/cb/45ccc40522d3a1c41e1d2ad53b8f33a62f394011ec38cd71c6
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0
Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (1

In [None]:
#imports
import pandas as pd
import requests
import time
import re
import random
from datetime import datetime, timedelta
from urllib.parse import quote
from wikidata.client import Client
from bs4 import BeautifulSoup
from google.colab import files
from tqdm import tqdm

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Features extraction and dataset enrichment

Numerical Informations

In [None]:
#get number of properties in the wikipedia pages
def get_num_properties(qid):
    if not qid: return 0
    try:
        res = requests.get(WIKIDATA_API, params={"action": "wbgetentities", "ids": qid, "format": "json"}, timeout=10)
        return len(res.json().get("entities", {}).get(qid, {}).get("claims", {}))
    except: return 0

#get the number of languages available for an entity
def get_num_languages(qid):
    try:
        entity = client.get(qid, load=True)
        return len(entity.data.get('sitelinks', {}))
    except: return 0

# Get the title of the english wikipedia page
def get_en_wikipedia_title(qid):
    try:
        r = requests.get(f'https://www.wikidata.org/wiki/Special:EntityData/{qid}.json', headers=HEADERS)
        r.raise_for_status()
        return r.json()['entities'][qid]['sitelinks']['enwiki']['title']
    except: return None

#Get the average number of views of the english wikipedia page
def get_average_daily_views(title):
    if not title: return None
    end = datetime.today() - timedelta(days=1)
    start = end - timedelta(days=30)
    url = f'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/{quote(title)}/daily/{start:%Y%m%d}/{end:%Y%m%d}'
    try:
        r = requests.get(url, headers=HEADERS)
        r.raise_for_status()
        views = [d['views'] for d in r.json().get('items', [])]
        return sum(views) // len(views) if views else None
    except: return None

def get_english_wikipedia_url(qid):
    try:
        url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
        r = requests.get(url, headers=HEADERS)
        r.raise_for_status()
        data = r.json()
        sitelinks = data['entities'][qid].get('sitelinks', {})
        enwiki = sitelinks.get('enwiki', {})
        return enwiki.get('url')
    except Exception as e:
        print(f"[URL_API] Error in taking the url for {qid}: {e}")
        return None

#Get the total number of words in the english wikipedia page
def get_word_count_from_wikipedia(url, idx=None):
    try:
        if not url:
            print(f"[{idx}] URL is None")
            return 0
        response = requests.get(url, headers=HEADERS, timeout=10)
        if response.status_code != 200:
            print(f"[{idx}] Request failed with status {response.status_code} → {url}")
            return 0

        soup = BeautifulSoup(response.text, 'html.parser')

        if "Wikipedia does not have an article with this exact name" in soup.text:
            print(f"[{idx}] Wikipedia page not found → {url}")
            return 0

        content = soup.find('div', {'id': 'mw-content-text'})
        if not content:
            print(f"[{idx}] No content found in div#mw-content-text → {url}")
            return 0

        text = content.get_text(" ", strip=True)
        word_count = len(text.split())
        if word_count == 0:
            print(f"[{idx}] Content found but word count is 0 → {url}")

        return word_count

    except Exception as e:
        print(f"[{idx}] Exception in get_word_count_from_wikipedia: {e} → {url}")
        return 0



Word data

In [None]:
#Get property given an entity
def get_label(entity, pid):
    try:
        prop = entity[client.get(pid)] #get the property
        if isinstance(prop, list): #if there are multiple values, put them together with a join and return it
            return ', '.join(p.label.get('en', 'null') for p in prop)
        return prop.label.get('en', 'null') #if there is only a single value, return it
    except Exception:
        return 'null'


#Return selected properties
def get_entity_data(qid):
    try:
        entity = client.get(qid, load=True)
        return {
            'instance_of': get_label(entity, 'P31'),
            'country': get_label(entity, 'P17'),
            'country_of_origin': get_label(entity, 'P495'),
            'English_description': entity.description.get('en', 'null')
        }
    except Exception:
        return {
            'instance_of': 'null',
            'country': 'null',
            'country_of_origin': 'null',
            'English_description': 'null'
        }


#categories need to be filtered because otherwise it would include wikipedia maintainance categories
#maintainance categories start with "articles", all articles" and we need to remove them because their are not useful
def clean_categories(categories_string): #take as input a sentence/string
    categories = categories_string.split(', ') #divide the input sentence into a list of words
    filtered = []
    for cat in categories: #for every word in categories, clean it
        cat = cat.lower()  #lowercase
        if not cat.startswith('articles') and not cat.startswith('all articles'):
            filtered.append(cat)
    #filtered = [cat for cat in categories if not cat.lower().startswith(('all articles', 'articles'))]
    return ', '.join(filtered)

#function to extract the categories from english wikipedia page
def get_categories_from_wikipedia(entity):
    try:
        #check if entity is valid, otherwise return null
        if not entity or not hasattr(entity, 'data') or entity.data is None:
            print("Empty entity or without data attribute")
            return 'null'
        #get the english wikipedia page
        sitelinks = entity.data.get('sitelinks') #get all the wikipedia pages links in the wikidata page of the entity (it return a dict)
        if sitelinks and isinstance(sitelinks, dict):
            site_link = sitelinks.get('enwiki') #get the english wikipedia page

            #make the API request
            if site_link:
                title = site_link['title'].replace(' ', '_') #extract the wikipedia page title and replace ' ' with '_'
                #create the API request and consider only visible categories (clshow=!hidden)
                url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&prop=categories&clshow=!hidden&titles={title}"
                #specify who is doing the request
                headers = {'User-Agent': 'MyWikipediaBot/1.0 (alessiass46@gmail.com)'}
                #make the API request
                r = requests.get(url, headers=headers)

                #check if there are errors and handle them
                if r.status_code == 429: #too many requests
                    print("Wikipedia block. Wait an hour at least") #500 request/h
                    return 'RATE_LIMITED' #will help later to block the categories extraction if there are too many requests
                elif r.status_code != 200: #other errors
                    print(f"Error {r.status_code} on {title}")
                    return 'null'

                #now we extract the categories:
                data = r.json() #take the JSON response, which has the form of a dict: query:{ pages: {.."categories": [...]..} }
                pages = data['query']['pages']  #-> 'categories is a value of pages'
                categories = []
                for page in pages.values():
                    if 'categories' in page: #scroll through the values of pages and look for 'categories'
                      categories = []
                      for cat in page['categories']:
                          title = cat['title']
                          cleaned = title.replace('Category:', '') #remove 'Category:'
                          categories.append(cleaned)
                        #categories = [cat['title'].replace('Category:', '') for cat in page['categories']]
                return clean_categories(', '.join(categories)) #clean categories from maintainance categories, join them in a list and return them

        #If there isn't the english wikipedia page return 'null'
        print("No enwiki link for this entity")
        return 'null'
    #handle errors in general because why not
    except Exception as e:
        print(f"Error in get_categories_from_wikipedia: {e}")
        return 'null'

def get_categories_from_wikipedia_scrape(qid):
    try:
        url = get_english_wikipedia_url(qid)
        if not url:
            print(f"[SCRAPE] No enwiki URL for {qid}")
            return 'null'

        r = requests.get(url, headers=HEADERS)
        if r.status_code != 200:
            print(f"[SCRAPE] HTTP {r.status_code} for {url}")
            return 'null'

        soup = BeautifulSoup(r.text, 'html.parser')
        cat_div = soup.find('div', id='mw-normal-catlinks')
        if not cat_div:
            print(f"[SCRAPE] No category div in {url}")
            return 'null'

        categories = []
        for li in cat_div.find_all('li'):
            cat = li.get_text(strip=True)
            if not cat.lower().startswith(('articles', 'all articles')):
                categories.append(cat)

        return ', '.join(categories) if categories else 'null'
    except Exception as e:
        print(f"[SCRAPE] Error for {qid}: {e}")
        return 'null'

#Get the wikipedia categories of an entity
def get_categories_from_wikipedia_combined(entity, qid):
    try:
        categories = get_categories_from_wikipedia(entity)
        if categories == 'RATE_LIMITED':
            return 'RATE_LIMITED'
        if categories not in ['null', '', None]:
            return categories
        print(f"[{qid}] API failed, trying scraping...")
        return get_categories_from_wikipedia_scrape(qid)
    except Exception as e:
        print(f"[COMBINED] Error for {qid}: {e}")
        return 'null'

# Train set

In [None]:
df = pd.read_csv("dataset_train.csv")

In [None]:
#INITIAL CONFIGURATION
WIKIDATA_API = "https://www.wikidata.org/w/api.php"
HEADERS = {'User-Agent': 'ColabPageViewsScript/1.0 (alessiass46@gmail.com)'}
CHECKPOINT = 'checkpoint.txt'
BATCH_SIZE = 300
DATASET_PATH = "dataset_train.csv"
SAVE_PATH = "dataset_train.csv"
client = Client()

# EXTRACT QID
def extract_qid(url):
    match = re.search(r'Q\d+', url)
    return match.group(0) if match else None

df['wikidata_id'] = df['item'].apply(extract_qid)

Data extraction

In [None]:
if 'wikipedia_categories' not in df.columns:
    df['wikipedia_categories'] = ''

try:
    with open(CHECKPOINT, 'r') as f:
        start_idx = int(f.read().strip()) #read the index saved before
except FileNotFoundError:
    start_idx = 0 #if file do not exist start from 0

while start_idx < len(df):
    end_idx = min(start_idx + BATCH_SIZE, len(df)) #determine at which row to stop
    print(f"\nCompute categories from row {start_idx} to {end_idx - 1}\n")
    got_rate_limited = False #variable to track the block

    for idx in range(start_idx, end_idx):
        qid = df.at[idx, 'wikidata_id']
        if not isinstance(qid, str): continue
        try:
            #GET PROPERTIES
            entity = client.get(qid, load=True)
            props = get_entity_data(qid)
            for k, v in props.items():
                df.at[idx, k] = v

            #GET NUMERICAL DATA
            df.at[idx, "num_properties"] = get_num_properties(qid)
            df.at[idx, "num_languages"] = get_num_languages(qid)
            title = get_en_wikipedia_title(qid)
            df.at[idx, "wikipedia_title"] = title
            df.at[idx, "average_daily_views"] = get_average_daily_views(title)
            url = get_english_wikipedia_url(qid)
            df.at[idx, "word_count_en"] = get_word_count_from_wikipedia(url)

            #GET CATEGORIES
            categories = get_categories_from_wikipedia_combined(entity, qid)  #get the categories of the entity
            if categories == 'RATE_LIMITED':
                got_rate_limited = True
                break # exit from the cycle and wait
            df.at[idx, 'wikipedia_categories'] = categories

            print(f"[{idx}] {qid} → {title or 'no title'} | {categories[:60]}...")
        except Exception as e:
            df.at[idx, 'wikipedia_categories'] = 'null'
            print(f"[{idx}] {qid} → Error: {e}")
        time.sleep(1)

    # Save updated file
    df.to_csv(SAVE_PATH, index=False)

    # Update checkpoint
    with open(CHECKPOINT, 'w') as f:
        f.write(str(idx if got_rate_limited else end_idx))

    print(f"Saved up to row {idx if got_rate_limited else end_idx - 1}")
    if got_rate_limited:
        print("Wikipedia block. Wait 10 minutes")
        time.sleep(600)

    start_idx = idx if got_rate_limited else end_idx

print(f"\n Extraction finished ( ͡° ͜ʖ ͡°)")

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
[1510] Q31273854 → Dataism | big data, data, digital revolution, philosophical schools an...
[1511] Q44158 → Dave Bautista | 1969 births, 20th-century american sportsmen, 21st-century a...
[1512] Q183337 → David Attenborough | 1926 births, 20th-century british biologists, 20th-century b...
[1513] Q192 → David Cameron | 1966 births, 20th-century anglicans, 21st-century anglicans,...
[1514] Q1628 → David Cecil, 6th Marquess of Exeter | 1905 births, 1981 deaths, 20th-century english sportsmen, al...
[1515] Q65468 → David Chytraeus | 1530 births, 1600 deaths, 16th-century german protestant the...
[1516] Q2071 → David Lynch | 1946 births, 2025 deaths, 20th-century american male musicia...
[1517] Q3018644 → David Production | 2014 mergers and acquisitions, animation studios in tokyo, d...
[1518] Q14715 → Davor Slamnig | 1956 births, croatian male short story writers, croatian mal...
[Q154005] API failed, trying scraping...
[1519

In [None]:
df.head()

Unnamed: 0,item,name,description,type,category,subcategory,label,wikidata_id,wikipedia_categories,instance_of,country,country_of_origin,English_description,num_properties,num_languages,wikipedia_title,average_daily_views,word_count_en
0,http://www.wikidata.org/entity/Q32786,916,2012 film by M. Mohanan,entity,films,film,cultural exclusive,Q32786,"2010s malayalam-language films, 2012 drama fil...",film,,India,2012 film by M. Mohanan,26.0,6.0,916 (film),69.0,507.0
1,http://www.wikidata.org/entity/Q371,!!!,American dance-punk band from California,entity,music,musical group,cultural representative,Q371,"!!!, 1996 establishments in california, americ...",musical group,,United States,American dance-punk band from California,59.0,30.0,!!!,191.0,1745.0
2,http://www.wikidata.org/entity/Q3729947,¡Soborno!,Mort & Phil comic,entity,comics and anime,comics,cultural representative,Q3729947,"1977 graphic novels, 1977 in comics, mort & ph...",comics,,Spain,Mort & Phil comic,11.0,4.0,¡Soborno!,13.0,726.0
3,http://www.wikidata.org/entity/Q158611,+44,American band,entity,music,musical group,cultural representative,Q158611,"Alternative rock groups from California, Ameri...",musical group,,United States,American band,33.0,38.0,+44 (band),479.0,3743.0
4,http://www.wikidata.org/entity/Q280375,1 Monk Street,"building in Monmouth, Wales",entity,architecture,building,cultural exclusive,Q280375,"buildings and structures in monmouth, wales, g...",building,United Kingdom,,"building in Monmouth, Wales",15.0,2.0,"1 Monk Street, Monmouth",16.0,930.0


In [None]:
df.to_csv("dataset_test_enrich.csv", index=False)
files.download('dataset_test_enrich.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Validation set

In [None]:
df = pd.read_csv("dataset_validation.csv")

In [None]:
#INITIAL CONFIGURATION
WIKIDATA_API = "https://www.wikidata.org/w/api.php"
HEADERS = {'User-Agent': 'ColabPageViewsScript/1.0 (alessiass46@gmail.com)'}
CHECKPOINT = 'checkpointval.txt'
BATCH_SIZE = 300
DATASET_PATH = "dataset_validation.csv"
SAVE_PATH = "dataset_validation.csv"
client = Client()

# EXTRACT QID
def extract_qid(url):
    match = re.search(r'Q\d+', url)
    return match.group(0) if match else None

df['wikidata_id'] = df['item'].apply(extract_qid)

In [None]:
if 'wikipedia_categories' not in df.columns:
    df['wikipedia_categories'] = ''

try:
    with open(CHECKPOINT, 'r') as f:
        start_idx = int(f.read().strip()) #read the index saved before
except FileNotFoundError:
    start_idx = 0 #if file do not exist start from 0

while start_idx < len(df):
    end_idx = min(start_idx + BATCH_SIZE, len(df)) #determine at which row to stop
    print(f"\nCompute categories from row {start_idx} to {end_idx - 1}\n")
    got_rate_limited = False #variable to track the block

    for idx in range(start_idx, end_idx):
        qid = df.at[idx, 'wikidata_id']
        if not isinstance(qid, str): continue
        try:
            #GET PROPERTIES
            entity = client.get(qid, load=True)
            props = get_entity_data(qid)
            for k, v in props.items():
                df.at[idx, k] = v

            #GET NUMERICAL DATA
            df.at[idx, "num_properties"] = get_num_properties(qid)
            df.at[idx, "num_languages"] = get_num_languages(qid)
            title = get_en_wikipedia_title(qid)
            df.at[idx, "wikipedia_title"] = title
            df.at[idx, "average_daily_views"] = get_average_daily_views(title)
            url = get_english_wikipedia_url(qid)
            df.at[idx, "word_count_en"] = get_word_count_from_wikipedia(url)

            #GET CATEGORIES
            categories = get_categories_from_wikipedia_combined(entity, qid) #get the categories of the entity
            if categories == 'RATE_LIMITED':
                got_rate_limited = True
                break # exit from the cycle and wait
            df.at[idx, 'wikipedia_categories'] = categories

            print(f"[{idx}] {qid} → {title or 'no title'} | {categories[:60]}...")
        except Exception as e:
            df.at[idx, 'wikipedia_categories'] = 'null'
            print(f"[{idx}] {qid} → Error: {e}")
        time.sleep(1)

    # Save updated file
    df.to_csv(SAVE_PATH, index=False)

    # Update checkpoint
    with open(CHECKPOINT, 'w') as f:
        f.write(str(idx if got_rate_limited else end_idx))

    print(f"Saved up to row {idx if got_rate_limited else end_idx - 1}")
    if got_rate_limited:
        print("Wikipedia block. Wait 10 minutes")
        time.sleep(600)

    start_idx = idx if got_rate_limited else end_idx

print(f"\n Extraction finished ( ͡° ͜ʖ ͡°)")


Compute categories from row 0 to 299

[0] Q15786 → 1. FC Nürnberg | 1. fc nürnberg, 1900 establishments in germany, 2. bundeslig...
[1] Q268530 → 77 Records | blues record labels, british jazz record labels, folk record...
[2] Q216153 → A Bug's Life | 1990s english-language films, 1990s children's animated film...
[3] Q593 → A Gang Story | 2010s french film stubs, 2010s french films, 2011 drama film...
[4] Q192185 → Aaron Copland | 1900 births, 1990 deaths, 20th-century american jews, 20th-c...
[5] Q265890 → Aarwangen Castle | castles in the canton of bern, cultural property of national...
[6] Q305718 → Abaya | arabic clothing, dresses, islamic female clothing, outerwear...
[7] Q337267 → Academy of San Carlos | 1540s establishments in mexico, 1780s establishments in mexi...
[8] Q15 → Africa | africa, continents...
[9] Q388170 → African-American literature | african-american culture, african-american literature, afric...
[10] Q11453 → Irrigation | agricultural soil science, agronomy, e

In [None]:
df.to_csv("dataset_validation_enrich.csv", index=False)
files.download('dataset_validation_enrich.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>