<a href="https://colab.research.google.com/github/aristeakon/country-explorer/blob/main/evaluating_3_languages.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup κελί (imports + constants)




In [None]:
# Εγκατάσταση απαραίτητων πακέτων
!pip -q install SPARQLWrapper textdistance

# Imports
from SPARQLWrapper import SPARQLWrapper, JSON
import json
from collections import defaultdict
import pandas as pd

# Ρυθμίσεις για Wikidata Query Service
WDQS_URL = "https://query.wikidata.org/sparql"

# Βάλε εδώ ένα δικό σου user agent με email ή site σου
USER_AGENT = "papes-poi-quality/0.1 (mailto:your-email@example.com)"


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/587.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m501.8/587.2 kB[0m [31m14.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.2/587.2 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h

### 1. Data extraction from Wikidata (per country)


In [None]:
def get_pois_for_country(country_qid: str, local_lang: str, local_wiki_domain: str, limit: int | None = None):
    """
    Full SPARQL για μία χώρα:
    - local + en labels / descriptions
    - type, admin, coords, image, website
    - aliases (local + en)
    - localwiki + enwiki
    - sources (P973)

    Αν limit != None, προσθέτει LIMIT στο τέλος για testing.
    """
    sparql = SPARQLWrapper(WDQS_URL, agent=USER_AGENT)
    sparql.setMethod("POST")
    sparql.setReturnFormat(JSON)
    sparql.setTimeout(90)

    limit_clause = f"\nLIMIT {limit}" if limit is not None else ""

    query = f"""
    PREFIX bd: <http://www.bigdata.com/rdf#>
    PREFIX wikibase: <http://wikiba.se/ontology#>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX schema: <http://schema.org/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT DISTINCT ?item ?qid
           ?itemLabel_local ?itemLabel_en
           ?desc_local ?desc_en
           ?typeLabel ?adminLabel
           ?coordinates ?image ?website
           ?localwiki ?enwiki
           ?alias_local ?alias_en
           ?describedAt
    WHERE {{
      VALUES ?country {{ wd:{country_qid} }}
      ?item wdt:P17 ?country .

      {{
        {{ ?item wdt:P31 wd:Q839954 }} UNION   # archaeological site
        {{ ?item wdt:P31 wd:Q33506 }} UNION    # museum
        {{ ?item wdt:P31 wd:Q4989906 }} UNION  # monument
        {{ ?item wdt:P31 wd:Q120560 }} UNION   # beach
        {{ ?item wdt:P31 wd:Q44782 }} UNION    # island
        {{ ?item wdt:P31 wd:Q2065736 }} UNION  # national park
        {{ ?item wdt:P31 wd:Q174782 }}         # castle
      }}

      OPTIONAL {{
        ?item wdt:P31 ?type .
        ?type rdfs:label ?typeLabel .
        FILTER(LANG(?typeLabel) IN ("{local_lang}","en"))
      }}

      OPTIONAL {{
        ?item wdt:P131 ?admin .
        ?admin rdfs:label ?adminLabel .
        FILTER(LANG(?adminLabel) IN ("{local_lang}","en"))
      }}

      OPTIONAL {{ ?item wdt:P625 ?coordinates }}
      OPTIONAL {{ ?item wdt:P18 ?image }}
      OPTIONAL {{ ?item wdt:P856 ?website }}
      OPTIONAL {{ ?item wdt:P973 ?describedAt }}

      # Labels
      OPTIONAL {{
        ?item rdfs:label ?itemLabel_local .
        FILTER(LANG(?itemLabel_local)="{local_lang}")
      }}
      OPTIONAL {{
        ?item rdfs:label ?itemLabel_en .
        FILTER(LANG(?itemLabel_en)="en")
      }}

      # Descriptions
      OPTIONAL {{
        ?item schema:description ?desc_local .
        FILTER(LANG(?desc_local)="{local_lang}")
      }}
      OPTIONAL {{
        ?item schema:description ?desc_en .
        FILTER(LANG(?desc_en)="en")
      }}

      # Aliases: local + en
      OPTIONAL {{
        ?item skos:altLabel ?alias_local .
        FILTER(LANG(?alias_local)="{local_lang}")
      }}
      OPTIONAL {{
        ?item skos:altLabel ?alias_en .
        FILTER(LANG(?alias_en)="en")
      }}

      # Local sitelink
      OPTIONAL {{
        ?localArticle schema:about ?item ;
                      schema:isPartOf <{local_wiki_domain}> .
        BIND(STR(?localArticle) AS ?localwiki)
      }}
      # English sitelink
      OPTIONAL {{
        ?enArticle schema:about ?item ;
                   schema:isPartOf <https://en.wikipedia.org/> .
        BIND(STR(?enArticle) AS ?enwiki)
      }}

      BIND(STRAFTER(STR(?item), "entity/") AS ?qid)
    }}
    {limit_clause}
    """

    sparql.setQuery(query)
    results = sparql.query().convert()
    return results


### 2. Κανονικοποίηση αποτελεσμάτων SPARQL σε επίπεδο POI

Σε αυτό το κελί παίρνω το raw αποτέλεσμα της SPARQL (`raw`) για μία χώρα και το μετατρέπω σε "καθαρή" λίστα προορισμών (POIs).  
Συγκεκριμένα:
- Συγχωνεύω όλες τις γραμμές που αντιστοιχούν στο ίδιο `qid` σε ένα ενιαίο record.
- Μαζεύω `aliases_local`, `aliases_en` και `sources` σε λίστες χωρίς διπλότυπα.
- Συμπληρώνω τα βασικά πεδία (labels, descriptions, type, admin, links) όπου λείπουν.

Το output της `build_country_items` είναι μια λίστα από dictionaries (ένα ανά POI), έτοιμη να ενωθεί με τα άλλα countries και να αποθηκευτεί σε JSON / να φορτωθεί σε DataFrame για ανάλυση.

In [None]:
def v(b, k):
    """Βοηθητικό: ασφαλής πρόσβαση σε binding value."""
    return b.get(k, {}).get("value", "")

def build_country_items(raw, country_code: str, local_lang: str):
    rows = raw["results"]["bindings"]
    items = {}
    aliases_local = defaultdict(set)
    aliases_en = defaultdict(set)
    sources = defaultdict(set)

    for b in rows:
        qid = v(b, "qid")
        name_local = v(b, "itemLabel_local")
        name_en    = v(b, "itemLabel_en")
        desc_local = v(b, "desc_local")
        desc_en    = v(b, "desc_en")
        typ  = v(b, "typeLabel")
        adm  = v(b, "adminLabel")
        wkt  = v(b, "coordinates")
        img  = v(b, "image")
        site = v(b, "website")
        lwp  = v(b, "localwiki")
        enwp = v(b, "enwiki")
        url  = v(b, "describedAt")

        if qid not in items:
            items[qid] = {
                "qid": qid,
                "country": country_code,
                "local_lang": local_lang,
                "label_local": name_local,
                "label_en": name_en,
                "desc_local": desc_local,
                "desc_en": desc_en,
                "type": typ,
                "admin": adm,
                "coordinates_wkt": wkt,
                "image": img,
                "website": site,
                "localwiki": lwp,
                "enwiki": enwp,
                "aliases_local": [],
                "aliases_en": [],
                "sources": [],
            }
        else:
            it = items[qid]
            if not it["label_local"] and name_local: it["label_local"] = name_local
            if not it["label_en"]   and name_en:    it["label_en"]    = name_en
            if not it["desc_local"] and desc_local: it["desc_local"] = desc_local
            if not it["desc_en"]    and desc_en:    it["desc_en"]    = desc_en
            if not it["type"] and typ: it["type"] = typ
            if not it["admin"] and adm: it["admin"] = adm
            if not it["coordinates_wkt"] and wkt: it["coordinates_wkt"] = wkt
            if not it["image"] and img: it["image"] = img
            if not it["website"] and site: it["website"] = site
            if not it["localwiki"] and lwp: it["localwiki"] = lwp
            if not it["enwiki"] and enwp: it["enwiki"] = enwp

        al_loc = v(b, "alias_local")
        if al_loc:
            aliases_local[qid].add(al_loc)

        al_en = v(b, "alias_en")
        if al_en:
            aliases_en[qid].add(al_en)

        if url:
            sources[qid].add(url)

    for qid, it in items.items():
        it["aliases_local"] = sorted(list(aliases_local[qid]))
        it["aliases_en"] = sorted(list(aliases_en[qid]))
        it["sources"] = sorted(list(sources[qid]))

    return list(items.values())


### 4. Δοκιμαστική ανάκτηση POIs για GR/ES/IT (με LIMIT)

Σε αυτό το βήμα καλώ τη `get_pois_for_country` για Ελλάδα, Ισπανία, Ιταλία
με ένα μικρό LIMIT (π.χ. 500) ώστε να ελέγξω ότι όλα δουλεύουν σωστά
χωρίς να φορτώσω υπερβολικά το Wikidata.


In [None]:
# Δοκιμή: μικρό δείγμα, π.χ. 500 γραμμές ανά χώρα
raw_gr_test = get_pois_for_country("Q41", "el", "https://el.wikipedia.org/", limit=500)
raw_es_test = get_pois_for_country("Q29", "es", "https://es.wikipedia.org/", limit=500)
raw_it_test = get_pois_for_country("Q38", "it", "https://it.wikipedia.org/", limit=500)

len(raw_gr_test["results"]["bindings"]), len(raw_es_test["results"]["bindings"]), len(raw_it_test["results"]["bindings"])


NameError: name 'get_pois_for_country' is not defined

### 3. Ανάκτηση POIs από Wikidata για Ελλάδα, Ισπανία, Ιταλία

Σε αυτό το βήμα καλώ τη `get_pois_for_country` για κάθε χώρα (GR/ES/IT),
μετατρέπω τα raw SPARQL αποτελέσματα σε καθαρές εγγραφές POI με τη
`build_country_items` και τα ενώνω σε μία ενιαία λίστα `all_items`.


In [None]:
all_items = []

# 1. ΤΡΑΒΑΩ RAW ΓΙΑ ΚΑΘΕ ΧΩΡΑ (ΜΙΑ ΦΟΡΑ)

raw_gr = get_pois_for_country("Q41", "el", "https://el.wikipedia.org/", limit=10000000)
raw_es = get_pois_for_country("Q29", "es", "https://es.wikipedia.org/", limit=10000000)
raw_it = get_pois_for_country("Q38", "it", "https://it.wikipedia.org/", limit=10000000)

# 2. ΠΡΟΑΙΡΕΤΙΚΟ CHECK: ΠΟΣΑ QIDs ΕΧΩ ΑΠΟ ΤΗΝ ΚΑΘΕ ΧΩΡΑ;

qids_gr = { b["qid"]["value"] for b in raw_gr["results"]["bindings"] if "qid" in b }
qids_es = { b["qid"]["value"] for b in raw_es["results"]["bindings"] if "qid" in b }
qids_it = { b["qid"]["value"] for b in raw_it["results"]["bindings"] if "qid" in b }

print("Διαφορετικά QIDs για GR:", len(qids_gr))
print("Διαφορετικά QIDs για ES:", len(qids_es))
print("Διαφορετικά QIDs για IT:", len(qids_it))

# (προαιρετικό) πόσες "γραμμές" (bindings) έχει το raw για GR
num_rows_gr = len(raw_gr["results"]["bindings"])
print("Αριθμός γραμμών (bindings) για GR:", num_rows_gr)

# 3. ΧΤΙΖΩ ΤΑ POIs ΑΝΑ ΧΩΡΑ ΜΕ ΤΗ build_country_items ΚΑΙ ΤΑ ΒΑΖΩ ΣΤΟ all_items

items_gr = build_country_items(raw_gr, "GR", "el")
items_es = build_country_items(raw_es, "ES", "es")
items_it = build_country_items(raw_it, "IT", "it")

all_items += items_gr
all_items += items_es
all_items += items_it

print("Σύνολο POIs (όλων των χωρών):", len(all_items))


NameError: name 'get_pois_for_country' is not defined

. Αποθήκευση σε JSON

In [None]:
import json

with open("destinations_multicountry.json", "w", encoding="utf-8") as f:
    json.dump(all_items, f, ensure_ascii=False, indent=2)

print("Saved", len(all_items), "POIs to destinations_multicountry.json")


Saved 0 POIs to destinations_multicountry.json


In [None]:
import pandas as pd

df = pd.DataFrame(all_items)
df.shape, df['country'].value_counts()


KeyError: 'country'

In [None]:
df.to_csv("destinations_multicountry.csv", index=False)
print("Saved DataFrame to destinations_multicountry.csv")


NameError: name 'df' is not defined

**Aπό εδώ και πέρα, για ανάλυση, μπορείς να ξεκινάς από:**



In [None]:
import pandas as pd
df = pd.read_csv("destinations_multicountry.csv")


### Βήμα 1 – Φόρτωση του πολυχώρου dataset σε pandas DataFrame

Σε αυτό το βήμα φορτώνω το `destinations_multicountry.csv` ή `destinations_multicountry.json`
και κάνω ένα πρώτο check:
- πόσες εγγραφές (POIs) έχουμε συνολικά
- πόσα POIs έχει κάθε χώρα (GR/ES/IT)
- ποιες στήλες υπάρχουν στο dataset


In [None]:
import pandas as pd
import json
import os

df = None
source = None

# Προσπαθώ πρώτα από CSV (αν υπάρχει)
if os.path.exists("destinations_multicountry.csv"):
    df = pd.read_csv("destinations_multicountry.csv")
    source = "csv"
# αλλιώς από JSON
elif os.path.exists("destinations_multicountry.json"):
    with open("destinations_multicountry.json", "r", encoding="utf-8") as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    source = "json"
else:
    raise FileNotFoundError("Δεν βρήκα ούτε destinations_multicountry.csv ούτε destinations_multicountry.json")

print("Loaded dataset from:", source)
print("Shape (rows, columns):", df.shape)
print("\nColumns:")
print(df.columns.tolist())

print("\nPOIs per country:")
print(df["country"].value_counts())

df.head()


Loaded dataset from: csv
Shape (rows, columns): (36693, 17)

Columns:
['qid', 'country', 'local_lang', 'label_local', 'label_en', 'desc_local', 'desc_en', 'type', 'admin', 'coordinates_wkt', 'image', 'website', 'localwiki', 'enwiki', 'aliases_local', 'aliases_en', 'sources']

POIs per country:
country
ES    25401
IT     9271
GR     2021
Name: count, dtype: int64


Unnamed: 0,qid,country,local_lang,label_local,label_en,desc_local,desc_en,type,admin,coordinates_wkt,image,website,localwiki,enwiki,aliases_local,aliases_en,sources
0,Q132472,GR,el,Μίεζα,Mieza,κώμη στην αρχαία Μακεδονία,village in Ancient Macedon,αρχαιολογική θέση,Naousa Municipality,Point(22.122222222 40.644166666),http://commons.wikimedia.org/wiki/Special:File...,,https://el.wikipedia.org/wiki/%CE%9C%CE%AF%CE%...,https://en.wikipedia.org/wiki/Mieza_(Macedonia),[],[],['http://odysseus.culture.gr/h/2/gh251.jsp?obj...
1,Q152348,GR,el,Κασταλία πηγή,Castalian Spring,,sacred fountain at Delphi,υδάτινη πηγή,Delfi Municipality,Point(22.505555555 38.483055555),http://commons.wikimedia.org/wiki/Special:File...,,https://el.wikipedia.org/wiki/%CE%9A%CE%B1%CF%...,https://en.wikipedia.org/wiki/Castalian_Spring,['Κασταλία κρήνη'],['Castalian fountain'],['http://odysseus.culture.gr/h/2/gh251.jsp?obj...
2,Q140345,GR,el,Ζαγορά Άνδρου,Zagora,,"archaeological site in Andros island, Greece",archaeological site,Δήμος Άνδρου,Point(24.86555556 37.77416667),,,,,[],[],['http://odysseus.culture.gr/h/3/gh351.jsp?obj...
3,Q174552,GR,el,Τεχνόπολη Δήμου Αθηναίων,Technopolis City of Athens,κέντρο πολιτισμού και μουσείο στην Αθήνα,"museum and cultural center in Athens, Greece",museum,Athens Municipality,Point(23.7141 37.9781),http://commons.wikimedia.org/wiki/Special:File...,https://www.athens-technopolis.gr/,https://el.wikipedia.org/wiki/%CE%A4%CE%B5%CF%...,https://en.wikipedia.org/wiki/Technopolis_(Gazi),"['Τεχνόπολη', 'Τεχνόπολις']","['Gazi', 'Industrial gas museum of Athens', 'T...",[]
4,Q193093,GR,el,Ακαδημία Πλάτωνος,Platonic Academy,"φιλοσοφικό, ερευνητικό και εκπαιδευτικό κέντρο...","ancient philosophical, research and educative ...",αρχιτεκτονική κατασκευή,Κολωνός,Point(23.708059 37.992359),http://commons.wikimedia.org/wiki/Special:File...,,https://el.wikipedia.org/wiki/%CE%91%CE%BA%CE%...,https://en.wikipedia.org/wiki/Platonic_Academy,"['Ακαδημία του Πλάτωνα', 'Πλατωνική Ακαδημία']","['Academy of Plato', ""Plato's Academy""]",['http://odysseus.culture.gr/h/2/gh251.jsp?obj...


Τι κάνουμε εδώ με απλά λόγια

to_list_safe: ξαναμετατρέπει τα aliases_local, aliases_en, sources σε πραγματικές Python λίστες.

Φτιάχνουμε flags has_* (True/False) και lengths (πόσοι χαρακτήρες ή πόσα στοιχεία).

Στο τέλος έχουμε ένα df_feat που είναι το αναλυτικό dataset με όλα τα quality metrics.

In [None]:
import ast
import numpy as np

# Δουλεύουμε σε αντίγραφο για να κρατήσουμε το original αν χρειαστεί
df_feat = df.copy()

# 2.1. Μετατροπή των aliases_local, aliases_en, sources από string -> list (αν χρειάζεται)
def to_list_safe(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        x = x.strip()
        # κενό string ή 'nan'
        if x == "" or x.lower() == "nan":
            return []
        # αν μοιάζει με λίστα π.χ. "['a', 'b']"
        if x.startswith("[") and x.endswith("]"):
            try:
                return ast.literal_eval(x)
            except Exception:
                return []
        # οτιδήποτε άλλο: το κάνουμε singleton list
        return [x]
    return []

for col in ["aliases_local", "aliases_en", "sources"]:
    df_feat[col] = df_feat[col].apply(to_list_safe)

# 2.2. Βοηθητικό για μήκος λίστας
def list_len(x):
    return len(x) if isinstance(x, list) else 0

# 2.3. Features για labels
df_feat["has_label_local"] = df_feat["label_local"].fillna("").ne("")
df_feat["has_label_en"]    = df_feat["label_en"].fillna("").ne("")

df_feat["len_label_local"] = df_feat["label_local"].fillna("").str.len()
df_feat["len_label_en"]    = df_feat["label_en"].fillna("").str.len()

# 2.4. Features για descriptions
df_feat["has_desc_local"]  = df_feat["desc_local"].fillna("").ne("")
df_feat["has_desc_en"]     = df_feat["desc_en"].fillna("").ne("")

df_feat["len_desc_local"]  = df_feat["desc_local"].fillna("").str.len()
df_feat["len_desc_en"]     = df_feat["desc_en"].fillna("").str.len()

# 2.5. Features για aliases
df_feat["n_aliases_local"] = df_feat["aliases_local"].apply(list_len)
df_feat["n_aliases_en"]    = df_feat["aliases_en"].apply(list_len)

# 2.6. Wikipedia sitelinks
df_feat["has_localwiki"] = df_feat["localwiki"].fillna("").ne("")
df_feat["has_enwiki"]    = df_feat["enwiki"].fillna("").ne("")

# 2.7. Πηγές (sources)
df_feat["n_sources"] = df_feat["sources"].apply(list_len)

# 2.8. Γρήγορο summary για να δούμε ότι όλα είναι ΟΚ
print("Shape:", df_feat.shape)

print("\nΜέσος αριθμός aliases_local / aliases_en ανά χώρα:")
print(
    df_feat.groupby("country")[["n_aliases_local", "n_aliases_en"]]
           .mean()
           .round(2)
)

print("\nCoverage labels/descriptions per country (τοπική γλώσσα):")
print(
    df_feat.groupby("country")[["has_label_local", "has_desc_local"]]
           .mean()
           .round(3)
)

print("\nCoverage English labels/descriptions per country:")
print(
    df_feat.groupby("country")[["has_label_en", "has_desc_en"]]
           .mean()
           .round(3)
)

print("\nCoverage Wikipedia sitelinks per country:")
print(
    df_feat.groupby("country")[["has_localwiki", "has_enwiki"]]
           .mean()
           .round(3)
)

df_feat.head()


Shape: (36693, 30)

Μέσος αριθμός aliases_local / aliases_en ανά χώρα:
         n_aliases_local  n_aliases_en
country                               
ES                  0.54          0.07
GR                  0.38          0.50
IT                  0.65          0.24

Coverage labels/descriptions per country (τοπική γλώσσα):
         has_label_local  has_desc_local
country                                 
ES                 0.900           0.872
GR                 0.779           0.456
IT                 0.957           0.826

Coverage English labels/descriptions per country:
         has_label_en  has_desc_en
country                           
ES              0.804        0.555
GR              0.972        0.837
IT              0.747        0.797

Coverage Wikipedia sitelinks per country:
         has_localwiki  has_enwiki
country                           
ES               0.195       0.048
GR               0.297       0.298
IT               0.457       0.155


Unnamed: 0,qid,country,local_lang,label_local,label_en,desc_local,desc_en,type,admin,coordinates_wkt,...,len_label_en,has_desc_local,has_desc_en,len_desc_local,len_desc_en,n_aliases_local,n_aliases_en,has_localwiki,has_enwiki,n_sources
0,Q132472,GR,el,Μίεζα,Mieza,κώμη στην αρχαία Μακεδονία,village in Ancient Macedon,αρχαιολογική θέση,Naousa Municipality,Point(22.122222222 40.644166666),...,5,True,True,26,26,0,0,True,True,1
1,Q152348,GR,el,Κασταλία πηγή,Castalian Spring,,sacred fountain at Delphi,υδάτινη πηγή,Delfi Municipality,Point(22.505555555 38.483055555),...,16,False,True,0,25,1,1,True,True,1
2,Q140345,GR,el,Ζαγορά Άνδρου,Zagora,,"archaeological site in Andros island, Greece",archaeological site,Δήμος Άνδρου,Point(24.86555556 37.77416667),...,6,False,True,0,44,0,0,False,False,1
3,Q174552,GR,el,Τεχνόπολη Δήμου Αθηναίων,Technopolis City of Athens,κέντρο πολιτισμού και μουσείο στην Αθήνα,"museum and cultural center in Athens, Greece",museum,Athens Municipality,Point(23.7141 37.9781),...,26,True,True,40,44,2,6,True,True,0
4,Q193093,GR,el,Ακαδημία Πλάτωνος,Platonic Academy,"φιλοσοφικό, ερευνητικό και εκπαιδευτικό κέντρο...","ancient philosophical, research and educative ...",αρχιτεκτονική κατασκευή,Κολωνός,Point(23.708059 37.992359),...,16,True,True,94,80,2,2,True,True,1


Ποσοστό POIs με τουλάχιστον 1 alias (local / English)


In [None]:
import ast
import pandas as pd

# Ξεκινάμε από το df που έχεις ήδη φορτώσει
# df: columns ['qid','country','local_lang','label_local','label_en',...,'aliases_local','aliases_en','sources']

# 1. Βοηθητικό: μετατροπή string -> list για aliases
def to_list_safe(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        x = x.strip()
        if x == "" or x.lower() == "nan":
            return []
        if x.startswith("[") and x.endswith("]"):
            try:
                return ast.literal_eval(x)
            except Exception:
                return []
        return [x]
    return []

df_feat = df.copy()

# 2. Εξασφαλίζουμε ότι aliases_local / aliases_en είναι λίστες
for col in ["aliases_local", "aliases_en"]:
    df_feat[col] = df_feat[col].apply(to_list_safe)

# 3. Μετράμε πόσα aliases έχει κάθε POI
df_feat["n_aliases_local"] = df_feat["aliases_local"].apply(lambda x: len(x))
df_feat["n_aliases_en"]    = df_feat["aliases_en"].apply(lambda x: len(x))

# 4. Flags: έχει τουλάχιστον 1 alias;
df_feat["has_alias_local"] = df_feat["n_aliases_local"] > 0
df_feat["has_alias_en"]    = df_feat["n_aliases_en"] > 0

# 5. Ποσοστό ανά χώρα (σε %)
alias_coverage = (
    df_feat
    .groupby("country")[["has_alias_local", "has_alias_en"]]
    .mean()
    .multiply(100)
    .round(1)
)

print("Ποσοστό POIs με ≥1 alias (local / English) ανά χώρα (%):")
print(alias_coverage)


Ποσοστό POIs με ≥1 alias (local / English) ανά χώρα (%):
         has_alias_local  has_alias_en
country                               
ES                  28.6           5.4
GR                  26.6          32.5
IT                  37.1          16.5


**Κελί ανάλυσης για labels**

In [None]:
import pandas as pd

# Ξεκινάμε από το df που έχεις ήδη φορτώσει από το CSV
df_labels = df.copy()

# 1. Flags: έχει label local / English;
df_labels["has_label_local"] = df_labels["label_local"].fillna("").ne("")
df_labels["has_label_en"]    = df_labels["label_en"].fillna("").ne("")

# 2. Coverage ανά χώρα (σε %)
label_coverage = (
    df_labels
    .groupby("country")[["has_label_local", "has_label_en"]]
    .mean()
    .multiply(100)
    .round(1)
)

print("Label coverage per country (%, local vs English):")
print(label_coverage)

# 3. Μοτίβα: μόνο local, μόνο English, και τα δύο, κανένα
def label_pattern(row):
    if row["has_label_local"] and row["has_label_en"]:
        return "both_local_and_en"
    elif row["has_label_local"] and not row["has_label_en"]:
        return "local_only"
    elif not row["has_label_local"] and row["has_label_en"]:
        return "en_only"
    else:
        return "no_label"

df_labels["label_pattern"] = df_labels.apply(label_pattern, axis=1)

pattern_counts = (
    df_labels
    .groupby(["country", "label_pattern"])
    .size()
    .unstack(fill_value=0)
)

pattern_perc = (
    pattern_counts
    .div(pattern_counts.sum(axis=1), axis=0)
    .multiply(100)
    .round(1)
)

print("\nLabel patterns per country (counts):")
print(pattern_counts)

print("\nLabel patterns per country (%):")
print(pattern_perc)


Label coverage per country (%, local vs English):
         has_label_local  has_label_en
country                               
ES                  90.0          80.4
GR                  77.9          97.2
IT                  95.7          74.7

Label patterns per country (counts):
label_pattern  both_local_and_en  en_only  local_only  no_label
country                                                        
ES                         18276     2142        4595       388
GR                          1541      424          34        22
IT                          6605      318        2266        82

Label patterns per country (%):
label_pattern  both_local_and_en  en_only  local_only  no_label
country                                                        
ES                          71.9      8.4        18.1       1.5
GR                          76.2     21.0         1.7       1.1
IT                          71.2      3.4        24.4       0.9


Θα κάνουμε ακριβώς ό,τι κάναμε για τα labels, αλλά τώρα για:

desc_local

desc_en

Δηλαδή:

Coverage: πόσα POIs έχουν description στη local γλώσσα / στα αγγλικά, ανά χώρα.

Patterns:

και local + English,

μόνο local,

μόνο English,

καμία description.

In [None]:
import pandas as pd

# Ξεκινάμε από το df που έχεις ήδη (το αρχικό με τα 36,693 POIs)
df_desc = df.copy()

# 1. Flags: έχει description local / English;
df_desc["has_desc_local"] = df_desc["desc_local"].fillna("").ne("")
df_desc["has_desc_en"]    = df_desc["desc_en"].fillna("").ne("")

# 2. Coverage ανά χώρα (σε %)
desc_coverage = (
    df_desc
    .groupby("country")[["has_desc_local", "has_desc_en"]]
    .mean()
    .multiply(100)
    .round(1)
)

print("Description coverage per country (%, local vs English):")
print(desc_coverage)

# 3. Μοτίβα: μόνο local, μόνο English, και τα δύο, καμία
def desc_pattern(row):
    if row["has_desc_local"] and row["has_desc_en"]:
        return "both_local_and_en"
    elif row["has_desc_local"] and not row["has_desc_en"]:
        return "local_only"
    elif not row["has_desc_local"] and row["has_desc_en"]:
        return "en_only"
    else:
        return "no_desc"

df_desc["desc_pattern"] = df_desc.apply(desc_pattern, axis=1)

pattern_counts_desc = (
    df_desc
    .groupby(["country", "desc_pattern"])
    .size()
    .unstack(fill_value=0)
)

pattern_perc_desc = (
    pattern_counts_desc
    .div(pattern_counts_desc.sum(axis=1), axis=0)
    .multiply(100)
    .round(1)
)

print("\nDescription patterns per country (counts):")
print(pattern_counts_desc)

print("\nDescription patterns per country (%):")
print(pattern_perc_desc)


Description coverage per country (%, local vs English):
         has_desc_local  has_desc_en
country                             
ES                 87.2         55.5
GR                 45.6         83.7
IT                 82.6         79.7

Description patterns per country (counts):
desc_pattern  both_local_and_en  en_only  local_only  no_desc
country                                                      
ES                        12239     1866        9912     1384
GR                          891      800          31      299
IT                         6446      947        1210      668

Description patterns per country (%):
desc_pattern  both_local_and_en  en_only  local_only  no_desc
country                                                      
ES                         48.2      7.3        39.0      5.4
GR                         44.1     39.6         1.5     14.8
IT                         69.5     10.2        13.1      7.2


per POI type (museums, archaeological sites, natural κλπ.) + per country.



Παίρνει το df που έχεις ήδη (από το CSV).

Φτιάχνει μια νέα στήλη poi_group (Museum, Archaeological, Natural κ.λπ.).

Ξαναϋπολογίζει labels / descriptions / aliases flags.

Βγάζει πίνακες με στατιστικά ανά χώρα + κατηγορία POI.

In [None]:
import pandas as pd
import ast

# Ξεκινάμε από το df που έχεις ήδη φορτώσει από το CSV
df_type = df.copy()

# 1. Mapping από raw 'type' σε μερικές μεγάλες κατηγορίες POIs
def map_poi_group(t):
    if pd.isna(t):
        return "Other"
    t_low = str(t).lower()

    # Museums / galleries
    if any(sub in t_low for sub in [
        "museum", "museo", "museu", "μουσείο", "galería", "galleria", "gallery", "pinacoteca"
    ]):
        return "Museum / Gallery"

    # Archaeological / historical sites
    if any(sub in t_low for sub in [
        "archaeological", "arqueológico", "arqueologico", "arqueológica",
        "αρχαιολογ", "archäolog", "sito archeologico", "historic site"
    ]):
        return "Archaeological / historical site"

    # Monuments / memorials
    if any(sub in t_low for sub in [
        "monument", "monumento", "μνημείο", "memorial"
    ]):
        return "Monument / Memorial"

    # Beaches
    if any(sub in t_low for sub in [
        "beach", "playa", "spiaggia", "παραλία"
    ]):
        return "Beach"

    # Islands
    if any(sub in t_low for sub in [
        "island", "isla", "isola", "νησί"
    ]):
        return "Island"

    # National / natural parks
    if any(sub in t_low for sub in [
        "national park", "parque nacional", "parco nazionale",
        "εθνικός δρυμός", "natural park", "parque natural", "παράρτημα δρυμός"
    ]):
        return "National / Natural park"

    return "Other"

df_type["poi_group"] = df_type["type"].apply(map_poi_group)

# 2. Labels & descriptions: flags για ύπαρξη local / English
for col in ["label_local", "label_en", "desc_local", "desc_en"]:
    df_type[col] = df_type[col].fillna("")

df_type["has_label_local"] = df_type["label_local"] != ""
df_type["has_label_en"]    = df_type["label_en"]   != ""

df_type["has_desc_local"]  = df_type["desc_local"] != ""
df_type["has_desc_en"]     = df_type["desc_en"]    != ""

# 3. Aliases: μετατροπή σε λίστα + counters + flags
def to_list_safe(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        x = x.strip()
        if x == "" or x.lower() == "nan":
            return []
        if x.startswith("[") and x.endswith("]"):
            try:
                return ast.literal_eval(x)
            except Exception:
                return []
        return [x]
    return []

for col in ["aliases_local", "aliases_en"]:
    df_type[col] = df_type[col].apply(to_list_safe)

df_type["n_aliases_local"] = df_type["aliases_local"].apply(len)
df_type["n_aliases_en"]    = df_type["aliases_en"].apply(len)

df_type["has_alias_local"] = df_type["n_aliases_local"] > 0
df_type["has_alias_en"]    = df_type["n_aliases_en"]    > 0

# 4. Composite documentation score (0–3)
# +1 αν έχει ΚΑΙ local+EN label, +1 αν έχει ΚΑΙ local+EN description, +1 αν έχει >=1 alias σε οποιαδήποτε γλώσσα
df_type["score_labels"] = ((df_type["has_label_local"]) & (df_type["has_label_en"])).astype(int)
df_type["score_desc"]   = ((df_type["has_desc_local"]) & (df_type["has_desc_en"])).astype(int)
df_type["score_alias"]  = ((df_type["has_alias_local"]) |  (df_type["has_alias_en"])).astype(int)

df_type["doc_score_0_3"] = df_type["score_labels"] + df_type["score_desc"] + df_type["score_alias"]

# 5. Aggregation ανά χώρα + κατηγορία POI

group_cols = ["country", "poi_group"]

# 5a. Coverage (ποσοστά σε %)
coverage_by_type = (
    df_type
    .groupby(group_cols)[[
        "has_label_local", "has_label_en",
        "has_desc_local", "has_desc_en",
        "has_alias_local", "has_alias_en"
    ]]
    .mean()
    .multiply(100)
    .round(1)
)

print("Coverage per country & POI group (%, labels / descriptions / aliases):")
print(coverage_by_type)

# 5b. Μέσος αριθμός aliases ανά POI
aliases_mean_by_type = (
    df_type
    .groupby(group_cols)[["n_aliases_local", "n_aliases_en"]]
    .mean()
    .round(2)
)

print("\nAverage number of aliases per POI, per country & POI group:")
print(aliases_mean_by_type)

# 5c. Τεκμηρίωση: μέσο documentation score (0–3)
docscore_by_type = (
    df_type
    .groupby(group_cols)["doc_score_0_3"]
    .agg(["mean", "median"])
    .round(2)
)

print("\nDocumentation score (0–3) per country & POI group (mean / median):")
print(docscore_by_type)


Coverage per country & POI group (%, labels / descriptions / aliases):
                                          has_label_local  has_label_en  \
country poi_group                                                         
ES      Archaeological / historical site             91.8          83.4   
        Island                                      100.0         100.0   
        Monument / Memorial                          83.7          69.6   
        Museum / Gallery                             97.8          64.6   
        Other                                        89.9          83.9   
GR      Archaeological / historical site             83.7          97.6   
        Island                                      100.0         100.0   
        Monument / Memorial                          76.2          95.2   
        Museum / Gallery                             63.5          95.4   
        Other                                        80.9          98.1   
IT      Archaeological / hist

ΤΙ POI GROUPS ΥΠΑΡΧΟΥΝ

In [None]:
import pandas as pd

# Πόσα μοναδικά 'type' υπάρχουν;
n_unique_types = df["type"].nunique()
print("Μοναδικοί τύποι (type):", n_unique_types)

# Οι 50 πιο συχνοί τύποι (για να πάρουμε μια αίσθηση)

print(df["type"].value_counts().head(1180))


Μοναδικοί τύποι (type): 1180
type
yacimiento arqueológico    6221
archaeological site        3927
monument                   3880
human settlement           2437
museum                     1987
                           ... 
pedestrian zone               1
wall                          1
plano                         1
pilaster                      1
cruz de término               1
Name: count, Length: 1180, dtype: int64


In [None]:
print("Μοναδικοί τύποι ανά χώρα:\n")
for c in df["country"].unique():
    print(f"=== {c} ===")
    sub = df[df["country"] == c]
    print("POIs:", len(sub))
    print("Unique types:", sub["type"].nunique())
    print(sub["type"].value_counts().head(20))
    print()


Μοναδικοί τύποι ανά χώρα:

=== GR ===
POIs: 2021
Unique types: 207
type
αρχαιολογική θέση         505
archaeological site       340
museum                    248
μουσείο                   210
πλατεία                    51
port                       49
ancient city               33
λιμάνι                     30
square                     29
polis                      25
αρχαία πόλη                24
ancient Greek temple       23
πόλις                      20
temple                     20
hieron                     19
αρχαίο ελληνικό ιερό       18
ναός                       13
αρχαίος ελληνικός ναός     10
μνημείο                    10
castle                     10
Name: count, dtype: int64

=== ES ===
POIs: 25401
Unique types: 662
type
yacimiento arqueológico    6221
monument                   3355
archaeological site        2898
human settlement           2431
plaza                      1248
museum                      725
monumento                   650
square                      579

Φτιάχνουμε πίνακα με όλους τους τύπους + counts (για να τον δεις/ανοίξεις)

In [None]:
type_counts = (
    df["type"]
    .value_counts(dropna=False)
    .reset_index()
    .rename(columns={"index": "type", "type": "count"})
)

print(type_counts.head(20))

# Αν θέλεις να τον ανοίξεις από το Colab σαν CSV:
type_counts.to_csv("type_counts.csv", index=False)
print("\nΑποθηκεύτηκε το type_counts.csv (μπορείς να το κατεβάσεις από τα Files).")


                      count  count
0   yacimiento arqueológico   6221
1       archaeological site   3927
2                  monument   3880
3          human settlement   2437
4                    museum   1987
5                    square   1723
6                     museo   1413
7                     plaza   1248
8                 monumento   1151
9         sito archeologico    922
10                   piazza    648
11        αρχαιολογική θέση    505
12   agricultural structure    419
13                    grave    253
14                     port    222
15                   pueblo    215
16                  μουσείο    210
17                   castle    199
18          basilica minore    164
19          church building    164

Αποθηκεύτηκε το type_counts.csv (μπορείς να το κατεβάσεις από τα Files).


In [None]:
all_types_df["in_GR"] = all_types_df["GR"] > 0
all_types_df["in_ES"] = all_types_df["ES"] > 0
all_types_df["in_IT"] = all_types_df["IT"] > 0

common_all3 = all_types_df[
    all_types_df["in_GR"] & all_types_df["in_ES"] & all_types_df["in_IT"]
].copy()

common_all3 = common_all3.sort_values("count_total", ascending=False)
common_all3


country,type,ES,GR,IT,count_total,in_GR,in_ES,in_IT
1,archaeological site,2898,340,689,3927,True,True,True
2,monument,3355,7,518,3880,True,True,True
4,museum,725,248,1014,1987,True,True,True
5,square,579,29,1115,1723,True,True,True
14,port,105,49,68,222,True,True,True
17,castle,185,10,4,199,True,True,True
18,church building,102,3,59,164,True,True,True
28,palace,106,5,7,118,True,True,True
31,building,108,5,2,115,True,True,True
41,defensive wall,67,1,1,69,True,True,True


In [None]:
import pandas as pd

# Αν έχεις ήδη df, δεν χρειάζεται να το ξαναφορτώσεις.
# df = pd.read_csv("destinations.csv")

df_types = df[["qid", "country", "type"]].copy()
df_types["type"] = df_types["type"].fillna("UNKNOWN")

# 1. Pivot: πόσα POIs έχει κάθε type ανά χώρα
type_country = (
    df_types
    .pivot_table(
        index="type",
        columns="country",
        values="qid",
        aggfunc="count",
        fill_value=0
    )
)

# Σιγουρευόμαστε ότι έχουμε και τις 3 στήλες
for c in ["GR", "ES", "IT"]:
    if c not in type_country.columns:
        type_country[c] = 0

# 2. Ποιοι τύποι είναι κοινοί και στις τρεις χώρες;
type_country["in_GR"] = type_country["GR"] > 0
type_country["in_ES"] = type_country["ES"] > 0
type_country["in_IT"] = type_country["IT"] > 0

common_mask = type_country["in_GR"] & type_country["in_ES"] & type_country["in_IT"]
common_types = type_country[common_mask].index.tolist()

print("Σύνολο διαφορετικών types:", type_country.shape[0])
print("Κοινοί types σε ΟΛΕΣ τις χώρες:", len(common_types))
print("Παράδειγμα κοινών types:", common_types[:10], "\n")

# 3. ΜΗ-κοινοί types ανά χώρα (όλοι οι άλλοι)

def non_common_types_for_country(country_code):
    sub = df_types[df_types["country"] == country_code]
    sub = sub[~sub["type"].isin(common_types)]
    counts = (
        sub["type"]
        .value_counts()
        .reset_index()
        .rename(columns={"index": "type", "type": "count"})
    )
    return counts

noncommon_GR = non_common_types_for_country("GR")
noncommon_ES = non_common_types_for_country("ES")
noncommon_IT = non_common_types_for_country("IT")

print("=== ΜΗ-κοινοί types για GR ===")
display(noncommon_GR)

print("=== ΜΗ-κοινοί types για ES ===")
display(noncommon_ES)

print("=== ΜΗ-κοινοί types για IT ===")
display(noncommon_IT)

# 4. Αν θες, τα σώζουμε και σε CSV για Excel
noncommon_GR.to_csv("poi_types_noncommon_GR.csv", index=False)
noncommon_ES.to_csv("poi_types_noncommon_ES.csv", index=False)
noncommon_IT.to_csv("poi_types_noncommon_IT.csv", index=False)

print("\nΑποθηκεύτηκαν τα:")
print(" - poi_types_noncommon_GR.csv")
print(" - poi_types_noncommon_ES.csv")
print(" - poi_types_noncommon_IT.csv")


common types code

In [None]:
import pandas as pd

# Ξεκινάμε από το df που έχεις ήδη
df_types = df[["qid", "country", "type"]].copy()
df_types["type"] = df_types["type"].fillna("UNKNOWN")

# ---------------------------------------------------
# 1. Βρίσκουμε τους ΤΥΠΟΥΣ που είναι κοινοί σε ΟΛΕΣ τις χώρες
# ---------------------------------------------------

type_country = (
    df_types
    .pivot_table(
        index="type",
        columns="country",
        values="qid",
        aggfunc="count",
        fill_value=0
    )
)

# Σιγουρευόμαστε ότι έχουμε και τις 3 στήλες
for c in ["GR", "ES", "IT"]:
    if c not in type_country.columns:
        type_country[c] = 0

type_country["in_GR"] = type_country["GR"] > 0
type_country["in_ES"] = type_country["ES"] > 0
type_country["in_IT"] = type_country["IT"] > 0

common_mask = type_country["in_GR"] & type_country["in_ES"] & type_country["in_IT"]
common_types = type_country[common_mask].index.tolist()

print("Πλήθος ΟΛΩΝ των types:", type_country.shape[0])
print("Πλήθος ΚΟΙΝΩΝ types (GR+ES+IT):", len(common_types))
print("Μερικοί κοινοί types:", common_types[:10])

# ---------------------------------------------------
# 2. ΜΗ-κοινοί types για κάθε χώρα
# ---------------------------------------------------

# Ελλάδα (GR)
sub_gr = df_types[df_types["country"] == "GR"]
sub_gr_noncommon = sub_gr[~sub_gr["type"].isin(common_types)]
noncommon_gr_counts = (
    sub_gr_noncommon["type"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "type", "type": "count"})
)

print("\n=== ΜΗ-κοινοί types για GR ===")
display(noncommon_gr_counts)

# Ισπανία (ES)
sub_es = df_types[df_types["country"] == "ES"]
sub_es_noncommon = sub_es[~sub_es["type"].isin(common_types)]
noncommon_es_counts = (
    sub_es_noncommon["type"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "type", "type": "count"})
)

print("\n=== ΜΗ-κοινοί types για ES ===")
display(noncommon_es_counts)

# Ιταλία (IT)
sub_it = df_types[df_types["country"] == "IT"]
sub_it_noncommon = sub_it[~sub_it["type"].isin(common_types)]
noncommon_it_counts = (
    sub_it_noncommon["type"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "type", "type": "count"})
)

print("\n=== ΜΗ-κοινοί types για IT ===")
display(noncommon_it_counts)


Πλήθος ΟΛΩΝ των types: 1024
Πλήθος ΚΟΙΝΩΝ types (GR+ES+IT): 25
Μερικοί κοινοί types: ['Roman aqueduct', 'Roman temple', 'Roman villa', 'ancient Roman structure', 'ancient city', 'archaeological museum', 'archaeological site', 'archive', 'art museum', 'basilica']

=== ΜΗ-κοινοί types για GR ===


Unnamed: 0,count,count.1
0,αρχαιολογική θέση,505
1,μουσείο,210
2,πλατεία,51
3,λιμάνι,30
4,polis,25
...,...,...
177,στρατιωτικό νεκροταφείο,1
178,λιθοσωρός,1
179,tourist destination,1
180,Lyceum,1



=== ΜΗ-κοινοί types για ES ===


Unnamed: 0,count,count.1
0,yacimiento arqueológico,6221
1,human settlement,2431
2,plaza,1248
3,monumento,650
4,museo,568
...,...,...
632,toy museum,1
633,manorialism,1
634,torre de telegrafía óptica,1
635,glorieta,1



=== ΜΗ-κοινοί types για IT ===


Unnamed: 0,count,count.1
0,sito archeologico,226
1,museo,197
2,piazza,187
3,basilica minore,164
4,porto,155
...,...,...
273,riparo sotto roccia,1
274,ex chiesa,1
275,domus de janas,1
276,art collection,1




In [None]:
import pandas as pd

# Αν δεν έχεις ήδη df από το CSV, ξεσχόλιασε:
# df = pd.read_csv("destinations.csv")

df_types = df[["qid", "country", "type"]].copy()
df_types["type"] = df_types["type"].fillna("UNKNOWN")

# 1. Pivot: πόσα POIs έχει κάθε type ανά χώρα
type_country = (
    df_types
    .pivot_table(
        index="type",
        columns="country",
        values="qid",
        aggfunc="count",
        fill_value=0
    )
)

# Σιγουρευόμαστε ότι υπάρχουν στήλες GR/ES/IT
for c in ["GR", "ES", "IT"]:
    if c not in type_country.columns:
        type_country[c] = 0

type_country["in_GR"] = type_country["GR"] > 0
type_country["in_ES"] = type_country["ES"] > 0
type_country["in_IT"] = type_country["IT"] > 0

common_mask = type_country["in_GR"] & type_country["in_ES"] & type_country["in_IT"]
common_types = type_country[common_mask].index.tolist()

print("Πλήθος ΟΛΩΝ των types:", type_country.shape[0])
print("Πλήθος ΚΟΙΝΩΝ types (GR+ES+IT):", len(common_types))
print("Μερικοί κοινοί types:", common_types[:10])

# 2. ΜΗ-κοινοί types για κάθε χώρα

# Ελλάδα (GR)
sub_gr = df_types[df_types["country"] == "GR"]
sub_gr_noncommon = sub_gr[~sub_gr["type"].isin(common_types)]
noncommon_gr_counts = (
    sub_gr_noncommon["type"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "type", "type": "count"})
)

print("\n=== ΜΗ-κοινοί types για GR ===")
display(noncommon_gr_counts)

# Ισπανία (ES)
sub_es = df_types[df_types["country"] == "ES"]
sub_es_noncommon = sub_es[~sub_es["type"].isin(common_types)]
noncommon_es_counts = (
    sub_es_noncommon["type"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "type", "type": "count"})
)

print("\n=== ΜΗ-κοινοί types για ES ===")
display(noncommon_es_counts)

# Ιταλία (IT)
sub_it = df_types[df_types["country"] == "IT"]
sub_it_noncommon = sub_it[~sub_it["type"].isin(common_types)]
noncommon_it_counts = (
    sub_it_noncommon["type"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "type", "type": "count"})
)

print("\n=== ΜΗ-κοινοί types για IT ===")
display(noncommon_it_counts)

# 3. Export σε 3 Excel αρχεία (.xlsx)

noncommon_gr_counts.to_excel("poi_types_noncommon_GR.xlsx", index=False)
noncommon_es_counts.to_excel("poi_types_noncommon_ES.xlsx", index=False)
noncommon_it_counts.to_excel("poi_types_noncommon_IT.xlsx", index=False)

print("\nΑποθηκεύτηκαν τα Excel αρχεία:")
print(" - poi_types_noncommon_GR.xlsx")
print(" - poi_types_noncommon_ES.xlsx")
print(" - poi_types_noncommon_IT.xlsx")
print("Μπορείς να τα κατεβάσεις από το panel 'Files' στο Colab.")


Πλήθος ΟΛΩΝ των types: 1024
Πλήθος ΚΟΙΝΩΝ types (GR+ES+IT): 25
Μερικοί κοινοί types: ['Roman aqueduct', 'Roman temple', 'Roman villa', 'ancient Roman structure', 'ancient city', 'archaeological museum', 'archaeological site', 'archive', 'art museum', 'basilica']

=== ΜΗ-κοινοί types για GR ===


Unnamed: 0,count,count.1
0,αρχαιολογική θέση,505
1,μουσείο,210
2,πλατεία,51
3,λιμάνι,30
4,polis,25
...,...,...
177,στρατιωτικό νεκροταφείο,1
178,λιθοσωρός,1
179,tourist destination,1
180,Lyceum,1



=== ΜΗ-κοινοί types για ES ===


Unnamed: 0,count,count.1
0,yacimiento arqueológico,6221
1,human settlement,2431
2,plaza,1248
3,monumento,650
4,museo,568
...,...,...
632,toy museum,1
633,manorialism,1
634,torre de telegrafía óptica,1
635,glorieta,1



=== ΜΗ-κοινοί types για IT ===


Unnamed: 0,count,count.1
0,sito archeologico,226
1,museo,197
2,piazza,187
3,basilica minore,164
4,porto,155
...,...,...
273,riparo sotto roccia,1
274,ex chiesa,1
275,domus de janas,1
276,art collection,1



Αποθηκεύτηκαν τα Excel αρχεία:
 - poi_types_noncommon_GR.xlsx
 - poi_types_noncommon_ES.xlsx
 - poi_types_noncommon_IT.xlsx
Μπορείς να τα κατεβάσεις από το panel 'Files' στο Colab.


εδω κανω κια mapping και στατιστικα

In [None]:
import pandas as pd
import ast

# ============================================================
# 0. Φόρτωση δεδομένων
# ============================================================

# Αν ΕΧΕΙΣ ΗΔΗ df, άσε το όπως είναι.
# Αν ΟΧΙ, ξεσχόλιασε την επόμενη γραμμή και βάλε το σωστό όνομα αρχείου.
# df = pd.read_csv("destinations.csv")

df_groups = df.copy()
df_groups["type"] = df_groups["type"].fillna("UNKNOWN")

# ============================================================
# 1. Λίστες keywords για mapping type -> poi_group
#    (Όλα σε lower-case, αγγλικά + ιταλικά + ισπανικά + ελληνικά)
# ============================================================

MUSEUM_TERMS = [
    # γενικά
    "museum", "museo", "museu", "μουσείο",
    "art museum", "museo d'arte", "museo de arte", "μουσείο τέχνης",
    "archaeological museum", "museo archeologico", "museo arqueológico", "αρχαιολογικό μουσείο",
    "history museum", "museo storico", "museo de historia",
    "ethnographic museum", "museo etnografico", "museo etnográfico",
    "natural history museum", "museo di storia naturale",
    "technology museum", "museo della tecnologia", "μουσείο τεχνολογίας",
    "science museum", "museo científico",
    "toy museum", "miniatures museum", "museo paleontológico",
    "military museum", "museo militar",
    "maritime museum", "museo marítimo",
    "glyptotheque", "pinacoteca", "pinacotheca",
    "museographic collection", "colección museográfica",
    "ecomuseum", "ecomuseo",
    "museum of a public entity", "museo di un ente pubblico",
    "museum of the italian ministry of culture",
    "museo nazionale italiano", "museo nazionale",
    "religious museum", "εβραϊκό μουσείο",
    "sculpture museum", "computer museum",
    "public aquarium", "δημόσιο ενυδρείο",
    "folklore collection"
]

ARCHAEO_TERMS = [
    "archaeological site", "yacimiento arqueológico", "sito archeologico",
    "listed archaeological site", "archaeological culture",
    "archaeological park", "parco archeologico",
    "ancient city", "ancient greek archaeological site",
    "ancient greek temple", "ancient greek stadium",
    "ancient port", "ancient monument",
    "αρχαιολογική θέση", "αρχαίο μνημείο",
    "αρχαίο ελληνικό ιερό", "αρχαίος ελληνικός ναός",
    "αρχαίο ελληνικό θέατρο", "ancient greek theatre",
    "αρχαία πόλη", "πόλις", "polis", "città antica", "ciudad antigua",
    "ρωμαϊκή έπαυλη", "roman villa", "villa romana",
    "roman archaeological site", "ancient roman structure", "edificio romano",
    "roman city", "città romana", "roman colony",
    "roman theatre", "teatro romano", "anfiteatro romano", "roman amphitheatre",
    "roman road", "calzada romana",
    "roman bridge", "puente romano",
    "roman aqueduct", "acueducto romano", "roman aqueduct",
    "prehistoric archaeological site", "prehistoric site",
    "yacimiento arqueológico prehistórico",
    "paleontological site", "yacimiento paleontológico",
    "necropoli", "necropolis", "necrópolis", "necropoli preistorica",
    "catacombs", "catacombe", "κατακόμβες",
    "dolmen", "cromlech", "megalith", "megalito", "megalítico",
    "talayot", "talaiot",
    "tomba dei giganti", "tomba etrusca", "etruscan necropolis",
    "Samian Ware Discovery Site", "Samian Ware Production centre",
    "domus de janas", "domus",
    "tell", "τελλ",
    "Lagerstätte"
]

RELIGIOUS_TERMS = [
    "church", "chiesa", "iglesia", "ναός", "χριστιανικός ναός",
    "parish church", "chiesa parrocchiale", "iglesia parroquial",
    "catholic parish church", "iglesia parroquial católica",
    "catholic church building",
    "cathedral", "cattedrale", "catedral", "cathedral church",
    "basilica", "basilica minore", "minor basilica",
    "παλαιοχριστιανική βασιλική", "early christian basilica",
    "abbey", "abbazia", "abbazia benedettina",
    "monastery", "monastero", "monasterio",
    "convent", "convento", "nunnery",
    "cistercian monastery", "monasterio cisterciense",
    "cistercian nunnery",
    "santuario", "sanctuary", "santuario mariano",
    "ιερό κορυφής", "hieron", "ιερό", "temple", "tempio",
    "synagogue", "sinagoga", "συναγωγή",
    "mosque", "mezquita", "τζαμί",
    "chapel", "ermita", "hermitage", "oratory", "oratorio",
    "pilgrimage church", "church of pilgrimage",
    "co-cathedral", "concattedrale", "concatedral",
    "structure of worship", "sufi lodge"
]

FORTIFICATION_TERMS = [
    "castle", "castello", "castillo", "κάστρο",
    "alcazaba", "alcázar",
    "fortress", "fortaleza", "fortificação", "fortification",
    "fort", "forte", "fortín", "οχυρό", "φρούριο",
    "citadel", "acropolis", "ακρόπολη",
    "hillfort", "castro", "oppidum",
    "city walls", "muralla", "muralla urbana", "mura cittadine",
    "defensive wall", "οχυρωματικό τείχος",
    "tower", "torre", "watchtower", "atalaya",
    "fortified tower", "torre defensiva", "οχυρωματικός πύργος",
    "αμυντικός πύργος", "bell tower", "torre campanario",
    "tower house",
    "búnker", "blockhouse"
]

MONUMENT_TERMS = [
    "monument", "monumento", "μνημείο",
    "monument (spain)", "monumento conmemorativo",
    "memorial", "memorial del genocidio",
    "statue", "estatue", "estatua", "statua", "άγαλμα",
    "bust", "busto",
    "sculpture", "escultura", "scultura",
    "relief sculpture", "grupo escultórico", "group of sculptures",
    "γλυπτό σύμπλεγμα",
    "obelisk", "obelisco",
    "triumphal arch", "arco de triunfo", "arco trionfale",
    "commemorative plaque", "placa conmemorativa",
    "historic landmark"
]

URBAN_TERMS = [
    "square", "piazza", "plaza", "πλατεία", "plaza mayor",
    "porticoed square", "plaza porticada",
    "urban ensemble", "conjunto urbano", "urban area",
    "barrio", "neighborhood", "συνοικία",
    "city", "ciudad", "città",
    "palace", "palacio", "palazzo", "ανάκτορο",
    "public building", "edificio público", "δημαρχείο",
    "town hall",
    "market", "mercado",
    "street", "calle", "roadway", "avenue", "avenida", "δρόμος",
    "roundabout", "traffic circle", "rotonda"
]

SETTLEMENT_TERMS = [
    "human settlement", "settlement site", "οικιστική μονάδα",
    "village", "χωριό", "aldea", "hamlet", "frazione",
    "pueblo", "polis", "πόλις",
    "ghost town", "abandoned village", "deserted medieval village",
    "former settlement", "πρώην οικισμός", "former place",
    "οχυρωμένη πόλη"
]

NATURAL_TERMS = [
    "mountain", "montaña", "montagna", "βουνό",
    "hill", "collina",
    "lake", "lago",
    "bay", "baia",
    "cape", "peninsula", "ακτή",
    "cave", "cueva", "caverna", "grotta", "σπήλαιο", "show cave",
    "rock shelter", "rock art", "arte rupestre", "pintura rupestre",
    "hot spring", "aguas termales", "spring", "manantial",
    "river", "curso de agua",
    "park", "parco", "parque", "urban green space",
    "protected area", "área protegida", "natura 2000 site",
    "botanical garden", "jardín botánico", "garden", "jardín"
]

INDUSTRIAL_TERMS = [
    "factory", "fábrica", "foundry", "fundición",
    "industrial building", "edificio industrial",
    "mine", "mina", "sand mine",
    "quarry", "cantera",
    "ironworks",
    "ceramic factory",
    "shipyard", "astillero", "naval arsenal",
    "patrimonio industrial",
    "workshop", "tannery",
]

CEMETERY_TERMS = [
    "cemetery", "cementerio", "cimitero", "necropolis", "necrópolis", "necropoli",
    "grave", "sepultura", "tumba", "tomb", "τάφος",
    "mausoleum", "mausoleo", "μαυσωλείο",
    "ossuary", "ossario", "ossario", "ossario",
    "columbarium", "columbario",
    "burial plot", "ταφικό μνημείο",
    "military cemetery", "στρατιωτικό νεκροταφείο",
    "islamic cemetery"
]

TRANSPORT_TERMS = [
    "port", "porto", "puerto", "λιμάνι",
    "harbour", "harbor", "harbour",
    "marina", "porto marittimo", "puerto deportivo", "puerto pesquero",
    "railway station", "former railway station",
    "bus station", "autostazione",
    "airport", "aeródromo",
    "bridge", "puente", "stone bridge", "roman bridge", "gέφυρα"
]

# ============================================================
# 2. Συνάρτηση ταξινόμησης type -> poi_group
# ============================================================

def classify_type(t: str) -> str:
    t_low = str(t).lower()

    if any(k in t_low for k in MUSEUM_TERMS):
        return "Museum / Gallery"
    if any(k in t_low for k in ARCHAEO_TERMS):
        return "Archaeological / Historical site"
    if any(k in t_low for k in RELIGIOUS_TERMS):
        return "Religious / Sacred site"
    if any(k in t_low for k in FORTIFICATION_TERMS):
        return "Fortification"
    if any(k in t_low for k in MONUMENT_TERMS):
        return "Monument / Artwork"
    if any(k in t_low for k in URBAN_TERMS):
        return "Urban place"
    if any(k in t_low for k in SETTLEMENT_TERMS):
        return "Settlement"
    if any(k in t_low for k in NATURAL_TERMS):
        return "Natural / Park"
    if any(k in t_low for k in INDUSTRIAL_TERMS):
        return "Industrial / Infrastructure"
    if any(k in t_low for k in CEMETERY_TERMS):
        return "Cemetery / Burial"
    if any(k in t_low for k in TRANSPORT_TERMS):
        return "Transport"

    return "Other"

df_groups["poi_group"] = df_groups["type"].apply(classify_type)

print("POIs per poi_group (overall):")
print(df_groups["poi_group"].value_counts(), "\n")

print("POIs per country & poi_group:")
print(df_groups.groupby(["country", "poi_group"]).size(), "\n")

# ============================================================
# 3. Υπολογισμός quality metrics (labels / descriptions / aliases)
# ============================================================

# Labels & descriptions: presence flags
for col in ["label_local", "label_en", "desc_local", "desc_en"]:
    df_groups[col] = df_groups[col].fillna("")

df_groups["has_label_local"] = df_groups["label_local"] != ""
df_groups["has_label_en"]    = df_groups["label_en"]   != ""

df_groups["has_desc_local"]  = df_groups["desc_local"] != ""
df_groups["has_desc_en"]     = df_groups["desc_en"]    != ""

# Aliases: μετατροπή σε λίστα + counters + flags
def to_list_safe(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        x = x.strip()
        if x == "" or x.lower() == "nan":
            return []
        if x.startswith("[") and x.endswith("]"):
            try:
                return ast.literal_eval(x)
            except Exception:
                return []
        return [x]
    return []

for col in ["aliases_local", "aliases_en"]:
    df_groups[col] = df_groups[col].apply(to_list_safe)

df_groups["n_aliases_local"] = df_groups["aliases_local"].apply(len)
df_groups["n_aliases_en"]    = df_groups["aliases_en"].apply(len)

df_groups["has_alias_local"] = df_groups["n_aliases_local"] > 0
df_groups["has_alias_en"]    = df_groups["n_aliases_en"]    > 0

# Composite documentation score 0–3
df_groups["score_labels"] = ((df_groups["has_label_local"]) & (df_groups["has_label_en"])).astype(int)
df_groups["score_desc"]   = ((df_groups["has_desc_local"]) & (df_groups["has_desc_en"])).astype(int)
df_groups["score_alias"]  = ((df_groups["has_alias_local"]) |  (df_groups["has_alias_en"])).astype(int)

df_groups["doc_score_0_3"] = df_groups["score_labels"] + df_groups["score_desc"] + df_groups["score_alias"]

# ============================================================
# 4. Στατιστικά ανά χώρα + poi_group
# ============================================================

group_cols = ["country", "poi_group"]

# 4a. Coverage σε % (labels / descriptions / aliases)
coverage_by_group = (
    df_groups
    .groupby(group_cols)[[
        "has_label_local", "has_label_en",
        "has_desc_local", "has_desc_en",
        "has_alias_local", "has_alias_en"
    ]]
    .mean()
    .multiply(100)
    .round(1)
)

print("Coverage per country & POI group (%, labels / descriptions / aliases):")
print(coverage_by_group, "\n")

# 4b. Μέσος αριθμός aliases
aliases_mean_by_group = (
    df_groups
    .groupby(group_cols)[["n_aliases_local", "n_aliases_en"]]
    .mean()
    .round(2)
)

print("Average number of aliases per POI, per country & POI group:")
print(aliases_mean_by_group, "\n")

# 4c. Documentation score (0–3) mean / median
docscore_by_group = (
    df_groups
    .groupby(group_cols)["doc_score_0_3"]
    .agg(["mean", "median", "count"])
    .round(2)
)

print("Documentation score (0–3) per country & POI group (mean / median / N):")
print(docscore_by_group)


POIs per poi_group (overall):
poi_group
Archaeological / Historical site    11000
Monument / Artwork                   4311
Settlement                           2783
Other                                2598
Urban place                          2587
Museum / Gallery                     2502
Religious / Sacred site              1335
Fortification                        1276
Transport                             610
Natural / Park                        428
Cemetery / Burial                     397
Industrial / Infrastructure           339
Name: count, dtype: int64 

POIs per country & poi_group:
country  poi_group                       
ES       Archaeological / Historical site    9352
         Cemetery / Burial                    370
         Fortification                       1227
         Industrial / Infrastructure          337
         Monument / Artwork                  4218
         Museum / Gallery                    1399
         Natural / Park                       383
      

In [None]:
import pandas as pd
import numpy as np
import ast

# ============================================
# 1. Φόρτωμα dataset
# ============================================
# Αν χρειάζεται, άλλαξε το filename εδώ
df = pd.read_csv("destinations_multicountry.csv")

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head(3))

# ============================================
# 2. Parsing aliases: string -> list
# ============================================

def parse_alias_cell(x):
    """
    Μετατρέπει το περιεχόμενο ενός κελιού aliases σε λίστα (list[str]).
    - NaN ή κενό -> []
    - "[...]" -> list μέσω ast.literal_eval
    - άλλο string -> [string]
    """
    if pd.isna(x) or x == "":
        return []
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        x = x.strip()
        try:
            v = ast.literal_eval(x)
            if isinstance(v, list):
                return v
            else:
                return [str(v)]
        except Exception:
            return [x]
    return [str(x)]

df["aliases_local_list"] = df["aliases_local"].apply(parse_alias_cell)
df["aliases_en_list"]    = df["aliases_en"].apply(parse_alias_cell)

print("\nΠαράδειγμα aliases μετά το parsing:")
display(df[["aliases_local", "aliases_local_list",
            "aliases_en", "aliases_en_list"]].head(5))

# ============================================
# 3. Boolean flags για label / description / alias
#    (local language & English)
# ============================================

# Local language (Greek για GR, Spanish για ES, Italian για IT)
df["has_label_local"] = df["label_local"].fillna("").str.strip() != ""
df["has_desc_local"]  = df["desc_local"].fillna("").str.strip() != ""
df["has_alias_local"] = df["aliases_local_list"].apply(lambda x: len(x) > 0)

# English
df["has_label_en"] = df["label_en"].fillna("").str.strip() != ""
df["has_desc_en"]  = df["desc_en"].fillna("").str.strip() != ""
df["has_alias_en"] = df["aliases_en_list"].apply(lambda x: len(x) > 0)

print("\nΈλεγχος flags (πρώτες γραμμές):")
display(
    df[
        [
            "country",
            "label_local", "has_label_local",
            "label_en",    "has_label_en",
            "desc_local",  "has_desc_local",
            "desc_en",     "has_desc_en",
            "aliases_local_list", "has_alias_local",
            "aliases_en_list",    "has_alias_en",
        ]
    ].head(5)
)

# ============================================
# 4. Documentation scores 0–3
# ============================================

# Local documentation score: 1 για label, 1 για description, 1 για ≥1 alias (στη local γλώσσα)
df["doc_score_local"] = (
    df["has_label_local"].astype(int)
    + df["has_desc_local"].astype(int)
    + df["has_alias_local"].astype(int)
)

# English documentation score: 1 για English label, 1 για English description, 1 για ≥1 English alias
df["doc_score_en"] = (
    df["has_label_en"].astype(int)
    + df["has_desc_en"].astype(int)
    + df["has_alias_en"].astype(int)
)

print("\nΠαράδειγμα scores (πρώτες γραμμές):")
display(df[["country", "doc_score_local", "doc_score_en"]].head(10))

# ============================================
# 5. Coverage stats (για να αντιστοιχούν στα Tables 1,3,4)
# ============================================

coverage = (
    df
    .groupby("country")
    .agg(
        has_label_local=("has_label_local", "mean"),
        has_label_en=("has_label_en", "mean"),
        has_desc_local=("has_desc_local", "mean"),
        has_desc_en=("has_desc_en", "mean"),
        has_alias_local=("has_alias_local", "mean"),
        has_alias_en=("has_alias_en", "mean"),
    )
)

coverage = (coverage * 100).round(1)

print("\nCoverage per country (% of POIs with at least one field):")
display(coverage)

# ============================================
# 6. Global documentation scores ανά χώρα
#    (mean/median + % score=3 + πλήθος POIs)
# ============================================

def share_full(s):
    # ποσοστό POIs με score=3
    return (s == 3).mean()

doc_global = (
    df
    .groupby("country")
    .agg(
        mean_doc_local=("doc_score_local", "mean"),
        median_doc_local=("doc_score_local", "median"),
        full_doc_local=("doc_score_local", share_full),
        mean_doc_en=("doc_score_en", "mean"),
        median_doc_en=("doc_score_en", "median"),
        full_doc_en=("doc_score_en", share_full),
        n_pois=("country", "size"),
    )
    .reset_index()
)

doc_global["full_doc_local"] = (doc_global["full_doc_local"] * 100).round(1)
doc_global["full_doc_en"]    = (doc_global["full_doc_en"] * 100).round(1)
doc_global = doc_global.round(
    {"mean_doc_local": 2, "median_doc_local": 2,
     "mean_doc_en": 2, "median_doc_en": 2}
)

print("\nGlobal documentation stats per country (doc_score_local / doc_score_en):")
display(doc_global)


Shape: (36693, 17)
Columns: ['qid', 'country', 'local_lang', 'label_local', 'label_en', 'desc_local', 'desc_en', 'type', 'admin', 'coordinates_wkt', 'image', 'website', 'localwiki', 'enwiki', 'aliases_local', 'aliases_en', 'sources']


Unnamed: 0,qid,country,local_lang,label_local,label_en,desc_local,desc_en,type,admin,coordinates_wkt,image,website,localwiki,enwiki,aliases_local,aliases_en,sources
0,Q132472,GR,el,Μίεζα,Mieza,κώμη στην αρχαία Μακεδονία,village in Ancient Macedon,αρχαιολογική θέση,Naousa Municipality,Point(22.122222222 40.644166666),http://commons.wikimedia.org/wiki/Special:File...,,https://el.wikipedia.org/wiki/%CE%9C%CE%AF%CE%...,https://en.wikipedia.org/wiki/Mieza_(Macedonia),[],[],['http://odysseus.culture.gr/h/2/gh251.jsp?obj...
1,Q152348,GR,el,Κασταλία πηγή,Castalian Spring,,sacred fountain at Delphi,υδάτινη πηγή,Delfi Municipality,Point(22.505555555 38.483055555),http://commons.wikimedia.org/wiki/Special:File...,,https://el.wikipedia.org/wiki/%CE%9A%CE%B1%CF%...,https://en.wikipedia.org/wiki/Castalian_Spring,['Κασταλία κρήνη'],['Castalian fountain'],['http://odysseus.culture.gr/h/2/gh251.jsp?obj...
2,Q140345,GR,el,Ζαγορά Άνδρου,Zagora,,"archaeological site in Andros island, Greece",archaeological site,Δήμος Άνδρου,Point(24.86555556 37.77416667),,,,,[],[],['http://odysseus.culture.gr/h/3/gh351.jsp?obj...



Παράδειγμα aliases μετά το parsing:


Unnamed: 0,aliases_local,aliases_local_list,aliases_en,aliases_en_list
0,[],[],[],[]
1,['Κασταλία κρήνη'],[Κασταλία κρήνη],['Castalian fountain'],[Castalian fountain]
2,[],[],[],[]
3,"['Τεχνόπολη', 'Τεχνόπολις']","[Τεχνόπολη, Τεχνόπολις]","['Gazi', 'Industrial gas museum of Athens', 'T...","[Gazi, Industrial gas museum of Athens, Techno..."
4,"['Ακαδημία του Πλάτωνα', 'Πλατωνική Ακαδημία']","[Ακαδημία του Πλάτωνα, Πλατωνική Ακαδημία]","['Academy of Plato', ""Plato's Academy""]","[Academy of Plato, Plato's Academy]"



Έλεγχος flags (πρώτες γραμμές):


Unnamed: 0,country,label_local,has_label_local,label_en,has_label_en,desc_local,has_desc_local,desc_en,has_desc_en,aliases_local_list,has_alias_local,aliases_en_list,has_alias_en
0,GR,Μίεζα,True,Mieza,True,κώμη στην αρχαία Μακεδονία,True,village in Ancient Macedon,True,[],False,[],False
1,GR,Κασταλία πηγή,True,Castalian Spring,True,,False,sacred fountain at Delphi,True,[Κασταλία κρήνη],True,[Castalian fountain],True
2,GR,Ζαγορά Άνδρου,True,Zagora,True,,False,"archaeological site in Andros island, Greece",True,[],False,[],False
3,GR,Τεχνόπολη Δήμου Αθηναίων,True,Technopolis City of Athens,True,κέντρο πολιτισμού και μουσείο στην Αθήνα,True,"museum and cultural center in Athens, Greece",True,"[Τεχνόπολη, Τεχνόπολις]",True,"[Gazi, Industrial gas museum of Athens, Techno...",True
4,GR,Ακαδημία Πλάτωνος,True,Platonic Academy,True,"φιλοσοφικό, ερευνητικό και εκπαιδευτικό κέντρο...",True,"ancient philosophical, research and educative ...",True,"[Ακαδημία του Πλάτωνα, Πλατωνική Ακαδημία]",True,"[Academy of Plato, Plato's Academy]",True



Παράδειγμα scores (πρώτες γραμμές):


Unnamed: 0,country,doc_score_local,doc_score_en
0,GR,2,2
1,GR,2,3
2,GR,1,2
3,GR,3,3
4,GR,3,3
5,GR,3,3
6,GR,2,2
7,GR,3,3
8,GR,1,2
9,GR,2,2



Coverage per country (% of POIs with at least one field):


Unnamed: 0_level_0,has_label_local,has_label_en,has_desc_local,has_desc_en,has_alias_local,has_alias_en
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ES,90.0,80.4,87.2,55.5,28.6,5.4
GR,77.9,97.2,45.6,83.7,26.6,32.5
IT,95.7,74.7,82.6,79.7,37.1,16.5



Global documentation stats per country (doc_score_local / doc_score_en):


Unnamed: 0,country,mean_doc_local,median_doc_local,full_doc_local,mean_doc_en,median_doc_en,full_doc_en,n_pois
0,ES,2.06,2.0,27.6,1.41,1.0,4.9,25401
1,GR,1.5,2.0,19.2,2.13,2.0,31.8,2021
2,IT,2.15,2.0,33.4,1.71,2.0,15.1,9271


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


def compute_score_distribution(df: pd.DataFrame, country_col: str, score_col: str) -> pd.DataFrame:
    """
    Returns a DataFrame with index=country and columns=[0,1,2,3]
    containing percentages (0-100) of POIs per documentation score.
    """
    # Keep only needed cols and drop missing rows safely
    data = df[[country_col, score_col]].dropna()

    # Ensure score is integer-like (0..3)
    data[score_col] = data[score_col].astype(int)

    # Count per (country, score)
    counts = (
        data
        .groupby([country_col, score_col])
        .size()
        .unstack(fill_value=0)
    )

    # Ensure all score columns exist (0..3), even if missing in a country
    for s in [0, 1, 2, 3]:
        if s not in counts.columns:
            counts[s] = 0
    counts = counts[[0, 1, 2, 3]]

    # Convert to percentages
    percentages = counts.div(counts.sum(axis=1), axis=0) * 100

    return percentages


def plot_stacked_distribution(percentages: pd.DataFrame, title: str, output_path: str = None):
    """
    Plots a stacked bar chart for score distribution percentages.
    If output_path is given, saves figure to file (e.g., .png, .pdf).
    """
    ax = percentages.plot(kind="bar", stacked=True, figsize=(7, 5))

    ax.set_ylabel("Percentage of POIs")
    ax.set_xlabel("Country")
    ax.set_title(title)

    ax.legend(
        title="Documentation score",
        bbox_to_anchor=(0.5, -0.2),
        loc="upper center",
        ncol=4,
        frameon=False
    )

    plt.tight_layout()

    if output_path:
        plt.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.show()


# =========================
# MAIN (run everything)
# =========================

# ---- IMPORTANT ----
# Change these if your columns have different names
COUNTRY_COL = "country"
LOCAL_SCORE_COL = "doc_score_local"
EN_SCORE_COL = "doc_score_en"

# Optional: enforce a nice ordering of countries (adjust if needed)
country_order = ["Greece", "Spain", "Italy"]
# If your df uses codes like "GR", "ES", "IT", change this list accordingly.

# Make sure country is string
df[COUNTRY_COL] = df[COUNTRY_COL].astype(str)

# (Optional) reorder categories if they exist
df[COUNTRY_COL] = pd.Categorical(df[COUNTRY_COL], categories=country_order, ordered=True)

# ---- Local distribution ----
local_pct = compute_score_distribution(df, COUNTRY_COL, LOCAL_SCORE_COL)
local_pct = local_pct.sort_index()  # respects categorical order if set

plot_stacked_distribution(
    local_pct,
    title="Distribution of local documentation scores (0–3)",
    output_path="doc_score_distribution_local.png"  # set None if you don't want saving
)

# ---- English distribution ----
en_pct = compute_score_distribution(df, COUNTRY_COL, EN_SCORE_COL)
en_pct = en_pct.sort_index()

plot_stacked_distribution(
    en_pct,
    title="Distribution of English documentation scores (0–3)",
    output_path="doc_score_distribution_english.png"  # set None if you don't want saving
)

# ---- Optional: print tables for paper appendix ----
print("\nLocal score distribution (%):")
print(local_pct.round(2))

print("\nEnglish score distribution (%):")
print(en_pct.round(2))


KeyError: "['doc_score_local'] not in index"