In [None]:
import pandas as pd
import numpy as np
import re
from langdetect import detect, DetectorFactory
from bs4 import BeautifulSoup
import requests
import time

In [None]:
# avalible here: https://open-research-europe.ec.europa.eu/articles/5-241
lex = pd.read_excel("FinalLexicon_02052025.xlsx", sheet_name="8.Europe_and_Japan")

In [None]:
lex

In [None]:
lex.columns = [c.strip() if isinstance(c, str) else c for c in lex.columns]

# Strip whitespace from string values
for col in lex.select_dtypes(include=["object"]).columns:
    lex[col] = lex[col].astype(str).str.strip().replace({"nan": np.nan})

# Fix column name with trailing space
if "City_Website " in lex.columns:
    lex = lex.rename(columns={"City_Website ": "City_Website"})

# Forward fill country info where merged cells were blank
for c in ["Country_Name", "Country_ID_Number"]:
    if c in lex.columns:
        lex[c] = lex[c].ffill()

# Drop rows with no country and no city
lex = lex[~(lex.get("City_Name").isna() & lex.get("Country_Name").isna())].copy()

# Remove Japan
europe = lex[lex["Country_Name"].str.lower() != "japan"].copy()

# Convert numeric columns
for c in ["Population", "Year", "Country_ID_Number"]:
    if c in europe.columns:
        europe[c] = pd.to_numeric(europe[c], errors="coerce").astype("Int64")

# Normalize URLs
def normalize_url(u):
    if pd.isna(u) or not isinstance(u, str) or not u:
        return np.nan
    u = u.strip()
    if not re.match(r"^https?://", u, flags=re.I):
        u = "http://" + u
    return u

if "City_Website" in europe.columns:
    europe["City_Website"] = europe["City_Website"].apply(normalize_url)

# Remove empty rows
europe = europe[~europe["City_Name"].isna()].copy()
europe = europe[~europe["City_Name"].str.isnumeric()]  # drop numeric city names like "63"
#europe = europe[~europe["City_Name"].str.contains("no city", case=False, na=False)]  # drop "no city above 50k"

# Reorder columns for readability
ordered_cols = [c for c in [
    "Country _ID_Number","Country_Name","Capital","City_Name","State",
    "Population","Year","City_Website","Local_adm_unit","Validated",
    "Source","Note"
] if c in europe.columns]
other_cols = [c for c in europe.columns if c not in ordered_cols]
eu = europe[ordered_cols + other_cols].reset_index(drop=True)

In [None]:
eu

In [None]:
eu = eu[["City_Name","State","Country_Name","City_Website"]]

In [None]:
# Mapping: country -> (capital, official/municipal website)
micro_map = {
    "Andorra": ("Andorra la Vella", "https://www.andorralavella.ad/"),
    "Liechtenstein": ("Vaduz", "https://www.vaduz.li/"),
    "Malta": ("Valletta", np.nan),
    "San Marino": ("San Marino", "https://www.gov.sm/"),      # state portal
    "Vatican City": ("Vatican City", "https://www.vatican.va/"),
}

In [None]:
# Identify placeholder rows
mask_placeholder = eu["City_Name"].astype(str).str.contains(r"\bno city above 50k\b", case=False, na=False)

# For each targeted country, replace City_Name and City_Website
for country, (capital, url) in micro_map.items():
    m = mask_placeholder & (eu["Country_Name"].astype(str).str.strip() == country)
    eu.loc[m, "City_Name"] = capital
    eu.loc[m, "City_Website"] = url

In [None]:
eu

In [None]:
DetectorFactory.seed = 0  # deterministic

In [None]:
def fetch_lang_from_site(url, timeout=10):
    if pd.isna(url) or not url: 
        return None
    try:
        r = requests.get(url, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"})
        r.raise_for_status()
        # 1) Try <html lang="xx">
        soup = BeautifulSoup(r.text, "html.parser")
        html_tag = soup.find("html")
        if html_tag and html_tag.get("lang"):
            return html_tag.get("lang").split("-")[0].lower()
        # 2) Fallback: detect from text
        text = soup.get_text(separator=" ", strip=True)
        if text:
            return detect(text)
    except Exception:
        return None
    return None

In [None]:
def language_name_from_code(code):
    m = {
        "en":"English","de":"German","fr":"French","es":"Spanish","it":"Italian","pt":"Portuguese",
        "nl":"Dutch","da":"Danish","sv":"Swedish","no":"Norwegian","fi":"Finnish","et":"Estonian",
        "lv":"Latvian","lt":"Lithuanian","pl":"Polish","cs":"Czech","sk":"Slovak","hu":"Hungarian",
        "ro":"Romanian","bg":"Bulgarian","el":"Greek","sq":"Albanian","mk":"Macedonian","sr":"Serbian",
        "hr":"Croatian","bs":"Bosnian","sl":"Slovenian","uk":"Ukrainian","be":"Belarusian","ru":"Russian",
        "is":"Icelandic","mt":"Maltese","ca":"Catalan","eu":"Basque","gl":"Galician","ga":"Irish","cy":"Welsh"
    }
    return m.get(code, None)

ai_translations = {
    "English":"artificial intelligence","German":"künstliche Intelligenz","French":"intelligence artificielle",
    "Spanish":"inteligencia artificial","Italian":"intelligenza artificiale","Portuguese":"inteligência artificial",
    "Dutch":"kunstmatige intelligentie","Danish":"kunstig intelligens","Swedish":"artificiell intelligens",
    "Norwegian":"kunstig intelligens","Finnish":"tekoäly","Estonian":"tehisintellekt","Latvian":"mākslīgais intelekts",
    "Lithuanian":"dirbtinis intelektas","Polish":"sztuczna inteligencja","Czech":"umělá inteligence",
    "Slovak":"umelá inteligencia","Hungarian":"mesterséges intelligencia","Romanian":"inteligență artificială",
    "Bulgarian":"изкуствен интелект","Greek":"τεχνητή νοημοσύνη","Albanian":"inteligjencë artificiale",
    "Macedonian":"вештачка интелигенција","Serbian":"вештачка интелигенција","Croatian":"umjetna inteligencija",
    "Bosnian":"umjetna inteligencija","Slovenian":"umetna inteligenca","Ukrainian":"штучний інтелект",
    "Belarusian":"штучны інтэлект","Russian":"искусственный интеллект","Icelandic":"gervigreind",
    "Maltese":"intelliġenza artifiċjali","Catalan":"intel·ligència artificial","Basque":"adimen artifiziala",
    "Galician":"intelixencia artificial","Irish":"intleacht shaorga","Welsh":"deallusrwydd artiffisial"
}


In [None]:
langs, translations = [], []
for _, row in eu.iterrows():
    url = row.get("City_Website")
    if pd.isna(url) or not url:
        langs.append(None)
        translations.append(None)
        continue

    code = fetch_lang_from_site(url)
    lang_name = language_name_from_code(code) if code else None
    langs.append(lang_name)
    translations.append(ai_translations.get(lang_name, "artificial intelligence"))
    time.sleep(0.5)  

eu["language"] = langs
eu["ai_translate"] = translations

In [None]:
eu

In [None]:
eu["lang_code"] = np.nan

In [None]:
# Mask: has website, but language is None/NaN
mask_targets = (
    eu["City_Website"].notna()
    & (eu["City_Website"].astype(str).str.strip() != "")
    & (eu["language"].isna() | (eu["language"].astype(str) == "None"))
)

# Iterate only the rows we need to re-check
for idx in eu.index[mask_targets]:
    url = eu.at[idx, "City_Website"]
    code = fetch_lang_from_site(url)  # uses your previously defined function
    eu.at[idx, "lang_code"] = code if code else np.nan
    time.sleep(0.5) 

In [None]:
eu.to_csv("eu_test.csv", index=False, encoding="utf-8")

In [None]:
temp = pd.read_csv("eu_checked1.csv")

In [None]:
temp.loc[temp["Country_Name"] == "Russia", "language"] = "Russian"
temp.loc[temp["Country_Name"] == "Russia", "ai_translate"] = "искусственный интеллект"

In [None]:
temp.to_csv("eu_checked2.csv", index=False, encoding="utf-8")