In [33]:
import pandas as pd
import requests
import time

API_KEY = "AIzaSyBPXacrTdwVOZ8vS6nlywLkD3D49Yipz2Q"
SEARCH_URL = "https://maps.googleapis.com/maps/api/place/textsearch/json"
DETAILS_URL = "https://maps.googleapis.com/maps/api/place/details/json"

# Load your own restaurant names
df = pd.read_csv("haarlem_today_restaurants.csv")

# Prepare new columns for Google data
df["Google Name"] = ""
df["Rating"] = ""
df["Total Ratings"] = ""
df["Google Address"] = ""
df["Place ID"] = ""
df["Types"] = ""
df["Price Level"] = ""
df["Google Website"] = ""
df["Phone Number"] = ""
df["Opening Hours"] = ""

for idx, row in df.iterrows():
    restaurant_name = str(row["Name"]).strip()
    if not restaurant_name or restaurant_name.lower() == "nan":
        continue

    # Step 1: Text Search
    query = f"{restaurant_name} restaurant Haarlem"
    params = {
        "query": query,
        "region": "nl",
        "location": "52.3874,4.6462",
        "radius": 30000,
        "key": API_KEY
    }

    response = requests.get(SEARCH_URL, params=params)
    data = response.json()

    if "results" in data and len(data["results"]) > 0:
        place = data["results"][0]
        place_id = place.get("place_id")

        df.at[idx, "Google Name"] = place.get("name", "")
        df.at[idx, "Rating"] = place.get("rating", "")
        df.at[idx, "Total Ratings"] = place.get("user_ratings_total", "")
        df.at[idx, "Google Address"] = place.get("formatted_address", "")
        df.at[idx, "Place ID"] = place_id
        df.at[idx, "Types"] = ", ".join(place.get("types", []))

        # Step 2: Place Details API
        if place_id:
            details_params = {
                "place_id": place_id,
                "fields": "price_level,website,formatted_phone_number,opening_hours",
                "key": API_KEY
            }
            details_res = requests.get(DETAILS_URL, params=details_params)
            details_data = details_res.json().get("result", {})

            def map_price_level(level):
                mapping = {
                    1: "$",
                    2: "$$",
                    3: "$$$",
                    4: "$$$$"
                }
                return mapping.get(level, "")
            
            # Get raw price level from Google Details
            raw_price = details_data.get("price_level")
            df.at[idx, "Price Level"] = map_price_level(raw_price)


            df.at[idx, "Google Website"] = details_data.get("website", "")
            df.at[idx, "Phone Number"] = details_data.get("formatted_phone_number", "")
            if details_data.get("opening_hours"):
                df.at[idx, "Opening Hours"] = ", ".join(details_data["opening_hours"].get("weekday_text", []))

        print(f"✅ {restaurant_name} → {place.get('rating', 'N/A')}")

    else:
        print(f"❌ No results for: {query}")
        print("↪️ Response:", data.get("status"), data.get("error_message"))

    time.sleep(2)  # Be kind to the API limits

# Save the enriched data
df.to_csv("haarlem_today_with_full_google_data.csv", index=False)
print("✅ Saved to 'haarlem_today_with_full_google_data.csv'")


✅ Brasserie de Canette → 4.4
✅ Cafe De Lange Heer → 4.5
✅ Blend Haarlem → 4.2
✅ The Governor → 4.3
✅ Restaurant Éclusier → 5
✅ Southern Cross → 4.7
✅ Brasserie van Beinum → 4.2
✅ Restaurant Rood → 4.7
✅ Guus Koffie → 4.8
✅ Toko SamaSama → 4.5
✅ FLFL → 4.8
✅ By LIMA → 4.5
✅ De slagersdochter → 4.5
✅ Friethoes (De Winkel) → 4.7
✅ Rigatoni → 4.6
✅ Adamo → 4.6
✅ Mano → 4.9
✅ Club Cantina → 4.6
✅ Kraantje Lek → 4.2
✅ Nolita → 4.4
✅ Restaurant Locael Centraal → 4.2
✅ Restaurant Locael Bloemendaal → 3.9
✅ Teds Haarlem → 4.2
✅ Menu Corridor → 4.4
✅ Café Colette → 4.4
✅ Restaurant Metzo → 4.5
✅ Museumcafé Thuys → 4.5
✅ Kus van de Cactus → 4.5
✅ The Harlem Social Club → 4.7
✅ Frenchie Restaurant → 4.5
✅ Restaurant Fris → 4.6
✅ Friethuis La Petite → 4.7
✅ Bambu Kitchen & Bar → 4.3
✅ Republiek Bloemendaal → 3.8
✅ Five Brothers Fat → 4.6
✅ Bistrobar Indonesia → 4.3
✅ Maita → 4.4
✅ Olivers Haarlem → 4.7
✅ Houtbaar → 4.9
✅ kaldi → 4.7
✅ Pip Deli → 4.8
✅ Red Orchids → 4.5
✅ Meneer Fans → 4.4
✅ Mooie B

In [None]:
import re
import pandas as pd
import requests
import time
from sentence_transformers import SentenceTransformer, util
from cuisine_keywords import cuisine_keywords  # External Python file containing the keyword dictionary
from unidecode import unidecode  # Normalize accented characters

# === CONFIG ===
API_KEY = "AIzaSyBPXacrTdwVOZ8vS6nlywLkD3D49Yipz2Q"
SEARCH_URL = "https://maps.googleapis.com/maps/api/place/textsearch/json"
DETAILS_URL = "https://maps.googleapis.com/maps/api/place/details/json"
CSV_INPUT = "/mnt/data/haarlem_today_restaurants.csv"
CSV_OUTPUT = "/mnt/data/haarlem_today_enriched_v2.csv"

# === Load model and cuisine labels ===
model = SentenceTransformer("all-mpnet-base-v2")
CUISINES = list(cuisine_keywords.keys())
cuisine_embeddings = model.encode(CUISINES, convert_to_tensor=True)

# === Read input file ===
df = pd.read_csv(CSV_INPUT)

# Prepare new columns
df["Cuisine BERT"] = ""
df["Cuisine Keywords"] = ""
df["Keyword Score"] = ""
df["BERT Score"] = ""
df["Final Cuisine"] = ""
df["Certainty"] = ""
df["Final Score"] = ""
df["Google Type Cuisine"] = ""
df["Keyword Hits"] = ""

# === Main loop ===
for idx, row in df.iterrows():
    name = str(row.get("Name", "")).strip()
    if not name:
        continue

    query = f"{name} restaurant Haarlem"
    search_params = {"query": query, "key": API_KEY}

    try:
        res = requests.get(SEARCH_URL, params=search_params)
        data = res.json()
        if not data.get("results"):
            continue

        place = data["results"][0]
        place_id = place["place_id"]
    except Exception:
        continue

    details_params = {
        "place_id": place_id,
        "fields": "editorial_summary,reviews,types",
        "key": API_KEY
    }

    try:
        res = requests.get(DETAILS_URL, params=details_params)
        details = res.json().get("result", {})

        texts = []
        if "editorial_summary" in details:
            texts.append(details["editorial_summary"].get("overview", ""))
        if "reviews" in details:
            texts += [r.get("text", "") for r in details["reviews"] if "text" in r]

        combined_text = " ".join(texts + [name]).lower().strip()
        combined_text = unidecode(combined_text)  # Normalize accents

        if not combined_text:
            continue

        # === BERT Matching ===
        embedding = model.encode(combined_text, convert_to_tensor=True)
        scores = util.cos_sim(embedding, cuisine_embeddings)[0]

        # === Keyword Matching with regex ===
        match_scores = {}
        match_hits = {}

        for cuisine, keywords in cuisine_keywords.items():
            total_score = 0
            hits = []
            for kw, weight in keywords.items():
                kw_normalized = unidecode(kw.lower())
                if re.search(rf"\b{re.escape(kw_normalized)}\b", combined_text):
                    total_score += weight
                    hits.append(kw)
            if total_score > 0:
                match_scores[cuisine] = total_score
                match_hits[cuisine] = hits

        # === Google Type Cuisine Guess ===
        google_types = details.get("types", [])
        google_cuisine_guess = None
        for t in google_types:
            if "restaurant" in t and "_" in t:
                guess = t.split("_")[0].capitalize()
                if guess in CUISINES:
                    google_cuisine_guess = guess
                    break
        df.at[idx, "Google Type Cuisine"] = google_cuisine_guess if google_cuisine_guess else "None"

        # === Score Fusion ===
        cuisine_scores = {}
        for cuisine in CUISINES:
            bert_idx = CUISINES.index(cuisine)
            bert_score = float(scores[bert_idx])

            kw_score = match_scores.get(cuisine, 0)
            max_possible = sum(cuisine_keywords[cuisine].values())
            kw_norm = kw_score / max_possible if max_possible else 0

            fusion_score = (bert_score * 0.6) + (kw_norm * 0.4)
            if cuisine == google_cuisine_guess:
                fusion_score += 0.1
            cuisine_scores[cuisine] = fusion_score

        # === Final Cuisine Selection ===
        final_cuisine = max(cuisine_scores, key=cuisine_scores.get)
        final_score = round(cuisine_scores[final_cuisine], 4)
        certainty = "HIGH" if final_score > 0.75 else "MEDIUM" if final_score > 0.5 else "LOW"

        # Save debug info
        top_bert_idx = int(scores.argmax())
        df.at[idx, "Cuisine BERT"] = CUISINES[top_bert_idx]
        df.at[idx, "BERT Score"] = round(float(scores[top_bert_idx]), 10)

        if final_cuisine in match_scores:
            df.at[idx, "Cuisine Keywords"] = f"{final_cuisine} ({match_scores[final_cuisine]})"
            df.at[idx, "Keyword Score"] = match_scores[final_cuisine]
            df.at[idx, "Keyword Hits"] = ", ".join(match_hits.get(final_cuisine, []))
        else:
            df.at[idx, "Cuisine Keywords"] = "None"
            df.at[idx, "Keyword Score"] = 0
            df.at[idx, "Keyword Hits"] = ""

        df.at[idx, "Final Cuisine"] = final_cuisine
        df.at[idx, "Certainty"] = certainty
        df.at[idx, "Final Score"] = final_score

    except Exception:
        continue

    time.sleep(1.5)

# Save final DataFrame
df.to_csv(CSV_OUTPUT, index=False)

import ace_tools as tools; tools.display_dataframe_to_user(name="Cuisine Classification Results", dataframe=df)

# === Save results ===
df.to_csv(CSV_OUTPUT, index=False)
print(f"\n✅ Saved to {CSV_OUTPUT}")