In [6]:
import re
import pandas as pd
import requests
import time
from sentence_transformers import SentenceTransformer, util
from cuisine_keywords import cuisine_keywords  # External Python file containing the keyword dictionary
from unidecode import unidecode  # Normalize accented characters

# === CONFIG ===
API_KEY = "AIzaSyBPXacrTdwVOZ8vS6nlywLkD3D49Yipz2Q"
SEARCH_URL = "https://maps.googleapis.com/maps/api/place/textsearch/json"
DETAILS_URL = "https://maps.googleapis.com/maps/api/place/details/json"
CSV_INPUT = "haarlem_today_restaurants.csv"
CSV_OUTPUT = "haarlem_today_fully_enriched.csv"

# === Load model and cuisine labels ===
print("Loading BERT model...")
model = SentenceTransformer("all-mpnet-base-v2")
CUISINES = list(cuisine_keywords.keys())
cuisine_embeddings = model.encode(CUISINES, convert_to_tensor=True)

# === Read input file ===
print(f"Reading input file: {CSV_INPUT}")
df = pd.read_csv(CSV_INPUT)

# Prepare new columns for classification
df["Cuisine BERT"] = ""
df["Cuisine Keywords"] = ""
df["Keyword Score"] = ""
df["BERT Score"] = ""
df["Final Cuisine"] = ""
df["Certainty"] = ""
df["Final Score"] = ""
df["Google Type Cuisine"] = ""
df["Keyword Hits"] = ""

# Prepare new columns for Google data
df["Google Name"] = ""
df["Rating"] = ""
df["Total Ratings"] = ""
df["Google Address"] = ""
df["Place ID"] = ""
df["Types"] = ""
df["Price Level"] = ""
df["Google Website"] = ""
df["Phone Number"] = ""
df["Opening Hours"] = ""

def map_price_level(level):
    mapping = {
        1: "$",
        2: "$$",
        3: "$$$",
        4: "$$$$"
    }
    return mapping.get(level, "")

# === Main loop ===
print("Starting data enrichment process...")
for idx, row in df.iterrows():
    name = str(row.get("Name", "")).strip()
    if not name or name.lower() == "nan":
        continue
    
    print(f"Processing {idx+1}/{len(df)}: {name}")
    
    query = f"{name} restaurant Haarlem"
    search_params = {
        "query": query,
        "region": "nl",
        "location": "52.3874,4.6462",
        "radius": 30000,
        "key": API_KEY
    }

    try:
        # === STEP 1: Get basic place data ===
        res = requests.get(SEARCH_URL, params=search_params)
        data = res.json()
        if not data.get("results"):
            print(f"❌ No results for: {query}")
            print(f"↪️ Response: {data.get('status')} {data.get('error_message', '')}")
            continue

        place = data["results"][0]
        place_id = place["place_id"]
        
        # Store basic Google data
        df.at[idx, "Google Name"] = place.get("name", "")
        df.at[idx, "Rating"] = place.get("rating", "")
        df.at[idx, "Total Ratings"] = place.get("user_ratings_total", "")
        df.at[idx, "Google Address"] = place.get("formatted_address", "")
        df.at[idx, "Place ID"] = place_id
        df.at[idx, "Types"] = ", ".join(place.get("types", []))
        
        # === STEP 2: Get detailed place data ===
        details_params = {
            "place_id": place_id,
            "fields": "editorial_summary,reviews,types,price_level,website,formatted_phone_number,opening_hours",
            "key": API_KEY
        }

        res = requests.get(DETAILS_URL, params=details_params)
        details = res.json().get("result", {})
        
        # Store additional Google data
        raw_price = details.get("price_level")
        df.at[idx, "Price Level"] = map_price_level(raw_price)
        df.at[idx, "Google Website"] = details.get("website", "")
        df.at[idx, "Phone Number"] = details.get("formatted_phone_number", "")
        if details.get("opening_hours"):
            df.at[idx, "Opening Hours"] = ", ".join(details["opening_hours"].get("weekday_text", []))

        # === STEP 3: Cuisine Classification ===
        texts = []
        if "editorial_summary" in details:
            texts.append(details["editorial_summary"].get("overview", ""))
        if "reviews" in details:
            texts += [r.get("text", "") for r in details["reviews"] if "text" in r]

        combined_text = " ".join(texts + [name]).lower().strip()
        combined_text = unidecode(combined_text)  # Normalize accents

        if not combined_text:
            continue

        # BERT Matching
        embedding = model.encode(combined_text, convert_to_tensor=True)
        scores = util.cos_sim(embedding, cuisine_embeddings)[0]

        # Keyword Matching with regex
        match_scores = {}
        match_hits = {}

        for cuisine, keywords in cuisine_keywords.items():
            total_score = 0
            hits = []
            for kw, weight in keywords.items():
                kw_normalized = unidecode(kw.lower())
                if re.search(rf"\b{re.escape(kw_normalized)}\b", combined_text):
                    total_score += weight
                    hits.append(kw)
            if total_score > 0:
                match_scores[cuisine] = total_score
                match_hits[cuisine] = hits

        # Google Type Cuisine Guess
        google_types = details.get("types", [])
        google_cuisine_guess = None
        for t in google_types:
            if "restaurant" in t and "_" in t:
                guess = t.split("_")[0].capitalize()
                if guess in CUISINES:
                    google_cuisine_guess = guess
                    break
        df.at[idx, "Google Type Cuisine"] = google_cuisine_guess if google_cuisine_guess else "None"

        # Score Fusion
        cuisine_scores = {}
        for cuisine in CUISINES:
            bert_idx = CUISINES.index(cuisine)
            bert_score = float(scores[bert_idx])

            kw_score = match_scores.get(cuisine, 0)
            max_possible = sum(cuisine_keywords[cuisine].values())
            kw_norm = kw_score / max_possible if max_possible else 0

            fusion_score = (bert_score * 0.6) + (kw_norm * 0.4)
            if cuisine == google_cuisine_guess:
                fusion_score += 0.1
            cuisine_scores[cuisine] = fusion_score

        # Final Cuisine Selection
        final_cuisine = max(cuisine_scores, key=cuisine_scores.get)
        final_score = round(cuisine_scores[final_cuisine], 4)
        certainty = "HIGH" if final_score > 0.75 else "MEDIUM" if final_score > 0.5 else "LOW"

        # Save debug info
        top_bert_idx = int(scores.argmax())
        df.at[idx, "Cuisine BERT"] = CUISINES[top_bert_idx]
        df.at[idx, "BERT Score"] = round(float(scores[top_bert_idx]), 10)

        if final_cuisine in match_scores:
            df.at[idx, "Cuisine Keywords"] = f"{final_cuisine} ({match_scores[final_cuisine]})"
            df.at[idx, "Keyword Score"] = match_scores[final_cuisine]
            df.at[idx, "Keyword Hits"] = ", ".join(match_hits.get(final_cuisine, []))
        else:
            df.at[idx, "Cuisine Keywords"] = "None"
            df.at[idx, "Keyword Score"] = 0
            df.at[idx, "Keyword Hits"] = ""

        df.at[idx, "Final Cuisine"] = final_cuisine
        df.at[idx, "Certainty"] = certainty
        df.at[idx, "Final Score"] = final_score
        
        print(f"✅ {name} → Rating: {place.get('rating', 'N/A')}, Cuisine: {final_cuisine} ({certainty})")

    except Exception as e:
        print(f"❌ Error processing {name}: {str(e)}")
        continue

    # Be kind to Google's API rate limits
    time.sleep(2)  

# === Save results ===
print(f"Saving enriched data to {CSV_OUTPUT}")
df.to_csv(CSV_OUTPUT, index=False)
print(f"\n✅ Saved to {CSV_OUTPUT}")

try:
    import ace_tools as tools
    tools.display_dataframe_to_user(name="Restaurant Enrichment Results", dataframe=df)
except ImportError:
    print("Note: ace_tools not available, skipping dataframe display")

SyntaxError: invalid syntax. Perhaps you forgot a comma? (cuisine_keywords.py, line 362)