In [1]:
API_KEYS = [
    "AIzaSyBxqqxwgPvM70mvtLGFXAggvnJ1RndF21I",
    "AIzaSyCmtDpxywkZEnOlM7GPsOJhxdpUUeol9aM",
    "AIzaSyCkz-rEdWZfNd97gIqueZvzNktOasXVIBg",
]


In [None]:
# -------------------- CONFIG --------------------

CSV_PATH = "destination_activities_generalised.csv"  # your CSV with: location, activity_1..activity_5

TOP_N_ACTIVITIES        = None   # None = all activity columns; or set e.g. 3
TARGET_REVIEWS_PER_QUERY = 100   # stop early once we reach this many reviews
MAX_REVIEWS_PER_QUERY    = 200   # hard cap per query

PAGE_SIZE               = 20     # max allowed by API v1
NEXT_PAGE_DELAY_SEC     = 2.0    # short pause before using nextPageToken
REQUESTS_PER_SECOND     = 2.0    # throttle all HTTP calls
DETAILS_SLEEP_SEC       = 0.15   # tiny pause between details
MAX_RETRIES             = 4

# -------------------- IMPORTS --------------------
import time, json, math, random, requests
import pandas as pd
from typing import List, Dict, Any, Optional, Tuple

# -------------------- BUILD QUERIES --------------------
def build_dict_from_csv(csv_path: str, top_n: Optional[int]) -> Dict[str, List[str]]:
    df = pd.read_csv(csv_path)
    cols = list(df.columns)
    if len(cols) < 6:
        raise ValueError("CSV must have at least 6 columns: location + 5 activities.")
    location_col = cols[0]
    activity_cols = [c for c in cols if c.lower().startswith("activity_")] or cols[1:6]

    acts_by_loc: Dict[str, List[str]] = {}
    for _, row in df.iterrows():
        loc = str(row[location_col]).strip()
        acts = []
        for c in activity_cols:
            v = row.get(c, None)
            if pd.notna(v):
                s = str(v).strip()
                if s: acts.append(s)
        if top_n is not None:
            acts = acts[:top_n]
        acts_by_loc[loc] = acts
    return acts_by_loc

def build_queries_from_dict(acts_by_loc: Dict[str, List[str]]) -> List[str]:
    return [f"{a} in {loc}" for loc, acts in acts_by_loc.items() for a in acts]

# -------------------- KEY ROTATION & THROTTLE --------------------
class KeyRotator:
    def __init__(self, keys: List[str]):
        if not keys: raise ValueError("Provide at least one API key.")
        self.keys = keys
        self.i = 0
    def next(self) -> str:
        k = self.keys[self.i]
        self.i = (self.i + 1) % len(self.keys)
        return k

rotator = KeyRotator(API_KEYS)
_last_request_ts = 0.0

def _throttle():
    global _last_request_ts
    interval = 1.0 / max(0.1, REQUESTS_PER_SECOND)
    now = time.time()
    sleep_for = max(0.0, _last_request_ts + interval - now)
    if sleep_for: time.sleep(sleep_for)
    _last_request_ts = time.time()

# -------------------- API v1 CALLS (your working pattern) --------------------
SEARCH_URL  = "https://places.googleapis.com/v1/places:searchText"
DETAILS_URL = "https://places.googleapis.com/v1/places/{place_id}"

def _post_json(url: str, headers: Dict[str,str], json_body: Dict[str,Any]) -> Dict[str,Any]:
    # basic retry/backoff on 429 / 5xx
    delay = 1.0
    for attempt in range(1, MAX_RETRIES+1):
        _throttle()
        r = requests.post(url, headers=headers, json=json_body, timeout=30)
        if r.status_code == 200:
            return r.json()
        if r.status_code in (429, 500, 502, 503, 504):
            time.sleep(delay); delay = min(delay*2, 20)
            continue
        # surface errors
        try:
            msg = r.json()
        except Exception:
            msg = r.text
        raise RuntimeError(f"POST {url} failed ({r.status_code}): {msg}")
    # last try
    _throttle()
    r = requests.post(url, headers=headers, json=json_body, timeout=30)
    try:
        return r.json()
    except Exception:
        raise RuntimeError(f"POST {url} failed after retries: {r.status_code}")

def _get_json(url: str, headers: Dict[str,str]) -> Dict[str,Any]:
    delay = 1.0
    for attempt in range(1, MAX_RETRIES+1):
        _throttle()
        r = requests.get(url, headers=headers, timeout=30)
        if r.status_code == 200:
            return r.json()
        if r.status_code in (429, 500, 502, 503, 504):
            time.sleep(delay); delay = min(delay*2, 20)
            continue
        try:
            msg = r.json()
        except Exception:
            msg = r.text
        raise RuntimeError(f"GET {url} failed ({r.status_code}): {msg}")
    _throttle()
    r = requests.get(url, headers=headers, timeout=30)
    try:
        return r.json()
    except Exception:
        raise RuntimeError(f"GET {url} failed after retries: {r.status_code}")

def places_search_text_v1(query: str, key: str, page_token: Optional[str] = None) -> Dict[str,Any]:
    headers = {
        "Content-Type": "application/json",
        "X-Goog-Api-Key": key,
        # Include nextPageToken in mask (as you did)
        "X-Goog-FieldMask": "places.id,places.displayName,places.formattedAddress,nextPageToken",
    }
    body = {
        "textQuery": query,
        "pageSize": PAGE_SIZE
    }
    if page_token:
        body["pageToken"] = page_token
    return _post_json(SEARCH_URL, headers, body)

def place_details_v1(place_id: str, key: str) -> Dict[str,Any]:
    """
    Places API (New) details request with an explicit field mask for review subfields.
    """
    headers = {
        "Content-Type": "application/json",
        "X-Goog-Api-Key": key,
        # Ask only for what we need – include nested review fields explicitly
        "X-Goog-FieldMask": (
            "id,displayName,formattedAddress,rating,userRatingCount,"
            "reviews.rating,"
            "reviews.text,"                       # {text, languageCode}
            "reviews.originalText,"               # {text, languageCode}
            "reviews.authorAttribution.displayName,"
            "reviews.publishTime,"
            "reviews.relativePublishTimeDescription"
        ),
    }
    url = DETAILS_URL.format(place_id=place_id)
    return _get_json(url, headers)

# -------------------- RESUMABLE IN-MEMORY STATE --------------------
def init_state_new(queries: List[str], prev_state: Optional[Dict[str,Any]] = None) -> Dict[str,Any]:
    state = prev_state.copy() if prev_state else {"queries": {}, "completed": []}
    for q in queries:
        if q not in state["queries"]:
            state["queries"][q] = {
                "nextPageToken": None,
                "places_done": {},          # place_id -> True
                "reviews_collected": 0,
                "done": False
            }
    return state

# -------------------- ONE QUERY (one page per pass) --------------------
def run_one_query_new(query: str, state: Dict[str,Any], verbose: bool = True) -> List[Dict[str,Any]]:
    q = state["queries"][query]
    rows: List[Dict[str,Any]] = []

    if q["done"] or q["reviews_collected"] >= MAX_REVIEWS_PER_QUERY:
        q["done"] = True
        if query not in state["completed"]: state["completed"].append(query)
        if verbose: print(f"[{query}] already done (collected={q['reviews_collected']}).")
        return rows

    if q["nextPageToken"]:
        time.sleep(NEXT_PAGE_DELAY_SEC)

    resp = places_search_text_v1(query, rotator.next(), page_token=q["nextPageToken"])
    places = resp.get("places", []) or []
    q["nextPageToken"] = resp.get("nextPageToken")

    if verbose:
        print(f"[{query}] places={len(places)}, nextPageToken={bool(q['nextPageToken'])}")

    # optional priority: most-rated first (if userRatingCount present)
    places.sort(key=lambda p: p.get("userRatingCount", 0), reverse=True)

    for p in places:
        if q["reviews_collected"] >= MAX_REVIEWS_PER_QUERY:
            q["done"] = True
            if query not in state["completed"]: state["completed"].append(query)
            break

        pid = p.get("id")
        if not pid or q["places_done"].get(pid):
            continue

        det = place_details_v1(pid, rotator.next())
        reviews = det.get("reviews", []) or []

        name = (det.get("displayName") or {}).get("text", p.get("displayName", {}).get("text",""))
        addr = det.get("formattedAddress", p.get("formattedAddress",""))
        rating = det.get("rating")
        count  = det.get("userRatingCount")

        remain = MAX_REVIEWS_PER_QUERY - q["reviews_collected"]
        to_take = reviews[:remain]

        for r in to_take:
            # --- SAFE extraction for text + language ---
            txt_obj = r.get("text")
            if isinstance(txt_obj, dict):
                review_text = (txt_obj.get("text") or "")
                lang_from_text = txt_obj.get("languageCode")
            else:
                review_text = (txt_obj or "")
                lang_from_text = None

            orig_obj = r.get("originalText")
            lang_from_orig = None
            if isinstance(orig_obj, dict):
                lang_from_orig = orig_obj.get("languageCode")
                # if you prefer original text over translated:
                if not review_text:
                    review_text = (orig_obj.get("text") or "")

            review_language = lang_from_text or lang_from_orig

            # Normalize text (only if it's a string)
            if isinstance(review_text, str):
                review_text = review_text.replace("\n", " ").strip()

            rows.append({
                "query": query,
                "place_id": pid,
                "place_name": name,
                "formatted_address": addr,
                "rating": rating,
                "user_ratings_total": count,
                "review_author_name": ((r.get("authorAttribution") or {}).get("displayName")),
                "review_rating": r.get("rating"),
                "review_relative_time": r.get("relativePublishTimeDescription"),
                "review_text": review_text,
                "review_language": review_language,
                "publishTime": r.get("publishTime"),
            })

        q["reviews_collected"] += len(to_take)
        q["places_done"][pid] = True

        time.sleep(DETAILS_SLEEP_SEC)

        if q["reviews_collected"] >= TARGET_REVIEWS_PER_QUERY:
            q["done"] = True
            if query not in state["completed"]: state["completed"].append(query)
            break

    if not q["nextPageToken"]:
        q["done"] = True
        if query not in state["completed"]: state["completed"].append(query)

    if verbose and not rows:
        print(f"[{query}] no reviews appended this pass.")
    return rows

# -------------------- BATCH RUNNER --------------------
def run_all_queries_new(
    prev_state: Optional[Dict[str,Any]] = None,
    max_queries_per_run: Optional[int] = 50
) -> Tuple[pd.DataFrame, Dict[str,Any]]:
    acts_by_loc = build_dict_from_csv(CSV_PATH, TOP_N_ACTIVITIES)
    queries = build_queries_from_dict(acts_by_loc)
    state = init_state_new(queries, prev_state)

    pending = [q for q in queries if not state["queries"][q]["done"]]
    if max_queries_per_run is not None:
        pending = pending[:max_queries_per_run]

    new_rows: List[Dict[str,Any]] = []
    for idx, q in enumerate(pending, 1):
        print(f"[{idx}/{len(pending)}] {q}")
        new_rows.extend(run_one_query_new(q, state, verbose=True))

    df_batch = pd.DataFrame(new_rows, columns=[
        "query","place_id","place_name","formatted_address",
        "rating","user_ratings_total","review_author_name","review_rating",
        "review_relative_time","review_text","review_language","publishTime"
    ])
    return df_batch, state

#-------------- PROGRESS ----------------------

def summarize_progress(state: dict, target: int = 100) -> pd.DataFrame:
    """
    Build a DataFrame showing progress for every query in `state`.
    Assumes query format: "<activity> in <location>".
    """
    rows = []
    qmap = state.get("queries", {})
    for q, s in qmap.items():
        # try to split "activity in location" (robust to accidental extra " in ")
        activity, location = None, None
        if " in " in q:
            parts = q.rsplit(" in ", 1)
            activity = parts[0]
            location = parts[1]
        rows.append({
            "query": q,
            "location": location,
            "activity": activity,
            "reviews_collected": s.get("reviews_collected", 0),
            "done": bool(s.get("done", False)),
            "has_next_page": bool(s.get("nextPageToken")),
            "places_processed": len(s.get("places_done", {})),
            "remaining_to_target": max(0, target - int(s.get("reviews_collected", 0))),
        })
    df = pd.DataFrame(rows).sort_values(
        ["done", "remaining_to_target", "reviews_collected"],
        ascending=[True, True, False]
    ).reset_index(drop=True)

    return df


In [11]:
df_batch, state = run_all_queries_new(prev_state=None, max_queries_per_run=50)
df_batch.head()

[1/50] immersive experience in London
[immersive experience in London] places=20, nextPageToken=True
[2/50] theatre in London
[theatre in London] places=20, nextPageToken=True
[3/50] boat cruise in London
[boat cruise in London] places=20, nextPageToken=True
[4/50] museum in London
[museum in London] places=20, nextPageToken=True
[5/50] market in London
[market in London] places=20, nextPageToken=True
[6/50] boat cruise in Paris
[boat cruise in Paris] places=20, nextPageToken=True
[7/50] landmark in Paris
[landmark in Paris] places=20, nextPageToken=True
[8/50] museum in Paris
[museum in Paris] places=20, nextPageToken=True
[9/50] neighborhood in Paris
[neighborhood in Paris] places=2, nextPageToken=False
[neighborhood in Paris] no reviews appended this pass.
[10/50] park/garden in Paris
[park/garden in Paris] places=20, nextPageToken=True
[11/50] architecture in Barcelona
[architecture in Barcelona] places=20, nextPageToken=True
[12/50] water sport in Barcelona
[water sport in Barcelo

Unnamed: 0,query,place_id,place_name,formatted_address,rating,user_ratings_total,review_author_name,review_rating,review_relative_time,review_text,review_language,publishTime
0,immersive experience in London,ChIJW4IlshQDdkgR7CWekBzGqJI,The War of The Worlds: The Immersive Experience,"56 Leadenhall St, London EC3A 2BJ, UK",4.8,3088,paula-lou Salkeld (Pezzy),4,3 weeks ago,As massive war of the worlds fans we weren't s...,en,2025-08-04T12:00:55.142807609Z
1,immersive experience in London,ChIJW4IlshQDdkgR7CWekBzGqJI,The War of The Worlds: The Immersive Experience,"56 Leadenhall St, London EC3A 2BJ, UK",4.8,3088,Sarah Harleyquinn,5,4 months ago,Having always liked War Of the Worlds my partn...,en,2025-04-21T08:53:41.700071Z
2,immersive experience in London,ChIJW4IlshQDdkgR7CWekBzGqJI,The War of The Worlds: The Immersive Experience,"56 Leadenhall St, London EC3A 2BJ, UK",4.8,3088,Hannah Wild,5,4 months ago,"Wow, what a fun experience! I didn't know any...",en,2025-04-21T07:27:28.218635Z
3,immersive experience in London,ChIJW4IlshQDdkgR7CWekBzGqJI,The War of The Worlds: The Immersive Experience,"56 Leadenhall St, London EC3A 2BJ, UK",4.8,3088,Tim Oldland,5,a week ago,Absolutely fantastic experience! I’ve been a f...,en,2025-08-16T07:57:11.543664413Z
4,immersive experience in London,ChIJW4IlshQDdkgR7CWekBzGqJI,The War of The Worlds: The Immersive Experience,"56 Leadenhall St, London EC3A 2BJ, UK",4.8,3088,Velda Mazzara,5,5 months ago,"The experience was really good, could not faul...",en,2025-03-19T09:53:41.343420Z


In [7]:
df_batch.head(20)

Unnamed: 0,query,place_id,place_name,formatted_address,rating,user_ratings_total,review_author_name,review_rating,review_relative_time,review_text,review_language,publishTime
0,immersive experience in London,ChIJW4IlshQDdkgR7CWekBzGqJI,The War of The Worlds: The Immersive Experience,"56 Leadenhall St, London EC3A 2BJ, UK",4.8,3088,paula-lou Salkeld (Pezzy),4,3 weeks ago,As massive war of the worlds fans we weren't s...,en,2025-08-04T12:00:55.142807609Z
1,immersive experience in London,ChIJW4IlshQDdkgR7CWekBzGqJI,The War of The Worlds: The Immersive Experience,"56 Leadenhall St, London EC3A 2BJ, UK",4.8,3088,Sarah Harleyquinn,5,4 months ago,Having always liked War Of the Worlds my partn...,en,2025-04-21T08:53:41.700071Z
2,immersive experience in London,ChIJW4IlshQDdkgR7CWekBzGqJI,The War of The Worlds: The Immersive Experience,"56 Leadenhall St, London EC3A 2BJ, UK",4.8,3088,Hannah Wild,5,4 months ago,"Wow, what a fun experience! I didn't know any...",en,2025-04-21T07:27:28.218635Z
3,immersive experience in London,ChIJW4IlshQDdkgR7CWekBzGqJI,The War of The Worlds: The Immersive Experience,"56 Leadenhall St, London EC3A 2BJ, UK",4.8,3088,Tim Oldland,5,a week ago,Absolutely fantastic experience! I’ve been a f...,en,2025-08-16T07:57:11.543664413Z
4,immersive experience in London,ChIJW4IlshQDdkgR7CWekBzGqJI,The War of The Worlds: The Immersive Experience,"56 Leadenhall St, London EC3A 2BJ, UK",4.8,3088,Velda Mazzara,5,5 months ago,"The experience was really good, could not faul...",en,2025-03-19T09:53:41.343420Z
5,immersive experience in London,ChIJW0sjYeEFdkgRdwCty-McgJs,"Immersive Gamebox - Southbank, London","Arch 3, 83 Scoresby St, London SE1 0XN, UK",4.9,4532,Jez Neeks,5,a week ago,"This is good fun for couples, families, friend...",en,2025-08-17T18:07:52.134500844Z
6,immersive experience in London,ChIJW0sjYeEFdkgRdwCty-McgJs,"Immersive Gamebox - Southbank, London","Arch 3, 83 Scoresby St, London SE1 0XN, UK",4.9,4532,Helena Hughes,5,a week ago,Such a fun experience! Sam was so kind and hel...,en,2025-08-19T18:14:56.705732603Z
7,immersive experience in London,ChIJW0sjYeEFdkgRdwCty-McgJs,"Immersive Gamebox - Southbank, London","Arch 3, 83 Scoresby St, London SE1 0XN, UK",4.9,4532,Holly Berndsen,5,2 months ago,SO MUCH FUN! I booked this for my boyfriend’s ...,en,2025-06-27T02:42:24.617526914Z
8,immersive experience in London,ChIJW0sjYeEFdkgRdwCty-McgJs,"Immersive Gamebox - Southbank, London","Arch 3, 83 Scoresby St, London SE1 0XN, UK",4.9,4532,Jade Bonifacio,5,a month ago,"Pricey, but the kids really, really enjoyed th...",en,2025-07-23T17:17:23.086481040Z
9,immersive experience in London,ChIJW0sjYeEFdkgRdwCty-McgJs,"Immersive Gamebox - Southbank, London","Arch 3, 83 Scoresby St, London SE1 0XN, UK",4.9,4532,Harmz B,5,6 months ago,We visited Immersive Gamebox in Southbank for ...,en,2025-02-03T02:48:22.659003Z


In [9]:
df_batch.to_csv("/Users/benjaminluff/code/benluff303/swipe-sense/London_immersive")

In [15]:
len(df_batch)

3720

In [None]:
df_batch2, state = run_all_queries_new(prev_state=state, max_queries_per_run=200)

print_progress_summary(state, target=TARGET_REVIEWS_PER_QUERY)