In [2]:
import requests
import pandas as pd
import json
import time
import random
from datetime import datetime

# 設定輸入 CSV 檔案名稱
INPUT_CSV = "user_list.csv"
KEI_CSV = "KEI_list.csv"
OUTPUT_CSV_REVIEWS = "user_reviews.csv"
OUTPUT_CSV_LOCATIONS = "user_locations.csv"

# 讀取 CSV 取得 user_id
df_users = pd.read_csv(INPUT_CSV) 
user_ids = df_users['user_id'].tolist()

# 讀取 KEI 
df_kei = pd.read_csv(KEI_CSV)
kei_list = df_kei['kei'].tolist()

# 設定請求次數上限
MAX_PAGE_REQUESTS = 5

# 儲存結果的 DataFrame
reviews_data = []
locations_data = {}

# 爬取所有 user_id
for user_id in user_ids:
    kei = random.choice(kei_list)  # 每位 user_id 開始時隨機選擇 KEI
    token = ""  # 第一個請求 token 為空
    request_count = 0

    while request_count < MAX_PAGE_REQUESTS:
        # 建立請求 URL
        url = (f"https://www.google.com/locationhistory/preview/mas?authuser=0&hl=zh-TW&gl=tw&pb="
               f"!1s{user_id}!2m3!1s{kei}!7e81!15i14416!6m2!4b1!7b1!9m0!17m28!1m6!1m2!1i0!2i0!2m2!1i530!2i306"
               f"!1m6!1m2!1i1870!2i0!2m2!1i1920!2i306!1m6!1m2!1i0!2i0!2m2!1i1920!2i20!1m6!1m2!1i0!2i286!2m2!1i1920"
               f"!2i306!18m9!1m3!1d46090.09471573684!2d121.5320757!3d25.0371489!2m0!3m2!1i1920!2i306!4f13.1!41m15!1i20!"
               f"2m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!3s{token}!7m2!1m1!1e1")

        response = requests.get(url)
        if response.status_code != 200:
            print(f"[ERROR] 無法取得 {user_id} 的資料，跳過...")
            break

        # 移除開頭的 ')]}'
        raw_content = response.text
        if raw_content.startswith(")]}'"):
            raw_content = raw_content[4:]
        
        data = json.loads(raw_content)

        for review in data[45][0]:
            try:
                # 檢查是否有台北標準時間
                if "台北標準時間" not in str(review[4][13][1][1]):
                    continue
                
                gmap_location = review[1][0][4] if len(review[1][0]) > 4 else ""
                location_id = review[1][0][2] if len(review[1][0]) > 2 else ""
                review_id = review[1][0][1]
                timestamp = review[2][1][2] // 1000000
                formatted_date = datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d')
                rating = review[2][2][0][0]
                language = review[2][2][14][0] if len(review[2][2]) > 14 else ""
                comment = review[2][2][15][0][0] if len(review[2][2]) > 15 else ""
                translated = review[2][2][15][1][0] if len(review[2][2]) > 15 else ""
                
                reviews_data.append([user_id, gmap_location, location_id, review_id, formatted_date, rating, language, comment, translated])
                
                # 更新景點統計
                if location_id and location_id not in locations_data:
                    locations_data[location_id] = {
                        "gmap_location": gmap_location,
                        "count": 0,
                    }
                if location_id:
                    locations_data[location_id]["count"] += 1
            except Exception as e:
                print(f"[ERROR] 解析評論失敗: {e}")
                continue


  formatted_date = datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d')


[ERROR] 無法取得 102593444098371563490 的資料，跳過...
[ERROR] 無法取得 110113342761671225928 的資料，跳過...
[ERROR] 無法取得 103308312894993937064 的資料，跳過...
[ERROR] 無法取得 112799207508959677048 的資料，跳過...
[ERROR] 無法取得 102371826996902429012 的資料，跳過...
[ERROR] 無法取得 102481634779662002158 的資料，跳過...
[ERROR] 無法取得 113567067201975349034 的資料，跳過...
[ERROR] 無法取得 112998935518815021950 的資料，跳過...
[ERROR] 無法取得 117234750600340065368 的資料，跳過...


In [3]:
print(url)

https://www.google.com/locationhistory/preview/mas?authuser=0&hl=zh-TW&gl=tw&pb=!1s117234750600340065368!2m3!1sk8euZ-hso8_V7w-f78egAQ!7e81!15i14416!6m2!4b1!7b1!9m0!17m28!1m6!1m2!1i0!2i0!2m2!1i530!2i306!1m6!1m2!1i1870!2i0!2m2!1i1920!2i306!1m6!1m2!1i0!2i0!2m2!1i1920!2i20!1m6!1m2!1i0!2i286!2m2!1i1920!2i306!18m9!1m3!1d46090.09471573684!2d121.5320757!3d25.0371489!2m0!3m2!1i1920!2i306!4f13.1!41m15!1i20!2m9!2b1!3b1!5b1!7b1!12m4!1b1!2b1!4m1!1e1!3s!7m2!1m1!1e1
