In [27]:
!pip install google-play-scraper
!pip install requests beautifulsoup4 pandas matplotlib textblob langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=5af16ba4e098b297e13d3e8bc3e39023132decfc2390ac8425eff8f56137f538
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [48]:
import pandas as pd
from google_play_scraper import reviews, Sort
import time
from datetime import datetime
from langdetect import detect

app_id = "jp.go.cas.mpa"
total_reviews = 5400
batch_size = 200
all_reviews = []
collected_reviews = 0
continuation_token = None
cutoff_date = datetime(2024, 1, 1)

seen_ids = set()
duplicate_batches = 0

while collected_reviews < total_reviews:
    try:
        batch, continuation_token = reviews(
            app_id,
            lang="en",
            country="US",
            count=batch_size,
            sort=Sort.NEWEST,
            continuation_token=continuation_token
        )

        if not batch:
            print("No new reviews retrieved. Stopping execution.")
            break

        new_batch = []
        for review in batch:
            review_id = review.get("reviewId")
            if review_id in seen_ids:
                continue
            seen_ids.add(review_id)

            text = review["content"]
            if len(text.split()) >= 5 and review["at"] >= cutoff_date:
                try:
                    lang = detect(text)
                except:
                    lang = "unknown"

                new_batch.append({
                    "content": text,
                    "score": review["score"],
                    "date": review["at"].strftime("%Y-%m-%d"),
                    "lang": lang
                })

        if not new_batch:
            duplicate_batches += 1
            if duplicate_batches >= 3:
                print("Too many duplicate batches. Stopping.")
                break
        else:
            all_reviews.extend(new_batch)
            collected_reviews = len(all_reviews)
            duplicate_batches = 0

        print(f"✅ {collected_reviews} / {total_reviews} reviews collected...")

        if not continuation_token:
            print("No more pages available. Stopping.")
            break

        time.sleep(1)

    except Exception as e:
        print(f"Error occurred: {e}. Retrying in 5 seconds...")
        time.sleep(5)

df_google = pd.DataFrame(all_reviews)
file_name = "googleplay_reviews.csv"
df_google.to_csv(file_name, index=False, encoding="utf-8")

df_google.head()
print(len(df_google))


✅ 21 / 5400 reviews collected...
✅ 33 / 5400 reviews collected...
✅ 48 / 5400 reviews collected...
✅ 64 / 5400 reviews collected...
✅ 80 / 5400 reviews collected...
✅ 94 / 5400 reviews collected...
✅ 111 / 5400 reviews collected...
✅ 134 / 5400 reviews collected...
✅ 146 / 5400 reviews collected...
✅ 161 / 5400 reviews collected...
✅ 164 / 5400 reviews collected...
✅ 164 / 5400 reviews collected...
✅ 164 / 5400 reviews collected...
Too many duplicate batches. Stopping.
164


In [21]:
len(df_google)

3000