## 📦 필요 라이브러리 설치

In [None]:
!pip -q install google-api-python-client tqdm

## ✅ 분석 코드

In [None]:
# ------------------ 1) 설치 & 설정 ------------------

from googleapiclient.discovery import build
from tqdm import tqdm
import json, time, os

# ▶︎▶︎▶︎ 반드시 수정 ◀︎◀︎◀︎
YOUTUBE_API_KEY = "AIzaSyC7PP8uw-oCnmvbYu_I5d8Q2HFH_DEIyrQ"
KEYWORDS       = ["스웨디시 젤리", "CU 티라미수", "GS25 푸딩"]  # 수집 키워드
TARGET_COUNT   = 3000      # 목표 댓글 개수
MAX_VIDEOS     = 50        # 키워드별 최대 검색 영상 수
MAX_PAGES      = 5         # 영상당 댓글 페이지(100개씩) 최대 반복

yt = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)

# ------------------ 2) 유틸 함수 ------------------
def search_video_ids(keyword, max_results=MAX_VIDEOS):
    """키워드로 영상 ID 리스트 반환"""
    vids, token = [], None
    while len(vids) < max_results:
        resp = yt.search().list(
            q=keyword, type="video", part="id",
            maxResults=min(50, max_results - len(vids)),
            pageToken=token, order="relevance"
        ).execute()
        vids += [it["id"]["videoId"] for it in resp["items"]]
        token = resp.get("nextPageToken")
        if not token: break
    return vids

from googleapiclient.errors import HttpError

def fetch_comments(video_id, max_pages=MAX_PAGES):
    """댓글 비활성·삭제 영상은 건너뛰도록 예외 처리"""
    comments, token = [], None
    try:
        for _ in range(max_pages):
            resp = yt.commentThreads().list(
                part="snippet", videoId=video_id,
                maxResults=100, pageToken=token
            ).execute()
            for it in resp["items"]:
                txt = it["snippet"]["topLevelComment"]["snippet"]["textOriginal"]
                comments.append(txt.strip())
            token = resp.get("nextPageToken")
            if not token:
                break
    except HttpError as e:
        # 403(댓글 비활성), 404(비공개·삭제) 등은 스킵
        err_reason = json.loads(e.content.decode()).get("error", {}).get("errors", [{}])[0].get("reason", "")
        print(f"⚠️  skip video {video_id} — {err_reason}")
    return comments


# ------------------ 3) 간단한 noise 필터 ------------------
def is_noise(text:str) -> bool:
    """아주 짧거나 광고·링크·단순질문이면 True"""
    if len(text) < 5: return True
    if "http" in text or "구독" in text: return True
    if text.endswith("?") and ("어디" in text or "팔아요" in text): return True
    return False

# ------------------ 4) (옵션) 최소 가공 형태 JSONL 레코드 ------------------
def to_jsonl(text:str) -> str:
    """학습용 스켈레톤 (noise만 구분)"""
    rec = {
        "textInput": text,
        "output": json.dumps({
            "attributes": None,
            "meta": None,
            "is_noise": True,
            "overall_sentiment": None
        }, ensure_ascii=False)
    }
    return json.dumps(rec, ensure_ascii=False)

# ------------------ 5) 메인 파이프라인 ------------------
all_comments, collected = [], 0
pbar = tqdm(total=TARGET_COUNT, desc="Collecting")

for kw in KEYWORDS:
    video_ids = search_video_ids(kw)
    for vid in video_ids:
        for c in fetch_comments(vid):
            if collected >= TARGET_COUNT:
                break
            if is_noise(c):   # 노이즈는 저장하지 않음(원하면 저장 가능)
                continue
            all_comments.append(to_jsonl(c))
            collected += 1
            pbar.update(1)
        if collected >= TARGET_COUNT:
            break
    if collected >= TARGET_COUNT:
        break

pbar.close()
print(f"✅ 수집 완료: {collected}개 댓글")

# ------------------ 6) 파일 저장 & 다운로드 ------------------
FNAME = "youtube_reviews_3000.jsonl"
with open(FNAME, "w", encoding="utf8") as f:
    f.write("\n".join(all_comments))

print(f"📁 저장 완료 → {FNAME}")

# Colab에서 로컬로 다운로드
from google.colab import files
files.download(FNAME)



Collecting:  81%|████████  | 2416/3000 [04:33<01:06,  8.85it/s] 

Collecting:   0%|          | 1/3000 [00:00<29:30,  1.69it/s][A
Collecting:   5%|▌         | 150/3000 [00:01<00:16, 170.40it/s][A
Collecting:  11%|█         | 322/3000 [00:01<00:07, 365.10it/s][A
Collecting:  13%|█▎        | 388/3000 [00:01<00:10, 240.79it/s][A
Collecting:  22%|██▏       | 660/3000 [00:01<00:04, 469.11it/s][A
Collecting:  24%|██▍       | 735/3000 [00:02<00:06, 329.70it/s][A
Collecting:  34%|███▍      | 1015/3000 [00:02<00:03, 566.45it/s][A
Collecting:  37%|███▋      | 1115/3000 [00:03<00:04, 389.75it/s][A
Collecting:  46%|████▌     | 1365/3000 [00:03<00:03, 524.04it/s][A
Collecting:  48%|████▊     | 1455/3000 [00:03<00:03, 404.97it/s][A
Collecting:  57%|█████▋    | 1724/3000 [00:04<00:02, 633.53it/s][A
Collecting:  61%|██████▏   | 1843/3000 [00:04<00:02, 571.16it/s][A
Collecting:  65%|██████▍   | 1938/3000 [00:05<00:03, 305.75it/s][A


⚠️  skip video XtXgsq0An8E — commentsDisabled



Collecting:  82%|████████▏ | 2447/3000 [00:05<00:01, 452.22it/s][A
Collecting:  84%|████████▍ | 2518/3000 [00:06<00:01, 359.97it/s][A
Collecting:  88%|████████▊ | 2651/3000 [00:06<00:00, 425.18it/s][A
Collecting:  90%|█████████ | 2714/3000 [00:07<00:01, 232.12it/s][A
Collecting:  95%|█████████▌| 2850/3000 [00:07<00:00, 289.91it/s][A
Collecting:  97%|█████████▋| 2915/3000 [00:07<00:00, 305.23it/s][A
Collecting: 100%|██████████| 3000/3000 [00:08<00:00, 360.95it/s]

✅ 수집 완료: 3000개 댓글
📁 저장 완료 → youtube_reviews_3000.jsonl





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>