<a href="https://colab.research.google.com/github/ahmadizza/AMS-MID-CS-GROUP-G/blob/main/Ambil_Data_Youtube_dengan_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv
from datetime import datetime
import googleapiclient.discovery

api_key = "your-api-key"

keywords = [
    "Program barak militer",
    "Barak Militer Dedi Mulyadi",
    "Barak militer anak",
    "Barak militer siswa",
    "Siswa nakal Jawa barat"
]

used_video_ids = set()  # Untuk menyimpan video yang sudah dipakai

def search_and_get_comments(search_query, max_videos=10, max_comments_per_video=250, start_date=None, end_date=None, csv_filename="youtube_comments.csv"):
    youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)

    search_response = youtube.search().list(
        part="snippet",
        q=search_query,
        type="video",
        maxResults=30  # ambil lebih banyak untuk jaga-jaga (nanti kita filter jadi 10 unik)
    ).execute()

    # Filter video yang belum dipakai
    video_titles_and_ids = {}
    for item in search_response['items']:
        if item['id']['kind'] == 'youtube#video':
            video_id = item['id']['videoId']
            if video_id not in used_video_ids:
                video_titles_and_ids[video_id] = item['snippet']['title']
                if len(video_titles_and_ids) >= max_videos:
                    break

    used_video_ids.update(video_titles_and_ids.keys())  # Tandai video ini sudah terpakai

    if not video_titles_and_ids:
        print(f"No new videos found for keyword '{search_query}'. Skipping.")
        return

    total_comments = 0
    unique_comments = set()

    start_dt = datetime.strptime(start_date, "%Y-%m-%d") if start_date else None
    end_dt = datetime.strptime(end_date, "%Y-%m-%d") if end_date else None

    with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["keyword", "video_id", "video_title", "comment_text", "published_at"])  # Header

        for video_id, video_title in video_titles_and_ids.items():
            try:
                next_page_token = None
                comments_written = 0

                while True:
                    request = youtube.commentThreads().list(
                        part="snippet",
                        videoId=video_id,
                        maxResults=100,
                        pageToken=next_page_token
                    )
                    response = request.execute()

                    for item in response['items']:
                        try:
                            snippet = item['snippet']['topLevelComment']['snippet']
                            comment_text = snippet['textDisplay']
                            published_at = snippet['publishedAt']
                            comment_dt = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%SZ")

                            if (start_dt and comment_dt < start_dt) or (end_dt and comment_dt > end_dt):
                                continue

                            unique_id = f"{video_id}_{comment_text}"
                            if unique_id in unique_comments:
                                continue
                            unique_comments.add(unique_id)

                            writer.writerow([search_query, video_id, video_title, comment_text, published_at])
                            total_comments += 1
                            comments_written += 1

                            if comments_written >= max_comments_per_video:
                                break
                        except KeyError:
                            continue

                    next_page_token = response.get("nextPageToken")
                    if not next_page_token or comments_written >= max_comments_per_video:
                        break

            except Exception as e:
                print(f"Error retrieving comments for video ID {video_id}: {e}")

    print(f"CSV file '{csv_filename}' created for keyword '{search_query}'.")
    print(f"Total unique comments retrieved: {total_comments}")
    print(f"Number of videos fetched: {len(video_titles_and_ids)}")

# Run for all keywords
start_date = "2025-05-02"
end_date = "2025-05-28"

for keyword in keywords:
    csv_file = f"youtube_comments_{keyword.replace(' ', '_')}.csv"
    search_and_get_comments(keyword, max_videos=10, max_comments_per_video=250, start_date=start_date, end_date=end_date, csv_filename=csv_file)


CSV file 'youtube_comments_Program_barak_militer.csv' created for keyword 'Program barak militer'.
Total unique comments retrieved: 1245
Number of videos fetched: 10
CSV file 'youtube_comments_Barak_Militer_Dedi_Mulyadi.csv' created for keyword 'Barak Militer Dedi Mulyadi'.
Total unique comments retrieved: 955
Number of videos fetched: 10
CSV file 'youtube_comments_Barak_militer_anak.csv' created for keyword 'Barak militer anak'.
Total unique comments retrieved: 1313
Number of videos fetched: 10
CSV file 'youtube_comments_Barak_militer_siswa.csv' created for keyword 'Barak militer siswa'.
Total unique comments retrieved: 1012
Number of videos fetched: 10
CSV file 'youtube_comments_Siswa_nakal_Jawa_barat.csv' created for keyword 'Siswa nakal Jawa barat'.
Total unique comments retrieved: 1010
Number of videos fetched: 10


In [None]:
import csv
import glob

# Simpan semua komentar yang ditemukan
all_comments = {}

# Baca semua file CSV hasil scraping
for file_name in glob.glob("youtube_comments_*.csv"):
    with open(file_name, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            comment_id = f"{row['video_id']}_{row['comment_text'].strip()}"
            if comment_id not in all_comments:
                all_comments[comment_id] = {
                    'count': 1,
                    'details': [(file_name, row['video_id'], row['video_title'], row['comment_text'], row['published_at'])]
                }
            else:
                all_comments[comment_id]['count'] += 1
                all_comments[comment_id]['details'].append((file_name, row['video_id'], row['video_title'], row['comment_text'], row['published_at']))

# Filter komentar yang muncul lebih dari sekali (duplikat antar file)
duplicates = {k: v for k, v in all_comments.items() if v['count'] > 1}

# Tulis laporan duplikat ke file CSV baru
with open('duplicate_comments_report.csv', mode='w', newline='', encoding='utf-8') as report_file:
    writer = csv.writer(report_file)
    writer.writerow(['video_id', 'comment_text', 'file_1', 'file_2', 'published_at'])

    for comment_id, data in duplicates.items():
        # Ambil dua contoh file pertama tempat duplikat muncul
        details = data['details']
        writer.writerow([
            details[0][1],  # video_id
            details[0][3],  # comment_text
            details[0][0],  # file_1
            details[1][0],  # file_2
            details[0][4]   # published_at
        ])

print(f"Duplicate check finished. Found {len(duplicates)} duplicate comments across files.")
print("Report saved as 'duplicate_comments_report.csv'.")


Duplicate check finished. Found 0 duplicate comments across files.
Report saved as 'duplicate_comments_report.csv'.


In [None]:
import csv
import glob

# Siapkan file gabungan
output_file = "all_youtube_comments_combined.csv"

# Cari semua file yang cocok
csv_files = glob.glob("youtube_comments_*.csv")

if not csv_files:
    print("No youtube_comments_*.csv files found.")
else:
    with open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
        writer = None

        for i, file_name in enumerate(csv_files):
            with open(file_name, mode='r', encoding='utf-8') as infile:
                reader = csv.reader(infile)
                headers = next(reader)  # ambil header

                # Tulis header sekali saja
                if writer is None:
                    writer = csv.writer(outfile)
                    writer.writerow(headers)

                for row in reader:
                    writer.writerow(row)

    print(f"All CSV files combined into '{output_file}'. Total files merged: {len(csv_files)}")


All CSV files combined into 'all_youtube_comments_combined.csv'. Total files merged: 5


In [None]:
import pandas as pd

df = pd.read_csv("all_youtube_comments_combined.csv")
df.shape

(5535, 5)

In [None]:
df.columns

Index(['keyword', 'video_id', 'video_title', 'comment_text', 'published_at'], dtype='object')

In [None]:
df.isnull().sum()

Unnamed: 0,0
keyword,0
video_id,0
video_title,0
comment_text,0
published_at,0
