In [1]:
# Install required library
!pip install google-api-python-client pandas

from googleapiclient.discovery import build
import pandas as pd
import re

# Set up YouTube API credentials
API_KEY = "API"  # Replace with your API key
youtube = build("youtube", "v3", developerKey=API_KEY)


# Function to clean comments
def clean_comment(comment):
    """
    Clean a comment by removing HTML tags and unnecessary whitespace.
    """
    comment = re.sub(r'<[^>]+>', '', comment).strip()  # Remove HTML tags and trim
    return comment if comment else None  # Return None if the comment is empty

# Function to get comments from a YouTube video
def get_video_comments(video_id):
    comments = []
    next_page_token = None

    while True:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            pageToken=next_page_token,
            maxResults=100
        )
        response = request.execute()

        for item in response.get("items", []):
            # Extract and clean the comment
            raw_comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            cleaned_comment = clean_comment(raw_comment)
            if cleaned_comment:  # Include only cleaned, non-empty comments
                comments.append(cleaned_comment)

        next_page_token = response.get("nextPageToken", None)
        if not next_page_token:
            break

    print(f"Collected {len(comments)} comments from video ID: {video_id}")
    return comments

# Save comments to a CSV file (without IDs)
def save_comments_to_csv(video_ids, output_file):
    all_comments = []

    for video_id in video_ids:
        print(f"Collecting comments for video: {video_id}")
        comments = get_video_comments(video_id)

        all_comments.extend(comments)

    # Debugging: Ensure there are comments collected
    print("Number of comments:", len(all_comments))

    # Save comments to a CSV file
    df = pd.DataFrame({"comment": all_comments})
    df.to_csv(output_file, index=False, encoding="utf-8")

    print(f"Saved {len(all_comments)} comments to {output_file}")


# Example usage
video_ids = ["aZN_ZgR_j_8", "heUQx35qXio", "nBq60wwkg7k", "mQrZuW-ooUs", "_WLs1E5tRUc", "JdwCcU1njMA", "xBimuKlO_8Q", "rwkstgOjQ5I", "gobeJXbFtBk", "33YwDyW31DE",
             "jVYAjmWo-C0", "1vzxMcXTwSI", "EI7TiCOKj-s", "K0ZN233sp1c", "QA9QQCvbvyU", "0xXLw1Q9HF4", "odNH2op25RM", "pDKNB8WA47I", "VOxVjF1kr-U", "E3mSYrbqml4",
             "jilktmDiBvY", "EV4J8J6RlOE", "0SDrhZ-jBSw", "XYZNdcQUKWU", "4ZdQyk7Pq_s", "Cxi9npzQqbQ", "NJbZICrq_9A", "63dhJUDsI6U", "55tJji52K8k", "ulT6T8KKRI0",
             "KdtrPVgkfas", "YCkahuYfynU", "tqMaLCwYuiw", "e6HIRBwLGeM", "EHe-YpY0WG0", "0PLHVzteHRY", "IBLgV3gdwtA", "15C8mfeY290", "N6nGRCUWkIM", "rgxmIqZ4fMw",
             "bvWe5_EvteM", "NX6iHRYFZaw", "-LhS8D4nqng", "qRb_72NgFLo", "94HekzWPmCc", "pdU9kzI-7Hg", "2ET4VgCpJ2k", "0VVkPxN_C0Y", "ZrW3oFRqkdg", "A6yZVDJ6rIY",
             "vMirkVh4I-w", "4-tLhqZZC6A", "H89jbSTH2p8", "2QW34US0QrA", "J-bfaC8baeg", "912mc32Ej9g", "8a7eUHR73To", "tQfrzYUiong", "p6XKfa9IpR4", "jG8xm3Uuoa0",
             "BZvxmonSf4c", "QIWX4oNn3Y0", "Q3a2bZkWAIY", "LV7im5BBMYQ", "SobAzfneQU0", "riKowdciniU", "72WgjWOLOH4", "r0o64vcdl3g", "kXc9L-agT9Q", "vUuzMYbdxAo",
             "-C9K4f5ss3M", "x7JiIoI5tfg", "tRkIaCSXjo0", "clr4vID8eVY", "-7wGu2GxMSw", "R8U-DJSNEUY", "pbSG3zLuvuU", "rzYt9pSSZLc", "lv5GMUAZOBM", "pqbyeO8yj4g",
             "2xtw_arzhoI", "P5W_OuKnvBQ", "yVm4vEe_vlE"]  # Replace with YouTube video IDs
save_comments_to_csv(video_ids, "doublej_music_comments.csv")


Collecting comments for video: aZN_ZgR_j_8
Collected 100 comments from video ID: aZN_ZgR_j_8
Collecting comments for video: heUQx35qXio
Collected 51 comments from video ID: heUQx35qXio
Collecting comments for video: nBq60wwkg7k
Collected 15 comments from video ID: nBq60wwkg7k
Collecting comments for video: mQrZuW-ooUs
Collected 5 comments from video ID: mQrZuW-ooUs
Collecting comments for video: _WLs1E5tRUc
Collected 4 comments from video ID: _WLs1E5tRUc
Collecting comments for video: JdwCcU1njMA
Collected 4 comments from video ID: JdwCcU1njMA
Collecting comments for video: xBimuKlO_8Q
Collected 3 comments from video ID: xBimuKlO_8Q
Collecting comments for video: rwkstgOjQ5I
Collected 3 comments from video ID: rwkstgOjQ5I
Collecting comments for video: gobeJXbFtBk
Collected 7 comments from video ID: gobeJXbFtBk
Collecting comments for video: 33YwDyW31DE
Collected 3 comments from video ID: 33YwDyW31DE
Collecting comments for video: jVYAjmWo-C0
Collected 8 comments from video ID: jVYAjmW

In [2]:
import re
import pandas as pd

# Function to clean comments (keeping only Burmese and emojis)
def clean_burmese_comments(comment):
    """
    Clean a comment by keeping only Burmese characters and emojis.
    Removes English, Japanese, Chinese, or any other non-Burmese text.
    """
    # Regex pattern for Burmese characters (including mixed Burmese text) and emojis
    burmese_pattern = re.compile(r'[\u1000-\u109F\s\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F]')

    # Find all Burmese characters and emojis
    cleaned_comment = "".join(re.findall(burmese_pattern, comment))

    # Return None if the comment becomes empty after cleaning
    return cleaned_comment.strip() if cleaned_comment.strip() else None

# Load the CSV file with saved comments
def load_comments_from_csv(input_file):
    df = pd.read_csv(input_file)
    return df['comment'].tolist()  # Assuming the column is named 'comment'

# Filter the comments to keep only Burmese and emojis
def filter_burmese_comments(comments):
    # Clean and filter out None, empty, or whitespace-only comments
    return [clean_burmese_comments(comment) for comment in comments if clean_burmese_comments(comment)]

# Save filtered comments to a new CSV file (with ID column)
def save_filtered_comments_to_csv(filtered_comments, output_file):
    # Ensure only valid (non-empty) comments are included
    if len(filtered_comments) == 0:
        print("No valid comments to save.")
        return

    # Generate IDs for the filtered comments
    comment_ids = list(range(1, len(filtered_comments) + 1))  # Generating IDs

    # Create the DataFrame with valid comments and IDs
    df = pd.DataFrame({"ID": comment_ids, "comment": filtered_comments})

    # Save the DataFrame to a CSV file
    df.to_csv(output_file, index=False, encoding="utf-8")
    print(f"Saved {len(filtered_comments)} filtered comments with IDs to {output_file}")

# Main processing function
def process_comments(input_file, output_file):
    # Load comments from the CSV
    comments = load_comments_from_csv(input_file)

    # Filter comments (only Burmese and emojis)
    filtered_comments = filter_burmese_comments(comments)

    # Save filtered comments to new CSV with IDs
    save_filtered_comments_to_csv(filtered_comments, output_file)

# Example usage
input_file = "doublej_music_comments.csv"  # Replace with your file path
output_file = "doublej_filtered_comments_with_id.csv"  # Output file path
process_comments(input_file, output_file)



Saved 8632 filtered comments with IDs to doublej_filtered_comments_with_id.csv
