In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from googleapiclient.discovery import build
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words = set(stopwords.words('indonesian'))

# Load IndoBERT model and tokenizer
model_name = "indobenchmark/indobert-large-p2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline(
    "sentiment-analysis", model=model, tokenizer=tokenizer)

# Video ID and API Key
video_id = "mb2Jw47oVl8"  # Video ID YouTube
apikey = "AIzaSyDrwVcjd1hQpsivM11bq996l1zn9xj5r38"  # API Key


def video_comments(video_id, apikey):
    """
    Fetch comments from a YouTube video.
    Args:
        video_id (str): YouTube video ID.
        apikey (str): YouTube API key.
    Returns:
        list: A list of comments with metadata.
    """
    replies = []
    youtube = build('youtube', 'v3', developerKey=apikey)
    video_response = youtube.commentThreads().list(
        part='snippet,replies', videoId=video_id).execute()

    while video_response:
        for item in video_response['items']:
            published = item['snippet']['topLevelComment']['snippet']['publishedAt']
            user = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            likeCount = item['snippet']['topLevelComment']['snippet']['likeCount']
            replies.append([published, user, comment, likeCount])

            # Process replies
            replycount = item['snippet']['totalReplyCount']
            if replycount > 0:
                for reply in item.get('replies', {}).get('comments', []):
                    published = reply['snippet']['publishedAt']
                    user = reply['snippet']['authorDisplayName']
                    repl = reply['snippet']['textDisplay']
                    likeCount = reply['snippet']['likeCount']
                    replies.append([published, user, repl, likeCount])

        if 'nextPageToken' in video_response:
            video_response = youtube.commentThreads().list(
                part='snippet,replies',
                pageToken=video_response['nextPageToken'],
                videoId=video_id
            ).execute()
        else:
            break
    return replies


def analyze_sentiment_indo(comment):
    """
    Analyze sentiment using IndoBERT.
    Args:
        comment (str): Text to analyze.
    Returns:
        str: Sentiment label ('positif', 'negatif', or 'netral').
    """
    result = sentiment_pipeline(comment)[0]
    label = result['label']
    if label == "LABEL_1":
        return "positif"
    elif label == "LABEL_0":
        return "negatif"
    else:
        return "netral"


def preprocess_text_nltk(text):
    """
    Preprocess text by normalizing and removing stopwords using NLTK.
    Args:
        text (str): Original text.
    Returns:
        str: Cleaned text.
    """
    text = text.lower()
    tokens = word_tokenize(text)
    # Remove stopwords and keep only alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()
              and word not in stop_words]
    return ' '.join(tokens)


comments = video_comments(video_id, apikey)

df = pd.DataFrame(comments, columns=[
                  'publishedAt', 'authorDisplayName', 'textDisplay', 'likeCount'])

df['textDisplay'] = df['textDisplay'].apply(preprocess_text_nltk)
df['sentiment'] = df['textDisplay'].apply(analyze_sentiment_indo)
df

# Save ke CSV
# output_path = 'dataset/youtube_comments.csv'
# df.to_csv(output_path, index=False)
# print(f"Data successfully saved to '{output_path}'")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-large-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Unnamed: 0,publishedAt,authorDisplayName,textDisplay,likeCount,sentiment
0,2024-12-27T01:39:01Z,@yakubchudory,harapan rakyat indonesia harapan dunia dibawah...,0,netral
1,2024-12-26T16:29:40Z,@faizrohmat,ibuter elu,0,netral
2,2024-12-21T22:42:30Z,@YunusKayameKayame,a https amp,0,netral
3,2024-12-21T05:19:14Z,@sahabatabbahofficial834,mentri negri yordan cantik nyaaa,0,netral
4,2024-12-17T02:35:00Z,@Suryati-wl3mh,amiiinnn terima kasih prsiden pa prabowo pidat...,0,netral
...,...,...,...,...,...
8894,2024-10-20T05:19:29Z,@nurkhamid1943,lanjutkan yg pro rakyat,8,netral
8895,2024-10-20T05:18:34Z,@sofiaSari-x7w,engk beras tolong sya perkerjaan suami sya becak,4,netral
8896,2024-10-20T06:05:25Z,@isyaharema877,tegur pamong desanya rt,0,netral
8897,2024-10-20T05:17:41Z,@Sangpujangga1908,jendral,1,netral


In [None]:
from wordcloud import WordCloud

### Membuat Word Cloud ###
text = ' '.join(df['textDisplay'])
wordcloud = WordCloud(
    width=800, height=400,
    background_color='white',
    colormap='viridis',
    max_words=100
).generate(text)

# Plot Word Cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Hilangkan sumbu
plt.title('Word Cloud Paslon', fontsize=16)
plt.show()

In [None]:
### Membuat BarPlot ###
# Hitung jumlah komentar berdasarkan sentimen
sentiment_counts = df['sentiment'].value_counts().reset_index()
sentiment_counts.columns = ['sentiment', 'count']

sns.barplot(
    data=sentiment_counts,
    x='sentiment',
    y='count',
    hue='sentiment',
    dodge=False
)

sns.despine()
plt.title('Sentimen Positif vs Negatif vs Netral', fontsize=16)
plt.xlabel('Sentimen', fontsize=12)
plt.ylabel('Total Komentar', fontsize=12)
plt.legend([], [], frameon=False)
plt.show()