In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
from transformers import pipeline
from wordcloud import WordCloud
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zakyf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zakyf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_excel('Youtube_Comment_Gibran.xlsx', header=None, names=['username', 'comment'])

In [4]:
# Preprocessing function
def preprocess_text(text):
    # Handle non-string input
    if not isinstance(text, str):
        text = str(text)
    # Remove username mentions (e.g., @username)
    text = re.sub(r'@\w+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('indonesian'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_comment'] = df['comment'].apply(preprocess_text)

In [5]:
sentiment_analyzer = pipeline('sentiment-analysis', model='w11wo/indonesian-roberta-base-sentiment-classifier')




Device set to use cpu


In [6]:
# Function to classify sentiment
def classify_sentiment(text):
    try:
        # Skip empty strings
        if not text.strip():
            return 'netral'
        # Truncate text to 512 tokens (max input length for RoBERTa)
        tokens = text.split()[:512]
        truncated_text = ' '.join(tokens)
        result = sentiment_analyzer(truncated_text)[0]
        label = result['label']
        # Map model output to sentiment
        if label == 'positive':
            return 'positif'
        elif label == 'negative':
            return 'negatif'
        else:
            return 'netral'
    except Exception as e:
        print(f"Error processing text: {text[:50]}... Error: {e}")
        return 'netral'  # Fallback for any errors

# Apply sentiment analysis
df['sentiment'] = df['cleaned_comment'].apply(classify_sentiment)

In [7]:
# Save results to CSV
df[['username', 'comment', 'sentiment']].to_csv('sentiment_results.csv', index=False)

# Visualization: Sentiment distribution
sentiment_counts = df['sentiment'].value_counts()
plt.figure(figsize=(8, 6))
sentiment_counts.plot(kind='bar', color=['red', 'green', 'blue'])
plt.title('Distribusi Sentimen Komentar YouTube')
plt.xlabel('Sentimen')
plt.ylabel('Jumlah Komentar')
plt.savefig('sentiment_distribution.png')
plt.close()

In [8]:
# Word cloud for negative comments
negative_comments = ' '.join(df[df['sentiment'] == 'negatif']['cleaned_comment'])
if negative_comments.strip():  # Check if there are negative comments
    wordcloud = WordCloud(width=800, height=400, background_color='white', min_font_size=10).generate(negative_comments)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud Komentar Negatif')
    plt.savefig('negative_wordcloud.png')
    plt.close()

# Most common words
all_words = ' '.join(df['cleaned_comment']).split()
word_freq = Counter(all_words).most_common(10)
words, counts = zip(*word_freq)
plt.figure(figsize=(10, 5))
plt.bar(words, counts, color='orange')
plt.title('10 Kata Paling Sering Muncul')
plt.xlabel('Kata')
plt.ylabel('Frekuensi')
plt.xticks(rotation=45)
plt.savefig('word_frequency.png')
plt.close()

# Print summary
print("Ringkasan Analisis Sentimen:")
print(sentiment_counts)
print("\n10 Kata Paling Sering Muncul:")
for word, freq in word_freq:
    print(f"{word}: {freq}")

Ringkasan Analisis Sentimen:
sentiment
negatif    620
netral     295
positif    283
Name: count, dtype: int64

10 Kata Paling Sering Muncul:
yg: 192
wapres: 155
like: 114
ai: 109
nya: 105
indonesia: 101
mas: 98
ga: 91
video: 88
aja: 83
