"""
# Sentiment Analysis on YouTube Public Policy Comments
**Naive Bayes Classifier for Indonesian Comments**
"""

In [None]:
# %% [1] Import Libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

nltk.download('punkt')
nltk.download('stopwords')

: 

In [None]:

# %% [2] Load Comments Data
comments_df = pd.read_csv('yt_comment_sample.csv')[['text']]  # Only load comments
print(f"Total Comments: {len(comments_df)}")


In [None]:
# %% [3] Text Cleaning
def clean_comment(text):
    if pd.isna(text): 
        return ""
    
    # Lowercase + remove URLs/emojis/special chars
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^\w\s]|[\d_]', '', text)  # Remove punctuation/numbers
    text = re.sub(r'\s+', ' ', text).strip()
    return text

comments_df['cleaned_text'] = comments_df['text'].apply(clean_comment)


In [None]:
# %% [4] Tokenization & Stopword Removal
stop_words = set(stopwords.words('indonesian') + list(stopwords.words('english')))

def tokenize(text):
    tokens = word_tokenize(text)
    return ' '.join([word for word in tokens if word not in stop_words])

comments_df['processed_text'] = comments_df['cleaned_text'].apply(tokenize)

In [None]:
# %% [5] Load Labeled Data (PREPARE THIS FIRST)
# Format: CSV with columns [text, sentiment] 
# sentiment values: 1 (positive), 0 (neutral), -1 (negative)
labeled_data = pd.read_csv('labeled_comments.csv')  # Replace with your labeled data
print("Label Distribution:\n", labeled_data['sentiment'].value_counts())


In [None]:
# %% [6] Build and Train Model
X = labeled_data['text']  # Original text
y = labeled_data['sentiment']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Model pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('nb', MultinomialNB())
])

model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
# %% [7] Predict Sentiment for All Comments
comments_df['sentiment'] = model.predict(comments_df['text'])  # Predict on raw text

# Save results
comments_df[['text', 'sentiment']].to_csv('comment_sentiments.csv', index=False)
print("Results saved to comment_sentiments.csv")


In [None]:
# %% [8] Quick Analysis
sentiment_counts = comments_df['sentiment'].value_counts()
print("\nSentiment Distribution:")
print(sentiment_counts)

# Sample positive/negative comments
print("\nSample Positive Comments:")
print(comments_df[comments_df['sentiment'] == 1]['text'].head(3).values)
print("\nSample Negative Comments:")
print(comments_df[comments_df['sentiment'] == -1]['text'].head(3).values)