The last step in data generation is labeling the sentiment of the comments: positive, negative, neutral.

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download VADER lexicon if not already available
nltk.download('vader_lexicon')


def analyze_sentiment(df):
    # Initialize VADER sentiment analyzer
    sid = SentimentIntensityAnalyzer()

    # Drop rows with missing or deleted comments
    df_clean = df[df['body'].notna() & (df['body'] != '[deleted]')].copy()

    # Apply VADER sentiment scoring
    df_clean['vader_score'] = df_clean['body'].apply(lambda text: sid.polarity_scores(text)['compound'])

    # Assign sentiment labels based on VADER compound score
    def label_sentiment(score):
        if score >= 0.05:
            return 'positive'
        elif score <= -0.05:
            return 'negative'
        else:
            return 'neutral'

    df_clean['sentiment'] = df_clean['vader_score'].apply(label_sentiment)

    # Show class distribution
    df_clean['sentiment'].value_counts()

    return df_clean

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/alex/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [10]:
import pandas as pd

comments = pd.read_csv("../data/comments.csv")

analyze_sentiment(comments).to_csv("../data/labeled_comments.csv", index=False)