This notebook does the sentiment analysis about the Reddit comment, using the nltk vader lexicon.

### Load the data

In [3]:
import pandas as pd

In [22]:
df = pd.read_csv('reddit_comments.csv', encoding='utf-8', index_col=0, sep=';')

In [23]:
df.head()

Unnamed: 0,comments
0,Idk what ya’ll are mad about. I’m pretty excit...
1,Can't wait to see a crippled Levi fighting din...
2,Can't wait for when Bellen Kristein would figh...
3,Honestly speaking I wouldn't mind reading it (...
4,"Beren is stunning, the ending is bad.\nReasons..."


### Clean the data

In [25]:
import re
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')

def preprocess_text(text):
    # Convert text to lowercase
    processed_text = text.lower()
    # Remove URLs and user mentions
    processed_text = re.sub(r"http\S+|www\S+|https\S+|\/\/t|co\/|\@\w+", '', processed_text, flags=re.MULTILINE)
    # Remove punctuation
    processed_text = processed_text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    processed_text = re.sub(r'\d+', '', processed_text)
    # Tokenize the text
    words = processed_text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    # Join the filtered words back into a string
    processed_text = ' '.join(filtered_words)
    return processed_text

# Apply the preprocess_text() function to each comment in the 'comments' column
df['processed_comments'] = df['comments'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
df.head()

Unnamed: 0,comments,processed_comments
0,Idk what ya’ll are mad about. I’m pretty excit...,idk ya’ll mad i’m pretty excited beren next ge...
1,Can't wait to see a crippled Levi fighting din...,cant wait see crippled levi fighting dinosaurs...
2,Can't wait for when Bellen Kristein would figh...,cant wait bellen kristein would fight jack unc...
3,Honestly speaking I wouldn't mind reading it (...,honestly speaking wouldnt mind reading beren n...
4,"Beren is stunning, the ending is bad.\nReasons...",beren stunning ending bad reasons alliance def...


### Sentiment Analysis

In [27]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [28]:
vader = SentimentIntensityAnalyzer()

In [29]:
def get_sentiment_scores(comment):
    # Calculate sentiment scores for the comment
    sentiment_scores = vader.polarity_scores(comment)
    return sentiment_scores['pos'], sentiment_scores['neg'], sentiment_scores['neu'], sentiment_scores['compound']

# Add new columns to the DataFrame
df['pos_score'], df['neg_score'], df['neu_score'], df['compound_score'] = zip(*df['comments'].map(get_sentiment_scores))

In [30]:
df.head()

Unnamed: 0,comments,processed_comments,pos_score,neg_score,neu_score,compound_score
0,Idk what ya’ll are mad about. I’m pretty excit...,idk ya’ll mad i’m pretty excited beren next ge...,0.292,0.24,0.469,0.25
1,Can't wait to see a crippled Levi fighting din...,cant wait see crippled levi fighting dinosaurs...,0.0,0.172,0.828,-0.3612
2,Can't wait for when Bellen Kristein would figh...,cant wait bellen kristein would fight jack unc...,0.0,0.191,0.809,-0.3818
3,Honestly speaking I wouldn't mind reading it (...,honestly speaking wouldnt mind reading beren n...,0.25,0.0,0.75,0.4588
4,"Beren is stunning, the ending is bad.\nReasons...",beren stunning ending bad reasons alliance def...,0.058,0.229,0.713,-0.9584


The data frame above shows positive score, negative score, neutral score, and the compound score for each comment.

In [31]:
# add a new column sentiment, if the compound score >= 0.05, return pos, and if score > -0.05, return neg
df['sentiment'] = df['compound_score'].apply(lambda score: 'pos' if score >= 0.05 else ('neu' if score > -0.05 else 'neg'))

In [32]:
df.head()

Unnamed: 0,comments,processed_comments,pos_score,neg_score,neu_score,compound_score,sentiment
0,Idk what ya’ll are mad about. I’m pretty excit...,idk ya’ll mad i’m pretty excited beren next ge...,0.292,0.24,0.469,0.25,pos
1,Can't wait to see a crippled Levi fighting din...,cant wait see crippled levi fighting dinosaurs...,0.0,0.172,0.828,-0.3612,neg
2,Can't wait for when Bellen Kristein would figh...,cant wait bellen kristein would fight jack unc...,0.0,0.191,0.809,-0.3818,neg
3,Honestly speaking I wouldn't mind reading it (...,honestly speaking wouldnt mind reading beren n...,0.25,0.0,0.75,0.4588,pos
4,"Beren is stunning, the ending is bad.\nReasons...",beren stunning ending bad reasons alliance def...,0.058,0.229,0.713,-0.9584,neg


In [33]:
# Count the number of occurrences of each sentiment label
sentiment_counts = df['sentiment'].value_counts()

# Create a new DataFrame with the counts
sentiment_counts_df = pd.DataFrame(sentiment_counts.items(), columns=['Sentiment', 'Count'])

# Display the DataFrame with the counts
print(sentiment_counts_df)

  Sentiment  Count
0       pos     91
1       neg     82
2       neu     22
