## Test VADER

In [1]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import sys

# Download VADER lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/bru/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
sys.path.append('../scripts')
import content_tagging

In [3]:
df = pd.read_csv('../data/clean/clean_films_id.csv')

# print(df.head())

In [4]:
df = content_tagging.assign_content_tags(df)
display(df)

Unnamed: 0.1,Unnamed: 0,title,original_title,genres,director,release_year,runtime,budget,revenue,popularity,...,tmdb_votes,imdb_rating,imdb_votes,language,tmdb_id,imdb_id,doesthedog_id,events,profit,content_tags
0,0,Traffic in Souls,Traffic in Souls,"crime, drama",George Loane Tucker,1913,88,5700,1800000,2.3,...,19,5.9,751,English,96128,tt0003471,268281,,1794300,
1,1,The Birth of a Nation,The Birth of a Nation,"drama, history, war",D.W. Griffith,1915,193,100000,11000000,13.8,...,520,6.1,26938,English,618,tt0004972,68194,"sexual assault, blood or gore, falling deaths,...",10900000,"Sexual Violence and Abuse, Horror and Supernat..."
2,2,The Cheat,The Cheat,drama,Cecil B. DeMille,1915,59,17311,137365,6.6,...,64,6.5,2892,English,70368,tt0005078,47532,,120054,
3,3,Intolerance: Love's Struggle Throughout the Ages,Intolerance: Love's Struggle Throughout the Ages,"drama, history",D.W. Griffith,1916,197,385907,1750000,7.4,...,329,7.7,17121,English,3059,tt0006864,46705,"kids dying, parents dying, sexual assault, blo...",1364093,"Sexual Violence and Abuse, Horror and Supernat..."
4,4,"20,000 Leagues Under the Sea","20,000 Leagues Under the Sea","adventure, drama, action, science fiction",Stuart Paton,1916,99,200000,8000000,7.9,...,52,6.1,2066,English,30266,tt0006333,226637,"shaving or cutting, blood or gore, animals (be...",7800000,"Horror and Supernatural, Physical Violence, An..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8939,8939,Imaginary,Imaginary,"horror, mystery, thriller",Jeff Wadlow,2024,104,13000000,43787034,57.3,...,502,4.7,15700,English,1125311,tt26658104,1071201,"jump scares, people being burned alive, spider...",30787034,"Sexual Violence and Abuse, Horror and Supernat..."
8940,8940,Kill,किल,"action, crime, thriller, drama",Nikhil Nagesh Bhat,2024,105,4800000,5600000,150.0,...,268,7.6,33309,Hindi,1160018,tt28259207,684649,"kids dying, people being burned alive, parents...",800000,"Horror and Supernatural, Physical Violence, Me..."
8941,8941,The Apprentice,The Apprentice,"drama, history",Ali Abbasi,2024,122,16000000,12013393,63.5,...,260,7.1,25086,English,1182047,tt8368368,1224057,"flashing lights or images, shaving or cutting,...",-3986607,"Sexual Violence and Abuse, Horror and Supernat..."
8942,8942,The Wild Robot,The Wild Robot,"animation, action, science fiction, family",Chris Sanders,2024,102,78000000,321836235,1147.2,...,3366,8.3,93319,English,1184918,tt29623480,1059280,"flashing lights or images, parents dying, fing...",243836235,"Sexual Violence and Abuse, Horror and Supernat..."


In [5]:
# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

Analyze sentiment for each keyword separately

In [None]:
# Handle non-string values in the 'content_tags' column
df['content_tags'] = df['content_tags'].fillna('')  # Replace NaN values with an empty string
df['content_tags'] = df['content_tags'].astype(str)  # Ensure all values are strings

# Split the keywords in the 'content_tags' column by commas
df['keywords'] = df['content_tags'].apply(lambda x: x.split(','))

# Function to apply sentiment analysis on each keyword
def analyze_keywords_sentiment(keywords):
    return [sia.polarity_scores(keyword.strip())['compound'] for keyword in keywords]

# Apply sentiment analysis to each keyword in the 'keywords' list
df['sentiment_scores'] = df['keywords'].apply(analyze_keywords_sentiment)

# Calculate the average compound score for each row (optional)
df['average_compound'] = df['sentiment_scores'].apply(lambda scores: sum(scores) / len(scores) if scores else 0)

# Display the updated dataframe with sentiment scores
print(df.head())


In [None]:
display(df)

In [None]:
# Flatten all sentiment scores from the 'sentiment_scores' column into a single list
all_sentiment_scores = [score for scores in df['sentiment_scores'] for score in scores]

# Calculate the overall average sentiment score
overall_average_sentiment = sum(all_sentiment_scores) / len(all_sentiment_scores) if all_sentiment_scores else 0

# Print the overall average sentiment score
print(f"Overall Average Sentiment Score: {overall_average_sentiment:.4f}")


Distribution of Average Compound Sentiment Scores

In [None]:
# Distribution plot of average compound scores
plt.figure(figsize=(10, 6))
sns.histplot(df['average_compound'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Average Compound Sentiment Scores', fontsize=16)
plt.xlabel('Average Compound Sentiment Score', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.grid(True)
plt.show()


Bar Chart of Keywords and Their Average Sentiment Scores

In [None]:
# Extract keywords and their average sentiment scores
keyword_sentiments = (
    df.explode('keywords')  # Flatten the 'keywords' list
    .groupby('keywords')['average_compound']  # Group by keyword
    .mean()  # Calculate the average sentiment per keyword
    .sort_values(ascending=False)  # Sort by sentiment
    .head(10)  # Show the top 10 keywords
)

# Plot the bar chart
plt.figure(figsize=(12, 8))
sns.barplot(x=keyword_sentiments.values, y=keyword_sentiments.index, palette='viridis')
plt.title('Top 10 Keywords by Average Sentiment Score', fontsize=16)
plt.xlabel('Average Sentiment Score', fontsize=14)
plt.ylabel('Keywords', fontsize=14)
plt.show()


Heatmap of Sentiment Scores Across Rows and Keywords

In [None]:
# Create a matrix of rows vs. keywords
heatmap_data = df.explode('keywords').reset_index(drop=True)  # Flatten the 'keywords' list and reset index

# Add the corresponding sentiment score for each keyword
heatmap_data['sentiment'] = heatmap_data.explode('sentiment_scores').reset_index(drop=True)['sentiment_scores']

# Pivot the data for a heatmap
heatmap_pivot = heatmap_data.pivot_table(index=heatmap_data.index, columns='keywords', values='sentiment', aggfunc='mean')

# Plot the heatmap
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 10))
sns.heatmap(heatmap_pivot, cmap='coolwarm', annot=False, cbar=True)
plt.title('Heatmap of Sentiment Scores Across Rows and Keywords', fontsize=16)
plt.xlabel('Keywords', fontsize=14)
plt.ylabel('Row Index', fontsize=14)
plt.show()


Sentiment Trendline Across Rows

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(data=df['average_compound'], color='purple', label='Sentiment Trend')
plt.title('Sentiment Trendline Across Rows', fontsize=16)
plt.xlabel('Row Index', fontsize=14)
plt.ylabel('Average Compound Sentiment Score', fontsize=14)
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Combine all the values in the 'events' column into a single string
events_text = " ".join(df['events'].dropna().astype(str))

# Combine all the values in the 'content_tags' column into a single string
content_tags_text = " ".join(df['content_tags'].dropna().astype(str))

# Create a function to generate and display a word cloud with a pink colormap
def generate_wordcloud(text, title):
    wordcloud = WordCloud(
        width=800, height=400,
        background_color='white',
        colormap='spring'  # Shiny pink shades
    ).generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16, color='hotpink')
    plt.show()

# Generate and display the word cloud for 'events'
generate_wordcloud(events_text, 'Word Cloud for Events')

# Generate and display the word cloud for 'content_tags'
generate_wordcloud(content_tags_text, 'Word Cloud for Content Tags')
