# Sentiment Analysis - **Letterboxd DF**

In [1]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import plotly.express as px
import plotly.graph_objects as go

In [None]:
nltk.download('vader_lexicon')
nltk.download('stopwords')

Import reusable functions for sentiment analysis from ./utils folder

In [3]:
import sys
sys.path.append('../utils')
import sentiment_utils

In [None]:
films = pd.read_csv('../data/clean/letterboxd_clean_films.csv')
films.head()

In [5]:
cleaned_films = films.copy()
cleaned_films.drop(columns=['topics', 'doesthedog_id', 'letterboxd_id'], inplace=True)

In [None]:
cleaned_films.head()


In [7]:
cleaned_films = cleaned_films.dropna()

In [None]:
cleaned_films.info()

In [None]:
display(cleaned_films)

#### Preprocess Text Data
- Clean the text by removing irrelevant characters, stopwords, and converting text to lowercase.
- Handle missing values in columns like genres, events, summary, etc.
- Tokenize the text and split genres by commas if necessary.

In [10]:
def clean_title(title):
    if isinstance(title, str):  
        title = title.strip() 
        title = re.sub(r'\s+', ' ', title)  
        title = re.sub(
            r'[^\w\sàáâäãåçèéêëìíîïñòóôöõùúûüýÿÀÁÂÄÃÅÇÈÉÊËÌÍÎÏÑÒÓÔÖÕÙÚÛÜÝ]', '', title
        )
        return title
    return None 


# Apply cleaning function to relevant columns using .loc to avoid SettingWithCopyWarning
cleaned_films.loc[:, 'title'] = cleaned_films['title'].apply(clean_title)
cleaned_films.loc[:, 'summary'] = cleaned_films['summary'].apply(clean_title)

In [None]:
display(cleaned_films)

### Sentiment Analysis

1. Exploding Columns with Multiple Values

In [None]:
films_exploded_genres = sentiment_utils.explode_column(cleaned_films, 'genres')
films_exploded_genres[['title', 'genres']].head()

In [None]:
films_exploded_languages = sentiment_utils.explode_column(cleaned_films, 'language')
films_exploded_languages[['title', 'language']].head()

In [None]:
films_exploded_countries = sentiment_utils.explode_column(cleaned_films, 'countries')
films_exploded_countries[['title', 'countries']].head()

In [None]:
films_exploded_events = sentiment_utils.explode_column(cleaned_films, 'events')
films_exploded_events[['title', 'events']].head()

In [None]:
# explode themes (already a list)
cleaned_films['themes'] = cleaned_films['themes'].apply(lambda x: eval(x) if isinstance(x, str) else x)

films_exploded_themes = cleaned_films.explode('themes')

films_exploded_themes[['title', 'themes']].head()

#### 2. Sentiment Analysis on Exploded DFs

In [21]:
# calculate the sentiment score for a text
films_exploded_genres = sentiment_utils.add_sentiment_columns(films_exploded_genres, ['title', 'summary', 'tagline'])

In [22]:
# calculate the sentiment score for a text
films_exploded_languages = sentiment_utils.add_sentiment_columns(films_exploded_languages, ['title', 'summary', 'tagline'])

In [23]:
films_exploded_countries = sentiment_utils.add_sentiment_columns(films_exploded_countries, ['title', 'summary', 'tagline'])

In [None]:
films_exploded_events = sentiment_utils.add_sentiment_columns(films_exploded_events, ['title', 'summary', 'tagline'])

In [None]:
films_exploded_themes = sentiment_utils.add_sentiment_columns(films_exploded_themes, ['title', 'summary', 'tagline'])

#### 3. Combine Sentiment Scores for Each Exploded DataFrame

In [None]:
# define sentiment columns to average
sentiment_columns = ['sentiment_title', 'sentiment_summary', 'sentiment_tagline']

# apply function
films_exploded_genres = sentiment_utils.calculate_overall_sentiment(films_exploded_genres, sentiment_columns)
print(films_exploded_genres[['title', 'genres', 'overall_sentiment']].head())

In [None]:
films_exploded_languages = sentiment_utils.calculate_overall_sentiment(films_exploded_languages, sentiment_columns)
print(films_exploded_languages[['title', 'language', 'overall_sentiment']].head())

In [None]:
films_exploded_countries = sentiment_utils.calculate_overall_sentiment(films_exploded_countries, sentiment_columns)
print(films_exploded_countries[['title', 'countries', 'overall_sentiment']].head())

In [None]:
films_exploded_events = sentiment_utils.calculate_overall_sentiment(films_exploded_events, sentiment_columns)
print(films_exploded_events[['title', 'events', 'overall_sentiment']].head())

In [None]:
films_exploded_themes = sentiment_utils.calculate_overall_sentiment(films_exploded_themes, sentiment_columns)
print(films_exploded_themes[['title', 'themes', 'overall_sentiment']].head())

#### 4. Visualizing Sentiment vs. Rating

#### **Sentiment vs. Rating for Genres**

In [None]:
sentiment_utils.plot_sentiment_distribution(
    films_exploded_genres, 
    category_column='genres', 
    sentiment_column='overall_sentiment', 
    title='Sentiment Distribution by GEnre',
    xaxis_title='Theme', 
    yaxis_title='Overall Sentiment'
)

#### **Sentiment vs. Rating for Languages**

In [None]:
# Usage examples:
sentiment_utils.plot_sentiment_distribution(
    films_exploded_languages, 
    category_column='language', 
    sentiment_column='overall_sentiment', 
    title='Sentiment Distribution by Language',
    xaxis_title='Language', 
    yaxis_title='Overall Sentiment'
)

#### **Sentiment vs. Rating for Events**

In [None]:


sentiment_utils.plot_sentiment_distribution(
    films_exploded_events, 
    category_column='events', 
    sentiment_column='overall_sentiment', 
    title='Sentiment Distribution by Events',
    xaxis_title='Event', 
    yaxis_title='Overall Sentiment'
)

#### **Sentiment vs. Rating for Themes**

In [None]:
sentiment_utils.plot_sentiment_distribution(
    films_exploded_themes, 
    category_column='themes', 
    sentiment_column='overall_sentiment', 
    title='Sentiment Distribution by Themes',
    xaxis_title='Theme', 
    yaxis_title='Overall Sentiment'
)

To understand how factors like themes, events, genres, and whether a title has warnings (has_warnings) impact the rating (letterboxd_rating), we can use statistical or machine learning techniques to analyze the relationships between these variables and the letterboxd_rating.

In [None]:
films_exploded_themes = sentiment_utils.explode_column_from_string(cleaned_films.copy(), 'themes')
films_exploded_genres = sentiment_utils.explode_column_from_string(cleaned_films.copy(), 'genres')
films_exploded_events = sentiment_utils.explode_column_from_string(cleaned_films.copy(), 'events')

print(films_exploded_themes[['title', 'themes']].head())
print(films_exploded_genres[['title', 'genres']].head())
print(films_exploded_events[['title', 'events']].head())

#### Encoding Categorical Variables

In [None]:
# One-hot encoding the exploded themes, genres, and events columns
films_exploded_themes_encoded = pd.get_dummies(films_exploded_themes, columns=['themes'])
films_exploded_genres_encoded = pd.get_dummies(films_exploded_genres, columns=['genres'])
films_exploded_events_encoded = pd.get_dummies(films_exploded_events, columns=['events'])

# Encode the 'has_warnings' column (True/False to 1/0)
films_exploded_themes_encoded['has_warnings'] = films_exploded_themes_encoded['has_warnings'].astype(int)

# Merge the exploded dataframes (themes, genres, events)
films_exploded_merged = pd.merge(films_exploded_themes_encoded, films_exploded_genres_encoded, how='inner', on=['title', 'release_year', 'letterboxd_rating'])
films_exploded_merged = pd.merge(films_exploded_merged, films_exploded_events_encoded, how='inner', on=['title', 'release_year', 'letterboxd_rating'])

films_exploded_merged.head(2)

#### Correlation and Feature Importance

In [None]:
# Select only numeric columns
numeric_columns = films_exploded_merged.select_dtypes(include=['number']).columns

# Print all the numerical columns
print("Numerical Features in the Dataset:")
for i, col in enumerate(numeric_columns, 1):
    print(f"{i}. {col}")

In [None]:

# numeric columns for correlation
numeric_columns = ['release_year', 'letterboxd_rating', 'runtime']

correlation_matrix = films_exploded_merged[numeric_columns].corr()

fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values, 
    x=correlation_matrix.columns,  
    y=correlation_matrix.columns, 
    colorscale='Viridis', 
    colorbar=dict(title="Correlation"), 
))

fig.update_layout(
    title="Correlation Matrix of Numerical Features",
    xaxis_title="Features",
    yaxis_title="Features",
    xaxis=dict(tickmode='array', tickvals=list(range(len(correlation_matrix.columns))), ticktext=correlation_matrix.columns),
    yaxis=dict(tickmode='array', tickvals=list(range(len(correlation_matrix.columns))), ticktext=correlation_matrix.columns),
    height=800,
)

fig.show()