# Sentiment Analysis - **Letterboxd DF**

In [1]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import plotly.express as px
import plotly.graph_objects as go

In [2]:
nltk.download('vader_lexicon')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/bru/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /home/bru/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Import reusable functions for sentiment analysis from ./utils folder

In [3]:
import sys
sys.path.append('../utils')
import sentiment_utils

In [4]:
films = pd.read_csv('../data/clean/letterboxd_clean_films.csv')
films.head()

Unnamed: 0,letterboxd_id,title,release_year,tagline,summary,runtime,letterboxd_rating,genres,language,countries,themes,director,topics,doesthedog_id,events,has_warnings
0,1000001,Barbie,2023,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114,3.86,"Comedy, Adventure",English,"UK, USA","['Humanity and the world around us', 'Crude hu...",Greta Gerwig,,381345.0,,False
1,1000002,Parasite,2019,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133,4.56,"Comedy, Thriller, Drama","Korean, English, German, Korean",South Korea,"['Humanity and the world around us', 'Intense ...",Bong Joon-ho,,19408.0,,False
2,1000003,Everything Everywhere All at Once,2022,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140,4.3,"Science Fiction, Adventure, Comedy, Action","English, Cantonese, Chinese, English",USA,"['Humanity and the world around us', 'Moving r...","Daniel Scheinert, Daniel Kwan",,121671.0,,False
3,1000004,Fight Club,1999,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139,4.27,Drama,English,"Germany, USA","['Intense violence and sexual transgression', ...",David Fincher,,9593.0,,False
4,1000005,La La Land,2016,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129,4.09,"Drama, Comedy, Music, Romance",English,"Hong Kong, USA","['Song and dance', 'Humanity and the world aro...",Damien Chazelle,167176222260266269339363,12823.0,"flashing lights or images, shower scenes, sad ...",True


In [5]:
cleaned_films = films.copy()
cleaned_films.drop(columns=['topics', 'doesthedog_id', 'letterboxd_id'], inplace=True)

In [6]:
cleaned_films.head()


Unnamed: 0,title,release_year,tagline,summary,runtime,letterboxd_rating,genres,language,countries,themes,director,events,has_warnings
0,Barbie,2023,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114,3.86,"Comedy, Adventure",English,"UK, USA","['Humanity and the world around us', 'Crude hu...",Greta Gerwig,,False
1,Parasite,2019,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133,4.56,"Comedy, Thriller, Drama","Korean, English, German, Korean",South Korea,"['Humanity and the world around us', 'Intense ...",Bong Joon-ho,,False
2,Everything Everywhere All at Once,2022,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140,4.3,"Science Fiction, Adventure, Comedy, Action","English, Cantonese, Chinese, English",USA,"['Humanity and the world around us', 'Moving r...","Daniel Scheinert, Daniel Kwan",,False
3,Fight Club,1999,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139,4.27,Drama,English,"Germany, USA","['Intense violence and sexual transgression', ...",David Fincher,,False
4,La La Land,2016,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129,4.09,"Drama, Comedy, Music, Romance",English,"Hong Kong, USA","['Song and dance', 'Humanity and the world aro...",Damien Chazelle,"flashing lights or images, shower scenes, sad ...",True


In [7]:
cleaned_films = cleaned_films.dropna()

In [8]:
cleaned_films.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4203 entries, 4 to 18449
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              4203 non-null   object 
 1   release_year       4203 non-null   int64  
 2   tagline            4203 non-null   object 
 3   summary            4203 non-null   object 
 4   runtime            4203 non-null   int64  
 5   letterboxd_rating  4203 non-null   float64
 6   genres             4203 non-null   object 
 7   language           4203 non-null   object 
 8   countries          4203 non-null   object 
 9   themes             4203 non-null   object 
 10  director           4203 non-null   object 
 11  events             4203 non-null   object 
dtypes: bool(1), float64(1), int64(2), object(9)
memory usage: 431.0+ KB


In [9]:
display(cleaned_films)

Unnamed: 0,title,release_year,tagline,summary,runtime,letterboxd_rating,genres,language,countries,themes,director,events,has_warnings
4,La La Land,2016,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129,4.09,"Drama, Comedy, Music, Romance",English,"Hong Kong, USA","['Song and dance', 'Humanity and the world aro...",Damien Chazelle,"flashing lights or images, shower scenes, sad ...",True
11,Whiplash,2014,The road to greatness can take you to the edge.,"Under the direction of a ruthless instructor, ...",107,4.43,"Drama, Music",English,USA,"['Moving relationship stories', 'Student comin...",Damien Chazelle,"finger or toe mutilation, spitting, car crashe...",True
32,Once Upon a Time in Hollywood,2019,"In this town, it can all change… like that","Los Angeles, 1969. TV star Rick Dalton, a stru...",162,3.76,"Drama, Thriller, Comedy","English, English, Italian, Spanish","China, UK, USA","['Humanity and the world around us', 'Fascinat...",Quentin Tarantino,"people being burned alive, spitting, blood or ...",True
39,Glass Onion,2022,"When the game ends, the mystery begins.",World-famous detective Benoit Blanc heads to G...,140,3.45,"Comedy, Crime, Mystery",English,USA,"['Thrillers and murder mysteries', 'Intriguing...",Rian Johnson,"flashing lights or images, car crashes, people...",True
68,Coco,2017,The celebration of a lifetime,Despite his family’s baffling generations-old ...,105,4.12,"Adventure, Animation, Music, Family","English, English, Spanish",USA,"['Moving relationship stories', 'Song and danc...",Lee Unkrich,"parents dying, spitting, ghosts, child abuse, ...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18425,The Triangle,2001,"In the Bermuda Triangle, nothing stays lost fo...",This made-for-TV movie follows a group of frie...,92,2.89,"Thriller, Horror, TV Movie",English,"USA, Canada, Barbados","['Horror, the undead and monster classics', 'T...",Lewis Teague,"kids dying, parents dying, shaving or cutting,...",True
18427,CAT,2022,Drugs. Deceit. Danger.,"Living under an alias, a former police informa...",360,3.50,"Crime, Drama",Hindi,India,"['Crime, drugs and gangsters', 'Intense politi...",Balwinder Singh Janjua,"people being burned alive, flashing lights or ...",True
18429,Wraith,2017,There's Something in My Room,After living in an old mansion for almost 10 y...,99,2.61,"Mystery, Thriller, Horror",English,USA,"['Faith and religion', 'Terrifying, haunted, a...",Michael O. Sajbel,"people being burned alive, spitting, shaky cam...",True
18436,Tulsa,2020,Big changes come in small packages,A desperate marine biker’s life is turned upsi...,120,2.97,"Comedy, Drama",English,USA,"['Faith and religion', 'Moving relationship st...","Gloria Stella, Scott Pryor","kids dying, parents dying, car crashes, people...",True


#### Preprocess Text Data
- Clean the text by removing irrelevant characters, stopwords, and converting text to lowercase.
- Handle missing values in columns like genres, events, summary, etc.
- Tokenize the text and split genres by commas if necessary.

In [10]:
def clean_title(title):
    if isinstance(title, str):  
        title = title.strip() 
        title = re.sub(r'\s+', ' ', title)  
        title = re.sub(
            r'[^\w\sàáâäãåçèéêëìíîïñòóôöõùúûüýÿÀÁÂÄÃÅÇÈÉÊËÌÍÎÏÑÒÓÔÖÕÙÚÛÜÝ]', '', title
        )
        return title
    return None 


# Apply cleaning function to relevant columns using .loc to avoid SettingWithCopyWarning
cleaned_films.loc[:, 'title'] = cleaned_films['title'].apply(clean_title)
cleaned_films.loc[:, 'summary'] = cleaned_films['summary'].apply(clean_title)

In [11]:
display(cleaned_films)

Unnamed: 0,title,release_year,tagline,summary,runtime,letterboxd_rating,genres,language,countries,themes,director,events,has_warnings
4,La La Land,2016,Here's to the fools who dream.,Mia an aspiring actress serves lattes to movie...,129,4.09,"Drama, Comedy, Music, Romance",English,"Hong Kong, USA","['Song and dance', 'Humanity and the world aro...",Damien Chazelle,"flashing lights or images, shower scenes, sad ...",True
11,Whiplash,2014,The road to greatness can take you to the edge.,Under the direction of a ruthless instructor a...,107,4.43,"Drama, Music",English,USA,"['Moving relationship stories', 'Student comin...",Damien Chazelle,"finger or toe mutilation, spitting, car crashe...",True
32,Once Upon a Time in Hollywood,2019,"In this town, it can all change… like that",Los Angeles 1969 TV star Rick Dalton a struggl...,162,3.76,"Drama, Thriller, Comedy","English, English, Italian, Spanish","China, UK, USA","['Humanity and the world around us', 'Fascinat...",Quentin Tarantino,"people being burned alive, spitting, blood or ...",True
39,Glass Onion,2022,"When the game ends, the mystery begins.",Worldfamous detective Benoit Blanc heads to Gr...,140,3.45,"Comedy, Crime, Mystery",English,USA,"['Thrillers and murder mysteries', 'Intriguing...",Rian Johnson,"flashing lights or images, car crashes, people...",True
68,Coco,2017,The celebration of a lifetime,Despite his familys baffling generationsold ba...,105,4.12,"Adventure, Animation, Music, Family","English, English, Spanish",USA,"['Moving relationship stories', 'Song and danc...",Lee Unkrich,"parents dying, spitting, ghosts, child abuse, ...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18425,The Triangle,2001,"In the Bermuda Triangle, nothing stays lost fo...",This madeforTV movie follows a group of friend...,92,2.89,"Thriller, Horror, TV Movie",English,"USA, Canada, Barbados","['Horror, the undead and monster classics', 'T...",Lewis Teague,"kids dying, parents dying, shaving or cutting,...",True
18427,CAT,2022,Drugs. Deceit. Danger.,Living under an alias a former police informan...,360,3.50,"Crime, Drama",Hindi,India,"['Crime, drugs and gangsters', 'Intense politi...",Balwinder Singh Janjua,"people being burned alive, flashing lights or ...",True
18429,Wraith,2017,There's Something in My Room,After living in an old mansion for almost 10 y...,99,2.61,"Mystery, Thriller, Horror",English,USA,"['Faith and religion', 'Terrifying, haunted, a...",Michael O. Sajbel,"people being burned alive, spitting, shaky cam...",True
18436,Tulsa,2020,Big changes come in small packages,A desperate marine bikers life is turned upsid...,120,2.97,"Comedy, Drama",English,USA,"['Faith and religion', 'Moving relationship st...","Gloria Stella, Scott Pryor","kids dying, parents dying, car crashes, people...",True


### Sentiment Analysis

1. Exploding Columns with Multiple Values

In [12]:
films_exploded_genres = sentiment_utils.explode_column(cleaned_films, 'genres')
films_exploded_genres[['title', 'genres']].head()

Unnamed: 0,title,genres
4,La La Land,Drama
4,La La Land,Comedy
4,La La Land,Music
4,La La Land,Romance
11,Whiplash,Drama


In [13]:
films_exploded_languages = sentiment_utils.explode_column(cleaned_films, 'language')
films_exploded_languages[['title', 'language']].head()

Unnamed: 0,title,language
4,La La Land,English
11,Whiplash,English
32,Once Upon a Time in Hollywood,English
32,Once Upon a Time in Hollywood,English
32,Once Upon a Time in Hollywood,Italian


In [14]:
films_exploded_countries = sentiment_utils.explode_column(cleaned_films, 'countries')
films_exploded_countries[['title', 'countries']].head()

Unnamed: 0,title,countries
4,La La Land,Hong Kong
4,La La Land,USA
11,Whiplash,USA
32,Once Upon a Time in Hollywood,China
32,Once Upon a Time in Hollywood,UK


In [15]:
films_exploded_events = sentiment_utils.explode_column(cleaned_films, 'events')
films_exploded_events[['title', 'events']].head()

Unnamed: 0,title,events
4,La La Land,flashing lights or images
4,La La Land,shower scenes
4,La La Land,sad endings
4,La La Land,misophonia
4,La La Land,babies/unborn


In [16]:
# explode themes (already a list)
cleaned_films['themes'] = cleaned_films['themes'].apply(lambda x: eval(x) if isinstance(x, str) else x)

films_exploded_themes = cleaned_films.explode('themes')

films_exploded_themes[['title', 'themes']].head()

Unnamed: 0,title,themes
4,La La Land,Song and dance
4,La La Land,Humanity and the world around us
4,La La Land,Moving relationship stories
4,La La Land,Dazzling vocal performances and musicals
4,La La Land,Captivating relationships and charming romance


#### 2. Sentiment Analysis on Exploded DFs

In [21]:
# calculate the sentiment score for a text
films_exploded_genres = sentiment_utils.add_sentiment_columns(films_exploded_genres, ['title', 'summary', 'tagline'])

In [22]:
# calculate the sentiment score for a text
films_exploded_languages = sentiment_utils.add_sentiment_columns(films_exploded_languages, ['title', 'summary', 'tagline'])

In [23]:
films_exploded_countries = sentiment_utils.add_sentiment_columns(films_exploded_countries, ['title', 'summary', 'tagline'])

In [None]:
films_exploded_events = sentiment_utils.add_sentiment_columns(films_exploded_events, ['title', 'summary', 'tagline'])

In [None]:
films_exploded_themes = sentiment_utils.add_sentiment_columns(films_exploded_themes, ['title', 'summary', 'tagline'])

#### 3. Combine Sentiment Scores for Each Exploded DataFrame

In [26]:
# define sentiment columns to average
sentiment_columns = ['sentiment_title', 'sentiment_summary', 'sentiment_tagline']

# apply function
films_exploded_genres = sentiment_utils.calculate_overall_sentiment(films_exploded_genres, sentiment_columns)
print(films_exploded_genres[['title', 'genres', 'overall_sentiment']].head())

         title    genres  overall_sentiment
4   La La Land     Drama           0.203733
4   La La Land    Comedy           0.203733
4   La La Land     Music           0.203733
4   La La Land   Romance           0.203733
11    Whiplash     Drama           0.263533


In [27]:
films_exploded_languages = sentiment_utils.calculate_overall_sentiment(films_exploded_languages, sentiment_columns)
print(films_exploded_languages[['title', 'language', 'overall_sentiment']].head())

                            title  language  overall_sentiment
4                      La La Land   English           0.203733
11                       Whiplash   English           0.263533
32  Once Upon a Time in Hollywood   English           0.389533
32  Once Upon a Time in Hollywood   English           0.389533
32  Once Upon a Time in Hollywood   Italian           0.389533


In [28]:
films_exploded_countries = sentiment_utils.calculate_overall_sentiment(films_exploded_countries, sentiment_columns)
print(films_exploded_countries[['title', 'countries', 'overall_sentiment']].head())

                            title  countries  overall_sentiment
4                      La La Land  Hong Kong           0.203733
4                      La La Land        USA           0.203733
11                       Whiplash        USA           0.263533
32  Once Upon a Time in Hollywood      China           0.389533
32  Once Upon a Time in Hollywood         UK           0.389533


In [29]:
films_exploded_events = sentiment_utils.calculate_overall_sentiment(films_exploded_events, sentiment_columns)
print(films_exploded_events[['title', 'events', 'overall_sentiment']].head())

KeyError: "['sentiment_summary', 'sentiment_tagline'] not in index"

In [30]:
films_exploded_themes = sentiment_utils.calculate_overall_sentiment(films_exploded_themes, sentiment_columns)
print(films_exploded_themes[['title', 'themes', 'overall_sentiment']].head())

KeyError: "None of [Index(['sentiment_title', 'sentiment_summary', 'sentiment_tagline'], dtype='object')] are in the [columns]"

#### 4. Visualizing Sentiment vs. Rating

#### **Sentiment vs. Rating for Genres**

In [31]:
sentiment_utils.plot_sentiment_distribution(
    films_exploded_genres, 
    category_column='genres', 
    sentiment_column='overall_sentiment', 
    title='Sentiment Distribution by GEnre',
    xaxis_title='Theme', 
    yaxis_title='Overall Sentiment'
)

#### **Sentiment vs. Rating for Languages**

In [33]:
# Usage examples:
sentiment_utils.plot_sentiment_distribution(
    films_exploded_languages, 
    category_column='language', 
    sentiment_column='overall_sentiment', 
    title='Sentiment Distribution by Language',
    xaxis_title='Language', 
    yaxis_title='Overall Sentiment'
)

#### **Sentiment vs. Rating for Events**

In [34]:


sentiment_utils.plot_sentiment_distribution(
    films_exploded_events, 
    category_column='events', 
    sentiment_column='overall_sentiment', 
    title='Sentiment Distribution by Events',
    xaxis_title='Event', 
    yaxis_title='Overall Sentiment'
)

ValueError: Value of 'y' is not the name of a column in 'data_frame'. Expected one of ['title', 'release_year', 'tagline', 'summary', 'runtime', 'letterboxd_rating', 'genres', 'language', 'countries', 'themes', 'director', 'events', 'has_warnings', 'sentiment_title'] but received: overall_sentiment

#### **Sentiment vs. Rating for Themes**

In [36]:
sentiment_utils.plot_sentiment_distribution(
    films_exploded_themes, 
    category_column='themes', 
    sentiment_column='overall_sentiment', 
    title='Sentiment Distribution by Themes',
    xaxis_title='Theme', 
    yaxis_title='Overall Sentiment'
)

ValueError: Value of 'y' is not the name of a column in 'data_frame'. Expected one of ['title', 'release_year', 'tagline', 'summary', 'runtime', 'letterboxd_rating', 'genres', 'language', 'countries', 'themes', 'director', 'events', 'has_warnings'] but received: overall_sentiment

To understand how factors like themes, events, genres, and whether a title has warnings (has_warnings) impact the rating (letterboxd_rating), we can use statistical or machine learning techniques to analyze the relationships between these variables and the letterboxd_rating.

In [38]:
films_exploded_themes = sentiment_utils.explode_column_from_string(cleaned_films.copy(), 'themes')
films_exploded_genres = sentiment_utils.explode_column_from_string(cleaned_films.copy(), 'genres')
films_exploded_events = sentiment_utils.explode_column_from_string(cleaned_films.copy(), 'events')

print(films_exploded_themes[['title', 'themes']].head())
print(films_exploded_genres[['title', 'genres']].head())
print(films_exploded_events[['title', 'events']].head())

        title                                          themes
4  La La Land                                  Song and dance
4  La La Land                Humanity and the world around us
4  La La Land                     Moving relationship stories
4  La La Land        Dazzling vocal performances and musicals
4  La La Land  Captivating relationships and charming romance
         title    genres
4   La La Land     Drama
4   La La Land    Comedy
4   La La Land     Music
4   La La Land   Romance
11    Whiplash     Drama
        title                     events
4  La La Land  flashing lights or images
4  La La Land              shower scenes
4  La La Land                sad endings
4  La La Land                 misophonia
4  La La Land              babies/unborn


#### Encoding Categorical Variables

In [39]:
# One-hot encoding the exploded themes, genres, and events columns
films_exploded_themes_encoded = pd.get_dummies(films_exploded_themes, columns=['themes'])
films_exploded_genres_encoded = pd.get_dummies(films_exploded_genres, columns=['genres'])
films_exploded_events_encoded = pd.get_dummies(films_exploded_events, columns=['events'])

# Encode the 'has_warnings' column (True/False to 1/0)
films_exploded_themes_encoded['has_warnings'] = films_exploded_themes_encoded['has_warnings'].astype(int)

# Merge the exploded dataframes (themes, genres, events)
films_exploded_merged = pd.merge(films_exploded_themes_encoded, films_exploded_genres_encoded, how='inner', on=['title', 'release_year', 'letterboxd_rating'])
films_exploded_merged = pd.merge(films_exploded_merged, films_exploded_events_encoded, how='inner', on=['title', 'release_year', 'letterboxd_rating'])

films_exploded_merged.head(2)

Unnamed: 0,title,release_year,tagline_x,summary_x,runtime_x,letterboxd_rating,genres_x,language_x,countries_x,director_x,...,events_spiders,events_spitting,events_stalking,events_suicide attempts,events_suicide threats,events_teeth damage,events_torture,events_unconscious,events_underwater scenes,events_vomiting
0,La La Land,2016,Here's to the fools who dream.,Mia an aspiring actress serves lattes to movie...,129,4.09,"[Drama, Comedy, Music, Romance]",[English],"[Hong Kong, USA]",Damien Chazelle,...,False,False,False,False,False,False,False,False,False,False
1,La La Land,2016,Here's to the fools who dream.,Mia an aspiring actress serves lattes to movie...,129,4.09,"[Drama, Comedy, Music, Romance]",[English],"[Hong Kong, USA]",Damien Chazelle,...,False,False,False,False,False,False,False,False,False,False


#### Correlation and Feature Importance

In [40]:
# Select only numeric columns
numeric_columns = films_exploded_merged.select_dtypes(include=['number']).columns

# Print all the numerical columns
print("Numerical Features in the Dataset:")
for i, col in enumerate(numeric_columns, 1):
    print(f"{i}. {col}")

Numerical Features in the Dataset:
1. release_year
2. runtime_x
3. letterboxd_rating
5. runtime_y
6. runtime


In [41]:

# numeric columns for correlation
numeric_columns = ['release_year', 'letterboxd_rating', 'runtime']

correlation_matrix = films_exploded_merged[numeric_columns].corr()

fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values, 
    x=correlation_matrix.columns,  
    y=correlation_matrix.columns, 
    colorscale='Viridis', 
    colorbar=dict(title="Correlation"), 
))

fig.update_layout(
    title="Correlation Matrix of Numerical Features",
    xaxis_title="Features",
    yaxis_title="Features",
    xaxis=dict(tickmode='array', tickvals=list(range(len(correlation_matrix.columns))), ticktext=correlation_matrix.columns),
    yaxis=dict(tickmode='array', tickvals=list(range(len(correlation_matrix.columns))), ticktext=correlation_matrix.columns),
    height=800,
)

fig.show()