# Sentiment Analysis - **TMDb DF**

In [22]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import plotly.express as px
import plotly.graph_objects as go

In [23]:
nltk.download('vader_lexicon')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/bru/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /home/bru/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Import reusable functions for sentiment analysis from ./utils folder

In [24]:
import sys
sys.path.append('../utils')
import sentiment_utils

In [25]:
films = pd.read_csv('../data/clean/tmdb_clean_films.csv')
films.head()

Unnamed: 0,tmdb_id,imdb_id,doesthedog_id,title,original_title,genres,director,release_year,runtime,budget,...,tmdb_rating,tmdb_votes,imdb_rating,imdb_votes,language,countries,overview,tagline,events,has_warnings
0,5,tt0113101,62268.0,Four Rooms,Four Rooms,comedy,"Quentin Tarantino, Robert Rodriguez, Alexandre...",1995,98,4000000,...,5.8,2628,6.7,112798,English,USA,It's Ted the Bellhop's first night on the job....,Twelve outrageous guests. Four scandalous requ...,"blood or gore, needles or syringes are used, d...",True
1,6,tt0107286,236737.0,Judgment Night,Judgment Night,"action, crime, thriller",Stephen Hopkins,1993,109,21000000,...,6.5,331,6.6,19361,English,USA,"Four young friends, while taking a shortcut en...",Don't move. Don't whisper. Don't even breathe.,"car crashes, drownings, people getting hit by ...",True
2,11,tt0076759,27949.0,Star Wars,Star Wars,"adventure, action, science fiction",George Lucas,1977,121,11000000,...,8.2,20622,8.6,1482739,English,USA,Princess Leia is captured and held hostage by ...,"A long time ago in a galaxy far, far away...","people being burned alive, flashing lights or ...",True
3,12,tt0266543,9602.0,Finding Nemo,Finding Nemo,"animation, family",Andrew Stanton,2003,100,94000000,...,7.8,19241,8.2,1139333,English,USA,"Nemo, an adventurous young clownfish, is unexp...",There are 3.7 trillion fish in the ocean. They...,"kids dying, jump scares, parents dying, spitti...",True
4,13,tt0109830,9641.0,Forrest Gump,Forrest Gump,"comedy, drama, romance",Robert Zemeckis,1994,142,55000000,...,8.5,27494,8.8,2326538,English,USA,A man with a low IQ has accomplished great thi...,The world will never be the same once you've s...,"parents dying, shower scenes, shaving or cutti...",True


In [26]:
cleaned_films = films.copy()
cleaned_films.drop(columns=['doesthedog_id', 'tmdb_id', 'imdb_id', 'original_title', 'imdb_votes', 'tmdb_votes'], inplace=True)
cleaned_films.rename(columns={'overview':'summary'}, inplace=True)


In [27]:
cleaned_films.head()


Unnamed: 0,title,genres,director,release_year,runtime,budget,revenue,profit,popularity,tmdb_rating,imdb_rating,language,countries,summary,tagline,events,has_warnings
0,Four Rooms,comedy,"Quentin Tarantino, Robert Rodriguez, Alexandre...",1995,98,4000000,4257354,257354,21.3,5.8,6.7,English,USA,It's Ted the Bellhop's first night on the job....,Twelve outrageous guests. Four scandalous requ...,"blood or gore, needles or syringes are used, d...",True
1,Judgment Night,"action, crime, thriller",Stephen Hopkins,1993,109,21000000,12136938,-8863062,8.9,6.5,6.6,English,USA,"Four young friends, while taking a shortcut en...",Don't move. Don't whisper. Don't even breathe.,"car crashes, drownings, people getting hit by ...",True
2,Star Wars,"adventure, action, science fiction",George Lucas,1977,121,11000000,775398007,764398007,98.8,8.2,8.6,English,USA,Princess Leia is captured and held hostage by ...,"A long time ago in a galaxy far, far away...","people being burned alive, flashing lights or ...",True
3,Finding Nemo,"animation, family",Andrew Stanton,2003,100,94000000,940335536,846335536,125.7,7.8,8.2,English,USA,"Nemo, an adventurous young clownfish, is unexp...",There are 3.7 trillion fish in the ocean. They...,"kids dying, jump scares, parents dying, spitti...",True
4,Forrest Gump,"comedy, drama, romance",Robert Zemeckis,1994,142,55000000,677387716,622387716,134.8,8.5,8.8,English,USA,A man with a low IQ has accomplished great thi...,The world will never be the same once you've s...,"parents dying, shower scenes, shaving or cutti...",True


In [28]:
cleaned_films = cleaned_films.dropna()

In [29]:
cleaned_films.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5301 entries, 0 to 9783
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         5301 non-null   object 
 1   genres        5301 non-null   object 
 2   director      5301 non-null   object 
 3   release_year  5301 non-null   int64  
 4   runtime       5301 non-null   int64  
 5   budget        5301 non-null   int64  
 6   revenue       5301 non-null   int64  
 7   profit        5301 non-null   int64  
 8   popularity    5301 non-null   float64
 9   tmdb_rating   5301 non-null   float64
 10  imdb_rating   5301 non-null   float64
 11  language      5301 non-null   object 
 12  countries     5301 non-null   object 
 13  summary       5301 non-null   object 
 14  tagline       5301 non-null   object 
 15  events        5301 non-null   object 
dtypes: bool(1), float64(3), int64(5), object(8)
memory usage: 709.2+ KB


In [30]:
display(cleaned_films)

Unnamed: 0,title,genres,director,release_year,runtime,budget,revenue,profit,popularity,tmdb_rating,imdb_rating,language,countries,summary,tagline,events,has_warnings
0,Four Rooms,comedy,"Quentin Tarantino, Robert Rodriguez, Alexandre...",1995,98,4000000,4257354,257354,21.3,5.8,6.7,English,USA,It's Ted the Bellhop's first night on the job....,Twelve outrageous guests. Four scandalous requ...,"blood or gore, needles or syringes are used, d...",True
1,Judgment Night,"action, crime, thriller",Stephen Hopkins,1993,109,21000000,12136938,-8863062,8.9,6.5,6.6,English,USA,"Four young friends, while taking a shortcut en...",Don't move. Don't whisper. Don't even breathe.,"car crashes, drownings, people getting hit by ...",True
2,Star Wars,"adventure, action, science fiction",George Lucas,1977,121,11000000,775398007,764398007,98.8,8.2,8.6,English,USA,Princess Leia is captured and held hostage by ...,"A long time ago in a galaxy far, far away...","people being burned alive, flashing lights or ...",True
3,Finding Nemo,"animation, family",Andrew Stanton,2003,100,94000000,940335536,846335536,125.7,7.8,8.2,English,USA,"Nemo, an adventurous young clownfish, is unexp...",There are 3.7 trillion fish in the ocean. They...,"kids dying, jump scares, parents dying, spitti...",True
4,Forrest Gump,"comedy, drama, romance",Robert Zemeckis,1994,142,55000000,677387716,622387716,134.8,8.5,8.8,English,USA,A man with a low IQ has accomplished great thi...,The world will never be the same once you've s...,"parents dying, shower scenes, shaving or cutti...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9775,The Mouse Trap,"horror, thriller",Jamie Bailey,2024,80,800000,5737,-794263,70.3,4.3,2.5,English,Canada,"It's Alex's 21st Birthday, but she's stuck at ...",This is not the funhouse.,"jump scares, flashing lights or images, blood ...",True
9776,Longlegs,"horror, thriller, crime",Osgood Perkins,2024,101,10000000,126388179,116388179,153.9,6.6,6.7,English,"Canada, USA",FBI Agent Lee Harker is a gifted new recruit a...,Every year there's another,"kids dying, jump scares, flashing lights or im...",True
9781,Moana 2,"animation, adventure, family, comedy","David G. Derrick Jr., Dana Ledoux Miller, Jaso...",2024,100,150000000,600055655,450055655,4485.0,6.8,7.1,English,"Canada, USA",After receiving an unexpected call from her wa...,The ocean is calling them back.,"flashing lights or images, ghosts, bugs, restr...",True
9782,Sound of Hope: The Story of Possum Trot,drama,Joshua Weigel,2024,130,8500000,11721425,3221425,58.5,6.7,7.1,English,USA,"Led by Donna and Reverend W.C. Martin, 22 fami...",The fight for kids begins.,"hate speech, child abuse, minority misrepresen...",True


#### Preprocess Text Data
- Clean the text by removing irrelevant characters, stopwords, and converting text to lowercase.
- Handle missing values in columns like genres, events, summary, etc.
- Tokenize the text and split genres by commas if necessary.

In [31]:
def clean_title(title):
    if isinstance(title, str):  
        title = title.strip() 
        title = re.sub(r'\s+', ' ', title)  
        title = re.sub(
            r'[^\w\sàáâäãåçèéêëìíîïñòóôöõùúûüýÿÀÁÂÄÃÅÇÈÉÊËÌÍÎÏÑÒÓÔÖÕÙÚÛÜÝ]', '', title
        )
        return title
    return None 


# Apply cleaning function to relevant columns using .loc to avoid SettingWithCopyWarning
cleaned_films.loc[:, 'title'] = cleaned_films['title'].apply(clean_title)
cleaned_films.loc[:, 'summary'] = cleaned_films['summary'].apply(clean_title)

In [32]:
display(cleaned_films)

Unnamed: 0,title,genres,director,release_year,runtime,budget,revenue,profit,popularity,tmdb_rating,imdb_rating,language,countries,summary,tagline,events,has_warnings
0,Four Rooms,comedy,"Quentin Tarantino, Robert Rodriguez, Alexandre...",1995,98,4000000,4257354,257354,21.3,5.8,6.7,English,USA,Its Ted the Bellhops first night on the joband...,Twelve outrageous guests. Four scandalous requ...,"blood or gore, needles or syringes are used, d...",True
1,Judgment Night,"action, crime, thriller",Stephen Hopkins,1993,109,21000000,12136938,-8863062,8.9,6.5,6.6,English,USA,Four young friends while taking a shortcut en ...,Don't move. Don't whisper. Don't even breathe.,"car crashes, drownings, people getting hit by ...",True
2,Star Wars,"adventure, action, science fiction",George Lucas,1977,121,11000000,775398007,764398007,98.8,8.2,8.6,English,USA,Princess Leia is captured and held hostage by ...,"A long time ago in a galaxy far, far away...","people being burned alive, flashing lights or ...",True
3,Finding Nemo,"animation, family",Andrew Stanton,2003,100,94000000,940335536,846335536,125.7,7.8,8.2,English,USA,Nemo an adventurous young clownfish is unexpec...,There are 3.7 trillion fish in the ocean. They...,"kids dying, jump scares, parents dying, spitti...",True
4,Forrest Gump,"comedy, drama, romance",Robert Zemeckis,1994,142,55000000,677387716,622387716,134.8,8.5,8.8,English,USA,A man with a low IQ has accomplished great thi...,The world will never be the same once you've s...,"parents dying, shower scenes, shaving or cutti...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9775,The Mouse Trap,"horror, thriller",Jamie Bailey,2024,80,800000,5737,-794263,70.3,4.3,2.5,English,Canada,Its Alexs 21st Birthday but shes stuck at the ...,This is not the funhouse.,"jump scares, flashing lights or images, blood ...",True
9776,Longlegs,"horror, thriller, crime",Osgood Perkins,2024,101,10000000,126388179,116388179,153.9,6.6,6.7,English,"Canada, USA",FBI Agent Lee Harker is a gifted new recruit a...,Every year there's another,"kids dying, jump scares, flashing lights or im...",True
9781,Moana 2,"animation, adventure, family, comedy","David G. Derrick Jr., Dana Ledoux Miller, Jaso...",2024,100,150000000,600055655,450055655,4485.0,6.8,7.1,English,"Canada, USA",After receiving an unexpected call from her wa...,The ocean is calling them back.,"flashing lights or images, ghosts, bugs, restr...",True
9782,Sound of Hope The Story of Possum Trot,drama,Joshua Weigel,2024,130,8500000,11721425,3221425,58.5,6.7,7.1,English,USA,Led by Donna and Reverend WC Martin 22 familie...,The fight for kids begins.,"hate speech, child abuse, minority misrepresen...",True


### Sentiment Analysis

1. Exploding Columns with Multiple Values

In [33]:
films_exploded_genres = sentiment_utils.explode_column(cleaned_films, 'genres')
films_exploded_genres[['title', 'genres']].head()

Unnamed: 0,title,genres
0,Four Rooms,comedy
1,Judgment Night,action
1,Judgment Night,crime
1,Judgment Night,thriller
2,Star Wars,adventure


In [34]:
films_exploded_languages = sentiment_utils.explode_column(cleaned_films, 'language')
films_exploded_languages[['title', 'language']].head()

Unnamed: 0,title,language
0,Four Rooms,English
1,Judgment Night,English
2,Star Wars,English
3,Finding Nemo,English
4,Forrest Gump,English


In [35]:
films_exploded_countries = sentiment_utils.explode_column(cleaned_films, 'countries')
films_exploded_countries[['title', 'countries']].head()

Unnamed: 0,title,countries
0,Four Rooms,USA
1,Judgment Night,USA
2,Star Wars,USA
3,Finding Nemo,USA
4,Forrest Gump,USA


In [36]:
films_exploded_events = sentiment_utils.explode_column(cleaned_films, 'events')
films_exploded_events[['title', 'events']].head()

Unnamed: 0,title,events
0,Four Rooms,blood or gore
0,Four Rooms,needles or syringes are used
0,Four Rooms,drug use
0,Four Rooms,sexual content
0,Four Rooms,vomiting


#### 2. Sentiment Analysis on Exploded DFs

In [38]:
# calculate the sentiment score for a text
# films_exploded_genres = sentiment_utils.add_sentiment_columns(films_exploded_genres, ['title', 'summary', 'tagline'])

In [39]:
# films_exploded_languages = sentiment_utils.add_sentiment_columns(films_exploded_languages, ['title', 'summary', 'tagline'])

In [40]:
# films_exploded_countries = sentiment_utils.add_sentiment_columns(films_exploded_countries, ['title', 'summary', 'tagline'])

In [41]:
# films_exploded_events = sentiment_utils.add_sentiment_columns(films_exploded_events, ['title', 'summary', 'tagline'])

In [42]:
# # # back up dfs to save time exploding columns
# films_exploded_genres.to_csv('../data/local/raw/tmdb/films_exploded_genres.csv', index=False)
# films_exploded_languages.to_csv('../data/local/raw/tmdb/films_exploded_languages.csv', index=False)
# films_exploded_countries.to_csv('../data/local/raw/tmdb/films_exploded_countries.csv', index=False)
# films_exploded_events.to_csv('../data/local/raw/tmdb/films_exploded_events.csv', index=False)

#### 3. Combine Sentiment Scores for Each Exploded DataFrame

In [43]:
films_exploded_genres = pd.read_csv('../data/local/raw/tmdb/films_exploded_genres.csv')
films_exploded_languages = pd.read_csv('../data/local/raw/tmdb/films_exploded_languages.csv')
films_exploded_countries = pd.read_csv('../data/local/raw/tmdb/films_exploded_countries.csv')
films_exploded_events = pd.read_csv('../data/local/raw/tmdb/films_exploded_events.csv')

In [44]:
# define sentiment columns to average
sentiment_columns = ['sentiment_title', 'sentiment_summary', 'sentiment_tagline']

# apply function
films_exploded_genres = sentiment_utils.calculate_overall_sentiment(films_exploded_genres, sentiment_columns)
print(films_exploded_genres[['title', 'genres', 'overall_sentiment']].head())

            title     genres  overall_sentiment
0      Four Rooms     comedy          -0.371200
1  Judgment Night     action          -0.257233
2  Judgment Night      crime          -0.257233
3  Judgment Night   thriller          -0.257233
4       Star Wars  adventure           0.113500


In [45]:
films_exploded_languages = sentiment_utils.calculate_overall_sentiment(films_exploded_languages, sentiment_columns)
print(films_exploded_languages[['title', 'language', 'overall_sentiment']].head())

            title language  overall_sentiment
0      Four Rooms  English          -0.371200
1  Judgment Night  English          -0.257233
2       Star Wars  English           0.113500
3    Finding Nemo  English           0.051033
4    Forrest Gump  English           0.310200


In [46]:
films_exploded_countries = sentiment_utils.calculate_overall_sentiment(films_exploded_countries, sentiment_columns)
print(films_exploded_countries[['title', 'countries', 'overall_sentiment']].head())

            title countries  overall_sentiment
0      Four Rooms       USA          -0.371200
1  Judgment Night       USA          -0.257233
2       Star Wars       USA           0.113500
3    Finding Nemo       USA           0.051033
4    Forrest Gump       USA           0.310200


In [47]:
films_exploded_events = sentiment_utils.calculate_overall_sentiment(films_exploded_events, sentiment_columns)
print(films_exploded_events[['title', 'events', 'overall_sentiment']].head())

        title                         events  overall_sentiment
0  Four Rooms                  blood or gore            -0.3712
1  Four Rooms   needles or syringes are used            -0.3712
2  Four Rooms                       drug use            -0.3712
3  Four Rooms                 sexual content            -0.3712
4  Four Rooms                       vomiting            -0.3712


#### 4. Visualizing Sentiment vs. Rating

#### **Sentiment vs. Rating for Genres**

In [49]:
sentiment_utils.plot_sentiment_distribution(
    films_exploded_genres, 
    category_column='genres', 
    sentiment_column='overall_sentiment', 
    title='Sentiment Distribution by GEnre',
    xaxis_title='Theme', 
    yaxis_title='Overall Sentiment'
)

#### **Sentiment vs. Rating for Languages**

In [50]:
# Usage examples:
sentiment_utils.plot_sentiment_distribution(
    films_exploded_languages, 
    category_column='language', 
    sentiment_column='overall_sentiment', 
    title='Sentiment Distribution by Language',
    xaxis_title='Language', 
    yaxis_title='Overall Sentiment'
)

#### **Sentiment vs. Rating for Events**

In [51]:


sentiment_utils.plot_sentiment_distribution(
    films_exploded_events, 
    category_column='events', 
    sentiment_column='overall_sentiment', 
    title='Sentiment Distribution by Events',
    xaxis_title='Event', 
    yaxis_title='Overall Sentiment'
)

To understand how factors like themes, events, genres, and whether a title has warnings (has_warnings) impact the rating (letterboxd_rating), we can use statistical or machine learning techniques to analyze the relationships between these variables and the letterboxd_rating.

In [52]:
# films_exploded_themes = sentiment_utils.explode_column_from_string(cleaned_films.copy(), 'themes')
films_exploded_genres = sentiment_utils.explode_column_from_string(cleaned_films.copy(), 'genres')
films_exploded_events = sentiment_utils.explode_column_from_string(cleaned_films.copy(), 'events')

# print(films_exploded_themes[['title', 'themes']].head())
print(films_exploded_genres[['title', 'genres']].head())
print(films_exploded_events[['title', 'events']].head())

            title     genres
0      Four Rooms     comedy
1  Judgment Night     action
1  Judgment Night      crime
1  Judgment Night   thriller
2       Star Wars  adventure
        title                         events
0  Four Rooms                  blood or gore
0  Four Rooms   needles or syringes are used
0  Four Rooms                       drug use
0  Four Rooms                 sexual content
0  Four Rooms                       vomiting


#### Encoding Categorical Variables

In [None]:
# One-hot encoding the exploded themes, genres, and events columns
# films_exploded_themes_encoded = pd.get_dummies(films_exploded_themes, columns=['themes'])
films_exploded_genres_encoded = pd.get_dummies(films_exploded_genres, columns=['genres'])
films_exploded_events_encoded = pd.get_dummies(films_exploded_events, columns=['events'])

# Encode the 'has_warnings' column (True/False to 1/0)
# films_exploded_themes_encoded['has_warnings'] = films_exploded_themes_encoded['has_warnings'].astype(int)

# Merge the exploded dataframes (themes, genres, events)
films_exploded_merged = pd.merge(films_exploded_themes_encoded, films_exploded_genres_encoded, how='inner', on=['title', 'release_year', 'letterboxd_rating'])
films_exploded_merged = pd.merge(films_exploded_merged, films_exploded_events_encoded, how='inner', on=['title', 'release_year', 'letterboxd_rating'])

films_exploded_merged.head(2)

#### Correlation and Feature Importance

In [53]:
# Select only numeric columns
numeric_columns = films_exploded_merged.select_dtypes(include=['number']).columns

# Print all the numerical columns
print("Numerical Features in the Dataset:")
for i, col in enumerate(numeric_columns, 1):
    print(f"{i}. {col}")

NameError: name 'films_exploded_merged' is not defined

In [None]:

# numeric columns for correlation
numeric_columns = ['release_year', 'letterboxd_rating', 'runtime']

correlation_matrix = films_exploded_merged[numeric_columns].corr()

fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values, 
    x=correlation_matrix.columns,  
    y=correlation_matrix.columns, 
    colorscale='Viridis', 
    colorbar=dict(title="Correlation"), 
))

fig.update_layout(
    title="Correlation Matrix of Numerical Features",
    xaxis_title="Features",
    yaxis_title="Features",
    xaxis=dict(tickmode='array', tickvals=list(range(len(correlation_matrix.columns))), ticktext=correlation_matrix.columns),
    yaxis=dict(tickmode='array', tickvals=list(range(len(correlation_matrix.columns))), ticktext=correlation_matrix.columns),
    height=800,
)

fig.show()