NLP on Letterboxd df
- Check word frequency in several columns -> 'tagline', 'summary', 'genres', 'themes', 'events'
- Based on correlations, rate from 0 to 5
- Tie back to whether a title has warnings or not
- Figure how it impacts sentiment on movies
- Sentiment-o-meter

The Data
- `cleaned_films` contains 4203 titles and 13 columns.
- There are no Null values.

| **Column Name**        | **Data Type**   | **Description**                                                                 |
|------------------------|-----------------|----------------------------------------------------------------------------------|
| **title**              | object          | The title of the movie.                                                          |
| **release_year**       | int64           | The year the movie was released.                                                 |
| **tagline**            | object          | The movie's tagline (promotional phrase).                                        |
| **summary**            | object          | A brief description of the movie's plot.                                         |
| **runtime**            | int64           | The total runtime of the movie in minutes.                                       |
| **letterboxd_rating**  | float64         | The movie's average rating on Letterboxd.                                        |
| **genres**             | object          | A list of genres the movie belongs to (e.g., Drama, Comedy).                     |
| **language**           | object          | The languages the movie was produced in.                                         |
| **countries**          | object          | The countries where the movie was made or released.                              |
| **themes**             | object          | The central themes explored in the movie (e.g., Love, War, Friendship).          |
| **director**           | object          | The director(s) of the movie.                                                   |
| **events**             | object          | Key events or warnings in the movie (e.g., violence, strong language).           |
| **has_warnings**       | bool            | A boolean indicating if the movie contains warnings for sensitive content.       |


In [None]:
import pandas as pd
import re
from collections import Counter
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import spearmanr

In [None]:
nltk.download('punkt_tab')

In [None]:

films = pd.read_csv('../data/clean/letterboxd_clean_films.csv')
films.head(2)

In [4]:
cleaned_films = films.copy()
cleaned_films.drop(columns=['topics', 'doesthedog_id', 'letterboxd_id'], inplace=True)

In [5]:
cleaned_films = cleaned_films.dropna()

In [None]:
display(cleaned_films)


### 1. Data Preprocessing

**Tagline Word Count**

In [None]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

cleaned_films['processed_tagline'] = cleaned_films['tagline'].apply(preprocess_text)
 
all_tokens = [word for tokens in cleaned_films['processed_tagline'] for word in tokens]
 
word_freq = Counter(all_tokens)
 
most_common_words_tagline = word_freq.most_common(50)
print('Most common words in taglines:')
print(most_common_words_tagline)

In [None]:
words, counts = zip(*most_common_words_tagline)
fig = px.bar(x=words, y=counts, title='Most Common Words in Taglines',
             labels={'x': 'Words', 'y': 'Frequency'}, color=counts)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

**Summary Word Count**

In [None]:
cleaned_films['processed_summary'] = cleaned_films['summary'].apply(preprocess_text)
 
all_tokens = [word for tokens in cleaned_films['processed_summary'] for word in tokens]
 
word_freq = Counter(all_tokens)
 
most_common_words_summary = word_freq.most_common(50)
print('Most common words in summaries:')
print(most_common_words_summary)

In [None]:
words, counts = zip(*most_common_words_summary)
fig = px.bar(x=words, y=counts, title='Most Common Words in Summaries',
             labels={'x': 'Words', 'y': 'Frequency'}, color=counts)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

**Genre Word Count**

In [None]:
cleaned_films['processed_genres'] = cleaned_films['genres'].apply(preprocess_text)
 
all_tokens = [word for tokens in cleaned_films['processed_genres'] for word in tokens]
 
word_freq = Counter(all_tokens)
 
most_common_words_genres = word_freq.most_common(50)
print('Most common words in Genres:')
print(most_common_words_genres)

In [None]:
words, counts = zip(*most_common_words_genres)
fig = px.bar(x=words, y=counts, title='Most Common Words in Genres',
             labels={'x': 'Words', 'y': 'Frequency'}, color=counts)
fig.update_layout(xaxis_tickangle=-45)
fig.show()


**Themes Word Count**

In [None]:
cleaned_films['processed_themes'] = cleaned_films['themes'].apply(preprocess_text)
 
all_tokens = [word for tokens in cleaned_films['processed_themes'] for word in tokens]
 
word_freq = Counter(all_tokens)
 
most_common_words_themes = word_freq.most_common(50)
print('Most common words in Themes:')
print(most_common_words_themes)

In [None]:
 
words, counts = zip(*most_common_words_themes)
fig = px.bar(x=words, y=counts, title='Most Common Words in Themes',
             labels={'x': 'Words', 'y': 'Frequency'}, color=counts)
fig.update_layout(xaxis_tickangle=-45)
fig.show()


**Events Word Count**

In [None]:
cleaned_films['processed_events'] = cleaned_films['events'].apply(preprocess_text)
 
all_tokens = [word for tokens in cleaned_films['processed_events'] for word in tokens]
 
word_freq = Counter(all_tokens)
 
most_common_words_events = word_freq.most_common(50)
print('Most common words in Events:')
print(most_common_words_events)

In [None]:
words, counts = zip(*most_common_words_events)
fig = px.bar(x=words, y=counts, title='Most Common Words in Events',
             labels={'x': 'Words', 'y': 'Frequency'}, color=counts)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

In [17]:
df = cleaned_films.copy()

In [None]:
# Vectorize the processed tagline text
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(df['processed_tagline'])

# Convert word counts to DataFrame
word_count_df = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())

# Compute correlations between word frequencies and ratings
correlations = {}
for word in word_count_df.columns:
    corr, _ = spearmanr(word_count_df[word], df['rating'])
    correlations[word] = corr

# Sort correlations
sorted_correlations = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)

# Display top correlated words
top_correlated_words = sorted_correlations[:10]
print("Top 10 Words Correlated with Ratings:")
for word, corr in top_correlated_words:
    print(f"{word}: {corr:.2f}")

# Plot the top 10 correlated words using Plotly
words, corrs = zip(*top_correlated_words)
fig = px.bar(x=words, y=corrs, title='Top 10 Words Correlated with Ratings',
             labels={'x': 'Words', 'y': 'Spearman Correlation'}, color=corrs)
fig.update_layout(xaxis_tickangle=-45)
fig.show()


In [None]:
# Vectorize the processed tagline text
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(df['processed_tagline'])

# Convert word counts to DataFrame
word_count_df = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())

# Compute correlations between word frequencies and ratings
correlations = {}
for word in word_count_df.columns:
    corr, _ = spearmanr(word_count_df[word], df['rating'])
    correlations[word] = corr

# Sort correlations
sorted_correlations = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)

# Display top correlated words
top_correlated_words = sorted_correlations[:10]
print("Top 10 Words Correlated with Ratings:")
for word, corr in top_correlated_words:
    print(f"{word}: {corr:.2f}")

# Plot the top 10 correlated words using Plotly
words, corrs = zip(*top_correlated_words)
fig = px.bar(x=words, y=corrs, title='Top 10 Words Correlated with Ratings',
             labels={'x': 'Words', 'y': 'Spearman Correlation'}, color=corrs)
fig.update_layout(xaxis_tickangle=-45)
fig.show()