Combine relevant data both from tmdb and letterboxd dataframes

In [18]:
import pandas as pd
from collections import Counter
import plotly.express as px
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [19]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/bru/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/bru/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/bru/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
tmdb = pd.read_csv('../data/clean/tmdb_clean_films.csv')
letterboxd = pd.read_csv('../data/clean/letterboxd_clean_films.csv')

In [21]:
# drop columns
tmdb.drop(columns=['tmdb_id', 'imdb_id', 'original_title', 'imdb_votes', 'tmdb_votes'], inplace=True)
tmdb.rename(columns={'overview':'summary'}, inplace=True)
tmdb.dropna(inplace=True)

In [22]:
tmdb.head(3)

Unnamed: 0,doesthedog_id,title,genres,director,release_year,runtime,budget,revenue,profit,popularity,tmdb_rating,imdb_rating,language,countries,summary,tagline,events,has_warnings
0,62268.0,Four Rooms,comedy,"Quentin Tarantino, Robert Rodriguez, Alexandre...",1995,98,4000000,4257354,257354,21.3,5.8,6.7,English,USA,It's Ted the Bellhop's first night on the job....,Twelve outrageous guests. Four scandalous requ...,"blood or gore, needles or syringes are used, d...",True
1,236737.0,Judgment Night,"action, crime, thriller",Stephen Hopkins,1993,109,21000000,12136938,-8863062,8.9,6.5,6.6,English,USA,"Four young friends, while taking a shortcut en...",Don't move. Don't whisper. Don't even breathe.,"car crashes, drownings, people getting hit by ...",True
2,27949.0,Star Wars,"adventure, action, science fiction",George Lucas,1977,121,11000000,775398007,764398007,98.8,8.2,8.6,English,USA,Princess Leia is captured and held hostage by ...,"A long time ago in a galaxy far, far away...","people being burned alive, flashing lights or ...",True


In [23]:
# drop columns
letterboxd.drop(columns=['letterboxd_id', 'topics'], inplace=True)
letterboxd.dropna(inplace=True)

In [24]:
letterboxd.head(3)

Unnamed: 0,title,release_year,tagline,summary,runtime,letterboxd_rating,genres,language,countries,themes,director,doesthedog_id,events,has_warnings
4,La La Land,2016,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129,4.09,"Drama, Comedy, Music, Romance",English,"Hong Kong, USA","['Song and dance', 'Humanity and the world aro...",Damien Chazelle,12823.0,"flashing lights or images, shower scenes, sad ...",True
11,Whiplash,2014,The road to greatness can take you to the edge.,"Under the direction of a ruthless instructor, ...",107,4.43,"Drama, Music",English,USA,"['Moving relationship stories', 'Student comin...",Damien Chazelle,12593.0,"finger or toe mutilation, spitting, car crashe...",True
32,Once Upon a Time in Hollywood,2019,"In this town, it can all change… like that","Los Angeles, 1969. TV star Rick Dalton, a stru...",162,3.76,"Drama, Thriller, Comedy","English, English, Italian, Spanish","China, UK, USA","['Humanity and the world around us', 'Fascinat...",Quentin Tarantino,20150.0,"people being burned alive, spitting, blood or ...",True


In [25]:
matching_rows = letterboxd[letterboxd[['title', 'doesthedog_id']].apply(tuple, axis=1).isin(tmdb[['title', 'doesthedog_id']].apply(tuple, axis=1))]

matching_count = matching_rows.shape[0]

print(f'Number of matching rows: {matching_count}')


Number of matching rows: 8


Step 2: Text Preprocessing
To run NLP tasks like frequency analysis, you'll need to preprocess the text data.

2.1 Text Preprocessing Function
Write a function to clean and preprocess the text data. This usually involves:

Tokenizing
Removing stopwords
Lowercasing the text
Removing punctuation and special characters
Lemmatizing or stemming

In [26]:
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    
    text = ''.join([char for char in text if char not in string.punctuation])
    
    tokens = nltk.word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    
    return tokens

In [27]:
# Apply text preprocessing to both dataframes
letterboxd['processed_tagline'] = letterboxd['tagline'].apply(preprocess_text)
letterboxd['processed_summary'] = letterboxd['summary'].apply(preprocess_text)
tmdb['processed_tagline'] = tmdb['tagline'].apply(preprocess_text)
tmdb['processed_summary'] = tmdb['summary'].apply(preprocess_text)

Word Frequency Analysis

In [28]:
def analyze_most_common_words(df, text_column, top_n=50):
    processed_column = f"processed_{text_column}"

    # Flatten the tokens and calculate frequencies
    all_tokens = [word for tokens in df[processed_column] for word in tokens]
    word_freq = Counter(all_tokens)
    most_common_words = word_freq.most_common(top_n)
    
    # Prepare data for Plotly
    words, counts = zip(*most_common_words)
    total_word_count = sum(counts)
    percentages = [round((count / total_word_count) * 100, 2) for count in counts]
    
    # Return as a DataFrame for plotting
    plot_df = pd.DataFrame({
        'Words': words,
        'Counts': counts,
        'Percentage': percentages
    })
    
    return plot_df

In [29]:
letterboxd_tagline_freq = analyze_most_common_words(letterboxd, 'tagline')
letterboxd_summary_freq = analyze_most_common_words(letterboxd, 'summary')

tmdb_tagline_freq = analyze_most_common_words(tmdb, 'tagline')
tmdb_summary_freq = analyze_most_common_words(tmdb, 'summary')

Visualize the Word Frequencies with Plotly

In [30]:
def plot_word_frequencies(plot_df, title):
    fig = px.bar(plot_df,
                 x='Words', 
                 y='Counts', 
                 title=title,
                 labels={'Counts': 'Frequency', 'Words': 'Words'},
                 color='Counts',
                 color_continuous_scale='Matter',
                 hover_data={'Words': True, 'Counts': True, 'Percentage': True})
    fig.update_layout(xaxis_tickangle=-45)
    fig.show()

In [31]:
plot_word_frequencies(letterboxd_tagline_freq, 'Most Common Words in Letterboxd Taglines')
plot_word_frequencies(letterboxd_summary_freq, 'Most Common Words in Letterboxd Summaries')

plot_word_frequencies(tmdb_tagline_freq, 'Most Common Words in TMDB Taglines')
plot_word_frequencies(tmdb_summary_freq, 'Most Common Words in TMDB Summaries')


Correlation Analysis

In [32]:
# Example: Create a DataFrame with word frequencies and numerical values for correlation analysis
def prepare_correlation_data(df, word_frequencies, numerical_columns):
    # Merge word frequency data (counts or percentages) with numerical data
    correlation_df = df[numerical_columns].copy()
    correlation_df = correlation_df.reset_index(drop=True)  # Ensure matching indices

    # Merge the word frequencies
    word_freq_df = word_frequencies.reset_index(drop=True)
    correlation_df = correlation_df.join(word_freq_df)

    return correlation_df

Normalize Letterboxd Ratings to a 0-10 Scale

Since Letterboxd ratings are on a scale from 0 to 5, and IMDB ratings and popularity are on a 0-10 scale, we'll normalize the letterboxd_rating to match the IMDB and popularity scales. This will make the comparisons between the two datasets more meaningful.

In [33]:
# Normalize the Letterboxd ratings to a 0-10 scale
letterboxd['normalized_letterboxd_rating'] = letterboxd['letterboxd_rating'] * 2


Select the Relevant Columns for Correlation

In [35]:
# Select the relevant columns for correlation analysis
letterboxd_columns = ['normalized_letterboxd_rating']
tmdb_columns = ['imdb_rating', 'popularity', 'budget', 'revenue', 'profit']

# Prepare the dataframes by selecting only the relevant columns
letterboxd_data = letterboxd[letterboxd_columns]
tmdb_data = tmdb[tmdb_columns]


Handle Missing Data

Make sure both dataframes have no missing values in the relevant numerical columns. If necessary, drop rows with missing values or handle them appropriately (e.g., impute).

In [36]:
# Drop rows with missing values in the relevant columns
letterboxd_data.dropna(inplace=True)
tmdb_data.dropna(inplace=True)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



 Correlation Analysis

 Now we can compute the correlation matrix for both dataframes. We will compare the correlations of the normalized letterboxd_rating with other columns like imdb_rating, popularity, and others, and do the same for tmdb.

In [37]:
# Compute the correlation matrix for both datasets
letterboxd_corr = letterboxd_data.corr()
tmdb_corr = tmdb_data.corr()

# Display the correlation matrices
print("Letterboxd Correlation Matrix:")
print(letterboxd_corr)

print("\nTMDB Correlation Matrix:")
print(tmdb_corr)


Letterboxd Correlation Matrix:
                              normalized_letterboxd_rating
normalized_letterboxd_rating                           1.0

TMDB Correlation Matrix:
             imdb_rating  popularity    budget   revenue    profit
imdb_rating     1.000000    0.036424  0.023779  0.177492  0.203187
popularity      0.036424    1.000000  0.128096  0.139374  0.128507
budget          0.023779    0.128096  1.000000  0.714400  0.561274
revenue         0.177492    0.139374  0.714400  1.000000  0.980098
profit          0.203187    0.128507  0.561274  0.980098  1.000000


Visualization with Plotly

In [38]:

# Function to plot the correlation matrix as a heatmap
def plot_correlation_matrix(corr_df, title):
    fig = px.imshow(corr_df, 
                    title=title, 
                    labels={'x': 'Features', 'y': 'Features'},
                    color_continuous_scale='Viridis', 
                    color_continuous_midpoint=0)
    fig.show()

# Plot the correlation matrices for both datasets
plot_correlation_matrix(letterboxd_corr, "Letterboxd Correlation Matrix")
plot_correlation_matrix(tmdb_corr, "TMDB Correlation Matrix")


Most Common Words

In [42]:
import re


def preprocess_text(text):
    """
    Preprocess the text by removing non-alphanumeric characters, 
    converting to lowercase, and splitting into words.
    """
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove non-alphanumeric characters
    words = text.split()  # Split text into words
    return words


In [43]:
def count_most_common_words(df, column_name, top_n=50):
    """
    Count the most common words in a specified column and return the top N most common words.
    """
    # Preprocess the text in the column
    df['processed_' + column_name] = df[column_name].apply(preprocess_text)
    
    # Flatten the list of words
    all_words = [word for words in df['processed_' + column_name] for word in words]
    
    # Count the most common words
    word_counts = Counter(all_words)
    
    # Get the top N most common words
    most_common_words = word_counts.most_common(top_n)
    
    # Return the most common words and their counts
    return most_common_words


In [45]:
# Count the most common words for each column in letterboxd
letterboxd_genres = count_most_common_words(letterboxd, 'genres', top_n=50)
letterboxd_tagline = count_most_common_words(letterboxd, 'tagline', top_n=50)
letterboxd_summary = count_most_common_words(letterboxd, 'summary', top_n=50)
letterboxd_events = count_most_common_words(letterboxd, 'events', top_n=50)

# Count the most common words for each column in tmdb
tmdb_genres = count_most_common_words(tmdb, 'genres', top_n=50)
tmdb_tagline = count_most_common_words(tmdb, 'tagline', top_n=50)
tmdb_summary = count_most_common_words(tmdb, 'summary', top_n=50)
tmdb_events = count_most_common_words(tmdb, 'events', top_n=50)
# tmdb_themes = count_most_common_words(tmdb, 'themes', top_n=50)


In [46]:
# Convert the results to DataFrames for easier comparison
def convert_to_df(most_common_words, column_name):
    words, counts = zip(*most_common_words)
    return pd.DataFrame({
        'Word': words,
        'Count': counts,
        'Column': column_name
    })

# Create DataFrames for each comparison column
letterboxd_genres_df = convert_to_df(letterboxd_genres, 'Genres')
letterboxd_tagline_df = convert_to_df(letterboxd_tagline, 'Tagline')
letterboxd_summary_df = convert_to_df(letterboxd_summary, 'Summary')
letterboxd_events_df = convert_to_df(letterboxd_events, 'Events')

tmdb_genres_df = convert_to_df(tmdb_genres, 'Genres')
tmdb_tagline_df = convert_to_df(tmdb_tagline, 'Tagline')
tmdb_summary_df = convert_to_df(tmdb_summary, 'Summary')
tmdb_events_df = convert_to_df(tmdb_events, 'Events')
# tmdb_themes_df = convert_to_df(tmdb_themes, 'Themes')

# Combine all the dataframes into one for easy comparison
letterboxd_df = pd.concat([letterboxd_genres_df, letterboxd_tagline_df, letterboxd_summary_df, letterboxd_events_df])
tmdb_df = pd.concat([tmdb_genres_df, tmdb_tagline_df, tmdb_summary_df, tmdb_events_df])

# Reset indices for clarity
letterboxd_df.reset_index(drop=True, inplace=True)
tmdb_df.reset_index(drop=True, inplace=True)


In [48]:
# Function to add the source column to each dataframe
def add_source_column(df, source_name):
    df['Source'] = source_name
    return df

# Create a bar chart for comparison
def plot_common_words_comparison(df1, df2, column_name1, column_name2):
    # Add source column to differentiate between dataframes
    df1 = add_source_column(df1, column_name1)
    df2 = add_source_column(df2, column_name2)
    
    # Concatenate the two dataframes
    combined_df = pd.concat([df1, df2])
    
    # Create the bar chart with color representing the source
    fig = px.bar(combined_df,
                 x='Word', 
                 y='Count', 
                 color='Source',  # Use the 'Source' column for coloring
                 title=f'Comparison of Most Common Words in {column_name1} and {column_name2}',
                 labels={'Count': 'Word Frequency', 'Word': 'Words'},
                 barmode='group')
    
    # Rotate the x-axis labels for better visibility
    fig.update_layout(xaxis_tickangle=-45)
    
    # Show the plot
    fig.show()

# Plot comparisons for each set of columns (genres, tagline, summary, events, themes)
plot_common_words_comparison(letterboxd_genres_df, tmdb_genres_df, 'Letterboxd Genres', 'TMDB Genres')
plot_common_words_comparison(letterboxd_tagline_df, tmdb_tagline_df, 'Letterboxd Tagline', 'TMDB Tagline')
plot_common_words_comparison(letterboxd_summary_df, tmdb_summary_df, 'Letterboxd Summary', 'TMDB Summary')
plot_common_words_comparison(letterboxd_events_df, tmdb_events_df, 'Letterboxd Events', 'TMDB Events')



Sentiment Analysis

In [None]:
from textblob import TextBlob

# Function to perform sentiment analysis on a list of words
def sentiment_analysis(words):
    sentiment_scores = []
    for word in words:
        # Get the sentiment polarity score of each word
        sentiment_scores.append(TextBlob(word).sentiment.polarity)
    return sentiment_scores


Calculate Spearman's Correlation

Combine and Flatten the Data

In [49]:
import pandas as pd
from scipy.stats import spearmanr
import plotly.express as px
from textblob import TextBlob

# Function to perform sentiment analysis on a list of words
def sentiment_analysis(words):
    sentiment_scores = []
    for word in words:
        # Get the sentiment polarity score of each word
        sentiment_scores.append(TextBlob(word).sentiment.polarity)
    return sentiment_scores

# Function to calculate and return Spearman's correlation
def calculate_spearman_corr(df, word_column, target_column):
    # Perform sentiment analysis on the word column
    sentiment_scores = sentiment_analysis(df[word_column])
    
    # Add sentiment scores as a new column
    df['sentiment'] = sentiment_scores
    
    # Calculate Spearman correlation between sentiment and target column
    correlation, _ = spearmanr(df['sentiment'], df[target_column])
    return correlation

# Prepare dataframes with the necessary columns
# Here we assume that `letterboxd` and `tmdb` are your dataframes

# Function to run for each column (e.g., 'tagline', 'summary', 'genres', 'events')
def prepare_for_corr_and_plot(df1, df2, column_name, target_column1, target_column2):
    # Calculate Spearman's correlation for Letterboxd data
    letterboxd_corr = calculate_spearman_corr(df1, column_name, target_column1)
    
    # Calculate Spearman's correlation for TMDB data
    tmdb_corr = calculate_spearman_corr(df2, column_name, target_column2)
    
    # Prepare the result for plotting
    result_df = pd.DataFrame({
        'Word': [column_name],
        'Letterboxd Correlation': [letterboxd_corr],
        'TMDB Correlation': [tmdb_corr],
    })
    
    return result_df

# Prepare an empty dataframe for the correlation results
corr_results = pd.DataFrame()

# List of columns to analyze
columns_to_analyze = ['tagline', 'summary', 'genres', 'events']

# Define target columns for each dataset
letterboxd_target = 'letterboxd_rating'
tmdb_target_columns = ['imdb_rating', 'popularity', 'profit']

# Loop through each column and calculate the correlations
for column in columns_to_analyze:
    # Run correlation calculation for each target column in TMDB (we'll do this for all targets)
    for tmdb_target in tmdb_target_columns:
        corr_result = prepare_for_corr_and_plot(letterboxd, tmdb, column, letterboxd_target, tmdb_target)
        corr_results = pd.concat([corr_results, corr_result])

# Plot the comparison of correlations
fig = px.bar(corr_results,
             x='Word',
             y=['Letterboxd Correlation', 'TMDB Correlation'],
             title='Spearman Correlation of Sentiment with Ratings, Popularity, and Profit',
             labels={'Letterboxd Correlation': 'Letterboxd Sentiment Correlation',
                     'TMDB Correlation': 'TMDB Sentiment Correlation'},
             barmode='group')

fig.show()
