In [51]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [52]:
cd /content/drive/MyDrive

/content/drive/MyDrive


In [53]:
repo = 'ada-2024-project-adarable/src'

In [54]:
cd {repo}

/content/drive/MyDrive/ada-2024-project-adarable/src


In [55]:
import pandas as pd
import re
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download necessary NLTK data (run once if needed)
nltk.download('punkt')        # For tokenization
nltk.download('stopwords')    # For stopwords
nltk.download('wordnet')      # For lemmatization

# Initialize necessary NLP tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()  # For Lemmatization
stemmer = PorterStemmer()         # For Stemming (optional)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [93]:
movies = pd.read_csv('../data/MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
characters = pd.read_csv('../data/MovieSummaries/character.metadata.tsv', sep = '\t', header = None)

name_clusters = pd.read_csv('../data/MovieSummaries/name.clusters.txt', sep = '\t', header = None)
summaries = pd.read_csv('../data/MovieSummaries/plot_summaries.txt', sep = '\t', header = None)
tv_tropes = pd.read_csv('../data/MovieSummaries/tvtropes.clusters.txt', sep = '\t', header = None)

In [94]:
movies.sample(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8
21830,2556838,/m/07mvc9,Chisum,1970-06-24,6000000.0,111.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/03btsm8"": ""Action/Adventure"", ""/m/0hfjk"":..."
74783,24158484,/m/07kfly_,The Stoolie,1974,,90.0,{},"{""/m/09c7w0"": ""United States of America""}","{""/m/0lsxr"": ""Crime Fiction""}"


In [95]:
characters.sample(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
390150,12151283,/m/02vrglp,1977-02-25,,1926-05-25,M,1.854,,Claude Akins,50.0,/m/0cvcvm_,,/m/07jfxx
296451,17366863,/m/043p2cq,2001-11-06,,1958-07-01,M,1.78,,Tim Abell,43.0,/m/04m5d2x,,/m/0307gs


In [96]:
name_clusters.sample(2)

Unnamed: 0,0,1
375,Barnyard Dawg,/m/0hyq5x5
810,Hannibal Lecter,/m/0gcxyv0


In [97]:
summaries.sample(2)

Unnamed: 0,0,1
42149,3668661,The movie involves Diane Shepherd who is an i...
31529,6758985,"It is 1940. When the movie begins, film star V..."


In [98]:
tv_tropes.sample(2)

Unnamed: 0,0,1
163,crazy_survivalist,"{""char"": ""Marvin Boggs"", ""movie"": ""Red"", ""id"":..."
466,the_editor,"{""char"": ""Ben Bradlee"", ""movie"": ""All the Pres..."


In [99]:
movies.columns = ['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_release_date', 'movie_box_office_revenue', 'movie_runtime', 'movie_languages', 'movie_countries', 'movie_genres']
name_clusters.columns = ['name', 'freebase_movie_id']
summaries.columns = ['wikipedia_movie_id', 'plot_summary']
characters.columns = ['wikipedia_movie_id', 'freebase_movie_id', 'movie_release_date', 'character_name', 'actor_birth', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name', 'actor_age', 'freebase_character_map', 'freebase_character_id', 'freebase_actor_id']


In [100]:
# We check for duplicates
print("number of duplicated according to Wikipedia id : ", movies['wikipedia_movie_id'].duplicated().sum())
print("number of duplicated according to freebase movie id : ", movies['freebase_movie_id'].duplicated().sum())
print("number of duplicated according to title of the movie : ", movies['movie_name'].duplicated().sum())

# We investigate further to see why some movies have the same name
movies[movies['movie_name'].duplicated(keep=False)].sort_values('movie_name')

# We see that the movies with the same name have not the same release date, so we can keep them as they are. They are likely representing different version of the same movie.

number of duplicated according to Wikipedia id :  0
number of duplicated according to freebase movie id :  0
number of duplicated according to title of the movie :  6263


Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres
77290,8422241,/m/0272wbj,100 Days,1991,,161.0,"{""/m/03k50"": ""Hindi Language""}","{""/m/03rk0"": ""India""}","{""/m/03npn"": ""Horror"", ""/m/0c3351"": ""Suspense""..."
18992,15498803,/m/03mc7x8,100 Days,2001-02-01,,100.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/07s9rl0"": ""Drama""}"
12205,31239628,/m/0gj9wpb,100% Love,2011-04,,140.0,"{""/m/09s02"": ""Telugu language""}",{},"{""/m/06cvj"": ""Romantic comedy""}"
69188,34757217,/m/0j3df_d,100% Love,2012-01-20,,,"{""/m/01c7y"": ""Bengali Language""}","{""/m/03rk0"": ""India""}","{""/m/02l7c8"": ""Romance Film"", ""/m/05p553"": ""Co..."
17891,33940717,/m/0j626hk,12,2003-06-27,,124.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/05p553"": ""Comedy film"", ""/m/04t36"": ""Musi..."
...,...,...,...,...,...,...,...,...,...
79,11353896,/m/02r8pmz,Zindagi,1976,,,"{""/m/03k50"": ""Hindi Language""}","{""/m/03rk0"": ""India""}","{""/m/01chg"": ""Bollywood""}"
75600,33215343,/m/0h67l5w,Zindagi,1940,,120.0,"{""/m/03k50"": ""Hindi Language""}","{""/m/03rk0"": ""India""}","{""/m/0hqxf"": ""Family Film"", ""/m/07s9rl0"": ""Dra..."
40871,11353680,/m/02r8p2s,Zindagi,1964,,,"{""/m/03k50"": ""Hindi Language""}","{""/m/03rk0"": ""India""}","{""/m/0hqxf"": ""Family Film"", ""/m/07s9rl0"": ""Dra..."
74801,31543513,/m/0glpvqt,Zoetrope,,,73.0,{},{},"{""/m/02n4kr"": ""Mystery"", ""/m/01hmnh"": ""Fantasy""}"


In [101]:
summaries=summaries.dropna()

In [102]:
characters=characters.dropna(subset=['wikipedia_movie_id', 'character_name'])

Compare size of character, movie, and movie summary datasets.
Keep only the movies which index is common between the 3 sets

In [103]:
print("Number of movies:", movies.shape[0])
print("Number of summaries:", summaries.shape[0])
n_movie_in_characters = characters.drop_duplicates(subset='wikipedia_movie_id', keep='first')
print("Number of movies where we know the characters:", n_movie_in_characters.shape[0])

Number of movies: 81741
Number of summaries: 42303
Number of movies where we know the characters: 32571


In [104]:
common_index = movies['wikipedia_movie_id'].isin(summaries['wikipedia_movie_id']) & movies['wikipedia_movie_id'].isin(characters['wikipedia_movie_id'])
filtered_movies = movies[common_index]

In [105]:
print(filtered_movies.shape[0])

23068


In [106]:
tv_tropes.columns = ['trope', 'details']
tv_tropes['details'] = tv_tropes['details'].apply(eval)
tv_tropes = tv_tropes.join(pd.json_normalize(tv_tropes['details'])).drop(columns=['details'])

In [107]:
tv_tropes.sample(2)

Unnamed: 0,trope,char,movie,id,actor
83,casanova,Dorian Gray,The League of Extraordinary Gentlemen,/m/0dkcy1f,Stuart Townsend
111,corrupt_corporate_executive,Carter Burke,Aliens,/m/0jt8n6,Paul Reiser


In [108]:
tv_tropes.columns = [
    'trope',
    'character_name',          # Change `char` to `character_name`
    'movie_name',              # Change `movie` to `movie_name`
    'freebase_movie_id',       # Change `id` to `freebase_movie_id`
    'actor_name'               # Change `actor` to `actor_name`
]

In [109]:
trope_counts = tv_tropes['trope'].value_counts().reset_index()
trope_counts.columns = ['trope', 'count']

In [110]:
# Display the most common tropes
print("Most Common Tropes:")
print(trope_counts.head(5))  # Display top 10 for quick inspection

Most Common Tropes:
                         trope  count
0            crazy_jealous_guy     25
1  corrupt_corporate_executive     23
2                 byronic_hero     17
3              psycho_for_hire     16
4            father_to_his_men     15


In [111]:
#!pip install scipy

In [112]:
#!pip install --upgrade sympy transformers

In [113]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import pandas as pd

# Load the model and encode tropes
model = SentenceTransformer('all-MiniLM-L6-v2')
tropes = tv_tropes['trope'].tolist()  # Assuming tv_tropes is a DataFrame with a 'trope' column
trope_embeddings = model.encode(tropes)

# Set the number of clusters (archetypes) you want
num_clusters = 10  # Adjust this based on how many distinct categories you want

# Perform K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
labels = kmeans.fit_predict(trope_embeddings)

# Assign labels to each trope
tv_tropes['category'] = labels

# Remove duplicates within each category
category_summary = tv_tropes.groupby('category')['trope'].apply(lambda x: list(set(x)))

# Display the cleaned-up representative tropes for each category
print(category_summary)


category
0                               [doormat, ditz, klutz]
1    [tranquil_fury, gentleman_thief, granola_perso...
2    [playful_hacker, adventurer_archaeologist, ego...
3    [coward, jerk_jock, fastest_gun_in_the_west, a...
4    [stupid_crooks, corrupt_corporate_executive, m...
5    [henpecked_husband, big_man_on_campus, self_ma...
6    [officer_and_a_gentleman, drill_sargeant_nasty...
7    [grumpy_old_man, hitman_with_a_heart, crazy_je...
8    [brainless_beauty, dumb_blonde, dumb_muscle, p...
9    [dean_bitterman, storyteller, loser_protagonis...
Name: trope, dtype: object


### Category Interpretation

1. **Category 0**: `[doormat, ditz, klutz]`
   - **Interpretation**: This category likely represents **"awkward or submissive characters"**. Characters in this category might be shy, prone to accidents, or easy to push around, which could include "sidekick" or "comic relief" roles.

   => Category 0: Awkward

2. **Category 1**: `[tranquil_fury, gentleman_thief, granola_person]`
   - **Interpretation**: This group seems to represent **"calm but driven characters with a moral code"**. They could include characters who are calm on the surface but have a strong underlying purpose or anger, like a "gentle avenger" or "principled rebel."

   => Category 1: Principled


3. **Category 2**: `[playful_hacker, adventurer_archaeologist, egotistical_genius]`
   - **Interpretation**: This category likely captures **"adventurous, clever, and sometimes arrogant characters"**. These are individuals who are both intelligent and risk-takers, often fitting roles like "brilliant explorers" or "quirky geniuses."

   => Category 2: Adventurous


4. **Category 3**: `[coward, jerk_jock, fastest_gun_in_the_west, arrogant_kungfu_guy]`
   - **Interpretation**: This category appears to represent **"overconfident and sometimes antagonistic characters"**. These could be characters who are cocky, competitive, or even bullies, fitting stereotypes like "school bullies" or "rival fighters."

   => Category 3: Arrogant

5. **Category 4**: `[stupid_crooks, corrupt_corporate_executive, morally_bankrupt_banker]`
   - **Interpretation**: This category likely includes **"villainous or morally corrupt characters"**. Characters here may be criminals or unethical businesspeople, fitting the "greedy villains" or "corrupt officials" archetype.

   => Category 4: Villainous

6. **Category 5**: `[henpecked_husband, big_man_on_campus, self_made_man]`
   - **Interpretation**: This category might represent **"socially or traditionally defined men"**. These are characters who play into traditional roles or social expectations, like the "overbearing husband" or the "popular guy."

   => Category 5: Traditional

7. **Category 6**: `[officer_and_a_gentleman, drill_sargeant_nasty, tough_love_teacher]`
   - **Interpretation**: This category includes **"tough but disciplined authority figures"**. These are characters who might be stern or strict but often have a moral or mentoring purpose, such as "strict mentors" or "authoritarian leaders."

   => Category 6: Disciplinarian

8. **Category 7**: `[grumpy_old_man, hitman_with_a_heart, crazy_jealous_guy]`
   - **Interpretation**: This category likely captures **"tough or emotionally complex characters"**. Characters here might be gruff or rough around the edges but have softer or more complicated motives, like "reluctant protectors" or "emotionally intense characters."

   => Category 7: Complex

9. **Category 8**: `[brainless_beauty, dumb_blonde, dumb_muscle, pretty_boy]`
   - **Interpretation**: This category appears to represent **"attractive but stereotypically 'dim-witted' characters"**. These characters may rely on their looks more than intellect, often fitting roles like "eye candy" or "muscle without brains."

   => Category 8: Attractive

10. **Category 9**: `[dean_bitterman, storyteller, loser_protagonist]`
    - **Interpretation**: This group could include **"underdog or unconventional protagonists"**. These characters might not be typical heroes but often have their own unique appeal, like "unlikely heroes" or "underdogs with a story to tell."
  
   => Category 9: Underdog

In [114]:
import numpy as np

# Calculate centroids
centroids = kmeans.cluster_centers_

# Find the closest trope to each centroid
cluster_labels = []
for i, centroid in enumerate(centroids):
    distances = np.linalg.norm(trope_embeddings - centroid, axis=1)
    closest_index = np.argmin(distances)
    cluster_labels.append(tropes[closest_index])

for i, label in enumerate(cluster_labels):
    print(f"Archetype {i}: {label}")

Archetype 0: ditz
Archetype 1: byronic_hero
Archetype 2: bounty_hunter
Archetype 3: arrogant_kungfu_guy
Archetype 4: corrupt_corporate_executive
Archetype 5: father_to_his_men
Archetype 6: psycho_for_hire
Archetype 7: crazy_jealous_guy
Archetype 8: brainless_beauty
Archetype 9: stoner


We choose high-level categories (e.g., Awkward, Principled, Villainous) to provide a broad view of character types, making it easier to analyze trends across numerous movies efficiently. These categories capture general personality or role traits, allowing us to group characters without overwhelming detail.

In [115]:
# Define the categories
categories = [
    "Awkward",       # Category 0
    "Principled",    # Category 1
    "Adventurous",   # Category 2
    "Arrogant",      # Category 3
    "Villainous",    # Category 4
    "Traditional",   # Category 5
    "Disciplinarian",# Category 6
    "Complex",       # Category 7
    "Attractive",    # Category 8
    "Underdog"       # Category 9
]

print(categories)

['Awkward', 'Principled', 'Adventurous', 'Arrogant', 'Villainous', 'Traditional', 'Disciplinarian', 'Complex', 'Attractive', 'Underdog']


Add trope to each character. This will be an additional element for sentiment analysis.

In [116]:
merged_characters = characters.merge(tv_tropes[['character_name', 'trope', 'actor_name']],
                                      on=['character_name', 'actor_name'],
                                      how='left',
                                      indicator=True)

In [117]:
merged_characters.sample(2)

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_release_date,character_name,actor_birth,actor_gender,actor_height,actor_ethnicity,actor_name,actor_age,freebase_character_map,freebase_character_id,freebase_actor_id,trope,_merge
174504,17637638,/m/0465lhs,2008-05-30,Neel Singh,1970-12-03,M,1.79,/m/0dryh9k,Jimmy Shergill,37.0,/m/0h36mt1,/m/0h36mt4,/m/075r_k,,left_only
136734,10030059,/m/02p_byk,1980-08,John McVicar,1944-03-01,M,1.676,,Roger Daltrey,,/m/02vbxg7,/m/0c1ml9n,/m/01k_0fp,,left_only


To find entries in the tv_tropes DataFrame that don't have a corresponding entry in the characters

In [118]:
# Count entries in characters that do not have trope values
count_without_tropes = merged_characters[merged_characters['trope'].isnull()].shape[0]

# Display the count
print(f"Number of characters without trope values: {count_without_tropes}")


Number of characters without trope values: 192207


Since we have a lot of characters without trope values, we cannot rely only on the trope, we can use the summaries as well.

In [119]:
characters = merged_characters

In [83]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

We will add the number of mentions in the plot summary of each character. Detecting character mentions enhances sentiment analysis by providing personalized insights and contextual understanding of how audiences feel about specific characters in the narrative. If it has no mention, then we cannot extract the category of the character from the plot.

In [120]:
characters['mentions']=int(0)

In [121]:
!pip install fuzzywuzzy



In [128]:
# Check unique entries in filtered_movies
unique_filtered_movies = filtered_movies['wikipedia_movie_id'].nunique()
total_filtered_movies = filtered_movies.shape[0]

# Check unique entries in summaries
unique_summaries = summaries['wikipedia_movie_id'].nunique()
total_summaries = summaries.shape[0]

print(f"Unique entries in filtered_movies: {unique_filtered_movies}, Total entries: {total_filtered_movies}")
print(f"Unique entries in summaries: {unique_summaries}, Total entries: {total_summaries}")

Unique entries in filtered_movies: 23068, Total entries: 23068
Unique entries in summaries: 42303, Total entries: 42303


All entries in both DataFrames are unique. We can merge on 'wikipedia_movie_id'

In [132]:
characters_with_tropes = characters[characters['trope'].notna()]

# 2. Identify movie IDs with summaries
movie_ids_with_summaries = set(summaries['wikipedia_movie_id'])

# 3. Keep characters that have either a trope or a summary
characters_with_summaries = characters[characters['wikipedia_movie_id'].isin(movie_ids_with_summaries)]

combined_characters = pd.concat([characters_with_tropes, characters_with_summaries]).drop_duplicates()
final_characters = combined_characters.drop_duplicates(subset='character_name')

print("Number of charcaters:", characters.shape[0])
print("Number of characters with either a trope or a summary:", final_characters.shape[0])

Number of charcaters: 192928
Number of characters with either a trope or a summary: 96235


In [133]:
character_counts_per_movie = final_characters.groupby('wikipedia_movie_id').size().reset_index(name='character_count')

# Display character counts per movie
print("\nNumber of characters per movie:")
print(character_counts_per_movie)


Number of characters per movie:
       wikipedia_movie_id  character_count
0                    3217               10
1                    3746               11
2                    3837               13
3                    3947                8
4                    4227                1
...                   ...              ...
20220            36724042                1
20221            36814246                5
20222            36956792               23
20223            37373877                2
20224            37501922                2

[20225 rows x 2 columns]


We can see numerous characters per movie. We only to primarily keep the main character actors. We will use the number of mentions to consider only the main characters, for this we will use name entity recognition and fuzzy matching.

In [127]:
import pandas as pd
import spacy
from fuzzywuzzy import fuzz
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
import re

# Load the spaCy English model for NER
nlp = spacy.load("en_core_web_sm")

def process_movie(movie_data):
    """Process a single movie to count character mentions with enhanced techniques."""
    plot = movie_data['plot_summary']
    char_names = movie_data['character_names']

    # Use spaCy for Named Entity Recognition
    doc = nlp(plot)
    detected_names = [ent.text.strip().lower() for ent in doc.ents if ent.label_ == "PERSON"]

    # Count detected names
    name_counts = Counter(detected_names)

    # Initialize mention counts
    mentions = {name: 0 for name in char_names}

    # Count mentions for each character name using exact matches and fuzzy matching
    for name in char_names:
        normalized_name = name.strip().lower()
        mentions[name] = name_counts.get(normalized_name, 0)

        # Fuzzy matching for name variations
        for detected_name in detected_names:
            if fuzz.ratio(normalized_name, detected_name) > 80:  # Adjust threshold as needed
                mentions[name] += 1

    return mentions

def count_character_mentions(filtered_movies, summaries, characters):
    """Counts character mentions across multiple movies in parallel."""
    movie_data = []
    for i in filtered_movies.index[:200]:
        plot = summaries[summaries['wikipedia_movie_id'] == filtered_movies.loc[i]['wikipedia_movie_id']]['plot_summary'].values[0]
        char_names = characters[characters['wikipedia_movie_id'] == filtered_movies.loc[i]['wikipedia_movie_id']]['character_name'].tolist()
        movie_data.append({'plot_summary': plot, 'character_names': char_names})

    # Use ProcessPoolExecutor to parallelize the processing
    with ProcessPoolExecutor() as executor:
        results = list(executor.map(process_movie, movie_data))

    # Aggregate results back into the characters DataFrame
    for i, mentions in enumerate(results):
        for char_name, count in mentions.items():
            characters.loc[characters['character_name'] == char_name, 'mentions'] += count

    return characters

characters['mentions'] = 0

# Run the mention counting function
characters = count_character_mentions(filtered_movies, summaries, characters)

print(characters[['character_name', 'mentions']].head(10))

KeyboardInterrupt: 

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import pipeline

nltk.download('vader_lexicon')

We will perform sentiment analysis on movie plots using two methods: VADER and a transformer model. It processes the plots in chunks for the transformer and aggregates sentiment scores. Results for each character in the movies are collected into a DataFrame.

1. **Data Preparation**:
   - Merged character data with TV tropes data, ensuring to include all character entries and their associated tropes where available.
   - Added the number of mentions in the plot summary based on named entity recognition

2. **Sentiment Analysis Setup**:
   - Initialized VADER and a transformer-based sentiment analysis model (DistilBERT) to evaluate sentiment on plot summaries and character tropes.
   - Implemented a function to handle long texts by splitting them into manageable chunks for analysis.

3. **Character Sentiment Analysis**:
   - Iterated through filtered movies to extract plot summaries and character names.
   - Counted the number of mentions for each character in the plot.
   - Conducted sentiment analysis on the plot summaries **only** for characters with mentions greater than zero.

4. **Trope Sentiment Evaluation**:
   - For characters with associated tropes, performed sentiment analysis on those tropes **only** if they exist.
   - Included sentiment labels and scores for both the plot and the tropes in the results.

We will have better sentiment analysis models in the future.

In [None]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import pipeline

# Initialize VADER and the transformer-based sentiment analyzer with a specific model
sia = SentimentIntensityAnalyzer()
transformer_model = "distilbert-base-uncased-finetuned-sst-2-english"
transformer_sia = pipeline("sentiment-analysis", model=transformer_model)

# Initialize an empty list to store results
results = []

# Function to analyze sentiment for long texts
def analyze_long_text(text):
    max_length = 512
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
    sentiments = []

    for chunk in chunks:
        if chunk:  # Only analyze non-empty chunks
            sentiment = transformer_sia(chunk)[0]  # Get the first result
            sentiments.append(sentiment)

    if sentiments:
        avg_score = sum([sentiment['score'] for sentiment in sentiments]) / len(sentiments)
        avg_label = 'POSITIVE' if avg_score > 0.5 else 'NEGATIVE' if avg_score < 0.5 else 'NEUTRAL'
    else:
        avg_score = 0
        avg_label = 'NEUTRAL'

    return avg_label, avg_score

# Iterate over the filtered movies
for i in filtered_movies.index[:50]:  # Limit to first 50 movies for demonstration
    # Get the plot summary for the current movie
    plot = str(summaries[summaries['wikipedia_movie_id'] == filtered_movies.loc[i]['wikipedia_movie_id']]['plot_summary'].values[0])

    # Get the character names for the current movie
    char_names = characters[characters['wikipedia_movie_id'] == filtered_movies.loc[i]['wikipedia_movie_id']]['character_name']

    # Perform sentiment analysis using VADER
    vader_sentiment = sia.polarity_scores(plot)
    vader_sentiment_label = 'positive' if vader_sentiment['compound'] > 0 else 'negative' if vader_sentiment['compound'] < 0 else 'neutral'

    # Store results for each character in the current movie
    for char_name in char_names:
        mentions = characters.loc[characters['character_name'] == char_name, 'mentions'].values[0]

        # Initialize trope sentiment variables
        trope_sentiment_label = 'N/A'
        trope_sentiment_score = None

        # Check for trope sentiment
        trope = tv_tropes.loc[tv_tropes['character_name'] == char_name, 'trope'].values

        if len(trope) > 0:  # If trope exists
            trope_text = trope[0]  # Assuming you want the first trope
            trope_sentiment_label, trope_sentiment_score = analyze_long_text(trope_text)

        if mentions > 0:
            # Perform sentiment analysis on the plot if mentions > 0
            transformer_sentiment_label, transformer_sentiment_score = analyze_long_text(plot)
            results.append({
                'character_name': char_name,
                'movie_name': filtered_movies.loc[i]['movie_name'],
                'plot': plot,  # Include the plot in the results
                'mentions': mentions,  # Include mentions count
                'vader_sentiment_label': vader_sentiment_label,
                'vader_sentiment_score': vader_sentiment['compound'],
                'transformer_sentiment_label': transformer_sentiment_label,
                'transformer_sentiment_score': transformer_sentiment_score,
                'trope': trope[0] if trope.size > 0 else None,  # Include the trope
                'trope_sentiment_label': trope_sentiment_label,  # Include trope sentiment
                'trope_sentiment_score': trope_sentiment_score  # Include trope sentiment score
            })
        else:
            # If no mentions, just store the character name and movie name with N/A for sentiment
            results.append({
                'character_name': char_name,
                'movie_name': filtered_movies.loc[i]['movie_name'],
                'plot': plot,  # Include the plot in the results
                'mentions': mentions,  # Include mentions count
                'vader_sentiment_label': 'N/A',
                'vader_sentiment_score': None,
                'transformer_sentiment_label': 'N/A',
                'transformer_sentiment_score': None,
                'trope': trope[0] if trope.size > 0 else None,  # Include the trope
                'trope_sentiment_label': trope_sentiment_label,  # Include trope sentiment
                'trope_sentiment_score': trope_sentiment_score  # Include trope sentiment score
            })

sentiment_df = pd.DataFrame(results)

We will then see how we are going to merge these values, we will keep only those who have either a trope or if their name have been mentioned in the plot summary.

In [None]:
filtered_sentiment_df = sentiment_df[
    ((sentiment_df['mentions'] > 0) &
    ((sentiment_df['vader_sentiment_label'] != 'N/A') |
    (sentiment_df['transformer_sentiment_label'] != 'N/A'))) |
    (sentiment_df['trope_sentiment_label'] != 'N/A')
]

In [None]:
filtered_sentiment_df.head(2)

In [None]:
# Filter to show only rows with a valid trope sentiment
trope_filtered_df = filtered_sentiment_df[filtered_sentiment_df['trope_sentiment_label'] != 'N/A']

# Display the first two rows of the filtered DataFrame
trope_filtered_df.head(2)