In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/MyDrive

In [3]:
repo = 'ada-2024-project-adarable'

In [None]:
cd {repo}

In [None]:
import pandas as pd
import re
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download necessary NLTK data (run once if needed)
nltk.download('punkt')        # For tokenization
nltk.download('stopwords')    # For stopwords
nltk.download('wordnet')      # For lemmatization

# Initialize necessary NLP tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()  # For Lemmatization
stemmer = PorterStemmer()         # For Stemming (optional)

In [16]:
movies = pd.read_csv('../data/MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
characters = pd.read_csv('../data/MovieSummaries/character.metadata.tsv', sep = '\t', header = None)

name_clusters = pd.read_csv('../data/MovieSummaries/name.clusters.txt', sep = '\t', header = None)
summaries = pd.read_csv('../data/MovieSummaries/plot_summaries.txt', sep = '\t', header = None)
tv_tropes = pd.read_csv('../data/MovieSummaries/tvtropes.clusters.txt', sep = '\t', header = None)

In [None]:
movies.sample(2)

In [None]:
characters.sample(2)

In [None]:
name_clusters.sample(2)

In [None]:
summaries.sample(2)

In [None]:
tv_tropes.sample(2)

In [17]:
movies.columns = ['wikipedia_movie_id', 'freebase_movie_id', 'movie_name', 'movie_release_date', 'movie_box_office_revenue', 'movie_runtime', 'movie_languages', 'movie_countries', 'movie_genres']
name_clusters.columns = ['name', 'freebase_movie_id']
summaries.columns = ['wikipedia_movie_id', 'plot_summary']
characters.columns = ['wikipedia_movie_id', 'freebase_movie_id', 'movie_release_date', 'character_name', 'actor_birth', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name', 'actor_age', 'freebase_character_map', 'freebase_character_id', 'freebase_actor_id']


In [None]:
# We check for duplicates
print("number of duplicated according to Wikipedia id : ", movies['wikipedia_movie_id'].duplicated().sum())
print("number of duplicated according to freebase movie id : ", movies['freebase_movie_id'].duplicated().sum())
print("number of duplicated according to title of the movie : ", movies['movie_name'].duplicated().sum())

# We investigate further to see why some movies have the same name
movies[movies['movie_name'].duplicated(keep=False)].sort_values('movie_name')

# We see that the movies with the same name have not the same release date, so we can keep them as they are. They are likely representing different version of the same movie.

In [18]:
summaries=summaries.dropna()

In [19]:
characters=characters.dropna(subset=['wikipedia_movie_id', 'character_name'])

Compare size of character, movie, and movie summary datasets.
Keep only the movies which index is common between the 3 sets

In [None]:
print("Number of movies:", movies.shape[0])
print("Number of summaries:", summaries.shape[0])
n_movie_in_characters = characters.drop_duplicates(subset='wikipedia_movie_id', keep='first')
print("Number of movies where we know the characters:", n_movie_in_characters.shape[0])

In [21]:
common_index = movies['wikipedia_movie_id'].isin(summaries['wikipedia_movie_id']) & movies['wikipedia_movie_id'].isin(characters['wikipedia_movie_id'])
filtered_movies = movies[common_index]

In [None]:
print(filtered_movies.shape[0])

In [22]:
tv_tropes.columns = ['trope', 'details']
tv_tropes['details'] = tv_tropes['details'].apply(eval)
tv_tropes = tv_tropes.join(pd.json_normalize(tv_tropes['details'])).drop(columns=['details'])

In [None]:
tv_tropes.sample(2)

In [23]:
tv_tropes.columns = [
    'trope',
    'character_name',          # Change `char` to `character_name`
    'movie_name',              # Change `movie` to `movie_name`
    'freebase_movie_id',       # Change `id` to `freebase_movie_id`
    'actor_name'               # Change `actor` to `actor_name`
]

In [25]:
trope_counts = tv_tropes['trope'].value_counts().reset_index()
trope_counts.columns = ['trope', 'count']

In [None]:
# Display the most common tropes
print("Most Common Tropes:")
print(trope_counts.head(5))  # Display top 10 for quick inspection

Add trope to each character. This will be an additional element for sentiment analysis.

In [166]:
merged_characters = characters.merge(tv_tropes[['character_name', 'trope', 'actor_name']],
                                      on=['character_name', 'actor_name'],
                                      how='left')

In [None]:
merged_characters.sample(2)

To find entries in the tv_tropes DataFrame that don't have a corresponding entry in the characters

In [None]:
# Perform a left merge to find unmatched entries
merged_characters = characters.merge(tv_tropes[['character_name', 'trope']],
                                      on='character_name',
                                      how='left',
                                      indicator=True)

# Count entries in characters that do not have trope values
count_without_tropes = merged_characters[merged_characters['trope'].isnull()].shape[0]

# Display the count
print(f"Number of characters without trope values: {count_without_tropes}")


In [169]:
characters = merged_characters

In [None]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

We will add the number of mentions in the plot summary of each character. Detecting character mentions enhances sentiment analysis by providing personalized insights and contextual understanding of how audiences feel about specific characters in the narrative. If it has no mention, then we cannot extract the sentiment from the plot.

In [None]:
characters['mentions']=int(0)

In [171]:
name=''
for i in filtered_movies.index[:200]:
    plot=str(summaries[summaries['wikipedia_movie_id']==filtered_movies.loc[i]['wikipedia_movie_id']]['plot_summary'])
    char_names=characters[characters['wikipedia_movie_id']==filtered_movies.loc[i]['wikipedia_movie_id']]['character_name']

    # Split the summary in words
    n_words=len(plot.split())
    names=[]
    name_groups = {}

    # Detect names in summary
    if n_words>=2:
        # Perform Name Entity Recognition using NLTK
        nltk_results = ne_chunk(pos_tag(word_tokenize(plot)))
        for nltk_result in nltk_results:
            if type(nltk_result) == Tree:
                name = ''
                # Extract words from result and add to name list
                for nltk_result_leaf in nltk_result.leaves():
                    name += nltk_result_leaf[0] + ' '
                names.append(name)



    # Divide name in last and first name, disgard longer names
    for name in names:
        match_found = False
        name_parts = name.strip().lower().split()
        if len(name_parts) == 2:
            #If full name
            first_name, last_name = name_parts
        elif len(name_parts) == 1:
            #If single name
            first_name, last_name= name_parts[0],""
        else:
            break
        full_name = (first_name, last_name)


        # If name mentionned multiple times, group the mentions
        for group, members in name_groups.items():
            if len(name_parts) == 1:
                if first_name in group[0] or first_name in group[1]:
                    members.append(name)
                    match_found = True
            elif first_name in group[0] or last_name in group[1] or first_name in group[1] or last_name in group[0]:
                members.append(name)
                match_found = True
                break

        if not match_found:
            name_groups[full_name] = [name]


    # Match names detected to character of char dataset
    for name in char_names:
        mention=False
        char_name_parts = name.strip().lower().split()
        for group, members in name_groups.items():
            for word in char_name_parts:
                if word in group:
                    characters.loc[characters['character_name'] == name, 'mentions'] = int(len(members))
                    mention=True
                    break
        if mention==False:
            characters.loc[characters['character_name'] == name, 'mentions'] = 0

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import pipeline

nltk.download('vader_lexicon')

We will perform sentiment analysis on movie plots using two methods: VADER and a transformer model. It processes the plots in chunks for the transformer and aggregates sentiment scores. Results for each character in the movies are collected into a DataFrame.

1. **Data Preparation**:
   - Merged character data with TV tropes data, ensuring to include all character entries and their associated tropes where available.
   - Added the number of mentions in the plot summary based on named entity recognition

2. **Sentiment Analysis Setup**:
   - Initialized VADER and a transformer-based sentiment analysis model (DistilBERT) to evaluate sentiment on plot summaries and character tropes.
   - Implemented a function to handle long texts by splitting them into manageable chunks for analysis.

3. **Character Sentiment Analysis**:
   - Iterated through filtered movies to extract plot summaries and character names.
   - Counted the number of mentions for each character in the plot.
   - Conducted sentiment analysis on the plot summaries **only** for characters with mentions greater than zero.

4. **Trope Sentiment Evaluation**:
   - For characters with associated tropes, performed sentiment analysis on those tropes **only** if they exist.
   - Included sentiment labels and scores for both the plot and the tropes in the results.

We will have better sentiment analysis models in the future.

In [None]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import pipeline

# Initialize VADER and the transformer-based sentiment analyzer with a specific model
sia = SentimentIntensityAnalyzer()
transformer_model = "distilbert-base-uncased-finetuned-sst-2-english"
transformer_sia = pipeline("sentiment-analysis", model=transformer_model)

# Initialize an empty list to store results
results = []

# Function to analyze sentiment for long texts
def analyze_long_text(text):
    max_length = 512
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
    sentiments = []

    for chunk in chunks:
        if chunk:  # Only analyze non-empty chunks
            sentiment = transformer_sia(chunk)[0]  # Get the first result
            sentiments.append(sentiment)

    if sentiments:
        avg_score = sum([sentiment['score'] for sentiment in sentiments]) / len(sentiments)
        avg_label = 'POSITIVE' if avg_score > 0.5 else 'NEGATIVE' if avg_score < 0.5 else 'NEUTRAL'
    else:
        avg_score = 0
        avg_label = 'NEUTRAL'

    return avg_label, avg_score

# Iterate over the filtered movies
for i in filtered_movies.index[:50]:  # Limit to first 50 movies for demonstration
    # Get the plot summary for the current movie
    plot = str(summaries[summaries['wikipedia_movie_id'] == filtered_movies.loc[i]['wikipedia_movie_id']]['plot_summary'].values[0])

    # Get the character names for the current movie
    char_names = characters[characters['wikipedia_movie_id'] == filtered_movies.loc[i]['wikipedia_movie_id']]['character_name']

    # Perform sentiment analysis using VADER
    vader_sentiment = sia.polarity_scores(plot)
    vader_sentiment_label = 'positive' if vader_sentiment['compound'] > 0 else 'negative' if vader_sentiment['compound'] < 0 else 'neutral'

    # Store results for each character in the current movie
    for char_name in char_names:
        mentions = characters.loc[characters['character_name'] == char_name, 'mentions'].values[0]

        # Initialize trope sentiment variables
        trope_sentiment_label = 'N/A'
        trope_sentiment_score = None

        # Check for trope sentiment
        trope = tv_tropes.loc[tv_tropes['character_name'] == char_name, 'trope'].values

        if len(trope) > 0:  # If trope exists
            trope_text = trope[0]  # Assuming you want the first trope
            trope_sentiment_label, trope_sentiment_score = analyze_long_text(trope_text)

        if mentions > 0:
            # Perform sentiment analysis on the plot if mentions > 0
            transformer_sentiment_label, transformer_sentiment_score = analyze_long_text(plot)
            results.append({
                'character_name': char_name,
                'movie_name': filtered_movies.loc[i]['movie_name'],
                'plot': plot,  # Include the plot in the results
                'mentions': mentions,  # Include mentions count
                'vader_sentiment_label': vader_sentiment_label,
                'vader_sentiment_score': vader_sentiment['compound'],
                'transformer_sentiment_label': transformer_sentiment_label,
                'transformer_sentiment_score': transformer_sentiment_score,
                'trope': trope[0] if trope.size > 0 else None,  # Include the trope
                'trope_sentiment_label': trope_sentiment_label,  # Include trope sentiment
                'trope_sentiment_score': trope_sentiment_score  # Include trope sentiment score
            })
        else:
            # If no mentions, just store the character name and movie name with N/A for sentiment
            results.append({
                'character_name': char_name,
                'movie_name': filtered_movies.loc[i]['movie_name'],
                'plot': plot,  # Include the plot in the results
                'mentions': mentions,  # Include mentions count
                'vader_sentiment_label': 'N/A',
                'vader_sentiment_score': None,
                'transformer_sentiment_label': 'N/A',
                'transformer_sentiment_score': None,
                'trope': trope[0] if trope.size > 0 else None,  # Include the trope
                'trope_sentiment_label': trope_sentiment_label,  # Include trope sentiment
                'trope_sentiment_score': trope_sentiment_score  # Include trope sentiment score
            })

sentiment_df = pd.DataFrame(results)

We will then see how we are going to merge these values, we will keep only those who have either a trope or if their name have been mentioned in the plot summary.

In [192]:
filtered_sentiment_df = sentiment_df[
    ((sentiment_df['mentions'] > 0) &
    ((sentiment_df['vader_sentiment_label'] != 'N/A') |
    (sentiment_df['transformer_sentiment_label'] != 'N/A'))) |
    (sentiment_df['trope_sentiment_label'] != 'N/A')
]

In [None]:
filtered_sentiment_df.head(2)

In [None]:
# Filter to show only rows with a valid trope sentiment
trope_filtered_df = filtered_sentiment_df[filtered_sentiment_df['trope_sentiment_label'] != 'N/A']

# Display the first two rows of the filtered DataFrame
trope_filtered_df.head(2)