In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/MyDrive/ada-2024-project-adarable/src

/content/drive/MyDrive/ada-2024-project-adarable/src


In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from joblib import Parallel, delayed

# Introduction: Plot Structure Analysis for Movie Summaries

The goal of this notebook is to **process movie plot summaries to identify their underlying plot structures**. By categorizing each summary according to distinct narrative patterns, we aim to gain insights into common plot structures and explore potential correlations with financial success.

To achieve this, we experimented with **two different approaches**:

1. **Clustering**: We used unsupervised clustering (KMeans) on plot summaries to explore any emergent plot structure patterns.

2. **Large Language Model (LLM) Classification**: Using a predefined set of 15 plot structure categories, we use a LLM to classify each summary. This classification approach uses zero-shot prompting to assign each summary to a category.

# Importing the data

In [None]:
movies = pd.read_csv('../data/processed/merged_movies.csv')
summaries = pd.read_csv('../data/processed/summaries_preprocessed.csv')

In [None]:
movies.head()

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,63143812.0,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant..."
2,10408933,/m/02qc0j7,Alexander's Ragtime Band,1938-08-16,3600000.0,106.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/04t36"": ""Musical"", ""/m/01z4y"": ""Comedy"", ..."
3,171005,/m/016ywb,Henry V,1989-11-08,10161099.0,137.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/04xvh5"": ""Costume drama"", ""/m/082gq"": ""Wa..."
4,77856,/m/0kcn7,Mary Poppins,1964-08-27,102272727.0,139.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""..."


In [None]:
summaries.head()

Unnamed: 0,wikipedia_movie_id,plot_summary,clean_plot_summary,tokenized_plot_summary,filtered_tokens,lemmatized_tokens
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",shlykov a hardworking taxi driver and lyosha a...,"['shlykov', 'a', 'hardworking', 'taxi', 'drive...","['shlykov', 'hardworking', 'taxi', 'driver', '...","['shlykov', 'hardworking', 'taxi', 'driver', '..."
1,31186339,The nation of Panem consists of a wealthy Capi...,the nation of panem consists of a wealthy capi...,"['the', 'nation', 'of', 'panem', 'consists', '...","['nation', 'panem', 'consists', 'wealthy', 'ca...","['nation', 'panem', 'consists', 'wealthy', 'ca..."
2,20663735,Poovalli Induchoodan is sentenced for six yea...,poovalli induchoodan is sentenced for six year...,"['poovalli', 'induchoodan', 'is', 'sentenced',...","['poovalli', 'induchoodan', 'sentenced', 'six'...","['poovalli', 'induchoodan', 'sentenced', 'six'..."
3,2231378,"The Lemon Drop Kid , a New York City swindler,...",the lemon drop kid a new york city swindler is...,"['the', 'lemon', 'drop', 'kid', 'a', 'new', 'y...","['lemon', 'drop', 'kid', 'new', 'york', 'city'...","['lemon', 'drop', 'kid', 'new', 'york', 'city'..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...,seventhday adventist church pastor michael cha...,"['seventhday', 'adventist', 'church', 'pastor'...","['seventhday', 'adventist', 'church', 'pastor'...","['seventhday', 'adventist', 'church', 'pastor'..."


We keep only the summaries of the movies we have. We filter thanks to the `wikipedia_movie_id` feature

In [1]:
common_summaries = summaries[summaries['wikipedia_movie_id'].isin(movies['wikipedia_movie_id'])]

NameError: name 'summaries' is not defined

# 1. Clustering

## 1.1 Clustering only with the plot summaries

First, we transform the plot summaries into a numerical format for clustering by applying **TF-IDF (Term Frequency-Inverse Document Frequency) vectorization**. TF-IDF highlights important words in each summary by reducing the weight of common terms and increasing the importance of unique terms.

In [None]:
# Text Vectorization with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(common_summaries['clean_plot_summary'])

# # Optional Step: Adding Genre Data (if available) to Enrich Clustering
# # Encode genre into numerical format and normalize it
# if 'movie_genres' in summaries.columns:
#     encoder = LabelEncoder()
#     summaries['genre_encoded'] = encoder.fit_transform(summaries['movie_genres'].fillna(''))
#     genre_embedding = np.expand_dims(summaries['genre_encoded'] / summaries['genre_encoded'].max(), axis=1)
#     combined_matrix = np.hstack((tfidf_matrix.toarray(), genre_embedding))
# else:
#     combined_matrix = tfidf_matrix.toarray()
combined_matrix = tfidf_matrix.toarray()

# # Step 3: Dimensionality Reduction (optional)
# pca = PCA(n_components=50)  # Reduce dimensions to speed up clustering
# reduced_matrix = pca.fit_transform(combined_matrix)

Now, we use **KMeans clustering** to group the plot summaries based on their TF-IDF representations. This step aims to identify distinct plot structure patterns by clustering similar summaries together.

**Parameters**:
- **n_clusters=10**: Specifies the number of clusters, initially set to 10. This value can be adjusted depending on the optimal number of clusters.
- **random_state=0**: Ensures reproducibility of clustering results.

The clustering labels are added to the dataset, allowing us to analyze plot structure patterns within each identified cluster.

To determine the optimal number of clusters, we plotted the **silhouette score** for cluster values ranging from 5 to 30. Typically, an ideal number of clusters shows a **peak in the silhouette score**.

However, in our plot, the silhouette score continually increases as the number of clusters increases.

Given these results, we will proceed with **15 clusters**. This number provides a balance between interpretability and granularity, allowing us to capture a range of plot structures without creating an excessive number of small, indistinct clusters.

In [None]:
from sklearn.metrics import silhouette_score
silhouette_scores = []
for i in range(5, 30):
    kmeans = KMeans(n_clusters=i, random_state=0)
    labels = kmeans.fit_predict(combined_matrix)
    score = silhouette_score(combined_matrix, labels)
    silhouette_scores.append(score)

plt.plot(range(5, 30), silhouette_scores, marker='o')
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score for Optimal Cluster Number")
plt.show()

KeyboardInterrupt: 

In [None]:
# Clustering with KMeans
n_clusters = 15 # Number of clusters to experiment with, can be adjusted
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
common_summaries['plot_structure_cluster'] = kmeans.fit_predict(combined_matrix)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_summaries['plot_structure_cluster'] = kmeans.fit_predict(combined_matrix)


In [None]:
common_summaries['plot_structure_cluster'].value_counts()

Unnamed: 0_level_0,count
plot_structure_cluster,Unnamed: 1_level_1
2,2593
10,1224
7,1210
6,953
12,922
14,420
4,243
8,213
11,144
5,141


The **distribution of plot summaries across clusters** shows that the clustering algorithm has created some clusters with a significantly higher number of summaries than others. The top three clusters (2, 10, and 7) collectively hold a large portion of the summaries, indicating that certain plot structures may be more common. We have to dive more in the clusters.

**Top Terms per Cluster**

To gain a better understanding of each cluster's underlying plot structure, we examine the **top terms** associated with each cluster. By averaging the **TF-IDF values** of terms within each cluster, we identify the 10 most representative words for each group. This allows us to interpret the clusters.

In [None]:
# Cluster Interpretation
# Get the top terms per cluster by averaging the TF-IDF values of the terms in each cluster
terms = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans.cluster_centers_
top_terms_per_cluster = []

for i in range(n_clusters):
    top_terms_idx = cluster_centers[i].argsort()[-10:]  # Top 10 terms per cluster
    top_terms_per_cluster.append([terms[idx] for idx in top_terms_idx])

# Display top terms per cluster for interpretation
for i, terms in enumerate(top_terms_per_cluster):
    print(f"Cluster {i+1} top terms: {', '.join(terms)}")



Cluster 1 top terms: finds, home, mary, tells, charley, jane, peters, jerry, ray, peter
Cluster 2 top terms: kevin, plane, money, david, kate, base, agent, maxs, bond, max
Cluster 3 top terms: wife, relationship, man, woman, young, story, new, life, love, film
Cluster 4 top terms: father, home, annie, charlie, alice, children, son, money, car, chris
Cluster 5 top terms: duke, palace, kingdom, kings, princess, castle, prince, queen, mary, king
Cluster 6 top terms: wife, love, greg, father, family, new, life, tells, jacks, jack
Cluster 7 top terms: killed, earth, captain, mission, men, soldiers, crew, army, ship, war
Cluster 8 top terms: man, killer, case, money, car, gang, murder, charlie, harry, police
Cluster 9 top terms: home, new, family, car, father, sams, michaels, jake, sam, michael
Cluster 10 top terms: goes, sarah, henry, family, new, jerry, film, father, house, tom
Cluster 11 top terms: tells, home, johnny, town, sarah, mr, alex, george, ben, house
Cluster 12 top terms: man, c

### Interpretation of Cluster Themes

Here’s an interpretation of each cluster based on the top terms:

- **Cluster 1**: character-driven plots centered around relationships.
- **Cluster 2**: action or espionage themes, maybe involving military or spy-related narratives.
- **Cluster 3**: romantic or life stories focused on relationships.
- **Cluster 4**: Plots around family reliationships.
- **Cluster 5**: royalty and historical or fantasy settings with themes of nobility.
- **Cluster 6**: domestic life or family-centered stories.
- **Cluster 7**: military or wartime narratives with themes of heroism or survival.
- **Cluster 8**: crime or thriller themes, often involving investigation or criminal pursuits.
- **Cluster 9**: domestic or personal stories with a focus on family.
- **Cluster 10**: character-centered family narratives with elements of drama or personal growth.
- **Cluster 11**: small-town settings or domestic life with local interpersonal relationships.
- **Cluster 12**: crime or drama narratives involving family or authority figures.
- **Cluster 13**: family dynamics or coming-of-age themes.
- **Cluster 14**: domestic drama with possible criminal elements.
- **Cluster 15**: school or educational settings with themes of growth, learning, or competition.

Each cluster reveals distinct themes and settings. While this analysis helps to identify common elements within each group, **we are not fully satisfied with this approach** as it appears to capture **genre and themes more than specific plot structures**.

Since our goal is to identify different types of plot structures, clustering based solely on keywords may lack the depth needed to capture narrative progression and plot dynamics. Consequently, we explore alternative methods, such as leveraging large language models or deeper natural language processing techniques, to classify plot structures more accurately.

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline


# Define model cache directory
cache_dir = "../LLM_cache"

# Create the cache directory if it does not exist
os.makedirs(cache_dir, exist_ok=True)

model_name = "google/flan-t5-large"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_dir)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)

# Initialize the classifier pipeline
classifier = pipeline("text2text-generation", model=model, tokenizer=tokenizer, framework="pt")


# Define the complete list of categories
predefined_categories = [
    "Hero’s Journey and Transformation",
    "Quest for Vengeance or Justice",
    "Coming of Age and Self-Discovery",
    "Survival or Escape",
    "Rise and Fall of a Protagonist",
    "Love and Relationship Dynamics",
    "Comedy of Errors or Misadventure",
    "Crime and Underworld Exploration",
    "Power Struggle and Betrayal",
    "Mystery and Conspiracy Unveiling",
    "Tragedy and Inevitability",
    "Conflict with Supernatural or Unknown Forces",
    "Comedy in Domestic Life",
    "Social Rebellion or Fight Against Oppression",
    "Fantasy or Science Fiction Quest"
]

# Long prompt
# prompt = """
# Classify the following plot summary into one of these categories:
# 1. Hero’s Journey and Transformation: The protagonist undergoes personal growth, often starting as an ordinary individual who faces challenges, gains allies, overcomes obstacles, and returns transformed.
# 2. Quest for Vengeance or Justice: A revenge-driven plot where the protagonist seeks retribution or justice for a past wrong or injustice.
# 3. Coming of Age and Self-Discovery: The protagonist matures or gains self-awareness, often overcoming personal or societal obstacles.
# 4. Survival or Escape: The story revolves around characters trying to survive dangerous situations or escape captivity.
# 5. Rise and Fall of a Protagonist: The protagonist experiences a rise to power or success, followed by a tragic or inevitable downfall.
# 6. Love and Relationship Dynamics: Focuses on romantic or family relationships, often dealing with misunderstandings, unions, reconciliations, or unfulfilled love.
# 7. Comedy of Errors or Misadventure: Characters experience humorous, unintended consequences or misadventures while pursuing a goal.
# 8. Crime and Underworld Exploration: The story explores criminal activities or the underworld, often involving heists, gang conflicts, or undercover missions.
# 9. Power Struggle and Betrayal: Focuses on conflicts for power or leadership, with betrayal as a central theme, often involving shifting alliances.
# 10. Mystery and Conspiracy Unveiling: The protagonist uncovers a hidden conspiracy, solves puzzles, or discovers hidden truths.
# 11. Tragedy and Inevitability: A character-driven plot where the protagonist faces an inevitable negative outcome, often due to a flaw or external betrayal.
# 12. Conflict with Supernatural or Unknown Forces: The protagonist encounters supernatural entities, unknown forces, or sci-fi elements that pose existential challenges.
# 13. Comedy in Domestic Life: Focuses on the humor and challenges of family life, with everyday misunderstandings and domestic issues driving the plot.
# 14. Social Rebellion or Fight Against Oppression: The protagonist challenges societal norms or oppressive systems, leading to personal or collective change.
# 15. Fantasy or Science Fiction Quest: Centers on a journey or quest in a fantastical or sci-fi setting, involving world-building, encounters with non-human entities, and mythical or technological challenges.

# Choose only one category from the list.

# Summary: "{}"
# Classification (choose one):
# """

# Short prompt
prompt = """
Classify the following plot summary into one of these categories. Example:
- Summary: "A young man seeks revenge for his family's murder."
  Classification: Quest for Vengeance or Justice
- Summary: "A young girl learns about herself through challenges."
  Classification: Coming of Age and Self-Discovery
Choose only one category name exactly as written.
1. Hero’s Journey and Transformation: Personal growth and overcoming challenges.
2. Quest for Vengeance or Justice: Seeking retribution or justice.
3. Coming of Age and Self-Discovery: Maturation or self-awareness.
4. Survival or Escape: Danger and resilience.
5. Rise and Fall of a Protagonist: Success followed by a tragic downfall.
6. Love and Relationship Dynamics: Focus on romantic or family relationships.
7. Comedy of Errors or Misadventure: Humorous, unintended consequences.
8. Crime and Underworld Exploration: Criminal activities, heists, or gang conflicts.
9. Power Struggle and Betrayal: Conflicts for power, often with betrayal.
10. Mystery and Conspiracy Unveiling: Discovering hidden truths or solving puzzles.
11. Tragedy and Inevitability: Inevitable negative outcomes due to fate or flaw.
12. Conflict with Supernatural or Unknown Forces: Supernatural or sci-fi challenges.
13. Comedy in Domestic Life: Family life and misunderstandings.
14. Social Rebellion or Fight Against Oppression: Challenging societal norms.
15. Fantasy or Science Fiction Quest: Journey in a fantastical or sci-fi setting.

Choose only one of these category.

Summary: "{}"
Classification (choose one):
"""

# Function to classify each plot summary
def classify_summary(summary):
    input_prompt = prompt.format(summary)
    response = classifier(input_prompt, max_length=15)  # Limit response length to avoid extra text
    print('response : ', response)
    classification = response[0]['generated_text'].strip()
    print('classification : ', classification)
    return classification
    # # Validate that response matches one of the categories
    # if classification in predefined_categories:
    #     return classification
    # else:
    #     return "Uncategorized"  # Handle cases where the response doesn't match any category

# # Apply classification to dataset
# summaries['plot_structure'] = summaries['plot_summary'].apply(classify_summary)

# # Save results to CSV if needed
# summaries.to_csv('/mnt/data/classified_summaries_with_plot_structures.csv', index=False)




In [None]:
cache_dir = "../LLM_summarizer_cache"

# Create the cache directory if it does not exist
os.makedirs(cache_dir, exist_ok=True)

summarizer = pipeline("summarization", model="facebook/bart-large-cnn", cache_dir=cache_dir)

def truncate_text(text, max_words=400):
    # Split and truncate to the first max_words
    words = text.split()
    if len(words) > max_words:
        return " ".join(words[:max_words])
    return text

def summarize_text(text):
    max_len = 250
    word_count = len(text.split())
    if word_count < max_len:
        return text
    min_len = max(50, int(0.2 * word_count)) # At least 20% of original length or 50 tokens min
    if min_len > max_len:
        min_len -= 50
    summary = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
    return summary

def summarize_text_with_retry(text, retries=3):
    truncated_text = truncate_text(text)
    for _ in range(retries):
        try:
            summary = summarizer(truncated_text, max_length=250, min_length=80, do_sample=False)
            return summary[0]['summary_text'] if summary else ""
        except IndexError:
            continue
    return "Summarization failed after retries."



In [None]:
test_sample = common_summaries.sample(10)
test_sample['summarized'] = test_sample['plot_summary'].apply(summarize_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (1667 > 1024). Running this sequence through the model will result in indexing errors


IndexError: index out of range in self

In [None]:
test_sample['plot_structure'] = test_sample['summarized'].apply(classify_summary)

Token indices sequence length is longer than the specified maximum sequence length for this model (561 > 512). Running this sequence through the model will result in indexing errors


response :  [{'generated_text': 'Crime and Underworld Exploration'}]
classification :  Crime and Underworld Exploration
response :  [{'generated_text': 'Comedy'}]
classification :  Comedy
response :  [{'generated_text': 'Love and Relationship Dynamics'}]
classification :  Love and Relationship Dynamics
response :  [{'generated_text': 'Crime and Underworld Exploration'}]
classification :  Crime and Underworld Exploration
response :  [{'generated_text': 'Crime and Underworld Exploration'}]
classification :  Crime and Underworld Exploration
response :  [{'generated_text': 'Comedy'}]
classification :  Comedy
response :  [{'generated_text': 'Drama'}]
classification :  Drama
response :  [{'generated_text': 'Drama'}]
classification :  Drama
response :  [{'generated_text': 'Comedy'}]
classification :  Comedy
response :  [{'generated_text': 'Love and Relationship Dynamics'}]
classification :  Love and Relationship Dynamics


In [None]:
merged_sample = pd.merge(test_sample, movies, on='wikipedia_movie_id', how='left')
merged_sample['clean_plot_summary', 'plot_structure', 'movie_genre']

Unnamed: 0,wikipedia_movie_id,plot_summary,clean_plot_summary,tokenized_plot_summary,filtered_tokens,lemmatized_tokens,plot_structure_cluster,summarized_plot,plot_structure,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres
0,5727920,"In Brooklyn, New York from November 1988 throu...",in brooklyn new york from november through ear...,"['in', 'brooklyn', 'new', 'york', 'from', 'nov...","['brooklyn', 'new', 'york', 'november', 'early...","['brooklyn', 'new', 'york', 'november', 'early...",7,bobby green is manager of a successful el car...,Crime and Underworld Exploration,/m/0f1jm9,We Own the Night,2007-05-25,54926886.0,116.0,"{""/m/06b_j"": ""Russian Language"", ""/m/02h40lc"":...","{""/m/09c7w0"": ""United States of America""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/01jfsb"": ""Th..."
1,10837755,Gavin Grey is a 1950s star athlete known by t...,gavin grey is a s star athlete known by the mo...,"['gavin', 'grey', 'is', 'a', 's', 'star', 'ath...","['gavin', 'grey', 'star', 'athlete', 'known', ...","['gavin', 'grey', 'star', 'athlete', 'known', ...",14,gavin grey is a star athlete known by the mon...,Comedy,/m/02qrl46,Everybody's All-American,1988-11-04,12638294.0,127.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01z02hx"": ""Sports"", ""/m/068d7h"": ""Romanti..."
2,1365502,Stone asks Gannon to speak before a night-scho...,stone asks gannon to speak before a nightschoo...,"['stone', 'asks', 'gannon', 'to', 'speak', 'be...","['stone', 'asks', 'gannon', 'speak', 'nightsch...","['stone', 'asks', 'gannon', 'speak', 'nightsch...",14,stone asks gannon to speak before a nightschoo...,Love and Relationship Dynamics,/m/04x0d0,Teacher's Pet,1958-04-01,6491350.0,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/02l7c8"": ""..."
3,612656,The film begins with the funeral of one of the...,the film begins with the funeral of one of the...,"['the', 'film', 'begins', 'with', 'the', 'fune...","['film', 'begins', 'funeral', 'one', 'three', ...","['film', 'begin', 'funeral', 'one', 'three', '...",0,the film begins with the funeral of one of the...,Crime and Underworld Exploration,/m/02wjh2,The Funeral,1996-08-28,1412799.0,99.0,"{""/m/02bjrlw"": ""Italian Language"", ""/m/02h40lc...","{""/m/09c7w0"": ""United States of America""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/0gw5w78"": ""G..."
4,5144797,Detective Lucas McCarthy finally catches the ...,detective lucas mccarthy finally catches the s...,"['detective', 'lucas', 'mccarthy', 'finally', ...","['detective', 'lucas', 'mccarthy', 'finally', ...","['detective', 'lucas', 'mccarthy', 'finally', ...",1,detective lucas mccarthy finally catches the s...,Crime and Underworld Exploration,/m/0d4vg5,The Horror Show,1989,1738897.0,94.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/03npn"": ""Horror"", ""/m/01585b"": ""Slasher""}"
5,76366,It is February 1929 in the city of Chicago. Tw...,it is february in the city of chicago two frie...,"['it', 'is', 'february', 'in', 'the', 'city', ...","['february', 'city', 'chicago', 'two', 'friend...","['february', 'city', 'chicago', 'two', 'friend...",11,it is february in the city of chicago two fri...,Comedy,/m/0k4f3,Some Like It Hot,1959-03-29,25000000.0,122.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/0gsy3b"": ""..."
6,14872132,"The film is set amongst a group of gungnyeo, o...",the film is set amongst a group of gungnyeo or...,"['the', 'film', 'is', 'set', 'amongst', 'a', '...","['film', 'set', 'amongst', 'group', 'gungnyeo'...","['film', 'set', 'amongst', 'group', 'gungnyeo'...",4,The film is set amongst a group of gungnyeo o...,Drama,/m/03g_qht,Shadows in the Palace,2007,9723970.0,112.0,"{""/m/02hwhyv"": ""Korean Language""}","{""/m/06qd3"": ""South Korea""}","{""/m/01jfsb"": ""Thriller"", ""/m/02n4kr"": ""Myster..."
7,14415654,Waterland follows the story of a mentally angu...,waterland follows the story of a mentally angu...,"['waterland', 'follows', 'the', 'story', 'of',...","['waterland', 'follows', 'story', 'mentally', ...","['waterland', 'follows', 'story', 'mentally', ...",14,waterland follows the story of a mentally angu...,Drama,/m/03d2tx2,Waterland,1992-08-21,1100218.0,94.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/07s9rl0"": ""Drama"", ""/m/0219x_"": ""Indie"", ..."
8,1188061,Rudy Robles is told by his mother to pick up ...,rudy robles is told by his mother to pick up h...,"['rudy', 'robles', 'is', 'told', 'by', 'his', ...","['rudy', 'robles', 'told', 'mother', 'pick', '...","['rudy', 'roble', 'told', 'mother', 'pick', 'c...",7,rudy robles is told by his mother to pick up ...,Comedy,/m/04fpr4,Born in East L.A.,1987-08-21,17355263.0,85.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0gf28"": ""Parody"", ""/m/0hj3mz5"": ""Comedy o..."
9,3037303,Wealthy American Jervis Pendleton III has a c...,wealthy american jervis pendleton iii has a ch...,"['wealthy', 'american', 'jervis', 'pendleton',...","['wealthy', 'american', 'jervis', 'pendleton',...","['wealthy', 'american', 'jervis', 'pendleton',...",2,wealthy american jervis pendleton iii has a ch...,Love and Relationship Dynamics,/m/08mb3r,Daddy Long Legs,1955-05-05,2500000.0,127.0,"{""/m/064_8sq"": ""French Language"", ""/m/02h40lc""...","{""/m/09c7w0"": ""United States of America""}","{""/m/02l7c8"": ""Romance Film"", ""/m/07s9rl0"": ""D..."


In [None]:
# prompt: print only the columns plot_structure, clean_summary and movie_genre of merged_sample

merged_sample[['plot_structure', 'clean_plot_summary', 'summarized_plot', 'movie_genres']]

Unnamed: 0,plot_structure,clean_plot_summary,summarized_plot,movie_genres
0,Crime and Underworld Exploration,in brooklyn new york from november through ear...,bobby green is manager of a successful el car...,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/01jfsb"": ""Th..."
1,Comedy,gavin grey is a s star athlete known by the mo...,gavin grey is a star athlete known by the mon...,"{""/m/01z02hx"": ""Sports"", ""/m/068d7h"": ""Romanti..."
2,Love and Relationship Dynamics,stone asks gannon to speak before a nightschoo...,stone asks gannon to speak before a nightschoo...,"{""/m/06cvj"": ""Romantic comedy"", ""/m/02l7c8"": ""..."
3,Crime and Underworld Exploration,the film begins with the funeral of one of the...,the film begins with the funeral of one of the...,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/0gw5w78"": ""G..."
4,Crime and Underworld Exploration,detective lucas mccarthy finally catches the s...,detective lucas mccarthy finally catches the s...,"{""/m/03npn"": ""Horror"", ""/m/01585b"": ""Slasher""}"
5,Comedy,it is february in the city of chicago two frie...,it is february in the city of chicago two fri...,"{""/m/06cvj"": ""Romantic comedy"", ""/m/0gsy3b"": ""..."
6,Drama,the film is set amongst a group of gungnyeo or...,The film is set amongst a group of gungnyeo o...,"{""/m/01jfsb"": ""Thriller"", ""/m/02n4kr"": ""Myster..."
7,Drama,waterland follows the story of a mentally angu...,waterland follows the story of a mentally angu...,"{""/m/07s9rl0"": ""Drama"", ""/m/0219x_"": ""Indie"", ..."
8,Comedy,rudy robles is told by his mother to pick up h...,rudy robles is told by his mother to pick up ...,"{""/m/0gf28"": ""Parody"", ""/m/0hj3mz5"": ""Comedy o..."
9,Love and Relationship Dynamics,wealthy american jervis pendleton iii has a ch...,wealthy american jervis pendleton iii has a ch...,"{""/m/02l7c8"": ""Romance Film"", ""/m/07s9rl0"": ""D..."


In [None]:
test_sample_2['plot_structure'].value_counts()

Unnamed: 0_level_0,count
plot_structure,Unnamed: 1_level_1
Love and Relationship Dynamics,2
Crime and Underworld Exploration,1
1.,1
Quest for Vengeance or Justice,1


In [None]:
merged_sample = pd.merge(test_sample, movies, on='wikipedia_movie_id', how='left')
merged_sample

Unnamed: 0,wikipedia_movie_id,plot_summary,clean_plot_summary,tokenized_plot_summary,filtered_tokens,lemmatized_tokens,plot_structure_cluster,plot_structure,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres
0,460442,"Flik, an individualist and would-be inventor,...",flik an individualist and wouldbe inventor liv...,"['flik', 'an', 'individualist', 'and', 'wouldb...","['flik', 'individualist', 'wouldbe', 'inventor...","['flik', 'individualist', 'wouldbe', 'inventor...",10,Comedy,/m/02c7k4,A Bug's Life,1998-11-14,363398565.0,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/03k9fj"": ""Adventure"", ""/m/0hj3myq"": ""Chil..."
1,155785,14-year-old Max Goof is the son of Goofy Goof...,yearold max goof is the son of goofy goof the ...,"['yearold', 'max', 'goof', 'is', 'the', 'son',...","['yearold', 'max', 'goof', 'son', 'goofy', 'go...","['yearold', 'max', 'goof', 'son', 'goofy', 'go...",1,Comedy,/m/014d_k,A Goofy Movie,1995-04-07,35348597.0,78.0,"{""/m/02h40lc"": ""English Language"", ""/m/06nm1"":...","{""/m/09c7w0"": ""United States of America""}","{""/m/03k9fj"": ""Adventure"", ""/m/0hj3myq"": ""Chil..."
2,23396344,A young woman on a train eyes a nervous man. I...,a young woman on a train eyes a nervous man in...,"['a', 'young', 'woman', 'on', 'a', 'train', 'e...","['young', 'woman', 'train', 'eyes', 'nervous',...","['young', 'woman', 'train', 'eye', 'nervous', ...",10,Fantasy or Science Fiction,/m/06wb73s,Blood: The Last Vampire,2009-04-02,5731143.0,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0f8l9c"": ""France"", ""/m/03h64"": ""Hong Kong...","{""/m/01jfsb"": ""Thriller"", ""/m/03npn"": ""Horror""..."
3,10051590,"The film portrays MacArthur's life from 1942,...",the film portrays macarthurs life from before ...,"['the', 'film', 'portrays', 'macarthurs', 'lif...","['film', 'portrays', 'macarthurs', 'life', 'ba...","['film', 'portrays', 'macarthur', 'life', 'bat...",6,1.,/m/02q07br,MacArthur,1977-07,16320000.0,130.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/03bxz7"": ""Biographical film"", ""/m/017fp"":..."
4,1848371,Refined actress Lauren Ames finally has a cha...,refined actress lauren ames finally has a chan...,"['refined', 'actress', 'lauren', 'ames', 'fina...","['refined', 'actress', 'lauren', 'ames', 'fina...","['refined', 'actress', 'lauren', 'ames', 'fina...",8,Drama,/m/060rs5,Outrageous Fortune,1987-01-30,65864741.0,100.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0556j8"": ""Buddy film"", ""/m/0hj3l_y"": ""Act..."
5,3858746,"As a cub, Alex the Lion was called Alakay and ...",as a cub alex the lion was called alakay and w...,"['as', 'a', 'cub', 'alex', 'the', 'lion', 'was...","['cub', 'alex', 'lion', 'called', 'alakay', 's...","['cub', 'alex', 'lion', 'called', 'alakay', 's...",10,1.,/m/0b3n61,Madagascar: Escape 2 Africa,2008-10-30,602308178.0,89.0,"{""/m/02bjrlw"": ""Italian Language"", ""/m/02h40lc...","{""/m/09c7w0"": ""United States of America""}","{""/m/03k9fj"": ""Adventure"", ""/m/0hj3myq"": ""Chil..."
6,1728156,Newly ordained Padre Amaro arrives in Los Reye...,newly ordained padre amaro arrives in los reye...,"['newly', 'ordained', 'padre', 'amaro', 'arriv...","['newly', 'ordained', 'padre', 'amaro', 'arriv...","['newly', 'ordained', 'padre', 'amaro', 'arriv...",12,Love and Relationship Dynamics,/m/05rl7b,The Crime of Father Amaro,2002-08-16,26996738.0,120.0,"{""/m/06nm1"": ""Spanish Language""}","{""/m/0b90_r"": ""Mexico"", ""/m/06mkj"": ""Spain"", ""...","{""/m/02l7c8"": ""Romance Film"", ""/m/07s9rl0"": ""D..."
7,11357777,"In 1965, Arlo Guthrie has attempted to avoid ...",in arlo guthrie has attempted to avoid the dra...,"['in', 'arlo', 'guthrie', 'has', 'attempted', ...","['arlo', 'guthrie', 'attempted', 'avoid', 'dra...","['arlo', 'guthrie', 'attempted', 'avoid', 'dra...",0,Comedy,/m/02r8w4b,Alice's Restaurant,1969-08-20,6300000.0,111.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01t_vv"": ""Comedy-drama"", ""/m/09n5t_"": ""Am..."
8,4572965,"Shortly after the events of Hostel, Paxton is...",shortly after the events of hostel paxton is s...,"['shortly', 'after', 'the', 'events', 'of', 'h...","['shortly', 'events', 'hostel', 'paxton', 'suf...","['shortly', 'event', 'hostel', 'paxton', 'suff...",10,Quest for Vengeance or Justice,/m/0c9jj9,Hostel 2,2007-06-07,35619521.0,94.0,"{""/m/01wgr"": ""Czech Language"", ""/m/02bjrlw"": ""...","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/01q03"": ""Cult"", ..."
9,1675523,"Sean Vetter and Demetrius Hicks , who are for...",sean vetter and demetrius hicks who are former...,"['sean', 'vetter', 'and', 'demetrius', 'hicks'...","['sean', 'vetter', 'demetrius', 'hicks', 'form...","['sean', 'vetter', 'demetrius', 'hick', 'forme...",7,Quest for Vengeance or Justice,/m/05mj4d,A Man Apart,2003-04-04,44350926.0,110.0,"{""/m/064_8sq"": ""French Language"", ""/m/02h40lc""...","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0lsxr"": ""Crime Fiction"", ""/m/01jfsb"": ""Th..."


In [None]:
# see what is the longest summary in common_summaries
max_len = max(common_summaries['clean_plot_summary'].apply(lambda x: len(x.split())))
print(max_len)

# print stats about len of summaries
print(common_summaries['clean_plot_summary'].apply(lambda x: len(x.split())).describe())

3515
count    8486.000000
mean      495.188310
std       351.862229
min        13.000000
25%       185.000000
50%       476.000000
75%       713.000000
max      3515.000000
Name: clean_plot_summary, dtype: float64


In [None]:
test_sample

In [None]:
batch_size = 500
output_dir = '../data/classified_summaries_batches'
os.makedirs(output_dir, exist_ok=True)

sample_summaries = common_summaries.sample(1500)

# Process each batch and save to checkpoint files
for i in range(0, len(summaries), batch_size):
    batch = sample_ummaries.iloc[i:i + batch_size].copy()  # Work with a copy to avoid modifying the original DataFrame

    # Parallel processing of classification within the batch
    batch['plot_structure'] = Parallel(n_jobs=4)(delayed(classify_summary)(summary) for summary in batch['clean_plot_summary'])

    # Save the processed batch to a CSV file as a checkpoint
    batch.to_csv(f"{output_dir}/classified_summaries_batch_{i}.csv", index=False)



In [None]:
# Optional: Combine all batch files into a single file after processing
all_batches = [pd.read_csv(f"{output_dir}/classified_summaries_batch_{i}.csv") for i in range(0, len(summaries), batch_size)]
classified_summaries = pd.concat(all_batches, ignore_index=True)
classified_summaries.to_csv('../data/processed/classified_summaries_with_plot_structures.csv', index=False)