In [40]:
import pandas as pd
import heapq


### Load and Explore 

In [41]:

df = pd.read_csv("../data/bonus_task.csv")
print(df.head())
print(df.columns)


   Unnamed: 0                                          Title  \
0           0                        I tre volti della paura   
1           1  Dungeons & Dragons: The Book of Vile Darkness   
2           2                     The Shop Around the Corner   
3           3                             Mr. Holland's Opus   
4           4                                       Scarface   

                                            Synopsis  \
0  Note: this synopsis is for the orginal Italian...   
1  Two thousand years ago, Nhagruul the Foul, a s...   
2  Matuschek's, a gift store in Budapest, is the ...   
3  Glenn Holland, not a morning person by anyone'...   
4  In May 1980, a Cuban man named Tony Montana (A...   

                                                 Tag  
0          cult, horror, gothic, murder, atmospheric  
1                                           violence  
2                                           romantic  
3             inspiring, romantic, stupid, feel-good  
4  

### Preprocess for Multi-label Format

In [42]:
# Clean up and parse multi-label tags
df['labels'] = df['Tag'].apply(lambda x: [t.strip() for t in x.split(',')])
df['labels'] 
label_set = sorted({tag for labels in df['labels'] for tag in labels})
print(label_set)

['absurd', 'action', 'adult comedy', 'allegory', 'alternate history', 'alternate reality', 'anti war', 'atmospheric', 'autobiographical', 'avant garde', 'blaxploitation', 'bleak', 'boring', 'brainwashing', 'christian film', 'claustrophobic', 'clever', 'comedy', 'comic', 'cruelty', 'cult', 'cute', 'dark', 'depressing', 'dramatic', 'entertaining', 'fantasy', 'feel-good', 'flashback', 'good versus evil', 'gothic', 'grindhouse film', 'haunting', 'historical', 'historical fiction', 'home movie', 'horror', 'humor', 'insanity', 'inspiring', 'intrigue', 'magical realism', 'melodrama', 'murder', 'mystery', 'neo noir', 'non fiction', 'paranormal', 'philosophical', 'plot twist', 'pornographic', 'prank', 'psychedelic', 'psychological', 'queer', 'realism', 'revenge', 'romantic', 'sadist', 'satire', 'sci-fi', 'sentimental', 'storytelling', 'stupid', 'suicidal', 'suspenseful', 'thought-provoking', 'tragedy', 'violence', 'western', 'whimsical']


In [43]:
df['labels'] .value_counts()

labels
[murder]                                                                  1004
[romantic]                                                                 731
[violence]                                                                 584
[psychedelic]                                                              437
[flashback]                                                                332
                                                                          ... 
[absurd, alternate reality]                                                  1
[violence, comedy, neo noir]                                                 1
[comedy, boring, bleak, cult, psychedelic, autobiographical, romantic]       1
[cult, psychedelic, romantic, flashback]                                     1
[cult, horror, gothic, murder, atmospheric]                                  1
Name: count, Length: 5604, dtype: int64

In [45]:
# Build a label “definition” for RAG retrieval:

# Richer descriptions or examples to label_knowledge
label_knowledge = {
        "absurd": "Absurd films feature illogical, surreal, or nonsensical elements. Example: Monty Python's The Meaning of Life.",
        "action": "Action films emphasize physical feats, fights, chases, and stunts. Example: Die Hard.",
        "adult comedy": "Adult comedy contains mature humor, often with sexual or risqué themes. Example: American Pie.",
        "allegory": "Allegorical films use symbolic figures and actions to convey deeper meanings. Example: Animal Farm.",
        "alternate history": "Alternate history explores 'what if' scenarios diverging from real historical events. Example: The Man in the High Castle.",
        "alternate reality": "Alternate reality stories take place in worlds different from our own. Example: The Matrix.",
        "anti war": "Anti-war films critique or oppose war and its consequences. Example: Apocalypse Now.",
        "atmospheric": "Atmospheric films focus on mood, visuals, and tension. Example: Blade Runner.",
        "autobiographical": "Autobiographical films are based on the creator's own life. Example: 8½ by Federico Fellini.",
        "avant garde": "Avant-garde films experiment with unconventional techniques and narratives. Example: Un Chien Andalou.",
        "blaxploitation": "Blaxploitation films feature Black actors and urban settings, often with social commentary. Example: Shaft.",
        "bleak": "Bleak films have a grim, hopeless, or depressing tone. Example: Requiem for a Dream.",
        "boring": "Boring films are perceived as dull or unengaging. Example: Some viewers consider Solaris (1972) slow and boring.",
        "brainwashing": "Brainwashing themes involve manipulation and control of minds. Example: The Manchurian Candidate.",
        "christian film": "Christian films focus on Christian themes, values, or stories. Example: The Passion of the Christ.",
        "claustrophobic": "Claustrophobic films evoke a sense of confinement or restricted space. Example: Buried.",
        "clever": "Clever films feature smart plots, witty dialogue, or inventive storytelling. Example: The Usual Suspects.",
        "comedy": "Comedy films aim to amuse and entertain through humor. Example: Airplane!.",
        "comic": "Comic films are lighthearted and often based on comic books or strips. Example: Scott Pilgrim vs. the World.",
        "cruelty": "Cruelty involves suffering or abuse intentionally inflicted. Example: Saw.",
        "cult": "Cult films are often strange or niche and attract small, loyal audiences. Example: The Rocky Horror Picture Show.",
        "cute": "Cute films are charming, endearing, or visually appealing. Example: My Neighbor Totoro.",
        "dark": "Dark films explore grim, disturbing, or morally ambiguous themes. Example: Se7en.",
        "depressing": "Depressing films evoke sadness or despair. Example: Grave of the Fireflies.",
        "dramatic": "Dramatic stories focus on emotional, social, or moral conflict. Example: The Godfather.",
        "entertaining": "Entertaining films are enjoyable and engaging for audiences. Example: Guardians of the Galaxy.",
        "fantasy": "Fantasy films feature magical, supernatural, or imaginary elements. Example: The Lord of the Rings.",
        "feel-good": "Feel-good stories are emotionally satisfying and heartwarming. Example: Amélie.",
        "flashback": "Flashback films use scenes set in earlier times to provide context. Example: Citizen Kane.",
        "good versus evil": "Good versus evil stories center on the struggle between opposing moral forces. Example: Star Wars.",
        "gothic": "Gothic stories mix horror with romance or mystery. Example: Crimson Peak.",
        "grindhouse film": "Grindhouse films are low-budget, exploitative, and often sensational. Example: Planet Terror.",
        "haunting": "Haunting films leave a lingering emotional or psychological impact. Example: The Others.",
        "historical": "Historical films are set in or based on real past events. Example: Schindler's List.",
        "historical fiction": "Historical fiction blends real history with fictional elements. Example: Titanic.",
        "home movie": "Home movies are amateur films, often documenting personal events. Example: The Blair Witch Project (styled as found footage).",
        "horror": "Horror involves fear, shock, and the supernatural. Example: The Exorcist.",
        "humor": "Humor is the quality of being amusing or comical. Example: Dumb and Dumber.",
        "insanity": "Insanity themes explore madness, mental illness, or psychological breakdown. Example: Black Swan.",
        "inspiring": "Inspiring films uplift and motivate the viewer. Example: The Pursuit of Happyness.",
        "intrigue": "Intrigue films involve suspense, secrets, and complex plots. Example: Tinker Tailor Soldier Spy.",
        "magical realism": "Magical realism blends realistic settings with magical elements. Example: Pan's Labyrinth.",
        "melodrama": "Melodramas emphasize exaggerated emotions and interpersonal conflicts. Example: Terms of Endearment.",
        "murder": "Murder-themed plots involve killings, crime, and investigations. Example: Zodiac.",
        "mystery": "Mystery films revolve around solving puzzles or crimes. Example: Knives Out.",
        "neo noir": "Neo-noir updates classic film noir themes with modern sensibilities. Example: Drive.",
        "non fiction": "Non-fiction films depict real events, people, or facts. Example: Bowling for Columbine.",
        "paranormal": "Paranormal films involve supernatural phenomena beyond scientific explanation. Example: Paranormal Activity.",
        "philosophical": "Philosophical films explore deep questions about existence, reality, or ethics. Example: Waking Life.",
        "plot twist": "Plot twist films feature unexpected changes in the storyline. Example: The Sixth Sense.",
        "pornographic": "Pornographic films depict explicit sexual content. Example: Deep Throat.",
        "prank": "Prank films involve practical jokes or trickery. Example: Jackass: The Movie.",
        "psychedelic": "Psychedelic films use surreal visuals and sounds to evoke altered states. Example: 2001: A Space Odyssey.",
        "psychological": "Psychological films focus on mental states, emotions, and mind games. Example: Fight Club.",
        "queer": "Queer films explore LGBTQ+ themes or characters. Example: Moonlight.",
        "realism": "Realism strives for authentic, true-to-life representation. Example: Bicycle Thieves.",
        "revenge": "Revenge films center on characters seeking retribution. Example: Oldboy.",
        "romantic": "Romantic plots explore love and relationships. Example: The Notebook.",
        "sadist": "Sadist themes involve deriving pleasure from inflicting pain. Example: The Night Porter.",
        "satire": "Satire uses humor, irony, or exaggeration to criticize or mock. Example: Dr. Strangelove.",
        "science fiction": "Science fiction explores futuristic, technological, or extraterrestrial concepts. Example: Interstellar.",
        "self-reflexive": "Self-reflexive films comment on their own creation or the filmmaking process. Example: Adaptation.",
        "shocking": "Shocking films aim to surprise or disturb the audience. Example: Audition.",
        "slapstick": "Slapstick comedy relies on exaggerated physical humor. Example: The Pink Panther.",
        "slow": "Slow films have a deliberate, unhurried pace. Example: 2001: A Space Odyssey.",
        "spoof": "Spoof films parody or lampoon other genres or works. Example: Spaceballs.",
        "stupid": "Stupid usually implies silly or exaggerated comedy. Example: Dumb and Dumber.",
        "stylized": "Stylized films use distinctive visual or narrative techniques. Example: Sin City.",
        "surreal": "Surreal films feature dreamlike, bizarre, or illogical elements. Example: Eraserhead.",
        "suspense": "Suspense films build tension and uncertainty. Example: Rear Window.",
        "thriller": "Thrillers are fast-paced, exciting, and often involve danger. Example: North by Northwest.",
        "tragic": "Tragic films end in disaster or evoke pity and sorrow. Example: Romeo + Juliet.",
        "twist ending": "Twist ending films conclude with an unexpected turn. Example: The Usual Suspects.",
        "uplifting": "Uplifting films inspire hope and positivity. Example: Billy Elliot.",
        "violence": "Violent stories include physical force and conflict. Example: John Wick.",
        "weird": "Weird films are unusual, eccentric, or unconventional. Example: Being John Malkovich."
}


### Encode with Sentence-Transformers

In [49]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Try to load the model with a longer timeout and local cache
try:
    # embedder = SentenceTransformer("all-MiniLM-L6-v2")
    embedder = SentenceTransformer("all-mpnet-base-v2")

    
except Exception as e:
    print("Model download failed, please check your internet connection or try again later.")
    raise e

# Encode label definitions
label_embeddings = {
    label: embedder.encode(definition)
    for label, definition in label_knowledge.items()
}


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


### Predict Labels with RAG-Like Retrieval

In [50]:
def predict_labels_rag(text, threshold=0.4):
    text_emb = embedder.encode(text)
    scores = {
        label: cosine_similarity([text_emb], [label_emb])[0][0]
        for label, label_emb in label_embeddings.items()
    }
    top_k = 3
    top_labels = heapq.nlargest(top_k, scores.items(), key=lambda x: x[1])
    return [label for label, score in top_labels]



### Evaluate Using MultiLabelBinarizer

In [51]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

mlb = MultiLabelBinarizer(classes=list(label_knowledge.keys()))
y_true = mlb.fit_transform(df['labels'])
y_pred = mlb.transform(df['Synopsis'].apply(predict_labels_rag))

print("Micro F1 score:", f1_score(y_true, y_pred, average='micro'))
print("Macro F1 score:", f1_score(y_true, y_pred, average='macro'))




Micro F1 score: 0.1295249198484407
Macro F1 score: 0.07047765312887279


In [52]:
predict_labels_rag("Note: this synopsis is for the orginal Italian release with the segments in this certain order.Boris Karloff introduces three horror tales of the " \
"macabre and the supernatural known as the 'Three Faces of Fear'.THE TELEPHONERosy (Michele Mercier) is an attractive, high-priced Parisian call-girl who returns to her" \
" spacious, basement apartment after an evening out when she immediately gets beset by a series of strange phone calls. The caller soon identified himself as Frank, her" \
" ex-pimp who has recently escaped from prison. Rosy is terrified for it was her testimony that landed the man in jail. Looking for solace, Rosy phones her lesbian lover" \
" Mary (Lynda Alfonsi). The two women have been estranged for some time, but Rosy is certain that she is the only one who can help her. Mary agrees to come over that " \
"night. Seconds later, Frank calls again, promising that no matter who she calls for protection, he will have his revenge. Unknown to Rosy, Mary is the caller " \
"impersonating Frank. Marry arrives at Rosy's apartment soon after, and does her best to calm Rosy's nerves. She gives the panic-struck woman a tranquillizer "
"and puts her to bed.Later that night as Rosy sleeps, Mary gets up out of bed, and pens a note of confession: she was the one making the strange phone calls when " \
"she learned of Franks escape from prison. Knowing that Rosy would call on her for help, she explains that she felt it was her way of coming back into her life " \
"after their breakup. While she is busy writing, she fails to notice an intruder in the apartment. This time it is Frank, for real. He creeps up behind Mary and " \
"strangles her to death with one of Rosys nylon stockings. The sound of the struggle awaken Rosy and she gasps in fright. The murderous pimp realizes that he just " \
"killed the wrong woman, and slowly makes his way to Rosy's bed. However, earlier that night, Rosy had placed a butcher knife under her pillow at Mary's suggestion." \
" Rosy seizes the knife and stabs Frank with it as he's beginning to strangle her. Rosy drops the knife and breaks down in hysteria, surrounded by the two corpses " \
"of her former lovers.THE WURDALAKIn 19th Century Russia, Vladimir D'Urfe is a young nobleman on a long trip. During the course of his journey, he finds a beheaded " \
"corpse with a knife plunged into its heart. He withdraws the blade and takes it as a souvenir.Later that night, Vladimir stops at a small rural cottage to ask for " \
"shelter. He notices several daggers hanging up on one of the walls, and a vacant space that happens to fit the one he has discovered. Vladimir is surprised by the " \
"entrance of Giorgio (Glauco Onorato), who explains that the knife belongs to his father, who has not been seen for five days. Giorgio offers a room to the young " \
"count, and subsequently introduces him to the rest of the family: his wife (Rika Dialina), their young son Ivan, Giorgio's younger brother Pietro (Massimo Righi),"
" and sister Sdenka (Susy Anderson). It subsequently transpires that they are eagerly anticipating the arrival of their father, Gorcha, as well as the reason for his" \
" absence: he has gone to do battle with the outlaw and dreaded wurdalak Ali Beg. Vladimir is confused by the term, and Sdenka explains that a wurdalak is a walking " \
"cadaver who feeds on the blood of the living, preferably close friends and family members. Giorgio and Pietro are certain that the corpse Vladimir had discovered is " )

['avant garde', 'shocking', 'horror']

### Add to Gradio Demo

In [53]:
import gradio as gr

def rag_demo_predict(text):
    predicted = predict_labels_rag(text)
    return ", ".join(predicted)

gr.Interface(fn=rag_demo_predict, inputs="textbox", outputs="textbox", title="Multi-Label RAG Classifier").launch()


* Running on local URL:  http://127.0.0.1:7864
* To create a public link, set `share=True` in `launch()`.


