### Load and Explore 

In [1]:
import pandas as pd

df = pd.read_csv("../data/bonus_task.csv")
print(df.head())
print(df.columns)


   Unnamed: 0                                          Title  \
0           0                        I tre volti della paura   
1           1  Dungeons & Dragons: The Book of Vile Darkness   
2           2                     The Shop Around the Corner   
3           3                             Mr. Holland's Opus   
4           4                                       Scarface   

                                            Synopsis  \
0  Note: this synopsis is for the orginal Italian...   
1  Two thousand years ago, Nhagruul the Foul, a s...   
2  Matuschek's, a gift store in Budapest, is the ...   
3  Glenn Holland, not a morning person by anyone'...   
4  In May 1980, a Cuban man named Tony Montana (A...   

                                                 Tag  
0          cult, horror, gothic, murder, atmospheric  
1                                           violence  
2                                           romantic  
3             inspiring, romantic, stupid, feel-good  
4  

### Preprocess for Multi-label Format

In [None]:
# Clean up and parse multi-label tags
df['labels'] = df['Tag'].apply(lambda x: [t.strip() for t in x.split(',')])
df['labels'] 
label_set = sorted({tag for labels in df['labels'] for tag in labels})
print(label_set)

['absurd', 'action', 'adult comedy', 'allegory', 'alternate history', 'alternate reality', 'anti war', 'atmospheric', 'autobiographical', 'avant garde', 'blaxploitation', 'bleak', 'boring', 'brainwashing', 'christian film', 'claustrophobic', 'clever', 'comedy', 'comic', 'cruelty', 'cult', 'cute', 'dark', 'depressing', 'dramatic', 'entertaining', 'fantasy', 'feel-good', 'flashback', 'good versus evil', 'gothic', 'grindhouse film', 'haunting', 'historical', 'historical fiction', 'home movie', 'horror', 'humor', 'insanity', 'inspiring', 'intrigue', 'magical realism', 'melodrama', 'murder', 'mystery', 'neo noir', 'non fiction', 'paranormal', 'philosophical', 'plot twist', 'pornographic', 'prank', 'psychedelic', 'psychological', 'queer', 'realism', 'revenge', 'romantic', 'sadist', 'satire', 'sci-fi', 'sentimental', 'storytelling', 'stupid', 'suicidal', 'suspenseful', 'thought-provoking', 'tragedy', 'violence', 'western', 'whimsical']


In [9]:
df['labels'] .value_counts()

labels
[murder]                                                                  1004
[romantic]                                                                 731
[violence]                                                                 584
[psychedelic]                                                              437
[flashback]                                                                332
                                                                          ... 
[absurd, alternate reality]                                                  1
[violence, comedy, neo noir]                                                 1
[comedy, boring, bleak, cult, psychedelic, autobiographical, romantic]       1
[cult, psychedelic, romantic, flashback]                                     1
[cult, horror, gothic, murder, atmospheric]                                  1
Name: count, Length: 5604, dtype: int64

In [16]:
# Build a label “definition” for RAG retrieval:
label_knowledge = {
    "absurd": "Absurd films feature illogical, surreal, or nonsensical elements.",
    "action": "Action films emphasize physical feats, fights, chases, and stunts.",
    "adult comedy": "Adult comedy contains mature humor, often with sexual or risqué themes.",
    "allegory": "Allegorical films use symbolic figures and actions to convey deeper meanings.",
    "alternate history": "Alternate history explores 'what if' scenarios diverging from real historical events.",
    "alternate reality": "Alternate reality stories take place in worlds different from our own.",
    "anti war": "Anti-war films critique or oppose war and its consequences.",
    "atmospheric": "Atmospheric films focus on mood, visuals, and tension.",
    "autobiographical": "Autobiographical films are based on the creator's own life.",
    "avant garde": "Avant-garde films experiment with unconventional techniques and narratives.",
    "blaxploitation": "Blaxploitation films feature Black actors and urban settings, often with social commentary.",
    "bleak": "Bleak films have a grim, hopeless, or depressing tone.",
    "boring": "Boring films are perceived as dull or unengaging.",
    "brainwashing": "Brainwashing themes involve manipulation and control of minds.",
    "christian film": "Christian films focus on Christian themes, values, or stories.",
    "claustrophobic": "Claustrophobic films evoke a sense of confinement or restricted space.",
    "clever": "Clever films feature smart plots, witty dialogue, or inventive storytelling.",
    "comedy": "Comedy films aim to amuse and entertain through humor.",
    "comic": "Comic films are lighthearted and often based on comic books or strips.",
    "cruelty": "Cruelty involves suffering or abuse intentionally inflicted.",
    "cult": "A cult film has a passionate fanbase, often obscure or unconventional.",
    "cute": "Cute films are charming, endearing, or visually appealing.",
    "dark": "Dark films explore grim, disturbing, or morally ambiguous themes.",
    "depressing": "Depressing films evoke sadness or despair.",
    "dramatic": "Dramatic stories focus on emotional, social, or moral conflict.",
    "entertaining": "Entertaining films are enjoyable and engaging for audiences.",
    "fantasy": "Fantasy films feature magical, supernatural, or imaginary elements.",
    "feel-good": "Feel-good stories are emotionally satisfying and heartwarming.",
    "flashback": "Flashback films use scenes set in earlier times to provide context.",
    "good versus evil": "Good versus evil stories center on the struggle between opposing moral forces.",
    "gothic": "Gothic stories mix horror with romance or mystery.",
    "grindhouse film": "Grindhouse films are low-budget, exploitative, and often sensational.",
    "haunting": "Haunting films leave a lingering emotional or psychological impact.",
    "historical": "Historical films are set in or based on real past events.",
    "historical fiction": "Historical fiction blends real history with fictional elements.",
    "home movie": "Home movies are amateur films, often documenting personal events.",
    "horror": "Horror involves fear, shock, and the supernatural.",
    "humor": "Humor is the quality of being amusing or comical.",
    "insanity": "Insanity themes explore madness, mental illness, or psychological breakdown.",
    "inspiring": "Inspiring films uplift and motivate the viewer.",
    "intrigue": "Intrigue films involve suspense, secrets, and complex plots.",
    "magical realism": "Magical realism blends realistic settings with magical elements.",
    "melodrama": "Melodramas emphasize exaggerated emotions and interpersonal conflicts.",
    "murder": "Murder-themed plots involve killings, crime, and investigations.",
    "mystery": "Mystery films revolve around solving puzzles or crimes.",
    "neo noir": "Neo-noir updates classic film noir themes with modern sensibilities.",
    "non fiction": "Non-fiction films depict real events, people, or facts.",
    "paranormal": "Paranormal films involve supernatural phenomena beyond scientific explanation.",
    "philosophical": "Philosophical films explore deep questions about existence, reality, or ethics.",
    "plot twist": "Plot twist films feature unexpected changes in the storyline.",
    "pornographic": "Pornographic films depict explicit sexual content.",
    "prank": "Prank films involve practical jokes or trickery.",
    "psychedelic": "Psychedelic films use surreal visuals and sounds to evoke altered states.",
    "psychological": "Psychological films focus on mental states, emotions, and mind games.",
    "queer": "Queer films explore LGBTQ+ themes or characters.",
    "realism": "Realism strives for authentic, true-to-life representation.",
    "revenge": "Revenge films center on characters seeking retribution.",
    "romantic": "Romantic plots explore love and relationships.",
    "sadist": "Sadist themes involve deriving pleasure from inflicting pain.",
    "satire": "Satire uses humor, irony, or exaggeration to criticize or mock.",
    "science fiction": "Science fiction explores futuristic, technological, or extraterrestrial concepts.",
    "self-reflexive": "Self-reflexive films comment on their own creation or the filmmaking process.",
    "shocking": "Shocking films aim to surprise or disturb the audience.",
    "slapstick": "Slapstick comedy relies on exaggerated physical humor.",
    "slow": "Slow films have a deliberate, unhurried pace.",
    "spoof": "Spoof films parody or lampoon other genres or works.",
    "stupid": "Stupid usually implies silly or exaggerated comedy.",
    "stylized": "Stylized films use distinctive visual or narrative techniques.",
    "surreal": "Surreal films feature dreamlike, bizarre, or illogical elements.",
    "suspense": "Suspense films build tension and uncertainty.",
    "thriller": "Thrillers are fast-paced, exciting, and often involve danger.",
    "tragic": "Tragic films end in disaster or evoke pity and sorrow.",
    "twist ending": "Twist ending films conclude with an unexpected turn.",
    "uplifting": "Uplifting films inspire hope and positivity.",
    "violence": "Violent stories include physical force and conflict.",
    "weird": "Weird films are unusual, eccentric, or unconventional."
}


### Encode with Sentence-Transformers

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Try to load the model with a longer timeout and local cache
try:
    embedder = SentenceTransformer("paraphrase-MiniLM-L3-v2")
    # embedder = SentenceTransformer("all-MiniLM-L6-v2", cache_folder="./model_cache")
except Exception as e:
    print("An error occured while loading the model.")
    raise e

# Encode label definitions
label_embeddings = {
    label: embedder.encode(definition)
    for label, definition in label_knowledge.items()
}


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


### Predict Labels with RAG-Like Retrieval

In [None]:
def predict_labels_rag(text, threshold=0.4):
    text_emb = embedder.encode(text)
    scores = {
        label: cosine_similarity([text_emb], [label_emb])[0][0]
        for label, label_emb in label_embeddings.items()
    }
    return [label for label, score in scores.items() if score >= threshold]


### Evaluate Using MultiLabelBinarizer

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

mlb = MultiLabelBinarizer(classes=list(label_knowledge.keys()))
y_true = mlb.fit_transform(df['labels'])
y_pred = mlb.transform(df['Synopsis'].apply(predict_labels_rag))

print("Micro F1 score:", f1_score(y_true, y_pred, average='micro'))
print("Macro F1 score:", f1_score(y_true, y_pred, average='macro'))




NameError: name 'predict_labels_rag' is not defined

### Add to Gradio Demo

In [22]:
import gradio as gr

def rag_demo_predict(text):
    predicted = predict_labels_rag(text)
    return ", ".join(predicted)

gr.Interface(fn=rag_demo_predict, inputs="textbox", outputs="textbox", title="Multi-Label RAG Classifier").launch()


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


