Training the model (distilbert-base-uncased) on emotion labelled data (Warning!! Takes a long time (3-4 hrs, later we just access the trained model that has been initally saved by me.)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import evaluate

# 1. Dataset laden
dataset = load_dataset("dair-ai/emotion")

# 2. Tokenizer & Modell laden
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

# 3. Tokenize Funktion
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['validation']

# 4. Metrik vorbereiten
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# 5. Training Arguments (ohne evaluation_strategy und load_best_model_at_end)
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# 6. Trainer definieren
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 7. Trainieren
trainer.train()

# 8. Manuelle Evaluation am Ende
eval_results = trainer.evaluate()
print(f"Eval Results: {eval_results}")

# 9. Modell speichern
trainer.save_model("./emotion_lyrics_model")



(After receiving bad results after final step of chatbot inegration) Finetuning trained model for chatbot integration with question answer pairs that contain
adjectives that might be used in user queries in the chat user face via file keyword_emotion_dataset.csv available on github

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

# === CONFIG ===
MODEL_PATH = "./emotion_lyrics_model"
FINETUNED_MODEL_PATH = "./emotion_lyrics_model_finetuned"
DATASET_PATH = "keyword_emotion_dataset.csv"
LABELS = ["sadness", "joy", "love", "anger", "fear", "surprise"]

# === Load dataset ===
df = pd.read_csv(DATASET_PATH)
print(df.head())
print(df.columns)

label_to_id = {label: i for i, label in enumerate(LABELS)}
df['label_id'] = df['emotion'].map(label_to_id)

# Convert to HuggingFace Dataset, rename 'label_id' to 'labels'
dataset = Dataset.from_pandas(df[['keyword', 'label_id']].rename(columns={'label_id': 'labels'}))

# === Load tokenizer and model ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

# === Preprocessing function ===
def preprocess(batch):
    return tokenizer(batch['keyword'], truncation=True, padding=True, max_length=64)

dataset = dataset.map(preprocess, batched=True)

# Rename label column (falls nötig, hier schon gemacht)
# dataset = dataset.rename_column("emotion", "labels")  # Nicht mehr nötig, da oben schon 'labels'

# Set dataset format for PyTorch with 'labels' instead of 'emotion'
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# === Training args ===
training_args = TrainingArguments(
    output_dir=FINETUNED_MODEL_PATH,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
)

# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
)

# === Train and save model ===
trainer.train()
trainer.save_model(FINETUNED_MODEL_PATH)

print(f"Fine-tuning complete. Model saved to {FINETUNED_MODEL_PATH}")


Creating and vectorizing a small database of songs to recommend to users. (Due to large processing times locally switched to Pinecone Cloud solution for processing)

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from tqdm import tqdm

# === CONFIGURATION ===
CSV_PATH = "songs.csv"
PINECONE_API_KEY = "pcsk_3VgB1d_QjgaUkxUF6XvyNXfWvZ7QMDYbCw1q1RVSHD7K4WREAUn1UVrw3aJhbJ2WWBZ79v"
PINECONE_ENV = "us-east-1"
INDEX_NAME = "genai"
EMBEDDING_DIM = 384  # assuming all-MiniLM-L6-v2
BATCH_SIZE = 100
MAX_METADATA_LENGTH = 1000  # Max Zeichen für Lyrics in Metadata

# === Emotion model ===
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_path = "./emotion_lyrics_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
emotion_model = AutoModelForSequenceClassification.from_pretrained(model_path)

label_mapping = {
    "label_0": "sadness",
    "label_1": "joy",
    "label_2": "love",
    "label_3": "anger",
    "label_4": "fear",
    "label_5": "surprise"
}

def predict_emotion(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        logits = emotion_model(**inputs).logits
    predicted_id = torch.argmax(logits, dim=1).item()
    return label_mapping[f"label_{predicted_id}"]

def truncate_lyrics(lyrics):
    if lyrics and len(lyrics) > MAX_METADATA_LENGTH:
        return lyrics[:MAX_METADATA_LENGTH] + "..."
    return lyrics

# === Init Pinecone ===
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(INDEX_NAME)

# === Load dataset ===
df = pd.read_csv(CSV_PATH)
print("Data preview:", df.head(3))

# === Init embedder ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# === Build and upsert vectors ===
vectors = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    title = str(row['Name'])
    artist = str(row['Artist'])
    lyrics = str(row['Lyrics'])

    if not lyrics.strip():
        continue

    # Lyrics kürzen, damit Metadata-Limit eingehalten wird
    lyrics_trunc = truncate_lyrics(lyrics)

    try:
        embedding = embedder.encode(lyrics).tolist()
        emotion = predict_emotion(lyrics)
        vec_id = f"song-{i}"

        vectors.append({
            "id": vec_id,
            "values": embedding,
            "metadata": {
                "title": title,
                "artist": artist,
                "lyrics": lyrics_trunc,
                "emotion": emotion
            }
        })

        # Upsert in Batches
        if len(vectors) >= BATCH_SIZE:
            index.upsert(vectors=vectors)
            vectors = []

    except Exception as e:
        print(f"Error embedding {title}: {e}")

# Upload remaining
if vectors:
    index.upsert(vectors=vectors)

print("✅ Upload finished!")

# === Check index status ===
stats = index.describe_index_stats()
print("Total indexed vectors:", stats['total_vector_count'])


In [None]:
! pip install pinecone

Collecting pinecone
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl.metadata (28 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone-7.3.0-py3-none-any.whl (587 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-assistant, pinecone
Successfully installed pinecone-7.3.0 pinecone-plugin-assistant-1.7.0 pinecone-p

Fusing everything together via a RAG Pipeline and adding a Chatbot interface

In [None]:
import gradio as gr
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import random
from huggingface_hub import login

login(token="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")

# === CONFIG ===
PINECONE_API_KEY = "pcsk_3VgB1d_QjgaUkxUF6XvyNXfWvZ7QMDYbCw1q1RVSHD7K4WREAUn1UVrw3aJhbJ2WWBZ79v"
PINECONE_ENV = "us-east-1"
INDEX_NAME = "genai"



EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
EMOTION_MODEL_PATH = "ankyber/emotion-lyrics"

# === Init Pinecone & Embedder ===
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(INDEX_NAME)
embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)

# === Load Emotion Classifier ===
tokenizer = AutoTokenizer.from_pretrained(EMOTION_MODEL_PATH)
emotion_model = AutoModelForSequenceClassification.from_pretrained(EMOTION_MODEL_PATH)
label_mapping = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

# === Friendly Response Templates ===
emotion_responses = {
    "sadness": [
        "I'm here for you. Maybe one of these songs can keep you company. 💙",
        "Here are some tracks that might match your mood. I hope they help a little."
    ],
    "joy": [
        "Love the energy! These should keep the vibe going. ✨",
        "Yay, happy vibes! Try these songs!"
    ],
    "anger": [
        "Let it out — these songs might help channel it. 🔥",
        "Here's a few that match that fire you're feeling."
    ],
    "fear": [
        "That’s okay — music can be comforting. Try these. 🤍",
        "Some songs for the quiet moments when you're feeling anxious."
    ],
    "love": [
        "Feeling romantic? These might fit the mood. 💕",
        "Songs for when your heart’s full. Enjoy!"
    ],
    "surprise": [
        "A surprise feeling needs a surprising song, right? 😉",
        "Wasn’t expecting that? Maybe these songs will match!"
    ]
}

# === Emotion Classifier ===
def predict_emotion(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        logits = emotion_model(**inputs).logits
    predicted_id = torch.argmax(logits, dim=1).item()
    return label_mapping.get(predicted_id, "unknown")

# === Chatbot Function ===
def chatbot(user_input):
    user_input_clean = user_input.lower().strip()
    emotion = predict_emotion(user_input_clean)

    # Embed and query Pinecone filtered by detected emotion
    query_vector = embedder.encode(user_input).tolist()
    result = index.query(
        vector=query_vector,
        top_k=3,
        include_metadata=True,
        filter={"emotion": emotion}
    )

    response_intro = random.choice(emotion_responses.get(emotion, ["Here are some tracks for you!"]))

    if not result['matches']:
        return "😕 I couldn’t find any songs that match your input."

    responses = [response_intro, ""]
    for match in result['matches']:
        meta = match['metadata']
        responses.append(
            f"🎵 *{meta.get('title', 'Unknown')}* by *{meta.get('artist', 'Unknown')}*\n"
            f"🧠 Emotion: {meta.get('emotion', 'unknown')}\n"
            f"📖 Lyrics preview: {meta.get('lyrics', '')[:200].strip()}...\n"
        )

    if random.random() < 0.3:
        responses.append("💡 *P.S.: My GF’s favorite artist is Conan Gray. Maybe give him a listen too?* 💜")

    return "\n---\n".join(responses)

# === Launch Gradio Interface ===
iface = gr.Interface(
    fn=chatbot,
    inputs=gr.Textbox(lines=3, placeholder="Tell me a mood or paste some lyrics..."),
    outputs="text",
    title="🎤 Song Recommender & Emotion Detector",
    description="Get songs that match your vibe — or find the emotion behind your lyrics."
)

iface.launch()


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/837 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://af234b8a3b7e077574.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


