In [1]:
# Colab: basic setup
!pip install -q sentence-transformers nltk pandas numpy scikit-learn

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.model_selection import train_test_split

nltk.download('punkt')
nltk.download('stopwords')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Using device: cpu


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

# TODO: change this path to your actual file location in Drive
csv_path = "/content/drive/MyDrive/Spotify Million Song Dataset_exported.csv"
# or e.g.: "/content/drive/MyDrive/spotify/Spotify_Million_Song_Dataset_exported.csv"

df = pd.read_csv(csv_path)

print(df.shape)
print(df.columns)
df.head()


Mounted at /content/drive
(57650, 4)
Index(['artist', 'song', 'link', 'text'], dtype='object')


Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [5]:
nltk.download('punkt_tab')

# Adjust this if your lyrics column is named differently, e.g. 'lyrics'
LYRICS_COL = "text"  # or "lyrics"

# Drop rows with missing lyrics
df = df.dropna(subset=[LYRICS_COL, "artist", "song"]).reset_index(drop=True)

stop_words = set(stopwords.words('english'))

def clean_text(s):
    if not isinstance(s, str):
        return ""
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s']", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def preprocess_lyrics(s):
    s = clean_text(s)
    tokens = word_tokenize(s)
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)

# Optional: create a cleaned column (useful for inspection / baselines)
df["lyrics_clean"] = df[LYRICS_COL].apply(preprocess_lyrics)

df[["artist", "song", LYRICS_COL, "lyrics_clean"]].head()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,artist,song,text,lyrics_clean
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd...",look face 's wonderful face means something sp...
1,ABBA,"Andante, Andante","Take it easy with me, please \nTouch me gentl...",take easy please touch gently like summer even...
2,ABBA,As Good As New,I'll never know why I had to go \nWhy I had t...,'ll never know go put lousy rotten show boy to...
3,ABBA,Bang,Making somebody happy is a question of give an...,making somebody happy question give take learn...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,making somebody happy question give take learn...


In [6]:
# Load a compact but strong sentence embedding model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = SentenceTransformer(model_name, device=device)

# For speed, you can downsample during development; for final run, use full df
# df_small = df.sample(15000, random_state=42).reset_index(drop=True)
df_small = df.reset_index(drop=True)

# Encode all lyrics
corpus_texts = df_small[LYRICS_COL].tolist()
batch_size = 256

corpus_embeddings = embed_model.encode(
    corpus_texts,
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_tensor=True,
    device=device
)

corpus_embeddings = corpus_embeddings.to(device)

print("Corpus embeddings shape:", corpus_embeddings.shape)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/226 [00:00<?, ?it/s]

Corpus embeddings shape: torch.Size([57650, 384])


In [7]:
def search_song_from_snippet(snippet, top_k=5):
    embed_model.eval()
    with torch.no_grad():
        query_emb = embed_model.encode(
            [snippet],
            convert_to_tensor=True,
            device=device
        )

    # Cosine similarity
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    results = []
    for rank, (score, idx) in enumerate(zip(top_results.values, top_results.indices), start=1):
        row = df_small.iloc[int(idx)]
        results.append({
            "rank": rank,
            "score": float(score),
            "artist": row["artist"],
            "song": row["song"]
        })
    return pd.DataFrame(results)

# Example query
snippet = "You can dance, you can jive, having the time of your life"
search_song_from_snippet(snippet, top_k=5)


Unnamed: 0,rank,score,artist,song
0,1,0.651191,Glee,Dancing Queen
1,2,0.622537,ABBA,Reina Danzante
2,3,0.616793,ABBA,Dancing Queen
3,4,0.613336,Kylie Minogue,Dancing Queen
4,5,0.585486,Erasure,Don't Dance


In [8]:
# Create a train/test split over rows
train_df, test_df = train_test_split(df_small, test_size=0.1, random_state=42)

# Note: embeddings currently computed over df_small (all rows)
# For strictness you could recompute only for train_df; for this project, using all is acceptable.

def sample_snippet(full_lyrics, max_len=200):
    """Sample a small snippet (approx 'max_len' characters) from the lyrics."""
    if not isinstance(full_lyrics, str) or len(full_lyrics) == 0:
        return ""
    full_lyrics = full_lyrics.replace("\n", " ")
    if len(full_lyrics) <= max_len:
        return full_lyrics
    start = np.random.randint(0, max(1, len(full_lyrics) - max_len))
    return full_lyrics[start:start+max_len]

def evaluate_retrieval(num_samples=200, top_k=5):
    correct_top1 = 0
    correct_topk = 0

    for _ in range(num_samples):
        row = test_df.sample(1).iloc[0]
        true_artist = row["artist"]
        true_song = row["song"]
        lyrics = row[LYRICS_COL]

        snippet = sample_snippet(lyrics, max_len=200)
        if not snippet.strip():
            continue

        results = search_song_from_snippet(snippet, top_k=top_k)

        # Check if correct song is in predictions
        match_mask = (
            (results["artist"] == true_artist) &
            (results["song"] == true_song)
        )

        if match_mask.any():
            rank = results[match_mask]["rank"].iloc[0]
            if rank == 1:
                correct_top1 += 1
            if rank <= top_k:
                correct_topk += 1

    top1_acc = correct_top1 / num_samples
    topk_acc = correct_topk / num_samples
    return top1_acc, topk_acc

top1, top5 = evaluate_retrieval(num_samples=200, top_k=5)
print(f"Top-1 accuracy: {top1:.3f}")
print(f"Top-5 accuracy: {top5:.3f}")


Top-1 accuracy: 0.670
Top-5 accuracy: 0.790


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Use cleaned lyrics
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,2))
tfidf_matrix = tfidf.fit_transform(df_small["lyrics_clean"])

def search_tfidf(snippet, top_k=5):
    snippet_clean = preprocess_lyrics(snippet)
    q_vec = tfidf.transform([snippet_clean])
    sims = cosine_similarity(q_vec, tfidf_matrix)[0]
    top_idx = np.argsort(-sims)[:top_k]

    rows = []
    for rank, idx in enumerate(top_idx, start=1):
        row = df_small.iloc[idx]
        rows.append({
            "rank": rank,
            "score": float(sims[idx]),
            "artist": row["artist"],
            "song": row["song"]
        })
    return pd.DataFrame(rows)

snippet = "You can dance, you can jive, having the time of your life"
search_tfidf(snippet, top_k=5)


Unnamed: 0,rank,score,artist,song
0,1,0.372948,Lionel Richie,Time Of Our Life
1,2,0.332543,Grease,Born To Hand Jive
2,3,0.320634,Glee,Born To Hand Jive
3,4,0.312711,Cyndi Lauper,Jim Jive
4,5,0.309838,Chaka Khan,Jive Talkin'


In [14]:
# Check if Dancing Queen exists
abba_mask = df["artist"].str.contains("ABBA", case=False, na=False)
dancing_mask = df["song"].str.contains("Dancing Queen", case=False, na=False)

print("ABBA tracks in dataset:", df[abba_mask].shape[0])
print("Dancing Queen tracks:", df[dancing_mask].shape[0])

if abba_mask.any():
    print("\nSample ABBA songs:")
    print(df[abba_mask][["artist", "song"]].head())

if dancing_mask.any():
    dq_row = df[dancing_mask].iloc[0]
    print("\nDancing Queen lyrics preview:")
    print(dq_row["text"][:300])  # adjust column name if needed


ABBA tracks in dataset: 269
Dancing Queen tracks: 4

Sample ABBA songs:
  artist                   song
0   ABBA  Ahe's My Kind Of Girl
1   ABBA       Andante, Andante
2   ABBA         As Good As New
3   ABBA                   Bang
4   ABBA       Bang-A-Boomerang

Dancing Queen lyrics preview:
You can dance, you can jive, having the time of your life  
See that girl, watch that scene, diggin' the Dancing Queen  
  
Friday night and the lights are low  
Looking out for the place to go  
Where they play the right music, getting in the swing  
You come in to look for a king  
Anybody could b


### Save Models for Future Use

We will save the `SentenceTransformer` model and the `TfidfVectorizer` to disk. The `SentenceTransformer` has its own `save_pretrained` method, while `TfidfVectorizer` (a scikit-learn object) can be saved using `joblib`.

In [18]:
import os

# Create a directory to save models if it doesn't exist
save_dir = "saved_models"
os.makedirs(save_dir, exist_ok=True)

# Save the Sentence Transformer model
embed_model.save_pretrained(os.path.join(save_dir, "sentence_transformer_model"))
print(f"Sentence Transformer model saved to {os.path.join(save_dir, 'sentence_transformer_model')}")

Sentence Transformer model saved to saved_models/sentence_transformer_model


In [19]:
import joblib

# Save the TF-IDF vectorizer
joblib.dump(tfidf, os.path.join(save_dir, "tfidf_vectorizer.joblib"))
print(f"TF-IDF vectorizer saved to {os.path.join(save_dir, 'tfidf_vectorizer.joblib')}")

TF-IDF vectorizer saved to saved_models/tfidf_vectorizer.joblib


### Try with Your Own Custom Snippet

In [17]:
my_custom_snippet = "Tell me about a story, of a man, that was told he couldn't fly"

print("\n--- Sentence Transformer Results for Custom Snippet ---")
display(search_song_from_snippet(my_custom_snippet, top_k=5))

print("\n--- TF-IDF Results for Custom Snippet ---")
display(search_tfidf(my_custom_snippet, top_k=5))


--- Sentence Transformer Results for Custom Snippet ---


Unnamed: 0,rank,score,artist,song
0,1,0.489565,Neil Young,Danger Bird
1,2,0.459166,Pearl Jam,Given To Fly
2,3,0.45704,Irving Berlin,I Got Lost In His Arms
3,4,0.453437,Ocean Colour Scene,Fly Me - Yesterday Today B - Side
4,5,0.447221,Barbie,Wings



--- TF-IDF Results for Custom Snippet ---


Unnamed: 0,rank,score,artist,song
0,1,0.332518,Waylon Jennings,If I Could Only Fly
1,2,0.302192,Helloween,If I Could Fly
2,3,0.27869,Roxette,Wish I Could Fly
3,4,0.278305,Mariah Carey,Languishing
4,5,0.211397,Kris Kristofferson,Please Don't Tell Me How The Story Ends


### Sample Test Query

In [16]:
snippet = "You can dance, you can jive, having the time of your life"

print("\n--- Sentence Transformer Results ---")
display(search_song_from_snippet(snippet, top_k=5))

print("\n--- TF-IDF Results ---")
display(search_tfidf(snippet, top_k=5))


--- Sentence Transformer Results ---


Unnamed: 0,rank,score,artist,song
0,1,0.651191,Glee,Dancing Queen
1,2,0.622537,ABBA,Reina Danzante
2,3,0.616793,ABBA,Dancing Queen
3,4,0.613336,Kylie Minogue,Dancing Queen
4,5,0.585486,Erasure,Don't Dance



--- TF-IDF Results ---


Unnamed: 0,rank,score,artist,song
0,1,0.372948,Lionel Richie,Time Of Our Life
1,2,0.332543,Grease,Born To Hand Jive
2,3,0.320634,Glee,Born To Hand Jive
3,4,0.312711,Cyndi Lauper,Jim Jive
4,5,0.309838,Chaka Khan,Jive Talkin'


# Task
Evaluate the Top-1 and Top-K accuracy for the TF-IDF model using the `evaluate_retrieval` function, then compare these results with the Sentence Transformer model's performance. Finally, analyze the differences between the semantic and keyword-based approaches and summarize the findings to propose next steps for improving song prediction accuracy.

## Evaluate TF-IDF Retrieval Performance

### Subtask:
Adapt the `evaluate_retrieval` function to use the `search_tfidf` function and calculate the Top-1 and Top-K accuracy for the TF-IDF model. This will allow for a direct quantitative comparison with the Sentence Transformer model's performance.


**Reasoning**:
I need to define a new function `evaluate_retrieval_tfidf` which mirrors the existing `evaluate_retrieval` function but uses the `search_tfidf` function for retrieval, as specified in the instructions. This will allow us to calculate and compare the performance of the TF-IDF model.



In [15]:
def evaluate_retrieval_tfidf(num_samples=200, top_k=5):
    correct_top1 = 0
    correct_topk = 0

    for _ in range(num_samples):
        row = test_df.sample(1).iloc[0]
        true_artist = row["artist"]
        true_song = row["song"]
        lyrics = row[LYRICS_COL]

        snippet = sample_snippet(lyrics, max_len=200)
        if not snippet.strip():
            continue

        results = search_tfidf(snippet, top_k=top_k)

        # Check if correct song is in predictions
        match_mask = (
            (results["artist"] == true_artist) &
            (results["song"] == true_song)
        )

        if match_mask.any():
            rank = results[match_mask]["rank"].iloc[0]
            if rank == 1:
                correct_top1 += 1
            if rank <= top_k:
                correct_topk += 1

    top1_acc = correct_top1 / num_samples
    topk_acc = correct_topk / num_samples
    return top1_acc, topk_acc

top1_tfidf, top5_tfidf = evaluate_retrieval_tfidf(num_samples=200, top_k=5)
print(f"TF-IDF Top-1 accuracy: {top1_tfidf:.3f}")
print(f"TF-IDF Top-5 accuracy: {top5_tfidf:.3f}")

TF-IDF Top-1 accuracy: 0.855
TF-IDF Top-5 accuracy: 0.950


## Performance Comparison and Analysis

We have evaluated both the Sentence Transformer (semantic) and TF-IDF (keyword-based) models for song snippet retrieval.

### Retrieval Accuracy:

*   **Sentence Transformer Model:**
    *   Top-1 Accuracy: `0.670`
    *   Top-5 Accuracy: `0.790`

*   **TF-IDF Model:**
    *   Top-1 Accuracy: `0.855`
    *   Top-5 Accuracy: `0.950`

### Analysis:

The TF-IDF model significantly outperforms the Sentence Transformer model in both Top-1 and Top-5 accuracy for this specific task. This suggests that for retrieving songs based on short lyrical snippets from this dataset, a keyword-based approach (TF-IDF) is more effective than a purely semantic similarity approach (Sentence Transformer).

**Possible reasons for this difference:**

1.  **Snippet Nature:** The `sample_snippet` function extracts short, direct phrases from the lyrics. These snippets often contain keyphrases or unique words directly tied to the song, which TF-IDF is excellent at capturing.
2.  **Dataset Characteristics:** The dataset might contain many songs where unique keywords or phrases are highly discriminative. If many songs share similar themes or sentiments but differ in specific word choices, TF-IDF could perform better by focusing on the exact words present.
3.  **Semantic Ambiguity:** While Sentence Transformers capture semantic meaning, very short snippets might not always provide enough context for the model to generate highly discriminative embeddings, especially if the semantic space is broad or if certain phrases have multiple interpretations.
4.  **Tokenization and Stop Words:** The TF-IDF model benefits from explicit preprocessing steps like stop-word removal and tokenization, which might help it focus on more informative words, whereas the Sentence Transformer's pre-training might include less aggressive filtering, leading to noise for exact match scenarios.

### Next Steps for Improving Song Prediction Accuracy:

Given these results, here are some proposed next steps:

1.  **Hybrid Approach:** Combine the strengths of both models. This could involve:
    *   **Ensemble Method:** Running both models and combining their ranked results (e.g., using reciprocal rank fusion).
    *   **Re-ranking:** Using TF-IDF for initial retrieval to get a candidate set, and then re-ranking these candidates using the Sentence Transformer for finer semantic distinctions.
2.  **Snippet Generation Improvement:** Experiment with different `sample_snippet` strategies. For instance:
    *   Ensuring snippets always start/end at sentence boundaries.
    *   Extracting snippets that are more semantically dense or contain named entities (artists, specific themes).
    *   Varying `max_len` to see its impact on both models.
3.  **Model Fine-tuning/Selection:**
    *   **Sentence Transformer:** Explore larger or domain-specific Sentence Transformer models if available, or fine-tune the current model on a task related to lyrical similarity.
    *   **TF-IDF:** Experiment with different `max_features`, `ngram_range`, and `min_df/max_df` parameters for the `TfidfVectorizer` to optimize its performance further.
4.  **Metadata Integration:** If available, integrate other metadata like genre, release year, or artist information into the retrieval process to further narrow down searches.
5.  **User Feedback Loop:** Implement a mechanism to collect user feedback on retrieval quality to iteratively improve the models.

## Summary:

### Q&A
The evaluation of Top-1 and Top-K accuracy for the TF-IDF model showed a significantly higher performance compared to the Sentence Transformer model. The TF-IDF model achieved a Top-1 accuracy of 0.855 and a Top-5 accuracy of 0.950, while the Sentence Transformer model had a Top-1 accuracy of 0.670 and a Top-5 accuracy of 0.790. This suggests that for retrieving songs based on short lyrical snippets from this dataset, a keyword-based approach (TF-IDF) is more effective than a purely semantic similarity approach (Sentence Transformer).

### Data Analysis Key Findings
*   The TF-IDF model demonstrated a **Top-1 accuracy of 0.855** and a **Top-5 accuracy of 0.950** for retrieving songs based on short lyrical snippets.
*   The Sentence Transformer model, in comparison, achieved a **Top-1 accuracy of 0.670** and a **Top-5 accuracy of 0.790**.
*   The TF-IDF model **significantly outperformed** the Sentence Transformer model in both Top-1 and Top-5 accuracy, indicating the effectiveness of a keyword-based approach for this specific task and dataset.
*   Potential reasons for TF-IDF's superior performance include the direct nature of the short lyrical snippets, characteristics of the dataset favoring unique keyword matches, and challenges in semantic interpretation for very short contexts.

### Insights or Next Steps
*   Develop a **hybrid retrieval system** that combines the strengths of both models, such as using TF-IDF for initial candidate generation followed by Sentence Transformer for semantic re-ranking, to potentially achieve even higher accuracy.
*   Investigate and experiment with **alternative snippet generation strategies** (e.g., ensuring sentence boundaries, extracting semantically denser phrases) and evaluate their impact on both model types.
