In [None]:
! pip install sentence-transformers

In [None]:
import kagglehub

# Load Kaggle dataset
path = kagglehub.dataset_download("nikhilnayak123/5-million-song-lyrics-dataset")

print("Path to dataset files:", path)

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import pickle
import os
from tqdm import tqdm

data_file = os.path.join(path, os.listdir(path)[0])

df = pd.read_csv(data_file)

# Take 300,000 random samples
df_sample = df.sample(n=300000, random_state=42)

print(f"Sampled dataset shape: {df_sample.shape}")

In [None]:
df.head()

In [None]:
from sentence_transformers import SentenceTransformer, util
!apt install libomp-dev
!pip install --upgrade faiss-cpu

import faiss

In [None]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Create SBERT embeddings for lyrics, returns (embeddings, list of song ids)
def create_embeddings(df_sample, batch_size=32, output_path='lyrics_embeddings.pkl'):

    # Get song IDs and lyrics
    song_ids = df_sample['song_id'].tolist() if 'song_id' in df_sample.columns else [f"song_{i}" for i in range(len(df_sample))]
    lyrics = df_sample['lyrics'].fillna('').tolist()

    # Compute embeddings in batches
    all_embeddings = []

    for i in tqdm(range(0, len(lyrics), batch_size)):
        batch = lyrics[i:i+batch_size]
        batch_embeddings = sbert_model.encode(batch, convert_to_tensor=True)
        all_embeddings.append(batch_embeddings)

    # Concatenate all embeddings
    embeddings = torch.cat(all_embeddings)

    # Save embeddings and song ids
    if output_path:
        print(f"Saving embeddings to {output_path}")
        os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)
        with open(output_path, 'wb') as f:
            pickle.dump({
                'embeddings': embeddings,
                'song_ids': song_ids
            }, f)

    print(f"Created embeddings with shape: {embeddings.shape}")
    return embeddings, song_ids

In [None]:
# Create embeddings for sampled song data
embeddings, song_ids = create_embeddings(
    df_sample,
    batch_size=32,
    output_path='lyrics_embeddings_300k.pkl'
)

In [None]:
# Create FAISS flat index for search, saves to given index path
def create_faiss_index(embeddings, song_ids, index_path='lyrics_index.faiss'):

    # Convert to numpy (required for FAISS)
    embeddings_np = embeddings.cpu().numpy()

    # Get dimensions
    num_vectors, dimension = embeddings_np.shape

    # Normalize vectors for cosine similarity
    faiss.normalize_L2(embeddings_np)

    # Create a flat index and add vectors
    index = faiss.IndexFlatIP(dimension)
    index.add(embeddings_np)

    # Save the FAISS index
    print(f"Saving index to {index_path}")
    faiss.write_index(index, index_path)

    # Save the mapping from index positions to song IDs
    mapping_path = f"{os.path.splitext(index_path)[0]}.ids.pkl"
    print(f"Saving song ID mapping to {mapping_path}")
    with open(mapping_path, 'wb') as f:
        pickle.dump(song_ids, f)

    return index_path

In [None]:
index_path = create_faiss_index(embeddings, song_ids, index_path='lyrics_index.faiss')

In [None]:
def simple_search(query, index_path, df, top_k = 20):
    # Load sbert model and index
    model = SentenceTransformer('all-MiniLM-L6-v2')
    index = faiss.read_index(index_path)

    # Encode the query
    query_embedding = model.encode(query, convert_to_numpy=True)
    query_embedding = query_embedding.reshape(1, -1)
    faiss.normalize_L2(query_embedding)

    # Search
    distances, indices = index.search(query_embedding, top_k)

    print(f"Found {len(indices[0])} matches")

    # Use positional matching
    results = []
    for i, idx in enumerate(indices[0]):
        if idx >= 0 and idx < len(df):
            song_data = df.iloc[idx]
            results.append({
                'position': idx,
                'title': song_data['title'],
                'artist': song_data['artist'],
                'tag': song_data['tag'],
                'year': song_data['year'],
                'similarity_score': float(distances[0][i])
            })

    return results

In [None]:
query = "sad breakup songs about lost love"
results = simple_search(
    query = query,
    index_path = 'lyrics_index.faiss',
    df = df_sample,
    top_k = 20
)

print("\nTop 10 Results:")
for i, result in enumerate(results[:10]):
    print(f"{i+1}. {result['title']} by {result['artist']} ({result['tag']}, {result['year']}) - Score: {result['similarity_score']:.4f}")

In [None]:
query = "study music"
results = simple_search(
    query = query,
    index_path='lyrics_index.faiss',
    df = df_sample,
    top_k = 20
)

print("\nTop 10 Results:")
for i, result in enumerate(results[:10]):
    print(f"{i+1}. {result['title']} by {result['artist']} ({result['tag']}, {result['year']}) - Score: {result['similarity_score']:.4f}")

In [None]:
from google.colab import files

# Download embeddings file
files.download('lyrics_embeddings_300k.pkl')

# Download FAISS index
files.download('lyrics_index.faiss')

# Download id mapping
files.download('lyrics_index.ids.pkl')