# LLM zero shots and few shots title generation

## Data cleaning

In [18]:
# CS 6140 - Machine Learning
#LLM zero-shot vs frew-shot for title generation
# Import necessary libraries
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from collections import Counter

# Part 1 - Data cleaning(the same as seq2seq data cleaning for the purpose of data consistency)

# Load Spotify dataset, remove duplicates
file_path = "spotify_songs.csv"
spotify_df = pd.read_csv(file_path)
spotify_df = spotify_df.drop_duplicates()

# Convert album release date to datetime
spotify_df['track_album_release_date'] = pd.to_datetime(
    spotify_df['track_album_release_date'], errors='coerce'
)

# Drop rows with missing values in key columns
spotify_df_cleaned = spotify_df.dropna(subset=[
    'track_id', 'track_name', 'track_artist',
    'track_album_id', 'track_album_name', 'track_album_release_date'
])

# Filter out invalid values
spotify_df_cleaned = spotify_df_cleaned[
    (spotify_df_cleaned['track_popularity'] >= 0) &
    (spotify_df_cleaned['track_popularity'] <= 100) &
    (spotify_df_cleaned['loudness'] <= 0)
]

# Standardize text columns
spotify_df_cleaned['track_name'] = spotify_df_cleaned['track_name'].str.strip().str.title()
spotify_df_cleaned['track_artist'] = spotify_df_cleaned['track_artist'].str.strip().str.title()
spotify_df_cleaned['playlist_genre'] = spotify_df_cleaned['playlist_genre'].str.strip().str.lower()
spotify_df_cleaned['playlist_subgenre'] = spotify_df_cleaned['playlist_subgenre'].str.strip().str.lower()

# Save cleaned data
spotify_df_cleaned.to_csv("spotify_songs_cleaned.csv", index=False)

In [19]:
# Display cleaned data
spotify_df_cleaned.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don'T Care (With Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All The Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


## Zero Shot

In [6]:
Mistral_API_Key="473aIIXwkwsZFT9RWpzkyqoFVrR8AcNw"

In [24]:
import pandas as pd
import numpy as np
import os
import random
import requests
import json

# Function to create playlist prompt using your existing spotify_df_cleaned
def create_zero_shot_prompt(playlist_df):
    """
    Create a prompt for the LLM to generate a playlist title without examples
    
    Args:
        playlist_df: DataFrame containing songs for the playlist
        
    Returns:
        str: Formatted prompt for the LLM
    """
    # Get a sample of songs to include in the prompt (avoid token limits)
    sample_size = min(5, len(playlist_df))
    sample_songs = playlist_df.sample(sample_size) if len(playlist_df) > sample_size else playlist_df
    
    # Create the prompt
    prompt = f"""Task: Generate a creative, catchy playlist title

Songs in the playlist:
"""
    
    # Add song information
    for i, (_, song) in enumerate(sample_songs.iterrows()):
        prompt += f"{i+1}. \"{song['track_name']}\" by {song['track_artist']}\n"
    
    # Add additional context about the songs
    prompt += "\nAdditional details:\n"
    
    # Calculate average popularity if available
    if 'track_popularity' in playlist_df.columns:
        avg_popularity = playlist_df['track_popularity'].mean()
        prompt += f"- Average track popularity: {avg_popularity:.1f}/100\n"
    
    # Get release date range if available
    if 'track_album_release_date' in playlist_df.columns:
        dates = playlist_df['track_album_release_date'].dropna()
        if not dates.empty:
            min_date = min(dates)
            max_date = max(dates)
            prompt += f"- Release date range: {min_date} to {max_date}\n"
    
    # Note if there are remixes
    has_remixes = any(playlist_df['track_name'].str.contains('Remix', case=False, na=False))
    if has_remixes:
        prompt += "- Contains remix tracks\n"
    
    prompt += "\nGenerate a single, creative playlist title that captures the mood and theme of these songs. The title should be catchy, marketable, and appealing to listeners:"
    
    return prompt

def generate_title_with_mistral(prompt, api_key=Mistral_API_Key, temperature=0.7):
    """
    Generate a playlist title using Mistral Large LLM via API
    
    Args:
        prompt: Text prompt to send to the model
        api_key: API key for the service (defaults to notebook variable Mistral_API_Key)
        temperature: Sampling temperature (higher = more creative)
        
    Returns:
        str: Generated playlist title
    """
    if not api_key:
        raise ValueError("No API key provided.")
    
    url = "https://api.mistral.ai/v1/chat/completions"
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    
    data = {
        "model": "mistral-large-latest",  # Use the latest Mistral Large model
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": 100  # Enough tokens for a title
    }
    
    try:
        print(f"Sending prompt to Mistral API: {prompt[:100]}...")
        response = requests.post(url, headers=headers, data=json.dumps(data))
        response.raise_for_status()  # Raise exception for HTTP errors
        
        response_data = response.json()
        
        # Extract the generated title from the response
        title = response_data["choices"][0]["message"]["content"].strip()
        
        # Clean up the title - remove quotes if present and any "Title:" prefix
        title = title.strip('"\'')
        if ":" in title and title.split(":", 1)[0].lower() in ["title", "playlist title", "playlist"]:
            title = title.split(":", 1)[1].strip()
            
        return title
    
    except Exception as e:
        print(f"Error calling Mistral API: {str(e)}")
        return f"Error generating title: {str(e)}"

# Function to create sample playlists from your DataFrame
def create_sample_playlists(df, num_playlists=3, songs_per_playlist=5):
    """
    Create sample playlists from the dataset for testing
    
    Args:
        df: DataFrame with song data
        num_playlists: Number of playlists to create
        songs_per_playlist: Number of songs per playlist
    
    Returns:
        list: List of DataFrames, each representing a playlist
    """
    playlists = []
    
    # Method 1: Create random playlists
    for i in range(num_playlists):
        if len(df) > songs_per_playlist:
            playlist_songs = df.sample(songs_per_playlist)
        else:
            playlist_songs = df
            
        playlists.append(playlist_songs)
    
    return playlists

# Main execution code
def generate_playlist_titles(spotify_df, num_playlists=3, songs_per_playlist=5, temperature=0.7):
    """
    Generate titles for multiple playlists using the Mistral API
    
    Args:
        spotify_df: DataFrame containing your Spotify track data
        num_playlists: Number of playlists to generate titles for
        songs_per_playlist: Number of songs in each playlist
        temperature: Creativity level for title generation
        
    Returns:
        pd.DataFrame: Results containing generated playlist titles
    """
    print("Playlist Title Generation with Mistral Large")
    print("-------------------------------------------")

    # Create sample playlists from data
    sample_playlists = create_sample_playlists(spotify_df, 
                                              num_playlists=num_playlists,
                                              songs_per_playlist=songs_per_playlist)

    # Generate titles for each playlist
    results = []

    for i, playlist in enumerate(sample_playlists):
        print(f"\nPlaylist {i+1}:")
        print(f"Sample tracks: {', '.join(playlist['track_name'].iloc[:3].tolist())}")
        
        # Create prompt
        prompt = create_zero_shot_prompt(playlist)
        
        # Generate title
        title = generate_title_with_mistral(prompt, temperature=temperature)
        
        # Create a track list for display
        track_list = [f"{song['track_name']} by {song['track_artist']}" 
                     for _, song in playlist.iterrows()]
        
        # Store result
        results.append({
            'playlist_id': i,
            'generated_title': title,
            'tracks': track_list,
            'num_tracks': len(playlist),
            'average_popularity': playlist['track_popularity'].mean() if 'track_popularity' in playlist.columns else None
        })
        
        print(f"Generated title: {title}")

    # Convert results to DataFrame for analysis
    results_df = pd.DataFrame(results)
    print("\nGeneration results:")
    print(results_df[['playlist_id', 'generated_title', 'num_tracks', 'average_popularity']])
    
    return results_df


In [None]:
#Generate playlist titles
results = generate_playlist_titles(
    spotify_df_cleaned,  # DataFrame with track data
    num_playlists=3,     # Generate 3 different playlists
    songs_per_playlist=5,  # 5 songs per playlist
    temperature=0.8      # Higher temperature for more creative titles
)

# Display the results
print("\nFull results with generated playlist titles:")
for i, row in results.iterrows():
    print(f"\nPlaylist {row['playlist_id']+1}: \"{row['generated_title']}\"")
    print("Tracks:")
    for j, track in enumerate(row['tracks']):
        print(f"  {j+1}. {track}")

Playlist Title Generation with Mistral Large
-------------------------------------------

Playlist 1:
Sample tracks: Runnin (With A$Ap Rocky, A$Ap Ferg & Nicki Minaj), Right Here - Human Nature Radio Mix, The Boy Is Mine
Sending prompt to Mistral API: Task: Generate a creative, catchy playlist title

Songs in the playlist:
1. "Runnin (With A$Ap Rocky...
Generated title: **"Soulful Strides: Chilled Chasers Through Time"**

This title captures the blend of classic and modern tracks, suggesting a smooth and soulful listening experience that spans different eras. It's catchy, marketable, and appeals to listeners looking for a diverse yet cohesive musical journey.

Playlist 2:
Sample tracks: Ya Llego (Captain Planet Remix), No Strings Attached, Sunshinebloom
Sending prompt to Mistral API: Task: Generate a creative, catchy playlist title

Songs in the playlist:
1. "Ya Llego (Captain Plane...
Generated title: **"Tropical Sunshine Remix: A Million Strings of Freedom"**

This title captures the

## Few shots

In [25]:
import pandas as pd
import numpy as np
import requests
import json
import random

def create_few_shot_prompt(playlist_df, num_examples=3):
    """
    Create a prompt for the LLM to generate a playlist title with examples (few-shot learning)
    
    Args:
        playlist_df: DataFrame containing songs for the playlist
        num_examples: Number of few-shot examples to include
        
    Returns:
        str: Formatted prompt with examples for the LLM
    """
    # Get a sample of songs to include in the prompt (avoid token limits)
    sample_size = min(5, len(playlist_df))
    sample_songs = playlist_df.sample(sample_size) if len(playlist_df) > sample_size else playlist_df
    
    # Few-shot examples
    examples = [
        {
            "songs": [
                "Don't Start Now by Dua Lipa",
                "Blinding Lights by The Weeknd",
                "Physical by Dua Lipa",
                "Roses (Imanbek Remix) by SAINt JHN",
                "Midnight Sky by Miley Cyrus"
            ],
            "title": "Neon Disco Revival"
        },
        {
            "songs": [
                "bad guy by Billie Eilish",
                "SICKO MODE by Travis Scott",
                "Old Town Road by Lil Nas X",
                "Shallow by Lady Gaga",
                "Truth Hurts by Lizzo"
            ],
            "title": "Chart Toppers: New Classics"
        },
        {
            "songs": [
                "Dynamite by BTS",
                "Watermelon Sugar by Harry Styles",
                "Rain On Me by Lady Gaga",
                "positions by Ariana Grande",
                "Say So by Doja Cat"
            ],
            "title": "Summer Pop Explosion"
        },
        {
            "songs": [
                "Memories (Dillon Francis Remix) by Maroon 5",
                "In Your Eyes (Remix) by The Weeknd",
                "Dream On Me (Remix) by Ella Henderson",
                "Let Me Down Slowly (Remix) by Alec Benjamin",
                "Someone You Loved (Future Humans Remix) by Lewis Capaldi"
            ],
            "title": "Remix Renaissance"
        },
        {
            "songs": [
                "All The Time (Don Diablo Remix) by Zara Larsson",
                "Call You Mine (Keanu Silva Remix) by The Chainsmokers",
                "Close To Me (Red Triangle Remix) by Ellie Goulding",
                "I Don't Care (With Justin Bieber) (Loud Luxury Remix) by Ed Sheeran",
                "Higher Love (Kygo Remix) by Whitney Houston"
            ],
            "title": "EDM Remix Odyssey"
        }
    ]
    
    # Select a subset of examples
    selected_examples = random.sample(examples, min(num_examples, len(examples)))
    
    # Start building the prompt
    prompt = "Task: Generate a creative, catchy playlist title based on the songs in the playlist.\n\n"
    prompt += "Here are some examples of playlists and good titles for them:\n\n"
    
    # Add the few-shot examples
    for i, example in enumerate(selected_examples):
        prompt += f"Example {i+1}:\n"
        prompt += "Songs:\n"
        for j, song in enumerate(example["songs"]):
            prompt += f"{j+1}. {song}\n"
        prompt += f"Title: {example['title']}\n\n"
    
    # Now add the actual songs we want a title for
    prompt += "Now, generate a creative title for this playlist:\n"
    prompt += "Songs:\n"
    
    # Add song information
    for i, (_, song) in enumerate(sample_songs.iterrows()):
        prompt += f"{i+1}. \"{song['track_name']}\" by {song['track_artist']}\n"
    
    # Add additional context about the songs if available
    if 'track_popularity' in playlist_df.columns:
        avg_popularity = playlist_df['track_popularity'].mean()
        prompt += f"\nAverage track popularity: {avg_popularity:.1f}/100\n"
    
    # Check if there are remixes
    has_remixes = any(playlist_df['track_name'].str.contains('Remix', case=False, na=False))
    if has_remixes:
        prompt += "Note: This playlist contains remix tracks.\n"
    
    prompt += "\nTitle:"
    
    return prompt

def generate_title_with_mistral_few_shot(prompt, api_key=Mistral_API_Key, temperature=0.7):
    """
    Generate a playlist title using Mistral Large LLM via API with few-shot examples
    
    Args:
        prompt: Text prompt to send to the model (including few-shot examples)
        api_key: API key for the service
        temperature: Sampling temperature (higher = more creative)
        
    Returns:
        str: Generated playlist title
    """
    if not api_key:
        raise ValueError("No API key provided.")
    
    url = "https://api.mistral.ai/v1/chat/completions"
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    
    data = {
        "model": "mistral-large-latest",
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": 50  # Enough for a title
    }
    
    try:
        print(f"Sending few-shot prompt to Mistral API...")
        response = requests.post(url, headers=headers, data=json.dumps(data))
        response.raise_for_status()
        
        response_data = response.json()
        
        # Extract the generated title from the response
        title = response_data["choices"][0]["message"]["content"].strip()
        
        # Clean up the title - remove quotes if present
        title = title.strip('"\'')
        
        return title
    
    except Exception as e:
        print(f"Error calling Mistral API: {str(e)}")
        return f"Error generating title: {str(e)}"

def generate_playlist_titles_few_shot(spotify_df, num_playlists=3, songs_per_playlist=5, temperature=0.7, num_examples=3):
    """
    Generate titles for multiple playlists using the Mistral API with few-shot learning
    
    Args:
        spotify_df: DataFrame containing your Spotify track data
        num_playlists: Number of playlists to generate titles for
        songs_per_playlist: Number of songs in each playlist
        temperature: Creativity level for title generation
        num_examples: Number of few-shot examples to include
        
    Returns:
        pd.DataFrame: Results containing generated playlist titles
    """
    print("Few-Shot Playlist Title Generation with Mistral Large")
    print("---------------------------------------------------")

    # Create sample playlists from data 
    playlists = []
    for i in range(num_playlists):
        if len(spotify_df) > songs_per_playlist:
            playlist_songs = spotify_df.sample(songs_per_playlist)
        else:
            playlist_songs = spotify_df
            
        playlists.append(playlist_songs)

    # Generate titles for each playlist
    results = []

    for i, playlist in enumerate(playlists):
        print(f"\nPlaylist {i+1}:")
        print(f"Sample tracks: {', '.join(playlist['track_name'].iloc[:3].tolist())}")
        
        # Create few-shot prompt
        prompt = create_few_shot_prompt(playlist, num_examples=num_examples)
        
        # Generate title
        title = generate_title_with_mistral_few_shot(prompt, temperature=temperature)
        
        # Create a track list for display
        track_list = [f"{song['track_name']} by {song['track_artist']}" 
                     for _, song in playlist.iterrows()]
        
        # Store result
        results.append({
            'playlist_id': i,
            'generated_title': title,
            'tracks': track_list,
            'num_tracks': len(playlist),
            'average_popularity': playlist['track_popularity'].mean() if 'track_popularity' in playlist.columns else None
        })
        
        print(f"Generated title: {title}")

    # Convert results to DataFrame for analysis
    results_df = pd.DataFrame(results)
    print("\nFew-shot generation results:")
    print(results_df[['playlist_id', 'generated_title', 'num_tracks', 'average_popularity']])
    
    return results_df

In [26]:

# Generate playlist titles with few-shot examples
few_shot_results = generate_playlist_titles_few_shot(
    spotify_df_cleaned,  
    num_playlists=3,     
    songs_per_playlist=5,  
    temperature=0.8, 
    num_examples=3 
)

# Display the results in a readable format
print("\nFull results with few-shot generated playlist titles:")
for i, row in few_shot_results.iterrows():
    print(f"\nPlaylist {row['playlist_id']+1}: \"{row['generated_title']}\"")
    print("Tracks:")
    for j, track in enumerate(row['tracks']):
        print(f"  {j+1}. {track}")


Few-Shot Playlist Title Generation with Mistral Large
---------------------------------------------------

Playlist 1:
Sample tracks: Castles, Man In The Box, I'M Not Sorry
Sending few-shot prompt to Mistral API...
Generated title: **Title: "Eclectic Echoes: Uncharted Vibes"**

This title captures the diverse and lesser-known nature of the tracks, combining them under an intriguing and adventurous theme.

Playlist 2:
Sample tracks: Just Like You, Promise - David Ezra Remix, Cryin'
Sending few-shot prompt to Mistral API...
Error calling Mistral API: 429 Client Error: Too Many Requests for url: https://api.mistral.ai/v1/chat/completions
Generated title: Error generating title: 429 Client Error: Too Many Requests for url: https://api.mistral.ai/v1/chat/completions

Playlist 3:
Sample tracks: Se Joga, Final Song, Rumors
Sending few-shot prompt to Mistral API...
Error calling Mistral API: 429 Client Error: Too Many Requests for url: https://api.mistral.ai/v1/chat/completions
Generated title

In [27]:
# Compare with zero-shot results
if 'results' in globals():
    print("\n=== Comparison: Zero-shot vs Few-shot ===")
    comparison = pd.DataFrame({
        'Playlist': range(1, min(len(results), len(few_shot_results))+1),
        'Zero-shot Title': results['generated_title'][:len(few_shot_results)],
        'Few-shot Title': few_shot_results['generated_title']
    })
    print(comparison)


=== Comparison: Zero-shot vs Few-shot ===
   Playlist                                    Zero-shot Title  \
0         1  **"Soulful Strides: Chilled Chasers Through Ti...   
1         2  **"Tropical Sunshine Remix: A Million Strings ...   
2         3  **"Echoes of Emotion: Beats & Ballads Through ...   

                                      Few-shot Title  
0  **Title: "Eclectic Echoes: Uncharted Vibes"**\...  
1  Error generating title: 429 Client Error: Too ...  
2  Error generating title: 429 Client Error: Too ...  
