<a href="https://colab.research.google.com/github/anniezhang2288/python_notebooks/blob/main/semantically_similar_taylor_swift_lyric_meanings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=f1871364c35a9142c4b7111a21ce2f000d56b6365bacd85555aed825ac6e898b
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-tra

In [24]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Function to find top 3 semantically similar songs
def find_top_songs(input_phrase, csv_file, top_n=3):
    # Load the dataset with specified encoding
    try:
        df = pd.read_csv(csv_file, encoding='utf-8')
    except UnicodeDecodeError:
        df = pd.read_csv(csv_file, encoding='ISO-8859-1')  # Alternative encoding

    # Pre-trained model for embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Prepare a dictionary to hold similarity scores
    song_similarity = {}

    # Group by song
    grouped = df.groupby(['track_title', 'album', 'artist'])

    for name, group in grouped:
        # Combine lines into one string per song
        lyrics = ' '.join(group['lyric'].tolist())

        # Encode lyrics and the input phrase
        lyrics_embedding = model.encode(lyrics, convert_to_tensor=True)
        input_embedding = model.encode(input_phrase, convert_to_tensor=True)

        # Calculate similarity
        similarity = util.pytorch_cos_sim(input_embedding, lyrics_embedding)

        # Store the score with song details
        song_similarity[name] = similarity.item()

    # Sort songs by similarity score
    sorted_songs = sorted(song_similarity.items(), key=lambda x: x[1], reverse=True)

    # Return top N songs
    return sorted_songs[:top_n]

# Example usage
input_phrase = "happy"
csv_file = '/content/taylor_swift_lyrics (1).csv'
print("Top 3 songs related to " + input_phrase + ":")
top_songs = find_top_songs(input_phrase, csv_file)
for song, score in top_songs:
    print(f"{song[0]} by {song[2]} from the album '{song[1]}': {score}")


Top 3 songs related to happy:
Back To December by Taylor Swift from the album 'Speak Now': 0.22548998892307281
Cold as You by Taylor Swift from the album 'Taylor Swift': 0.18939611315727234
Stay Stay Stay by Taylor Swift from the album 'Red': 0.17992272973060608


In [23]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Function to find top 3 semantically similar songs and their unique top 3 lines with song line numbers
def find_top_songs_and_unique_lyrics(input_phrase, csv_file, top_n=3):
    # Load the dataset with specified encoding
    try:
        df = pd.read_csv(csv_file, encoding='utf-8')
    except UnicodeDecodeError:
        df = pd.read_csv(csv_file, encoding='ISO-8859-1')  # Alternative encoding

    # Pre-trained model for embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Prepare a dictionary to hold similarity scores for songs
    song_similarity = {}

    # Group by song
    grouped = df.groupby(['track_title', 'album', 'artist'])

    for name, group in grouped:
        # Reset index for easier row access
        group = group.reset_index(drop=True)

        # Combine lines into one string per song
        lyrics = ' '.join(group['lyric'].tolist())

        # Encode lyrics and the input phrase
        lyrics_embedding = model.encode(lyrics, convert_to_tensor=True)
        input_embedding = model.encode(input_phrase, convert_to_tensor=True)

        # Calculate similarity for the whole song
        song_similarity_score = util.pytorch_cos_sim(input_embedding, lyrics_embedding).item()

        # Calculate similarity for each line
        line_embeddings = model.encode(group['lyric'].tolist(), convert_to_tensor=True)
        line_similarities = util.pytorch_cos_sim(input_embedding, line_embeddings).flatten()

        # Find unique top 3 lines with song line numbers
        top_lines = []
        top_lines_scores = []
        top_line_numbers = []
        line_number = 1  # Initialize line number counter
        for index in line_similarities.argsort(descending=True).tolist():
            if len(top_lines) < 3:
                line = group['lyric'].iloc[index]
                if line not in top_lines:
                    top_lines.append(line)
                    top_lines_scores.append(line_similarities[index].item())
                    top_line_numbers.append(line_number)
                line_number += 1  # Increment line number
            if len(top_lines) == 3:
                break

        # Store the score with song details, top lines, and line numbers
        song_similarity[name] = (song_similarity_score, top_lines, top_lines_scores, top_line_numbers)

    # Sort songs by similarity score
    sorted_songs = sorted(song_similarity.items(), key=lambda x: x[1][0], reverse=True)

    # Return top N songs and their top lines with song line numbers
    return sorted_songs[:top_n]

# Example usage
input_phrase = "happy"
csv_file = '/content/taylor_swift_lyrics (1).csv'
print("Top 3 songs related to '" + input_phrase + "':")
top_songs = find_top_songs_and_unique_lyrics(input_phrase, csv_file)
for song, (score, lines, line_scores, line_numbers) in top_songs:
    print(f"{song[0]} by {song[2]} from the album '{song[1]}': {score}")
    print("  Top 3 unique lines:")
    for line, line_score, line_number in zip(lines, line_scores, line_numbers):
        print(f"    Line {line_number}: {line} (Score: {line_score})")


Top 3 songs related to 'happy':
Back To December by Taylor Swift from the album 'Speak Now': 0.22548998892307281
  Top 3 unique lines:
    Line 1: I miss your tan skin, your sweet smile, so good to me, so right (Score: 0.3266279101371765)
    Line 2: Wishing I'd realized what I had when you were mine (Score: 0.28460580110549927)
    Line 5: Realized I loved you in the fall (Score: 0.28333982825279236)
Cold as You by Taylor Swift from the album 'Taylor Swift': 0.18939611315727234
  Top 3 unique lines:
    Line 1: Every smile you fake is so condescending (Score: 0.31048160791397095)
    Line 2: Of a mess of a dreamer with the nerve to adore you (Score: 0.2875947654247284)
    Line 3: And I stood there loving you and wished them all away (Score: 0.27248454093933105)
Stay Stay Stay by Taylor Swift from the album 'Red': 0.17992272973060608
  Top 3 unique lines:
    Line 1: I've been loving you for quite some time, time, time (Score: 0.2907577157020569)
    Line 5: I'd like to hang out with 