In [1]:
# idea: chunk songs by lines, create embeddings for chunks, cosine similarity, take top X%

In [14]:
import sqlite3
import pandas as pd
import numpy as np
import heapq
import re
from sentence_transformers import SentenceTransformer

In [3]:
model = SentenceTransformer("all-MiniLM-L6-v2")

# connecting to db:

In [4]:
conn = sqlite3.connect(r"songs.db")
cursor = conn.cursor()

query = "SELECT * FROM songs"
cursor.execute(query)



<sqlite3.Cursor at 0x22cbd54db40>

In [5]:
query = "SELECT * FROM songs"
df = pd.read_sql_query(query, conn)

# preprocessing lyrics:

In [6]:
def chunking(lyrics):
    # remove start of song artifacts
    stripped = re.sub(r'.*?Contributors.*?\[.*?\]', '', lyrics)
    # remove song structure comments
    stripped = re.sub(r'\[.*?\]', '', stripped)
    # split song texts into lines by splitting at newline characters
    stripped = stripped.splitlines()
    # removing empty entries
    stripped = [line for line in stripped if line.strip()]
    # remove end of song artifacts
    stripped[-1] = re.sub(r'\d+Embed$', "", stripped[-1])
    return stripped


In [7]:
lyrics = df["lyrics"][6]
chunks = chunking(lyrics)
chunks

['What?',
 'Well fuck you...bitch',
 'Now can you bounce wit me, wit me, wit me wit me',
 'Can you bounce wit me, bounce wit me, wit me wit me',
 'Can you bounce wit me, bounce wit me, ge-gi-gi-gi-gi-gi',
 'Can you bounce wit me, bounce wit me, ye-ye-yeah',
 'Uh-huh uh-huh bounce wit me, bounce wit me',
 'Can ya can ya can ya bounce wit me, bounce wit me',
 'Ya-yah-yah, ya-ya-yah-yeah bounce wit me, bounce wit me',
 'Ge-gi, ge-gi-gi-gi-geyeah bounce wit me, bounce wit me',
 'Get it!',
 "Can I hit in the mornin' without givin' you half of my dough",
 'And even worse, if I was broke would you want me',
 "If I couldn't get you finer things like all of them diamond rings",
 'Bitches kill for would you still roll',
 "If we couldn't see the sun risin' off the shore of Thailand",
 "Would you ride then, if I wasn't driving",
 "If I wasn't a eight-figure nigga by the name of Jigga",
 'Would you come around me or would you clown me',
 "If I couldn't flow futuristic, would ya",
 'Put your two lip

# embeddings lyrics

In [8]:
embeddings = model.encode(chunks, normalize_embeddings=True)

In [9]:
def cosine_similarity(x, y):
    
    # Ensure length of x and y are the same
    if len(x) != len(y) :
        return None
    
    # Compute the dot product between x and y
    dot_product = np.dot(x, y)
    
    # Compute the L2 norms (magnitudes) of x and y
    magnitude_x = np.sqrt(np.sum(x**2)) 
    magnitude_y = np.sqrt(np.sum(y**2))
    
    # Compute the cosine similarity
    cosine_similarity = dot_product / (magnitude_x * magnitude_y)
    
    return cosine_similarity


In [10]:
input_phrase = "ambitious breadwinner"
input_vector = model.encode(input_phrase)
similarities = []
for i in embeddings:
    similarities.append(cosine_similarity(input_vector, i))
    

# find most similar line:

In [11]:
max_similarity_index = similarities.index(max(similarities))  # Find index of max similarity
most_similar_line = chunks[max_similarity_index]  # Get the corresponding line

print("Most similar line:", most_similar_line)
print("Cosine similarity:", max(similarities))

Most similar line: Ambition makes me so horny
Cosine similarity: 0.42427036


# top n results:

In [15]:
top_n = 3  
top_indices = heapq.nlargest(top_n, range(len(similarities)), key=similarities.__getitem__)
top_lines = [chunks[i] for i in top_indices]
for line, sim in zip(top_lines, [similarities[i] for i in top_indices]):
    print(f"Similar line: {line}, Cosine similarity: {sim}")

Similar line: Ambition makes me so horny, Cosine similarity: 0.424270361661911
Similar line: Can you afford me? My niggas breadwinners, never corny, Cosine similarity: 0.3952316343784332
Similar line: Who don't love hoes, they get no dough, Cosine similarity: 0.32037052512168884


In [16]:
top_indices

[52, 51, 30]