In [53]:
from glob import glob
from tqdm import tqdm
import pandas as pd

In [28]:
tag_paths = glob("KGRec-dataset/KGRec-music/tags/*.txt")
desc_paths = glob("KGRec-dataset/KGRec-music/descriptions/*.txt")

In [29]:
print("Tags:", len(tag_paths))
print("Descriptions:", len(desc_paths))

Tags: 8239
Descriptions: 8640


In [39]:
song_ids = [f"{i+1}.txt" for i in range(8640)]

In [40]:
len(song_ids)

8640

In [41]:
all_tags = []

In [42]:
for tag in tag_paths:
    with open(tag, 'r') as f:
        lines = f.readlines()
        if len(lines) == 1:
            #print(lines[0].split(" "))
            all_tags.extend(lines[0].split(" "))

In [43]:
len(set(all_tags))

50956

In [72]:
songs_metadata = []

In [73]:
for song_id in tqdm(song_ids):
    tag_path = f"KGRec-dataset/KGRec-music/tags/{song_id}"
    desc_path = f"KGRec-dataset/KGRec-music/descriptions/{song_id}"
    try:
        with open(tag_path, 'r') as f:
            tags = f.read()
            tags = tags.replace("-", " ")
    except:
        tags = ""
    with open(desc_path, 'r') as f:
        desc = f.read()
    songs_metadata.append({
        "id": song_id.split(".")[0],
        "tags": tags,
        "description": desc
    })

100%|██████████| 8640/8640 [00:00<00:00, 38230.28it/s]


In [74]:
len(songs_metadata)

8640

In [75]:
songs_metadata[0]

{'id': '1',
 'tags': 'mathematical songs with individual pictures indie Experimental Rock sleepy pop 00s Good Stuff alternative Progressive rock shit change zodiac signs Mellow 60s indie rock missing someone time trippy warm downtempo psychedelic cool karen o Love Psychedelic Rock ear sex american american i like rock Top1000 2009 experimental',
 'description': "This ultra experimental number features some unusual contributions from German mathematician Dr. Thorsten Wörman and Yeah Yeah Yeahs vocalist , Karen O. Frontman Wayne Coyne explained : `` Features Thorsten Wörmann doing sound bites explaining some fundamental mathematical equations involving polynomial rings , but also featuring Karen O making clicking gunshot sounds . ''\nThis is one of five songs on Embryonic that are named after zodiac signs .\n"}

In [76]:
songs_df = pd.DataFrame(songs_metadata)
songs_df.index = songs_df["id"]
songs_df.drop("id", axis=1, inplace=True)

In [77]:
songs_df

Unnamed: 0_level_0,tags,description
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,mathematical songs with individual pictures in...,This ultra experimental number features some u...
2,my favorite country new country great song,The third single from Country music artist Cra...
3,beautiful indie sweet pop indie pop violin alt...,Frontman Charlie Fink 's lyrics for Last Night...
4,fucked up shawty awwwwwwwww yeah drake fig dat...,"The sequel to Drake 's Thank Me Later track , ..."
5,Iron Maiden rockhard classic british heavy met...,The `` Black Rain '' is a metaphor for oil .\n...
...,...,...
8636,,Florida Metalcore band Trivium introduce their...
8637,louder is never enough power indie rock indie ...,Frontman Kele Okerke has spoken of how he list...
8638,indie pretty fucking awesome pop female vocals...,"This dynamic track might sound upbeat , but ly..."
8639,christian metal beautiful christian rock xynod...,Vocalist Ryan Clark on bandsonfire.com about t...


In [78]:
from searcharray import SearchArray

In [79]:
songs_df['tags_tokenized'] = SearchArray.index(songs_df['tags'])

2024-06-15 10:17:25,834 - searcharray.indexing - INFO - Indexing begins
2024-06-15 10:17:25,837 - searcharray.indexing - INFO - 0 Batch Start tokenization
2024-06-15 10:17:25,837 - searcharray.indexing - INFO - Tokenizing 8640 documents
2024-06-15 10:17:26,101 - searcharray.indexing - INFO - Tokenization -- vstacking
2024-06-15 10:17:26,169 - searcharray.indexing - INFO - Tokenization -- DONE
2024-06-15 10:17:26,175 - searcharray.indexing - INFO - Inverting docs->terms
2024-06-15 10:17:26,235 - searcharray.indexing - INFO - Encoding positions to bit array
2024-06-15 10:17:26,281 - searcharray.indexing - INFO - Indexing from tokenization complete


In [80]:
songs_df['description_tokenized'] = SearchArray.index(songs_df['description'])

2024-06-15 10:17:43,455 - searcharray.indexing - INFO - Indexing begins
2024-06-15 10:17:43,458 - searcharray.indexing - INFO - 0 Batch Start tokenization
2024-06-15 10:17:43,459 - searcharray.indexing - INFO - Tokenizing 8640 documents
2024-06-15 10:17:44,198 - searcharray.indexing - INFO - Tokenization -- vstacking
2024-06-15 10:17:44,271 - searcharray.indexing - INFO - Tokenization -- DONE
2024-06-15 10:17:44,286 - searcharray.indexing - INFO - Inverting docs->terms
2024-06-15 10:17:44,506 - searcharray.indexing - INFO - Encoding positions to bit array
2024-06-15 10:17:44,616 - searcharray.indexing - INFO - Indexing from tokenization complete


In [114]:
def fts(query, column = "tags"):
    results = songs_df.copy()
    results["score"] = songs_df['tags_tokenized'].array.score(query.split(" "))
    if any(results["score"].values > 0):
        results = results.sort_values('score', ascending=False)
        return results.iloc[:15,:2] # top 5 hits, id, tags and description columns only
    else:
        return "No results found"

In [122]:
fts("death metal")

Unnamed: 0_level_0,tags,description
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1495,aggressive depression Obituary Extreme Metal a...,This is the title track from Obituary 's secon...
8540,no sympathy for fools Blackened Death Metal Ex...,This is Nergal 's testimony of his hatred for ...
55,metalcore metal i had to change my pants after...,This song is about a virgin woman dying while ...
6434,groove metal Enslaved death metal Soulfly 7 of...,"This is a track from Enslaved , the eighth stu..."
878,groovy 5 of 10 stars grindcore atmospheric dea...,Ben Savage of Whitechapel spoke to us about a ...
7092,death metal Awesome metal Metal Core Slipknot ...,"This is a rewritten , re-recorded version of a..."
4546,heavy groovy atmospheric death metal brutal de...,This is the third single from Baptized In Filt...
8329,groove metal USA death metal Awesome metal thr...,Guitarist Rob Arnold told Kerrang !\nmagazine ...
3722,groovy grindcore atmospheric power death metal...,`` Reprogrammed to Hate '' is a great example ...
4412,viking metal swedish October Rust Melodic Deat...,"This song is about Hel , the name of the godde..."
