In [131]:
# Read all tracks

import pandas as pd

tracks_df = pd.read_json("tracks.json", lines=True).set_index('track')

In [132]:
# Capture sessions

import pandas as pd
import glob

# df = pd.read_json("data.0.json", lines=True)

df = pd.concat([
    pd.read_json(data_path, lines=True)
    for data_path
    in glob.glob("../rec_sys_data/*/data.json")
])

df['ts'] = pd.to_datetime(df['timestamp'], unit='ms')
df       = df.sort_values(['user', 'ts']).reset_index(drop=True)

df['prev_was_last'] = (
    df.groupby('user')['message']
      .shift()
      .eq('last')
      .fillna(False)
)

df['session_id'] = (
    df.groupby('user')['prev_was_last']
      .cumsum()
      .astype(int)
)

In [133]:
df

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments,ts,prev_was_last,session_id
0,next,2025-05-11 19:33:01.707,1,11543,1.00,0.029562,36336.0,{'HW2': 'C'},2025-05-11 19:33:01.707,False,0
1,next,2025-05-11 19:33:01.751,1,36336,0.64,0.034388,802.0,{'HW2': 'C'},2025-05-11 19:33:01.751,False,0
2,next,2025-05-11 19:33:01.773,1,802,0.63,0.011737,13248.0,{'HW2': 'C'},2025-05-11 19:33:01.773,False,0
3,next,2025-05-11 19:33:01.810,1,13248,0.50,0.014285,9184.0,{'HW2': 'C'},2025-05-11 19:33:01.810,False,0
4,next,2025-05-11 19:33:01.871,1,9184,0.40,0.040028,10986.0,{'HW2': 'C'},2025-05-11 19:33:01.871,False,0
...,...,...,...,...,...,...,...,...,...,...,...
414660,next,2025-05-11 19:27:23.908,9999,3637,0.00,0.001199,3637.0,{'HW2': 'C'},2025-05-11 19:27:23.908,False,1
414661,next,2025-05-11 19:27:23.916,9999,3637,0.00,0.000869,3637.0,{'HW2': 'C'},2025-05-11 19:27:23.916,False,1
414662,next,2025-05-11 19:27:23.934,9999,3637,0.00,0.013034,3637.0,{'HW2': 'C'},2025-05-11 19:27:23.934,False,1
414663,next,2025-05-11 19:27:23.952,9999,3637,0.00,0.006829,3637.0,{'HW2': 'C'},2025-05-11 19:27:23.952,False,1


In [134]:
# Encode track labels in min time listened descending order for consistency

sessions = df.groupby(['user', 'session_id'])

max_listened = df.groupby('track').max('time')

tracks_ml_df = tracks_df.join(max_listened[['time']].rename(columns={'time': 'max_listened'}), 'track')
tracks_ml_df = tracks_ml_df.sort_values('max_listened', ascending=False)
tracks_ml_df.insert(0, 'track_label', range(len(tracks_ml_df)))

tracks_ml_df

Unnamed: 0_level_0,track_label,artist,album,title,genre,pop,duration,max_listened
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
41164,0,Михаил Бублик,ART-Обстрел I-часть,Сорок тысяч верст,"[1, 47]",-0.500252,282,1.0
13606,1,Bakhtin,Дом,Дом,[10],-0.667521,172,1.0
118,2,Трофимова Елизавета,Номер,Номер,[1],-1.039376,145,1.0
40800,3,Ночные Снайперы,выживут только влюбленные,история,[2],-0.096236,219,1.0
4943,4,Baga,7:00,7:00,[1],-1.094547,183,1.0
...,...,...,...,...,...,...,...,...
25938,49995,H.A.Z.E,Finding You,Finding You (Radio Edit),[36],-0.998239,203,
43840,49996,Олег Алябин,Я живу лишь тобой,Я живу лишь тобой,[1],-0.213198,209,
48072,49997,Группа Виктор,Звезда по имени Солнце,Звезда по имени Солнце,[2],-0.152822,197,
43028,49998,Sister Sin,Black Lotus,Desert Queen,[12],-0.872107,324,


In [142]:
sessions = list(\
    df\
    [df['time'] > 0.5]
    # [df['experiments'].apply(lambda x: x['HW2'] == 'T1')]
    .groupby(['user', 'session_id'])\
    ['track']\
    .apply(lambda a: list(map(str, a)))\
    .to_numpy()\
)

sessions = list(filter(lambda s: len(s) > 1, sessions))
len(sessions)

35580

In [143]:
from gensim.models import Word2Vec, KeyedVectors
import pickle as pkl
from gensim.models.callbacks import CallbackAny2Vec
import pathlib

class Saver(CallbackAny2Vec):
    def __init__(self, prefix="song2vec_ckpt", keep_kv_only=True):
        self.prefix = pathlib.Path(prefix)
        self.keep_kv_only = keep_kv_only
        self.epoch = 0
        self.prefix.mkdir(exist_ok=True)          # create folder once

    def on_epoch_end(self, model):
        fname = self.prefix / f"epoch{self.epoch:02d}.model"
        if self.keep_kv_only:
            model.wv.save(str(fname.with_suffix('.kv')))
        else:
            model.save(str(fname))                     # full model inc. optimizer
        print(f"\u2705  saved checkpoint: {fname}")
        self.epoch += 1

import os
if os.path.isfile("w2v.model"):
    w2v = Word2Vec.load('w2v.model')
    w2v.train(sessions, callbacks=[Saver(prefix="song2vec_ckpt_v2", keep_kv_only=False)], total_examples=w2v.corpus_count, epochs=25)
else:
    w2v = Word2Vec(sessions, vector_size=128, sg=1,
                window=10, negative=10, epochs=50,
                min_count=5, sample=1e-3, workers=12,
                callbacks=[Saver(prefix="song2vec_ckpt_v2", keep_kv_only=False)])

w2v.save('w2v.model')
w2v.wv.save('w2v.kv')
print(w2v.wv)



✅  saved checkpoint: song2vec_ckpt_v2/epoch00.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch01.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch02.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch03.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch04.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch05.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch06.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch07.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch08.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch09.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch10.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch11.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch12.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch13.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch14.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch15.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch16.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch17.model
✅  saved checkpoint: song2vec_ckpt_v2/epoch18.model
✅  saved che

In [None]:
from gensim.models import KeyedVectors

kv = KeyedVectors.load('w2v.kv')

def show(p, tid):
    track = tracks_ml_df.loc[tid]
    print(f"{p:>20} {tid:>4} {track['artist']:<20} {track['title']}")

def show_similar(tid, topn=10):
    # print(tracks_ml_df.loc[tid])
    track = tracks_ml_df.loc[int(tid)]
    print(f"Similar to {track['artist']} - {track['title']}:")
    for t in kv.most_similar(positive=[str(tid)], topn=topn):
        track = tracks_ml_df.loc[int(t[0])]
        print(f"[{t[1]}] {t[0]}: {track['artist']} - {track['title']}")

# list(kv.key_to_index.keys())
show_similar(34403)

Similar to Boney M. - Sunny (Live):
[0.8901371359825134] 34408: Boney M. - Ma Baker (Live)
[0.8881460428237915] 34407: Boney M. - Gotta Go Home (Live)
[0.8832859992980957] 34406: Boney M. - Belfast (Live)
[0.883190393447876] 42921: Arabesque - Born To Reggae
[0.882602334022522] 34409: Boney M. - Hooray, Hooray, It's a Holi-Holiday (Live)
[0.8715381622314453] 32192: Revoльvers - Яблоки на снегу
[0.8642311096191406] 26557: Моя Мишель - Стюардесс
[0.8581438064575195] 34404: Boney M. - Brown Girl in the Ring (Live)
[0.7969501614570618] 23027: MEDUZA - Paradise
[0.7943751215934753] 44452: Fr David - Pick Up the Phone
