In [1]:
import argparse
import glob
import os
import json

from gensim.models import FastText
from gensim.models.callbacks import CallbackAny2Vec

from tqdm import tqdm



In [3]:
log_files = glob.glob('../../../logs/*/data.json*')

user_sequences = {}

for log in tqdm(log_files, desc='Processing log files'):
    with open(log, 'r') as log:
        for line in log:

            event = json.loads(line.strip())

            if event['time'] < 0.9:
                continue

            user_id = event['user']

            if user_id not in user_sequences:
                user_sequences[user_id] = {'tracks': [], 'timestamps': []}

            user_sequences[user_id]['tracks'].append(event['track'])
            user_sequences[user_id]['timestamps'].append(event['timestamp'])

sequences = []
for user_id, data in user_sequences.items():
    sorted_tracks = [track for _, track in sorted(zip(data['timestamps'], data['tracks']))]

    sequences.append([str(track) for track in sorted_tracks])

Processing log files: 100%|██████████| 44/44 [00:48<00:00,  1.11s/it]


In [9]:
class TrainingProgressBar(CallbackAny2Vec):
    def __init__(self, total_epochs):
        self.epoch = 0
        self.pbar = tqdm(total=total_epochs, desc='Training')

    def on_epoch_end(self, model):
        self.epoch += 1
        self.pbar.update(1)

    def on_train_end(self, model):
        self.pbar.close()

model = FastText(
    sentences=sequences,
    vector_size=100,
    window=10,
    min_count=5,
    workers=8,
    epochs=25,
    sg=1,
    alpha=0.001,
    min_n=3,
    max_n=6,
    callbacks=[TrainingProgressBar(total_epochs=25)]
)

model.wv.save('userbody.ft')


[A

KeyboardInterrupt: 