In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import tqdm

from pathlib import Path

rand = random.Random(199998)

In [8]:
N_DATA_ROWS = 50000000
POSITIVE_TH = 0.8
N_USERS = 10000
N_TRACKS = 50000

In [9]:
def process_file(file: str | Path):
    df = pd.read_json(file, lines=True).sort_values(by='user')

    for i in range(len(df)):
        if df.at[i, 'time'] < POSITIVE_TH:
            continue

        if i > 0 and df.at[i-1, 'user'] == df.at[i, 'user']:
            yield [df.at[i, 'user'], df.at[i-1, 'track'], df.at[i-1, 'time'], df.at[i, 'track'], df.at[i, 'time']]
        else:
            yield [df.at[i, 'user'], N_TRACKS, 0.0, df.at[i, 'track'], df.at[i, 'time']]

In [10]:
data = []
data_files = list(Path('data_9GB').glob('*.json'))

while len(data) < N_DATA_ROWS and len(data_files) > 0:
    file = rand.choice(data_files)
    data_files.remove(file)

    print(f'Processing file {file}...')
    for u, py, pt, y, t in process_file(file):
        data.append((u, py, pt, y, t))
    print(f'{len(data)} rows')

Processing file data_9GB/botify_recommender_10_3.json...
108113 rows
Processing file data_9GB/botify_recommender_10_4.json...
215815 rows
Processing file data_9GB/botify_recommender_3_data.json...
323790 rows
Processing file data_9GB/botify_recommender_10_1.json...
432050 rows
Processing file data_9GB/botify_recommender_2_1.json...
538074 rows
Processing file data_9GB/botify_recommender_1_1.json...
645335 rows
Processing file data_9GB/botify_recommender_3_7.json...
753480 rows
Processing file data_9GB/botify_recommender_40_2.json...
857713 rows
Processing file data_9GB/botify_recommender_4_3.json...
963114 rows
Processing file data_9GB/botify_recommender_10_10.json...
1071299 rows
Processing file data_9GB/als_gcf_data2.json...
1074662 rows
Processing file data_9GB/botify_recommender_4_data.json...
1179978 rows
Processing file data_9GB/botify_recommender_20_8.json...
1288815 rows
Processing file data_9GB/botify_recommender_4_6.json...
1394354 rows
Processing file data_9GB/botify_recomme

In [12]:
data_by_user = {}

for u, py, pt, y, t in data:
    if u not in data_by_user:
        data_by_user[u] = set()
    data_by_user[u].add(y)

for u in data_by_user.keys():
    data_by_user[u] = list(data_by_user[u])

In [13]:
with open('recommendations_kiss.json', 'w') as file:
    for u in sorted(list(data_by_user.keys())):
        file.write(f'{{"user":{u},"tracks":[{",".join(map(str, data_by_user[u]))}]}}\n')