In [1]:
import pandas as pd
from tqdm import tqdm
from json import loads, dumps

In [2]:
# index, session_id, song_id, unix_played_at, play_status, login_type, listening_order
train_source = pd.read_parquet("../../datagame-2023/label_train_source.parquet")
# index, session_id, song_id, unix_played_at, play_status, login_type, listening_order
train_target = pd.read_parquet("../../datagame-2023/label_train_target.parquet")
# index, session_id, song_id, unix_played_at, play_status, login_type, listening_order
test_source = pd.read_parquet("../../datagame-2023/label_test_source.parquet")
# index, song_id, artist_id, song_length, album_id, language_id, album_month
meta_song = pd.read_parquet("../../datagame-2023/meta_song.parquet")
# index, song_id, composer_id
meta_song_composer = pd.read_parquet("../../datagame-2023/meta_song_composer.parquet")
# index, song_id, genre_id
meta_song_genre = pd.read_parquet("../../datagame-2023/meta_song_genre.parquet")
# index, song_id, lyricist_id
meta_song_lyricist = pd.read_parquet("../../datagame-2023/meta_song_lyricist.parquet")
# index, song_id, producer_id
meta_song_producer = pd.read_parquet("../../datagame-2023/meta_song_producer.parquet")
# index, song_id, title_text_id
meta_song_titletext = pd.read_parquet("../../datagame-2023/meta_song_titletext.parquet")

In [3]:
song_to_genre = dict()
song_to_lyricist = dict()
song_to_artist = dict()
song_to_album = dict()
song_to_language = dict()
song_to_album_month = dict()

for row in tqdm(meta_song_genre.itertuples(), total=len(meta_song)):
    song_to_genre[row.song_id] = row.genre_id

for row in tqdm(meta_song_lyricist.itertuples(), total=len(meta_song)):
    song_to_lyricist[row.song_id] = row.lyricist_id

for row in tqdm(meta_song.itertuples(), total=len(meta_song)):
    if not pd.isna(row.artist_id):
        song_to_artist[row.song_id] = int(row.artist_id)

    if not pd.isna(row.album_id):
        song_to_album[row.song_id] = int(row.album_id)

    if not pd.isna(row.language_id):
        song_to_language[row.song_id] = int(row.language_id)

1161955it [00:00, 2004455.29it/s]                            
 46%|████▌     | 475632/1030712 [00:00<00:00, 1746936.17it/s]
100%|██████████| 1030712/1030712 [00:01<00:00, 599710.19it/s]


In [4]:
# Combine train_source and train_target by session_id
combined_train = pd.concat([train_source, train_target], ignore_index=True)

combined_train = combined_train.sort_values(by=['unix_played_at'])

In [5]:
from collections import defaultdict

song_to_sessions = defaultdict(list)

# 'session_id' 轉換為 list
grouped_by_songs = combined_train.groupby('song_id')['session_id'].apply(list)

# 將結果轉換為字典
song_to_sessions = grouped_by_songs.to_dict()

In [6]:
from collections import defaultdict
import json
import os

song_info = defaultdict(lambda: defaultdict(str))
fields = ['artist', 'album', 'language', 'genre']

fields_to_dict = {
    'artist': song_to_artist,
    'album': song_to_album,
    'language': song_to_language,
    'genre': song_to_genre,
}

if not os.path.exists('corpus'):
    os.makedirs('corpus')

with open("./corpus/sparse_corpus.jsonl", 'w', encoding='utf-8') as jsonl_file:
    # 合併兩個迴圈
    for row in tqdm(meta_song.itertuples(), total=len(meta_song)):
        song_id = row.song_id

        # 生成 contents 字串時判斷值是否為空
        contents_list = [f'{field}{fields_to_dict[field].get(song_id, "")}' for field in fields if fields_to_dict[field].get(song_id, "") != ""]
        contents_str = ' '.join(contents_list)
        song_get = song_to_sessions.get(song_id, [])
        song_get = [hex(session)[2:] for session in song_get]
        tmp = ' '
        if len(song_get) == 0 or len(contents_str) == 0:
            tmp = ''
        song_content = contents_str + tmp + ' '.join(song_get)

        if len(song_content) < 3:
            continue

        if len(song_content) != 0:
            song_info[song_id]['id'] = song_id  # 將 song_id 加入 id 欄位
            song_info[song_id]['contents'] = song_content

        # 將字典轉換為 JSON 並寫入 JSONL 文件
        jsonl_file.write(json.dumps(song_info[song_id]) + '\n')

100%|██████████| 1030712/1030712 [00:10<00:00, 95813.95it/s]


In [7]:
len(song_info)

1019117