In [1]:
import pandas as pd

# index, session_id, song_id, unix_played_at, play_status, login_type, listening_order
train_source = pd.read_parquet("../../datagame-2023/label_train_source.parquet")
# index, session_id, song_id, unix_played_at, play_status, login_type, listening_order
train_target = pd.read_parquet("../../datagame-2023/label_train_target.parquet")
# index, session_id, song_id, unix_played_at, play_status, login_type, listening_order
test_source = pd.read_parquet("../../datagame-2023/label_test_source.parquet")

In [2]:
all_data = pd.concat([train_source, train_target, test_source])

In [3]:
session_to_songs = all_data.sort_values(by=['session_id', 'listening_order']).groupby('session_id')['song_id'].apply(
    list).to_dict()

In [4]:
data = session_to_songs

In [5]:
# 在每位用戶的歌曲序列結尾添加一個特殊的標記，表示歌曲序列的結束
for user_songs in data.values():
    user_songs.append('END')
    user_songs.append('END')
    user_songs.append('END')
    user_songs.append('END')
    user_songs.append('END')

# 將所有用戶的歌曲序列合併為一個大列表
all_songs = [song for songs in data.values() for song in songs]
# all_songs

In [6]:
from collections import Counter


class myConditionalFreqDist:
    def __init__(self):
        self._data = {}
        self.values = [1, 0.63, 0.5, 0.43, 0.38]  # ndcg 的加權分數

    def __getitem__(self, key):
        return self._data.get(key, {})

    def __setitem__(self, key, value):
        self._data[key] = value

    def inc(self, condition, sample, index):
        if condition in self._data:
            if sample in self._data[condition]:
                self._data[condition][sample] += self.values[index]
            else:
                self._data[condition][sample] = self.values[index]
        else:
            self._data[condition] = {sample: self.values[index]}

    def most_common(self, condition, n=None):
        if condition in self._data:
            counter = Counter(self._data[condition])
            return counter.most_common(n)
        else:
            return []


cfd = myConditionalFreqDist()
cfd.inc('condition1', 'sample1', 0)
cfd.inc('condition1', 'sample2', 1)
cfd.inc('condition1', 'sample2', 3)
cfd.inc('condition2', 'sample1', 1)
cfd.inc('condition2', 'sample1', 2)
cfd.inc('condition2', 'sample2', 1)

print(cfd.most_common('condition1'))  # 輸出：[('sample2', 1.06), ('sample1', 1)]
print(cfd.most_common('condition2', n=1))  # 輸出：[('sample1', 1.13)]


[('sample2', 1.06), ('sample1', 1)]
[('sample1', 1.13)]


In [7]:
from tqdm import tqdm

cfd_5grams = myConditionalFreqDist()

for i in tqdm(range(len(all_songs) - 9)):
    for j in range(4, 9):
        if (all_songs[i] != 'END' and all_songs[i + 1] != 'END' and all_songs[i + 2] != 'END' 
                and all_songs[i + 3] != 'END' and all_songs[i + j] != 'END'):
            cfd_5grams.inc(all_songs[i] + all_songs[i + 1] + all_songs[i + 2] + all_songs[i + 3], all_songs[i + j], j - 4)

100%|██████████| 20744361/20744361 [00:57<00:00, 359822.18it/s]


In [8]:
import pickle

with open('cfd_5grams_test', 'wb') as file:
    pickle.dump(cfd_5grams, file)

In [9]:
del cfd_5grams

In [10]:
from tqdm import tqdm

cfd_4grams = myConditionalFreqDist()

for i in tqdm(range(len(all_songs) - 8)):
    for j in range(3, 8):
        if (all_songs[i] != 'END' and all_songs[i + 1] != 'END' 
                and all_songs[i + 2] != 'END' and all_songs[i + j] != 'END'):
            cfd_4grams.inc(all_songs[i] + all_songs[i + 1] + all_songs[i + 2], all_songs[i + j], j - 3)

100%|██████████| 20744362/20744362 [00:51<00:00, 399707.18it/s]


In [11]:
import pickle

with open('cfd_4grams_test', 'wb') as file:
    pickle.dump(cfd_4grams, file)

In [12]:
del cfd_4grams

In [13]:
from tqdm import tqdm

cfd_3grams = myConditionalFreqDist()

for i in tqdm(range(len(all_songs) - 7)):
    for j in range(2, 7):
        if all_songs[i] != 'END' and all_songs[i + 1] != 'END' and all_songs[i + j] != 'END':
            cfd_3grams.inc(all_songs[i] + all_songs[i + 1], all_songs[i + j], j - 2)

100%|██████████| 20744363/20744363 [00:44<00:00, 462385.90it/s]


In [14]:
import pickle

with open('cfd_3grams_test', 'wb') as file:
    pickle.dump(cfd_3grams, file)

In [14]:
del cfd_3grams