In [1]:
import pandas as pd 
import numpy as np
import os 
import pickle
from tqdm import trange

In [2]:
dataset = 'KKBOX'

datafolder = os.path.join(os.getcwd(),dataset+'_raw')
savefolder = os.path.join(os.getcwd(),dataset)

In [3]:
train = pd.read_csv(os.path.join(datafolder,'train.csv'))
songs = pd.read_csv(os.path.join(datafolder,'songs.csv'))
song_extra_info = pd.read_csv(os.path.join(datafolder,'song_extra_info.csv'))
members = pd.read_csv(os.path.join(datafolder,'members.csv'))

In [4]:
train_data = train[['msno','song_id','source_system_tab']].rename({"msno":"user","song_id":"item","source_system_tab":"op"},axis=1)

In [5]:
train_data = train_data.dropna()

In [6]:
train_data.head()

Unnamed: 0,user,item,op
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore


In [7]:
item_att = songs[['song_id','artist_name','genre_ids','language']].rename({"song_id":"item","artist_name":"att1","genre_ids":"att2","language":"att3"},axis=1).dropna()

In [8]:
item_att.head()

Unnamed: 0,item,att1,att2,att3
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,張信哲 (Jeff Chang),465,3.0
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,BLACKPINK,444,31.0
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,SUPER JUNIOR,465,31.0
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,S.H.E,465,3.0
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,貴族精選,726,52.0


In [9]:
item_count= train_data['item'].value_counts().reset_index()
less_item = item_count[item_count['item']<20]['index'] # 出现次数小于3的item

In [10]:
train_data = train_data[~train_data['item'].isin(less_item)] # 过滤掉出现次数少的item

In [11]:
train_data['item'].unique().shape

(36790,)

In [12]:
train_session = [] # 所有的训练session
one_session = [] # 一个sessison
for i in trange(len(train_data)):
    if i==0 or train_data.iloc[i,0] != train_data.iloc[i-1,0]: # 新session
        if len(one_session) >= 2:
            train_session.append(one_session)
        one_session = []
    one_session.append((train_data.iloc[i,1],train_data.iloc[i,2])) # item op

100%|██████████| 6342123/6342123 [09:03<00:00, 11659.92it/s]


In [13]:
item2id = dict([(item,i) for item,i in zip(train_data['item'].unique(),range(1,len(train_data['item'].unique())+1))]) # item编码为id
op2id = dict([(op,i) for op,i in zip(train_data['op'].unique(),range(1, len(train_data['op'].unique())+1))]) # op编码为id

In [14]:
all_sessions = []
for s in train_session:
    all_sessions.append([(item2id[i],op2id[o]) for (i,o) in s])

In [15]:
train_sessions = all_sessions[:int(len(all_sessions)*0.9)]
test_sessions = all_sessions[int(len(all_sessions)*0.9):]

In [16]:
pickle.dump(train_sessions,open(os.path.join(savefolder,'train.pkl'),'wb'))
pickle.dump(test_sessions,open(os.path.join(savefolder,'test.pkl'),'wb'))

In [17]:
# 处理知识图

In [18]:
att2id = dict()
now_id = max(list(item2id.values()))+1
for i in item_att.to_numpy()[:,1:].reshape(-1):
    if i not in att2id.keys():
        att2id[i] = now_id
        now_id += 1

In [28]:
kg = []
for item in item_att.to_numpy():
    item_id = item[0]
    if item_id not in item2id.keys():
        continue
    for rid, att in enumerate(item[1:]):
        kg.append([item2id[item_id], att2id[att],rid])

In [29]:
kg2id = pd.DataFrame(kg, dtype='int',columns=['head','tail','relation'])

In [30]:
kg2id.to_csv(os.path.join(savefolder,'kg2id'),index=False)

In [31]:
pickle.dump((len(item2id)+1, kg2id['relation'].max()+1, kg2id['head'].max()+1),open(os.path.join(savefolder,'dataset_info.pkl'),'wb'))

In [27]:
# 所有padding都是0