In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf
from collections import Counter
import ast

In [2]:
cloud = pd.read_csv("processed_data/prossed_cloud.csv", encoding='utf-8')
kuwo = pd.read_csv("processed_data/prossed_kuwo.csv", encoding='utf-8')

In [3]:
cloud_id2artist_df = pd.read_csv("id2name/cloud_id2artist",header=None, encoding='utf-8',delimiter='\t')
kuwo_id2artist_df = pd.read_csv("id2name/kuwo_id2artist", header=None, encoding='utf-8',delimiter='\t')

In [4]:
def build_single_relation(df,h,t,r):
    h_t = df[[h,t]].drop_duplicates()
    h_t['relation'] = [r] * h_t.shape[0]
    return h_t[[h,'relation',t]]

In [5]:
def build_single_bidirection_relation(df,h,t,r1,r2):
    h_t = df[[h,t]].drop_duplicates()
    h_t['relation1'] = [r1] * h_t.shape[0]
    h_t['relation2'] = [r2] * h_t.shape[0]
    return h_t[[h,'relation1',t]], h_t[[t,'relation2',h]]

In [6]:
def build_song2artist(df):
    li = []
    for i in range(df.shape[0]):
        song = df.iloc[i].song_id
        artists = ast.literal_eval(df.iloc[i].artist_id)
        for artist in artists:
            li.append([song, artist])
    re_df = pd.DataFrame(li).drop_duplicates()
    re_df.columns = ['song_id', 'artist_id']
    re_df['relation1'] = ['sung_by'] * re_df.shape[0]
    re_df['relation2'] = ['sings'] * re_df.shape[0]
    return  re_df[['song_id','relation1','artist_id']],re_df[['artist_id','relation2','song_id']]

In [7]:
def build_relations(df, id2artist_df, df_num):
    # triples
    # build_song —— artists
    song2artist, artist2song = build_song2artist(df)
    # build playlist —— song
    playlist2song, song2playlist = build_single_bidirection_relation(df,'playlist_id','song_id','collects','collected_by')
    
    # build album —— song
    album2song, song2album = build_single_bidirection_relation(df,'album_id','song_id','contains','contained_by')
#     to_csv(f"data/{df_name}_album2song",header = False,index=False, sep='\t')
    triples = pd.DataFrame(np.concatenate([d.values for d in [song2artist, artist2song,playlist2song, song2playlist,album2song, song2album]]))
    triples.to_csv(f"triples/rel_triples_{df_num}",header = False,index=False, sep='\t')
    
    # attributes
    # build song —— duration
    song2duration = build_single_relation(df,'song_id','song_duration','duration')
    id2artist = build_single_relation(id2artist_df,0,1,'name')
    id2song = build_single_relation(df,'song_id','song_name','name')
    id2album = build_single_relation(df,'album_id','album_name','name')
    attrs = pd.DataFrame(np.concatenate([d.values for d in [song2duration, id2artist, id2song, id2album]]))
    attrs.to_csv(f"triples/attr_triples_{df_num}",header = False,index=False, sep='\t')

In [8]:
build_relations(cloud,cloud_id2artist_df, 1)

In [9]:
build_relations(kuwo,kuwo_id2artist_df, 2)