In [189]:
import numpy as np
import pandas as pd
from itertools import combinations
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [190]:
data_path = '/opt/ml/input/data/train/'
train_df = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv')

In [191]:
directors = pd.read_csv(data_path + 'directors.tsv', sep='\t')
genres= pd.read_csv(data_path + 'genres.tsv', sep='\t')
titles = pd.read_csv(data_path + 'titles.tsv', sep='\t')
writers = pd.read_csv(data_path + 'writers.tsv', sep='\t')
years = pd.read_csv(data_path + 'years.tsv', sep='\t')

In [192]:
writers

Unnamed: 0,item,writer
0,1237,nm0000005
1,5147,nm0000005
2,7327,nm0000005
3,2068,nm0000005
4,7396,nm0000005
...,...,...
11301,27397,nm5371819
11302,2987,nm5927607
11303,33463,nm5927607
11304,2987,nm5927608


In [193]:
director_series = directors.groupby('director').apply(lambda r: list(r.item))

In [194]:
rel_num = 1

In [195]:
all_list = []

In [196]:
director_combi = director_series.map(lambda x: all_list.extend(list(combinations(x, 2))))

In [197]:
director_kg = pd.DataFrame(np.zeros((len(all_list), 3), dtype=np.int32), columns=['h', 'r', 't'])

In [198]:
h_list, t_list, r_list = [], [], []

In [199]:
for h, t in all_list:
    h_list.append(h)
    t_list.append(t)
r_list = [rel_num] * len(all_list)

In [200]:
director_kg['h'] = h_list
director_kg['t'] = t_list
director_kg['r'] = r_list

In [201]:
director_kg

Unnamed: 0,h,r,t
0,1237,1,5147
1,1237,1,7327
2,1237,1,2068
3,1237,1,7396
4,1237,1,7820
...,...,...,...
18890,82461,1,101864
18891,84273,1,58376
18892,84273,1,65225
18893,58376,1,65225


In [202]:
# data_path = '/opt/ml/input/data/train/'
# def conversion_kg(data_path:str, attribute:str, rel_num:int = 0):
    
#     attr = pd.read_csv(data_path + attribute + 's.tsv', sep='\t')
#     attr_series = attr.groupby(attribute).apply(lambda full_rel: list(full_rel.item))
    
#     all_list = []
#     attr_combi = attr_series.map(lambda x: all_list.extend(list(combinations(x, 2))))
    
#     attr_kg = pd.DataFrame(np.zeros((len(all_list), 3), dtype=np.int32), columns=['h', 'r', 't'])
#     h_list, t_list, r_list = [], [], []
#     for h, t in all_list:
#         h_list.append(h)
#         t_list.append(t)
#     r_list = [rel_num] * len(all_list)
    
#     attr_kg['h'] = h_list
#     attr_kg['t'] = t_list
#     attr_kg['r'] = r_list
    
#     return attr_kg

In [203]:
data_path = '/opt/ml/input/data/train/'
def conversion_kg(data_path:str, attribute:str, rel_num:int = 0):
    
    attr = pd.read_csv(data_path + attribute + 's.tsv', sep='\t')
    attr_series = attr.groupby(attribute).apply(lambda full_rel: list(full_rel.item))
    
    all_set = set()
    attr_combi = attr_series.map(lambda x: all_set | set(combinations(x, 2)))
    
    attr_kg = pd.DataFrame(np.zeros((len(all_list), 3), dtype=np.int32), columns=['h', 'r', 't'])
    h_list, t_list, r_list = [], [], []
    for h, t in all_list:
        h_list.append(h)
        t_list.append(t)
    r_list = [rel_num] * len(all_list)
    
    attr_kg['h'] = h_list
    attr_kg['t'] = t_list
    attr_kg['r'] = r_list
    
    return attr_kg

In [204]:
rel_dict = {
    'director': 0,
    'genre': 1,
    'writer' : 2,
    'year' : 3
    }

In [205]:
df = pd.DataFrame(columns=['h', 'r', 't'])
for key, value in rel_dict.items():
    df = pd.concat([df, conversion_kg(data_path, key, value)], ignore_index=True)

In [206]:
df

Unnamed: 0,h,r,t
0,1237,0,5147
1,1237,0,7327
2,1237,0,2068
3,1237,0,7396
4,1237,0,7820
...,...,...,...
75575,82461,3,101864
75576,84273,3,58376
75577,84273,3,65225
75578,58376,3,65225


In [207]:
df.to_csv('kg_final.csv', index=False)

### train.txt

In [175]:
train_split, test_split = train_test_split(train_df, test_size=0.20, train_size=0.80, random_state=42)

In [176]:
train = train_split.sort_values(['user', 'time'])
test = test_split.sort_values(['user', 'time'])


In [177]:
def to_string(x):
    tmp = ''
    for i in x.item:
        tmp += str(i) + ' '
    return tmp

In [178]:
train_srs = train.groupby('user').apply(to_string)
test_srs = test.groupby('user').apply(to_string)


In [179]:
train_df = pd.DataFrame(columns=['user', 'item'])
test_df = pd.DataFrame(columns=['user', 'item'])

In [180]:
train_df['user'] = list(train_srs.index)
train_df['item'] = train_srs.values

test_df['user'] = list(test_srs.index)
test_df['item'] = test_srs.values

In [181]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

In [182]:
def load_cf_df(filename):
    user = []
    item = []
    user_dict = dict()
    
    df = pd.read_csv(filename)
    for u, items in zip(df.user, df.item):
        inter = [int(i) for i in items.split()]

        if len(inter) > 1:
            user_id, item_ids = u, inter
            item_ids = list(set(item_ids))

            for item_id in item_ids:
                user.append(user_id)
                item.append(item_id)
            user_dict[user_id] = item_ids

    user = np.array(user, dtype=np.int32)
    item = np.array(item, dtype=np.int32)
    return (user, item), user_dict

In [183]:
(user, item), user_dict = load_cf_df('/opt/ml/input/exp_ipynb/test.csv')