In [1]:
import torch

import os
from torch.utils.data import Dataset
from utils import neg_sample, join_attribute, feature_matrix

import pandas as pd
from pathlib import Path

In [2]:
rating_df = pd.read_csv('/opt/ml/input/data/train/'+ 'rating.csv')
attr_path = os.path.join('/opt/ml/input/data/train/','genre.csv')
attr_df = pd.read_csv(attr_path)

len(rating_df['item'].unique())

6807

In [3]:
len(rating_df['user'].unique())

31360

In [4]:
user_watch = rating_df.groupby('user')['item'].apply(list)

In [5]:
attr_df.items
items = set(rating_df['item'])

In [6]:
data = []
for user, item in user_watch.iteritems():
    un_watched = [i for i in items if i not in item]
    data += [[user,i] for i in un_watched]
new_df = pd.DataFrame(data, columns=["user","item"])

len(new_df)
new_df

Unnamed: 0,user,item
0,11,2
1,11,3
2,11,32770
3,11,5
4,11,6
...,...,...
208313044,138493,98243
208313045,138493,32721
208313046,138493,32728
208313047,138493,32743


In [7]:
attr_df = pd.read_csv(attr_path, index_col=0)
attr_df

#joined_rating_df = pd.merge(new_df, attr_df, left_on='item', right_on='item', how='inner')

joined_rating_df = new_df.set_index('item').join(attr_df.set_index('item'), how='left')

joined_rating_df = joined_rating_df.sort_values(by=['user','item'])
joined_rating_df

Unnamed: 0_level_0,user,genre
item,Unnamed: 1_level_1,Unnamed: 2_level_1
2,11,15
3,11,5
4,11,5
5,11,5
6,11,17
...,...,...
118700,138493,4
118900,138493,4
118997,138493,1
119141,138493,17


In [32]:
# user, item을 zero-based index로 mapping
users = list(set(joined_rating_df.loc[:,'user']))
users.sort()
items =  list(set((joined_rating_df.loc[:, 'item'])))
items.sort()
attrs =  list(set((joined_rating_df.loc[:, "genre"])))
attrs.sort()

if len(users)-1 != max(users):
    users_dict = {users[i]: i for i in range(len(users))}
    joined_rating_df['user']  = joined_rating_df['user'].map(lambda x : users_dict[x])
    users = list(set(joined_rating_df.loc[:,'user']))
        
if len(items)-1 != max(items):
    items_dict = {items[i]: i for i in range(len(items))}
    joined_rating_df['item']  = joined_rating_df['item'].map(lambda x : items_dict[x])
    items =  list(set((joined_rating_df.loc[:, 'item'])))

joined_rating_df = joined_rating_df.sort_values(by=['user'])
joined_rating_df.reset_index(drop=True, inplace=True)

print("# of users : {}\n# of items : {}\n# of genres : {}".format(len(users), len(items), len(attrs)))

# of users : 20
# of items : 6807
# of genres : 18


In [33]:
n_user = len(users)
n_item = len(items)

In [34]:
data = joined_rating_df

In [45]:
user_col = torch.tensor(data.loc[:,'user'])
item_col = torch.tensor(data.loc[:,'item'])
genre_col = torch.tensor(data.loc[:,'genre'])
print(user_col)
print(item_col)
print(genre_col)

tensor([ 0,  0,  0,  ..., 19, 19, 19])
tensor([   1,   23, 3458,  ..., 6804, 6754, 6806])
tensor([15,  4, 17,  ...,  1, 17, 17])


In [47]:
offsets = [0, n_user, n_user+n_item]
for col, offset in zip([user_col, item_col, genre_col], offsets):
    col += offset

X = torch.cat([user_col.unsqueeze(1), item_col.unsqueeze(1), genre_col.unsqueeze(1)], dim=1)

In [48]:
print(user_col)
print(item_col)
print(genre_col)
print(X)

tensor([ 0,  0,  0,  ..., 19, 19, 19])
tensor([  41,   63, 3498,  ..., 6844, 6794, 6846])
tensor([13669, 13658, 13671,  ..., 13655, 13671, 13671])
tensor([[    0,    41, 13669],
        [    0,    63, 13658],
        [    0,  3498, 13671],
        ...,
        [   19,  6844, 13655],
        [   19,  6794, 13671],
        [   19,  6846, 13671]])


In [44]:
for col, offset in zip([user_col, item_col, genre_col], offsets):
    col -= offset


print(user_col)
print(item_col)
print(genre_col)

tensor([ 0,  0,  0,  ..., 19, 19, 19])
tensor([   1,   23, 3458,  ..., 6804, 6754, 6806])
tensor([15,  4, 17,  ...,  1, 17, 17])


In [50]:
a = 1,1,1] 
a -= [2,2,2]
a

TypeError: unsupported operand type(s) for -=: 'list' and 'list'

In [5]:
class InferenceDataset(Dataset):
    def __init__(self, rating_dir, attr_dir):
        self.rating_dir = rating_dir
        self.attr_dir = attr_dir
        self.data = None
        self.user_dict = None
        self.item_dict = None
        self.offsets = None
        self.X = None
        
        self.setup()
        
    def setup(self):
        rating_df = pd.read_csv(self.rating_dir)
        attr_df = pd.read_csv(self.attr_dir, index_col = 0)
        
        print("Create inference samples")
        data = self._inference_sample(rating_df)
        data = pd.DataFrame(data, columns=["user","item"])

        print('Join attribute dataframe')
        joined_rating_df = pd.merge(data, attr_df, left_on='item', right_on='item', how='inner')

        users = list(set(joined_rating_df.loc[:,'user']))
        users.sort()
        items =  list(set((joined_rating_df.loc[:, 'item'])))
        items.sort()
        attrs =  list(set((joined_rating_df.loc[:, "genre"])))
        attrs.sort()

        if len(users)-1 != max(users):
            users_dict = {users[i]: i for i in range(len(users))}

            self.user_dict = {v:k for k,v in users_dict.items()}

            joined_rating_df['user']  = joined_rating_df['user'].map(lambda x : users_dict[x])
            users = list(set(joined_rating_df.loc[:,'user']))
                
        if len(items)-1 != max(items):
            items_dict = {items[i]: i for i in range(len(items))}
            
            self.item_dict = {v:k for k,v in items_dict.items()}

            joined_rating_df['item']  = joined_rating_df['item'].map(lambda x : items_dict[x])
            items =  list(set((joined_rating_df.loc[:, 'item'])))

        joined_rating_df = joined_rating_df.sort_values(by=['user'])
        joined_rating_df.reset_index(drop=True, inplace=True)
        self.data = joined_rating_df
        
        self.X = self._feature_matrix()

    def _inference_sample(self,rating_df):
        items = set(rating_df['item'])
        user_rating = rating_df.groupby('user')['item'].apply(list)

        data = []

        cnt = 10
        for user, u_items in user_rating.iteritems():
            un_watched = [i for i in items if i not in u_items]
            data += [[user,i] for i in un_watched]
            cnt -= 1
            if cnt == 0:
                break
        
        return data

    def _feature_matrix(self, attr='genre'):
        #feature matrix X, label tensor y 생성
        user_col = torch.tensor(self.data.loc[:,'user'])
        item_col = torch.tensor(self.data.loc[:,'item'])
        attr_col = torch.tensor(self.data.loc[:,attr])

        n_user = len(set(self.data.loc[:,'user']))
        n_item = len(set(self.data.loc[:,'item']))

        self.offsets = [0, n_user, n_user+n_item]
        for col, offset in zip([user_col, item_col, attr_col], self.offsets):
            col += offset

        X = torch.cat([user_col.unsqueeze(1), item_col.unsqueeze(1), attr_col.unsqueeze(1)], dim=1)

        return X.long()

    def decode_offset(self, X):
        user_idx = X[0].item() - self.offsets[0]
        item_idx = X[1].item() - self.offsets[1]
        #attr_idx = X[2] - self.offset[2]

        user = self.user_dict[user_idx]
        item = self.item_dict[item_idx]
        return user, item

    def __getitem__(self, index):
        return self.X[index]

    def get_users(self):
        return len(set(self.data.loc[:, 'user']))

    def get_items(self):
        return len(set(self.data.loc[:, 'item']))

    def get_attributes(self):
        return len(set(self.data.loc[:, 'genre']))

    def __len__(self):
        return len(self.data)

dataset = InferenceDataset('/opt/ml/input/data/train/rating.csv', '/opt/ml/input/data/train/genre.csv')
X = dataset[0]
dataset.decode_offset(X)

Create inference samples
Join attribute dataframe


(11, 2)

In [4]:
class InferenceDataset(Dataset):
    def __init__(self, args, rating_dir, attr_dir):
        self.args = args
        self.rating_dir = rating_dir
        self.attr_dir = attr_dir
        self.data = None
        self.user_dict = None
        self.item_dict = None
        self.offsets = None
        self.X = None
        
        self.setup()
        
    def setup(self):
        rating_df = pd.read_csv(self.rating_dir)
        attr_df = pd.read_csv(self.attr_dir, index_col = 0)
        
        print("Create inference samples")
        data = self._inference_sample(rating_df)
        data = pd.DataFrame(data, columns=["user","item"])

        print('Join attribute dataframe')
        joined_rating_df = pd.merge(data, attr_df, left_on='item', right_on='item', how='inner')

        users = list(set(joined_rating_df.loc[:,'user']))
        users.sort()
        items =  list(set((joined_rating_df.loc[:, 'item'])))
        items.sort()
        attrs =  list(set((joined_rating_df.loc[:, "genre"])))
        attrs.sort()

        if len(users)-1 != max(users):
            users_dict = {users[i]: i for i in range(len(users))}

            self.user_dict = {v:k for k,v in users_dict.items()}

            joined_rating_df['user']  = joined_rating_df['user'].map(lambda x : users_dict[x])
            users = list(set(joined_rating_df.loc[:,'user']))
                
        if len(items)-1 != max(items):
            items_dict = {items[i]: i for i in range(len(items))}
            
            self.item_dict = {v:k for k,v in items_dict.items()}

            joined_rating_df['item']  = joined_rating_df['item'].map(lambda x : items_dict[x])
            items =  list(set((joined_rating_df.loc[:, 'item'])))

        joined_rating_df = joined_rating_df.sort_values(by=['user'])
        joined_rating_df.reset_index(drop=True, inplace=True)
        self.data = joined_rating_df
        
        self.X = self._feature_matrix(self.args.attr)

    def _inference_sample(self,rating_df, attr_df):
        items = set(rating_df['item'])
        user_rating = rating_df.groupby('user')['item'].apply(list)

        data = []

        cnt = 10
        for user, u_items in user_rating.iteritems():
            un_watched = [i for i in items if i not in u_items]
            data += [[user,i] for i in un_watched]
            cnt -= 1
            if cnt == 0:
                break
        
        return data

    def _feature_matrix(self, attr='genre'):
        #feature matrix X, label tensor y 생성
        user_col = torch.tensor(self.data.loc[:,'user'])
        item_col = torch.tensor(self.data.loc[:,'item'])
        attr_col = torch.tensor(self.data.loc[:,attr])

        n_user = len(set(self.data.loc[:,'user']))
        n_item = len(set(self.data.loc[:,'item']))

        self.offsets = [0, n_user, n_user+n_item]
        for col, offset in zip([user_col, item_col, attr_col], self.offsets):
            col += offset

        X = torch.cat([user_col.unsqueeze(1), item_col.unsqueeze(1), attr_col.unsqueeze(1)], dim=1)

        return X.long()

    def decode_offset(self, X):
        user_idx = X[0].item() - self.offsets[0]
        item_idx = X[1].item() - self.offsets[1]
        #attr_idx = X[2] - self.offset[2]

        user = self.user_dict[user_idx]
        item = self.item_dict[item_idx]
        return user, item

    def __getitem__(self, index):
        return self.X[index]

    def get_users(self):
        return len(set(self.data.loc[:, 'user']))

    def get_items(self):
        return len(set(self.data.loc[:, 'item']))

    def get_attributes(self):
        return len(set(self.data.loc[:, self.args.attr]))

    def __len__(self):
        return len(self.data)

6807

AttributeError: 

In [8]:
rating = []

rating.append([11, 0.7, 247])
rating.append([11, 0.9, 1234])
rating.append([11, 0.8, 879])

rating.append([13, 0.7,247])
rating.append([13 ,0.9,1234])
rating.append([13 ,0.8,879])

rating.append([10, 0.7, 247])
rating.append([10, 0.9, 1234])
rating.append([10, 0.8, 879])

rating


[[11, 0.7, 247],
 [11, 0.9, 1234],
 [11, 0.8, 879],
 [13, 0.7, 247],
 [13, 0.9, 1234],
 [13, 0.8, 879]]

In [24]:
ratings = []
ratings.append([0.7,1])
ratings.append([0.1,2])
ratings.append([0.2,3])
ratings.append([0.3,5])
ratings.append([0.9,4])
ratings.append([0.4,6])
ratings.append([0.8,7])

ratings.sort(key=lambda x:x[0])
ratings

[[0.1, 2], [0.2, 3], [0.3, 5], [0.4, 6], [0.7, 1], [0.8, 7], [0.9, 4]]

In [31]:
info = []

for item in ratings[-3:]:
    info.append([11,item[1]])

info = pd.DataFrame(info, columns=['user','item'])
info.to_csv(out_dir,f"submission.csv",index=False)

In [33]:
df.to_csv("test.csv", index=False)