In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
class WideDeep(nn.Module):
    def __init__(self, input_dims, embedding_dim, mlp_dims, drop_rate=0.1):
        super(WideDeep, self).__init__()
        
        self.total_input_dim = int(sum(input_dims))

        self.global_bias = nn.Parameter(torch.zeros((1,)))
        
        # Wide
        self.linaer = nn.Linear(self.total_input_dim, 1, bias=True)

        # Deep
        self.global_bias = nn.Parameter(torch.zeros((1,)))
        # embedding matrix
        self.embedding = nn.Embedding(self.total_input_dim, embedding_dim)
        # layer input
        self.embedding_dim = len(input_dims) * embedding_dim


        mlp_layers = []
        for i, dim in enumerate(mlp_dims):
            if i == 0:
                mlp_layers.append(nn.Linear(self.embedding_dim, dim))
            else:
                mlp_layers.append(nn.Linear(mlp_dims[i-1], dim))
            mlp_layers.append(nn.ReLU(True))
            mlp_layers.append(nn.Dropout(drop_rate))
        mlp_layers.append(nn.Linear(mlp_dims[-1], 1))
        self.mlp_layers = nn.Sequential(*mlp_layers)


    def wide(self, x):
        # x : cross product
        wide_y = nn.Linear(x, 1, bias=True)

        return wide_y


    def deep(self, x):
        # x : (batch_size, )
        embed_x = self.embedding(x)

        inputs = embed_x.view(-1, self.embedding_dim)
        deep_y = self.mlp_layers(inputs)

        return deep_y

    def forward(self, x):
        
        wide_out = self.wide(x) 
        deep_out = self.deep(x)

        assert wide_out.size() == deep_out.size()
        out = wide_out + deep_out

        # joint wide & deep + global bias
        y_pred = torch.sigmoid(out + self.global_bias)
 
        return y_pred


In [None]:
# device = torch.device('cuda')
# input_dims = [n_user, n_item, n_genre] # hyper parameter
# embedding_dim = 10

# model = WideDeep(input_dims, embedding_dim, mlp_dims=[512, 128, 64]).to(device)
# bce_loss = nn.BCELoss() # Binary Cross Entropy loss
# lr, num_epochs = 0.01, 10
# optimizer = optim.Adam(model.parameters(), lr=lr)

In [63]:
import os
data = "../data/"
os.path.join(data, "train/input")


'../data/train/input'

datasets 구성

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

In [4]:
rating_data = "../data/train/train_ratings.csv"

raw_rating_df = pd.read_csv(rating_data)
raw_rating_df.drop(['time'],axis=1,inplace=True)

raw_rating_df.shape

(5154471, 2)

In [33]:
items = raw_rating_df.groupby("user")["item"].apply(list)

In [45]:
print(torch.tensor([256, 37]))
print(int(sum(torch.tensor([256, 37]))))

tensor([256,  37])
293


In [4]:
ten = torch.tensor([256,37])

In [10]:
to = np.array([1,2,3])
to = np.append(to, ten)

In [12]:
x = torch.tensor([31232, 31233, 31234, 31235, 31236, 31237, 31238, 31239, 31240, 31241,
        31242, 31243, 31244, 31245, 31246, 31247, 31248, 31249, 31250, 31251,
        31252, 31253, 31254, 31255, 31256, 31257, 31258, 31259, 31260, 31261,
        31262, 31263, 31264, 31265, 31266, 31267, 31268, 31269, 31270, 31271,
        31272, 31273, 31274, 31275, 31276, 31277, 31278, 31279, 31280, 31281,
        31282, 31283, 31284, 31285, 31286, 31287, 31288, 31289, 31290, 31291,
        31292, 31293, 31294, 31295, 31296, 31297, 31298, 31299, 31300, 31301,
        31302, 31303, 31304, 31305, 31306, 31307, 31308, 31309, 31310, 31311,
        31312, 31313, 31314, 31315, 31316, 31317, 31318, 31319, 31320, 31321,
        31322, 31323, 31324, 31325, 31326, 31327, 31328, 31329, 31330, 31331,
        31332, 31333, 31334, 31335, 31336, 31337, 31338, 31339, 31340, 31341,
        31342, 31343, 31344, 31345, 31346, 31347, 31348, 31349, 31350, 31351,
        31352, 31353, 31354, 31355, 31356, 31357, 31358, 31359])
y = torch.tensor([1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
        0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
        1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
        1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
        1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1])

In [20]:
x

tensor([31232, 31233, 31234, 31235, 31236, 31237, 31238, 31239, 31240, 31241,
        31242, 31243, 31244, 31245, 31246, 31247, 31248, 31249, 31250, 31251,
        31252, 31253, 31254, 31255, 31256, 31257, 31258, 31259, 31260, 31261,
        31262, 31263, 31264, 31265, 31266, 31267, 31268, 31269, 31270, 31271,
        31272, 31273, 31274, 31275, 31276, 31277, 31278, 31279, 31280, 31281,
        31282, 31283, 31284, 31285, 31286, 31287, 31288, 31289, 31290, 31291,
        31292, 31293, 31294, 31295, 31296, 31297, 31298, 31299, 31300, 31301,
        31302, 31303, 31304, 31305, 31306, 31307, 31308, 31309, 31310, 31311,
        31312, 31313, 31314, 31315, 31316, 31317, 31318, 31319, 31320, 31321,
        31322, 31323, 31324, 31325, 31326, 31327, 31328, 31329, 31330, 31331,
        31332, 31333, 31334, 31335, 31336, 31337, 31338, 31339, 31340, 31341,
        31342, 31343, 31344, 31345, 31346, 31347, 31348, 31349, 31350, 31351,
        31352, 31353, 31354, 31355, 31356, 31357, 31358, 31359])

In [23]:
for i, j in zip(x, y):
    if i == 31232:
        print([int(i),int(j)])

[31232, 1]


In [79]:
seq = np.random.choice(items[11], 1, replace=True)
seq[0]

1255

In [80]:
seq

array([1255])

In [91]:
item_genres[0]

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]

In [6]:
positive_next = np.random.choice(list(datasets - set(seq)), 1, replace=False)
positive_next

NameError: name 'datasets' is not defined

In [109]:
box = [12]

In [110]:
box.extend(sum_genre[sum_genre["item"]==positive_next[0]].iloc[:,1:].values[0])
box

[12, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]

In [23]:
sum_genre[sum_genre["item"]==positive_next[0]].iloc[:,1:].values[0]

NameError: name 'positive_next' is not defined

In [22]:
user = 22141
mask_prob = 0.5
seq = np.random.choice(items[user], 1, replace=True)
target = []

token = [user, seq[0]]
seq = np.random.choice(items[user], 1, replace=True)
item_genres = sum_genre[sum_genre["item"]==seq[0]].iloc[:,1:].values.tolist()
token.extend(*item_genres)
prob = np.random.random()
if prob < mask_prob:
    #positive_case
    positive_next = np.random.choice(list(datasets - set(seq)), 1, replace=False)
    token.append(positive_next[0])
    token.extend(sum_genre[sum_genre["item"]==positive_next[0]].iloc[:,1:].values[0])
    target.append(1)

else:
    negative_next = np.random.choice(list(datasets - set(seq)), 1, replace=False)
    token.append(negative_next[0])
    token.extend(sum_genre[sum_genre["item"]==negative_next[0]].iloc[:,1:].values[0])
    target.append(0)

print(token)
print(target)

[22141, 3699, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 48943, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0]
[0]


In [28]:
item_ids = raw_rating_df['item'].unique()
n_items = len(item_ids)
item2idx = pd.Series(data=np.arange(n_items), index=item_ids)
def genre_items_mulithot(genre_data):
    # gnre mulit-hot encoding
    genre_dict = {genre:i for i, genre in enumerate(set(genre_data['genre']))}
    genre_data['genre']  = genre_data['genre'].map(lambda x : genre_dict[x])
    sum_genre = list()
    for item in item_ids:
        sum_genre.append([item, genre_data[genre_data['item']==item]['genre'].values])
    sum_genre = pd.DataFrame(sum_genre , columns=['item', 'genre'])
    
    # Mulit-Labeling
    mlb = MultiLabelBinarizer()
    genre_label = mlb.fit_transform(sum_genre['genre'])
    sum_genre = pd.concat([sum_genre['item'],pd.DataFrame(genre_label, columns=genre_dict)], axis = 1)
    sum_genre = pd.merge(sum_genre, pd.DataFrame({'item': item_ids, 'item_idx': item2idx[item_ids].values}), on='item', how='inner')
    sum_genre.sort_values(['item_idx'], inplace=True)
    del sum_genre['item']
    
    return sum_genre 

In [29]:
genre_data = "../data/train/genres.tsv"

raw_genre_df = pd.read_csv(genre_data, sep='\t')
genre = genre_items_mulithot(raw_genre_df)
genre.head()

Unnamed: 0,Western,War,Action,Romance,Musical,Fantasy,Comedy,Horror,Thriller,Mystery,Animation,Sci-Fi,Drama,Adventure,Children,Documentary,Crime,Film-Noir,item_idx
0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0
1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,3
4,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,4


In [7]:
train_data = pd.concat({k: pd.Series(v) for k, v in train_items.items()}) 
train_data = train_data.reset_index(0) 
train_data.columns = ['user', 'item']

NameError: name 'train_items' is not defined

In [8]:
genre_data = "../data/train/genres.tsv"

raw_genre_df = pd.read_csv(genre_data, sep='\t')

genre_dict = {genre:i for i, genre in enumerate(set(raw_genre_df['genre']))}
raw_genre_df['genre']  = raw_genre_df['genre'].map(lambda x : genre_dict[x])

raw_genre_df

Unnamed: 0,item,genre
0,318,16
1,318,12
2,2571,2
3,2571,11
4,2571,8
...,...,...
15928,109850,12
15929,8605,2
15930,8605,6
15931,3689,6


In [9]:
item_list = set(raw_genre_df['item'].unique())
sum_genre = list()
for item in item_list:
    sum_genre.append([item, raw_genre_df[raw_genre_df['item']==item]['genre'].values])

sum_genre = pd.DataFrame(sum_genre , columns=['item', 'genre'])

In [10]:
mlb = MultiLabelBinarizer()
genre_label = mlb.fit_transform(sum_genre['genre'])

In [11]:
sum_genre

Unnamed: 0,item,genre
0,1,"[13, 10, 14, 6, 5]"
1,32770,[12]
2,2,"[13, 14, 5]"
3,3,"[6, 3]"
4,5,[6]
...,...,...
6802,98243,"[13, 10, 5]"
6803,32721,"[2, 12, 1]"
6804,32728,"[12, 9, 8]"
6805,32743,"[12, 7, 8]"


In [12]:
sum_genre = pd.concat([sum_genre['item'],pd.DataFrame(genre_label, columns=genre_dict)], axis = 1)
sum_genre

Unnamed: 0,item,Western,War,Action,Romance,Musical,Fantasy,Comedy,Horror,Thriller,Mystery,Animation,Sci-Fi,Drama,Adventure,Children,Documentary,Crime,Film-Noir
0,1,0,0,0,0,0,1,1,0,0,0,1,0,0,1,1,0,0,0
1,32770,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0
3,3,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6802,98243,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0
6803,32721,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
6804,32728,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
6805,32743,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0


In [13]:
len(sum_genre['item'].unique())

6807

In [14]:
sum_genre[sum_genre['item'] == 1].iloc[0,1:].values.tolist()

[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0]

In [33]:
token_in = ['user','item','next','label']
token_in.extend(*(sum_genre[sum_genre['item'] == 1].iloc[:,1:].values.tolist()))
token_in

['user',
 'item',
 'next',
 'label',
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1]

In [33]:
print("Create Nagetive instances")
num_negative = 50 # item 수가 50보다 적으면 negative 가 없다
user_group_dfs = list(raw_rating_df.groupby('user')['item'])

user_rating_df = raw_rating_df
user_rating_df['n_item'] = 0
user_rating_df['target'] = 1

Create Nagetive instances


In [None]:

for u, u_items in tqdm(user_group_dfs):
    u_items = set(u_items)
    i_user_neg_item = np.random.choice(list(item_list - u_items), num_negative, replace=False)
     
    # i_user_neg_df = pd.DataFrame({'user': [u]*num_negative,
    #                               'n_item' : i_user_neg_item, 
    #                               'target': [0]*num_negative})
    i_user_neg_df = pd.DataFrame({"n_item" : i_user_neg_item})
  
    user_rating_df[user_rating_df['user'] ==u].iloc[:len(i_user_neg_item), 'n_item']

random, item들을 유저별로 연관기반 규칙으로 left 값이 가장 높은 조합끼리만 뽑는다.
Top1 - 1:1
Top2 - 1:2

Datasets

In [None]:
import os
import pandas as pd

from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MultiLabelBinarizer


class WDDataset(Dataset):
    def __init__(self, path='../data/', mode='train'):

        # data Path
        if mode == 'train':
            self.data_path = os.path.join(path, 'train/train_rating.csv')
        df = pd.read_csv(self.data_path)

        #genre_data
        self.genre_df = pd.read_csv(os.path.join(path, 'train/genres.tsv'), sep='\t')
        self.genre_mulit = genre_items_mulithot(self.genre_df)

        # for submission
        self.rating_df = df.copy()
        self.train_df = len(df)

        # get unique user and tiems
        user_ids, item_ids = df['user'].unique(), df['item'].unique()

        # get number of users and items
        self.n_users, self_n_items = len(self.user_ids), len(self.item_ids)
        self.n_train, self.n_test = 0, 0
        self.neg_pools = {}

        # user, item indexing
        item2idx = pd.Series(data=np.arange(len(item_ids)), index=item_ids) # item re-indexing
        user2idx = pd.Series(data=np.arange(len(user_ids)), index=user_ids) # user re-indexing (0~num_user-1)

        # dataframe indexing
        df = pd.merge(df, pd.DataFrame({'item': item_ids, 'item_idx': item2idx[item_ids].values}), on='item', how='inner')
        df = pd.merge(df, pd.DataFrame({'user': user_ids, 'user_idx': user2idx[user_ids].values}), on='user', how='inner')
        df.sort_values(['user_idx', 'time'], inplace=True)
        del df['item'], df['user']
        
        self.exist_users = list(df['user_idx'].unique())

        items = df.groupby("user_idx")["item_idx"].apply(list) # user_id : [item list]


    def genre_items_mulithot(self, genre_data):
        # gnre mulit-hot encoding
        genre_dict = {genre:i for i, genre in enumerate(set(genre_data['genre']))}
        genre_data['genre']  = genre_data['genre'].map(lambda x : genre_dict[x])
        

        genre_label = mlb.fit_transform(sum_genre['genre'])

        return genre_data

    def sample_pos_items_for_u(self, u, num):

        return pos_batch

    def sample_next_items_for_u(self, u, num):

        return next_item, rating



    def __getitem__(self, index): 
        user = self.exist_users[idx] # 유저 idx 목록
        pos_item = self.sample_pos_items_for_u(user, 1)[0]
        next_item, rating = self.sample_next_items_for_u(user, 1)[0]
        return user, pos_item, next_item, rating

    def __len__(self):
        return



In [55]:
user_rating_df.groupby("user")["item"].apply(list)

user
11        [4643, 170, 531, 616, 2140, 2722, 2313, 2688, ...
14        [8961, 1396, 471, 2105, 1042, 1947, 1269, 2394...
18        [1952, 1283, 3507, 4280, 51084, 593, 318, 356,...
25        [261, 22, 2161, 3255, 372, 1093, 428, 175, 214...
31        [260, 1196, 1210, 7153, 4993, 5952, 1270, 5855...
                                ...                        
138473    [524, 3354, 1025, 6565, 69757, 2085, 32, 55282...
138475    [1639, 1673, 1148, 246, 2019, 1267, 1172, 1235...
138486    [2694, 1994, 2723, 441, 2288, 637, 2013, 2423,...
138492    [2115, 908, 58, 2700, 2599, 1500, 1358, 1288, ...
138493    [3174, 2872, 48780, 2662, 2840, 1566, 2857, 20...
Name: item, Length: 31360, dtype: object

In [None]:
user_rating_df = raw_rating_df
user_rating_df['n_item'] = 0
user_rating_df['target'] = 1
for user in user_group_dfs['user'].unique():
    u_neg_items = set(user_neg_dfs[user_neg_dfs['user']==user]['item'])
    for neg_item in u_neg_items: # negative items


In [16]:
user_neg_dfs[user_neg_dfs['item']==837]

Unnamed: 0,user,item,target
3,11,837,0
44,482,837,0
26,935,837,0
32,1906,837,0
47,2081,837,0
...,...,...,...
29,136864,837,0
1,136968,837,0
13,137129,837,0
37,137552,837,0


In [14]:
set(user_neg_dfs[user_neg_dfs['user']==11]['item'])

{837,
 999,
 1124,
 1304,
 1355,
 1994,
 2391,
 2433,
 2709,
 2967,
 3068,
 3117,
 3253,
 3637,
 3984,
 4002,
 4068,
 4969,
 5826,
 6031,
 6077,
 6101,
 6264,
 6820,
 7013,
 7162,
 7573,
 8199,
 8831,
 25750,
 26467,
 27584,
 27788,
 27816,
 44864,
 46970,
 48322,
 49200,
 53121,
 53468,
 54648,
 58351,
 59440,
 77330,
 81910,
 84954,
 85056,
 89745,
 93740,
 97024}

In [149]:
sum_genre_user = pd.concat([raw_rating_df, user_neg_dfs], axis=0, sort=False)
sum_genre_user = pd.merge(sum_genre_user, sum_genre, on = 'item')
sum_genre_user

Unnamed: 0,user,item,target,Documentary,Animation,Horror,Fantasy,Sci-Fi,War,Thriller,...,Comedy,Romance,Musical,Crime,Mystery,Action,Western,Drama,Children,Film-Noir
0,11,4643,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
1,189,4643,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
2,294,4643,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
3,383,4643,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
4,421,4643,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6722466,135336,102880,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
6722467,136172,102880,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
6722468,136770,102880,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
6722469,137678,102880,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [64]:
sum_genre_user

Unnamed: 0,user,item,target,Documentary,Animation,Horror,Fantasy,Sci-Fi,War,Thriller,...,Comedy,Romance,Musical,Crime,Mystery,Action,Western,Drama,Children,Film-Noir
0,11,4643,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
1,189,4643,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
2,294,4643,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
3,383,4643,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
4,421,4643,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6722466,135336,102880,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
6722467,136172,102880,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
6722468,136770,102880,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
6722469,137678,102880,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
{user : [item]} 


In [65]:
sum_genre_user[sum_genre_user['user']==11]
# positive set
# negative set apply.lambda

Unnamed: 0,user,item,target,Documentary,Animation,Horror,Fantasy,Sci-Fi,War,Thriller,...,Comedy,Romance,Musical,Crime,Mystery,Action,Western,Drama,Children,Film-Noir
0,11,4643,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
1364,11,170,1,0,0,0,0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
2686,11,531,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
3875,11,616,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5188,11,2140,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6478732,11,66547,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
6520581,11,550,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
6614147,11,60948,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6624692,11,6816,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [3]:
sum_genre_user = pd.read_csv("../data/train/sum_genre_user.csv")

sum_genre_user[sum_genre_user['user'] == 11]

Unnamed: 0,user,item,target,Documentary,Animation,Horror,Fantasy,Sci-Fi,War,Thriller,...,Comedy,Romance,Musical,Crime,Mystery,Action,Western,Drama,Children,Film-Noir
0,11,4643,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
1364,11,170,1,0,0,0,0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
2686,11,531,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
3875,11,616,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5188,11,2140,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6478732,11,66547,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
6520581,11,550,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
6614147,11,60948,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6624692,11,6816,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [4]:
sum_genre_user['user'].unique()

array([    11,    189,    294, ...,  85909, 136897, 116859])