## item_feature vector 생성
- tf-idf를 사용하여 side information을 적용한 각 영화의 feature vector를 생성해주었습니다.
    - side information : title, genre
- 그렇게 생성한 item_feature vector를 이용하여, 각 유저가 시청한 영화들의 기록을 이용하여 user_feature vector도 생성하였습니다.
- item_feature.npy와 user_feature.npy를 저장합니다.
    - item_feature.npy와 user_feature.npy는 이 노트북과 같은 경로에 생성됩니다.
    - recvae_dae_vae_inference(+Content_Based).py 파일과 item_feature.npy, user_feature.npy이 같은 경로 안에 존재해야 합니다.

In [62]:
import argparse
import time
import os
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import bottleneck as bn
import adabound
import wandb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from scipy import sparse

In [63]:
## 각종 파라미터 세팅
parser = argparse.ArgumentParser(description='PyTorch Variational Autoencoders for Collaborative Filtering')


parser.add_argument('--data', type=str, default='/opt/ml/input/data/train/',
                    help='Movielens dataset location')
parser.add_argument('--lr', type=float, default=1e-4,
                    help='initial learning rate')
parser.add_argument('--wd', type=float, default=0.00,
                    help='weight decay coefficient')
parser.add_argument('--batch_size', type=int, default=500,
                    help='batch size')
parser.add_argument('--epochs', type=int, default=50, #원래 20
                    help='upper epoch limit')
parser.add_argument('--total_anneal_steps', type=int, default=200000,
                    help='the total number of gradient updates for annealing')
parser.add_argument('--anneal_cap', type=float, default=0.2,
                    help='largest annealing parameter')
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')
parser.add_argument('--cuda', action='store_true',
                    help='use CUDA')
parser.add_argument('--log_interval', type=int, default=100, metavar='N',
                    help='report interval')
parser.add_argument('--dae_save', type=str, default='multi_dae_model.pt',
                    help='path to save the final model')
parser.add_argument('--vae_save', type=str, default='multi_vae_model.pt',
                    help='path to save the final model')
args = parser.parse_args([])

In [64]:
# Set the random seed manually for reproductibility.
torch.manual_seed(args.seed)

#만약 GPU가 사용가능한 환경이라면 GPU를 사용
if torch.cuda.is_available():
    args.cuda = True

device = torch.device("cuda" if args.cuda else "cpu")
print(device)

cuda


### 데이터 전처리
- item_feature를 생성하기 위해 raw_data를 처리해야 합니다.
- Multi-VAE 노트북의 데이터 전처리 과정을 그대로 사용했습니다.

In [65]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()

    return count

In [66]:
def filter_triplets(tp, min_uc=5, min_sc=0):
    if min_sc > 0:
        itemcount = get_count(tp, 'item')
        tp = tp[tp['item'].isin(itemcount.index[itemcount >= min_sc])]

    if min_uc > 0:
        usercount = get_count(tp, 'user')
        tp = tp[tp['user'].isin(usercount.index[usercount >= min_uc])]

    usercount, itemcount = get_count(tp, 'user'), get_count(tp, 'item')
    return tp, usercount, itemcount

In [67]:
def split_train_test_proportion(data, test_prop=0.2): #원래 0.2
    data_grouped_by_user = data.groupby('user')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for _, group in data_grouped_by_user:
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])

        else:
            tr_list.append(group)

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)

    return data_tr, data_te

In [68]:
def numerize(tp, profile2id, show2id):
    uid = tp['user'].apply(lambda x: profile2id[x])
    sid = tp['item'].apply(lambda x: show2id[x])
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [69]:
# Load Data
print("Load and Preprocess Movielens dataset")
DATA_DIR = args.data
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'train_ratings.csv'), header=0)
print("원본 데이터\n", raw_data)

Load and Preprocess Movielens dataset
원본 데이터
            user   item        time
0            11   4643  1230782529
1            11    170  1230782534
2            11    531  1230782539
3            11    616  1230782542
4            11   2140  1230782563
...         ...    ...         ...
5154466  138493  44022  1260209449
5154467  138493   4958  1260209482
5154468  138493  68319  1260209720
5154469  138493  40819  1260209726
5154470  138493  27311  1260209807

[5154471 rows x 3 columns]


In [70]:
raw_data[raw_data['user']==27968]

Unnamed: 0,user,item,time
1059595,27968,2997,1146358272
1059596,27968,912,1146358274
1059597,27968,223,1146358278
1059598,27968,2959,1146358280
1059599,27968,1221,1146358286
...,...,...,...
1059763,27968,916,1146360616
1059764,27968,1704,1146360751
1059765,27968,3071,1146360755
1059766,27968,2183,1146360829


In [71]:
# Filter Data
raw_data, user_activity, item_popularity = filter_triplets(raw_data, min_uc=5, min_sc=0)
#제공된 훈련데이터의 유저는 모두 5개 이상의 리뷰가 있습니다.
print("5번 이상의 리뷰가 있는 유저들로만 구성된 데이터\n",raw_data)

print("유저별 리뷰수\n",user_activity)
print("아이템별 리뷰수\n",item_popularity)

5번 이상의 리뷰가 있는 유저들로만 구성된 데이터
            user   item        time
0            11   4643  1230782529
1            11    170  1230782534
2            11    531  1230782539
3            11    616  1230782542
4            11   2140  1230782563
...         ...    ...         ...
5154466  138493  44022  1260209449
5154467  138493   4958  1260209482
5154468  138493  68319  1260209720
5154469  138493  40819  1260209726
5154470  138493  27311  1260209807

[5154471 rows x 3 columns]
유저별 리뷰수
 user
11        376
14        180
18         77
25         91
31        154
         ... 
138473     63
138475    124
138486    137
138492     68
138493    314
Length: 31360, dtype: int64
아이템별 리뷰수
 item
1         12217
2          3364
3           734
4            43
5           590
          ...  
118700       54
118900       60
118997       52
119141      122
119145       78
Length: 6807, dtype: int64


In [72]:
# Shuffle User Indices
unique_uid = user_activity.index
print("(BEFORE) unique_uid:", unique_uid)
np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]
print("(AFTER) unique_uid:",unique_uid)

n_users = unique_uid.size #31360
n_heldout_users = 3136 #3000

(BEFORE) unique_uid: Int64Index([    11,     14,     18,     25,     31,     35,     43,     50,
                58,     60,
            ...
            138459, 138461, 138470, 138471, 138472, 138473, 138475, 138486,
            138492, 138493],
           dtype='int64', name='user', length=31360)
(AFTER) unique_uid: Int64Index([ 27968,  67764,   2581,  82969, 137831,  48639,  97870,  40424,
             46835,  79570,
            ...
            114284,   9009,  21165,  33920,  22054, 135379, 125855,  41891,
             15720,  17029],
           dtype='int64', name='user', length=31360)


In [73]:
# Split Train/Validation/Test User Indices
tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

#주의: 데이터의 수가 아닌 사용자의 수입니다.
print("훈련 데이터에 사용될 사용자 수:", len(tr_users))
print("검증 데이터에 사용될 사용자 수:", len(vd_users))
print("테스트 데이터에 사용될 사용자 수:", len(te_users))

훈련 데이터에 사용될 사용자 수: 25088
검증 데이터에 사용될 사용자 수: 3136
테스트 데이터에 사용될 사용자 수: 3136


In [74]:
##훈련 데이터에 해당하는 아이템들
#Train에는 전체 데이터를 사용합니다.
train_plays = raw_data.loc[raw_data['user'].isin(tr_users)]

##아이템 ID
unique_sid = pd.unique(train_plays['item'])

show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

pro_dir = os.path.join(DATA_DIR, 'pro_sg')

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

In [75]:
#Validation과 Test에는 input으로 사용될 tr 데이터와 정답을 확인하기 위한 te 데이터로 분리되었습니다.
vad_plays = raw_data.loc[raw_data['user'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['item'].isin(unique_sid)]
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)

test_plays = raw_data.loc[raw_data['user'].isin(te_users)]
test_plays = test_plays.loc[test_plays['item'].isin(unique_sid)]
test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)

In [76]:
train_data = numerize(train_plays, profile2id, show2id)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)


vad_data_tr = numerize(vad_plays_tr, profile2id, show2id)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

vad_data_te = numerize(vad_plays_te, profile2id, show2id)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

test_data_tr = numerize(test_plays_tr, profile2id, show2id)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

test_data_te = numerize(test_plays_te, profile2id, show2id)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

print("Done!")

Done!


In [77]:
print(raw_data['user'].nunique())
numer_raw = numerize(raw_data, profile2id, show2id)
print(raw_data)
print(numer_raw)

31360
           user   item        time
0            11   4643  1230782529
1            11    170  1230782534
2            11    531  1230782539
3            11    616  1230782542
4            11   2140  1230782563
...         ...    ...         ...
5154466  138493  44022  1260209449
5154467  138493   4958  1260209482
5154468  138493  68319  1260209720
5154469  138493  40819  1260209726
5154470  138493  27311  1260209807

[5154471 rows x 3 columns]
           uid   sid
0        11825     0
1        11825     1
2        11825     2
3        11825     3
4        11825     4
...        ...   ...
5154466  10783   477
5154467  10783  1325
5154468  10783   331
5154469  10783   558
5154470  10783  1922

[5154471 rows x 2 columns]


### Genre information 처리

In [78]:
gen = pd.read_csv("/opt/ml/input/data/train/genres.tsv", delimiter='\t')
gen.head()

Unnamed: 0,item,genre
0,318,Crime
1,318,Drama
2,2571,Action
3,2571,Sci-Fi
4,2571,Thriller


In [79]:
def gen_numerize(tp, show2id):
    sid = tp['item'].apply(lambda x: show2id[x])
    return sid

In [80]:
gen['item'] = gen_numerize(gen, show2id)
gen.head()

Unnamed: 0,item,genre
0,198,Crime
1,198,Drama
2,82,Action
3,82,Sci-Fi
4,82,Thriller


In [81]:
gen.replace('Sci-Fi','science_fiction',inplace=True)
gen.replace('Film-Noir','Film_Noir',inplace=True)
gen

Unnamed: 0,item,genre
0,198,Crime
1,198,Drama
2,82,Action
3,82,science_fiction
4,82,Thriller
...,...,...
15928,6763,Drama
15929,5046,Action
15930,5046,Comedy
15931,5508,Comedy


In [82]:
numer_raw.columns = ['user', 'item']
print(numer_raw['user'].nunique())
numer_raw

31360


Unnamed: 0,user,item
0,11825,0
1,11825,1
2,11825,2
3,11825,3
4,11825,4
...,...,...
5154466,10783,477
5154467,10783,1325
5154468,10783,331
5154469,10783,558


In [83]:
temp_mat = pd.DataFrame.merge(numer_raw, gen, on='item')
print(temp_mat['user'].nunique())
temp_mat

31360


Unnamed: 0,user,item,genre
0,11825,0,Action
1,11825,0,Adventure
2,11825,0,Drama
3,11825,0,science_fiction
4,30224,0,Action
...,...,...,...
14126319,17505,6798,Adventure
14126320,17505,6798,science_fiction
14126321,8540,6798,Action
14126322,8540,6798,Adventure


In [84]:
temp_mat[temp_mat.sort_values('user')['user'] == 0]['item'].nunique()

  temp_mat[temp_mat.sort_values('user')['user'] == 0]['item'].nunique()


173

### Title information 처리

In [85]:
title = pd.read_csv("/opt/ml/input/data/train/titles.tsv", delimiter='\t')

In [86]:
print(title)

        item                                              title
0        318                   Shawshank Redemption, The (1994)
1       2571                                 Matrix, The (1999)
2       2959                                  Fight Club (1999)
3        296                                Pulp Fiction (1994)
4        356                                Forrest Gump (1994)
...      ...                                                ...
6802   73106  American Pie Presents: The Book of Love (Ameri...
6803  109850                              Need for Speed (2014)
6804    8605                                      Taxi 3 (2003)
6805    3689                    Porky's II: The Next Day (1983)
6806    8130                         Girl Next Door, The (1999)

[6807 rows x 2 columns]


In [87]:
def title_numerize(tp, show2id):
    sid = tp['item'].apply(lambda x: show2id[x])
    return sid

In [88]:
title['item'] = title_numerize(title, show2id)
print(title)

      item                                              title
0      198                   Shawshank Redemption, The (1994)
1       82                                 Matrix, The (1999)
2      260                                  Fight Club (1999)
3      264                                Pulp Fiction (1994)
4      265                                Forrest Gump (1994)
...    ...                                                ...
6802  3396  American Pie Presents: The Book of Love (Ameri...
6803  6763                              Need for Speed (2014)
6804  5046                                      Taxi 3 (2003)
6805  5508                    Porky's II: The Next Day (1983)
6806  5531                         Girl Next Door, The (1999)

[6807 rows x 2 columns]


In [89]:
nltk.download('stopwords')
nltk.download('punkt')

stops = stopwords.words('english')

[nltk_data] Downloading package stopwords to /opt/ml/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /opt/ml/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [90]:
new_title = []

for item, text in title.values:
    new_text = ''
    if re.search(r'\([0-9]{4}\)', text[-6:]):
        new_text = text[:-6]
    else:
        new_text = text[:text.rfind('(')]

    new_text = re.compile("[,:]").sub("", new_text)
    # filtered_tokens = [token for token in word_tokenize(new_text) if token.lower() not in stops]
    # new_title.append([item, ' '.join(filtered_tokens)])
    new_title.append([item, new_text])

new_title_df = pd.DataFrame(new_title, columns = ['item', 'title'])

In [91]:
new_title_df

Unnamed: 0,item,title
0,198,Shawshank Redemption The
1,82,Matrix The
2,260,Fight Club
3,264,Pulp Fiction
4,265,Forrest Gump
...,...,...
6802,3396,American Pie Presents The Book of Love (Americ...
6803,6763,Need for Speed
6804,5046,Taxi 3
6805,5508,Porky's II The Next Day


In [92]:
user_item_df = pd.DataFrame.merge(temp_mat, new_title_df, on='item')

In [93]:
user_item_df['item'].nunique()

6807

In [94]:
user_item_df

Unnamed: 0,user,item,genre,title
0,11825,0,Action,Planet of the Apes
1,11825,0,Adventure,Planet of the Apes
2,11825,0,Drama,Planet of the Apes
3,11825,0,science_fiction,Planet of the Apes
4,30224,0,Action,Planet of the Apes
...,...,...,...,...
14126319,17505,6798,Adventure,After Earth
14126320,17505,6798,science_fiction,After Earth
14126321,8540,6798,Action,After Earth
14126322,8540,6798,Adventure,After Earth


### directors information 처리(최종 item_feature에는 사용 X)

In [95]:
directors  = pd.read_csv("/opt/ml/input/data/train/directors.tsv", delimiter='\t')

In [96]:
def director_numerize(tp, show2id):
    sid = tp['item'].apply(lambda x: show2id[x])
    return sid

In [97]:
directors['item'] = director_numerize(directors, show2id)
print(directors)

      item   director
0      917  nm0000005
1     1859  nm0000005
2     1786  nm0000005
3     3694  nm0000005
4     2756  nm0000005
...    ...        ...
5900  5530  nm2879822
5901  2769  nm2879822
5902  3731  nm2879822
5903  1681  nm9054338
5904  1937  nm9054338

[5905 rows x 2 columns]


In [98]:
user_item_df = pd.DataFrame.merge(user_item_df, directors, how='outer',on = 'item')

In [99]:
user_item_df = user_item_df.sort_values('user').append({'user': 31360, 'item' : 0, 'genre' : '0', 'title' : '0', 'director':'0'}, ignore_index=True)
user_item_df

Unnamed: 0,user,item,genre,title,director
0,0,753,War,Bridge on the River Kwai The,nm0000180
1,0,2218,Crime,Big Sleep The,nm0001328
2,0,2435,Thriller,Rebecca,nm0000033
3,0,3977,Thriller,Getaway The,nm0001603
4,0,1071,Drama,Gandhi,nm0000277
...,...,...,...,...,...
16135953,31359,962,Comedy,O Brother Where Art Thou?,nm0001054
16135954,31359,494,Comedy,Bend It Like Beckham,nm0149446
16135955,31359,2650,Thriller,Third Man The,nm0715346
16135956,31359,1235,Romance,Forgetting Sarah Marshall,nm0831557


In [100]:
user_item_df['user'].nunique()

31361

In [101]:
user_item_df['director'].isnull().sum().sum()

726676

### user_feature를 생성하기 위한 user별 시청 기록 생성

In [102]:
user_item_df

Unnamed: 0,user,item,genre,title,director
0,0,753,War,Bridge on the River Kwai The,nm0000180
1,0,2218,Crime,Big Sleep The,nm0001328
2,0,2435,Thriller,Rebecca,nm0000033
3,0,3977,Thriller,Getaway The,nm0001603
4,0,1071,Drama,Gandhi,nm0000277
...,...,...,...,...,...
16135953,31359,962,Comedy,O Brother Where Art Thou?,nm0001054
16135954,31359,494,Comedy,Bend It Like Beckham,nm0149446
16135955,31359,2650,Thriller,Third Man The,nm0715346
16135956,31359,1235,Romance,Forgetting Sarah Marshall,nm0831557


In [103]:
user_item = np.array(user_item_df)
user_item

array([[0, 753, 'War', 'Bridge on the River Kwai The ', 'nm0000180'],
       [0, 2218, 'Crime', 'Big Sleep The ', 'nm0001328'],
       [0, 2435, 'Thriller', 'Rebecca ', 'nm0000033'],
       ...,
       [31359, 2650, 'Thriller', 'Third Man The ', 'nm0715346'],
       [31359, 1235, 'Romance', 'Forgetting Sarah Marshall ',
        'nm0831557'],
       [31360, 0, '0', '0', '0']], dtype=object)

In [104]:
user_item_list = []
temp_item_list = []

prev_user = 0
prev_item =''

for user, item, genre, title, director in user_item:
    cur_user = user
    if prev_user != cur_user:
        user_item_list.append(temp_item_list)
        temp_item_list = []
        prev_item =''
    
    if prev_item != item:
        temp_item_list.extend([item])

    prev_user = cur_user
    prev_item = item


In [105]:
len(user_item_list[0])

344

### item_feature 생성 과정

In [106]:
item_user_df = pd.DataFrame.merge(new_title_df, gen, how='outer', on ='item')
item_user_df = pd.DataFrame.merge(item_user_df, directors, how='outer', on='item')
item_user_df = item_user_df.sort_values('item').append({'item':6807,'title':'0','genre':'0','director':'0'},ignore_index=True)
item_user_df

Unnamed: 0,item,title,genre,director
0,0,Planet of the Apes,Drama,nm0000318
1,0,Planet of the Apes,Adventure,nm0000318
2,0,Planet of the Apes,science_fiction,nm0000318
3,0,Planet of the Apes,Action,nm0000318
4,1,Hackers,Thriller,nm0812200
...,...,...,...,...
17096,6804,Rebound The,Romance,
17097,6804,Rebound The,Comedy,
17098,6805,Selma,Drama,
17099,6806,Dickie Roberts Former Child Star,Comedy,nm0918873


In [107]:
item_user = np.array(item_user_df.sort_values('item'))
item_user

array([[0, 'Planet of the Apes ', 'Drama', 'nm0000318'],
       [0, 'Planet of the Apes ', 'Adventure', 'nm0000318'],
       [0, 'Planet of the Apes ', 'science_fiction', 'nm0000318'],
       ...,
       [6805, 'Selma ', 'Drama', nan],
       [6806, 'Dickie Roberts Former Child Star ', 'Comedy', 'nm0918873'],
       [6807, '0', '0', '0']], dtype=object)

In [108]:
item_genre_list = []
item_title_list = []
item_director_list = []
item_list = [0]

temp_movie_list = []
temp_genre_list = []
temp_title_list = []
temp_director_list = []

prev_item = 0
prev_title = ''
prev_director = ''
prev_genre = ''
for item, title, genre, director in item_user:
    cur_item = item
    if prev_item != cur_item:
        item_genre_list.append(temp_genre_list)
        item_title_list.append(temp_title_list)
        item_director_list.append(temp_director_list)

        temp_genre_list = []
        temp_director_list = []
        temp_title_list = []

        prev_director = ''
        prev_genre = ''
        prev_title = ''
    
    if prev_genre != genre: 
        temp_genre_list.extend([genre])
    
    if prev_director != director:
        temp_director_list.extend([director])
    if prev_title != title:
        temp_title_list.extend([title])
    
    prev_item = cur_item
    prev_director = director
    prev_title = title
    prev_genre = genre


In [109]:
literal_gen_item = []
for i in item_genre_list:
    literal_gen_item.extend([" ".join(np.array(i).flatten())])

In [110]:
item_genre_list_df = pd.DataFrame(literal_gen_item)
item_genre_list_df

Unnamed: 0,0
0,Drama Adventure science_fiction Action
1,Thriller Adventure Action Crime
2,Drama Children
3,Animation Children
4,Fantasy Adventure
...,...
6802,Drama Thriller
6803,Romance Comedy
6804,Comedy Romance
6805,Drama


In [112]:
literal_title_item = []
flat_literal_title_item = []
for i in item_title_list:
    literal_title_item.extend(["".join(np.array(i).flatten())])


In [113]:
item_title_list_df = pd.DataFrame(literal_title_item)
item_title_list_df

Unnamed: 0,0
0,Planet of the Apes
1,Hackers
2,Secret Garden The
3,Aristocats The
4,Dark Crystal The
...,...
6802,Poison Ivy
6803,Other Woman The
6804,Rebound The
6805,Selma


In [114]:
literal_director_item = []
for i in item_director_list:
    try:
        #print(newlist)
        newlist = [x for x in i if str(x) != 'nan']
        literal_director_item.extend([" ".join(np.array(newlist))])
    except:
        print(i,"에러!", i[0] != i[0])

In [115]:
item_director_list_df = pd.DataFrame(literal_director_item)
item_director_list_df

Unnamed: 0,0
0,nm0000318
1,nm0812200
2,nm0002140
3,nm0718627
4,nm0000568 nm0001345 nm0000568 nm0001345
...,...
6802,
6803,nm0001024
6804,
6805,


### tf-idf를 이용하여 item_feature vector 생성

In [116]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_genre = TfidfVectorizer()
tfidf_title = TfidfVectorizer()
tfidf_director = TfidfVectorizer()

In [117]:
item_gen_feature = tfidf_genre.fit_transform(literal_gen_item)
item_title_feature = tfidf_title.fit_transform(literal_title_item)
item_director_feature = tfidf_director.fit_transform(literal_director_item)

In [118]:
print(item_gen_feature.shape)
print(item_title_feature.shape)
print(item_director_feature.shape)

(6807, 18)
(6807, 7353)
(6807, 1340)


In [119]:
item_gen_feature.toarray()

array([[0.48900021, 0.5497373 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.47777828, 0.53712153, 0.        , ..., 0.44899869, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [120]:
item_feature = np.concatenate((item_gen_feature.toarray(), item_title_feature.toarray()), axis=1)

In [121]:
#user_feature 선언 해줘야할 부분
print(len(user_item_list))
print(item_feature.shape)

31360
(6807, 7371)


### item_feature와 user별 시청 기록을 이용하여 user_feature 생성

In [123]:
from tqdm import tqdm

user_feature = []

for user, itemlist in tqdm(enumerate(user_item_list)):
    
    user_feature.append(np.mean(item_feature[itemlist],axis=0))


31360it [05:30, 95.00it/s] 


In [124]:
user_feature = np.array(user_feature)
user_feature.shape

(31360, 7371)

In [125]:
print(user_feature.shape)
print(item_feature.shape)


(31360, 7371)
(6807, 7371)


In [126]:
print(type(user_feature), type(item_feature))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


### item_feature, user_feature 저장

In [127]:
np.save('user_feature.npy',user_feature)
np.save('item_feature.npy',item_feature)