# `train_ratings.csv` 로드해서 `MostPopular`  추천 결과 생성하기

In [15]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.sparse import csr_matrix

In [16]:
train_df = pd.read_csv('../data/train/train_ratings.csv') # 전체 학습 데이터

In [21]:
def generate_rating_matrix_valid(user_seq, num_users, num_items):
    # three lists are used to construct sparse matrix
    row = []
    col = []
    data = []
    for user_id, item_list in enumerate(user_seq):
        for item in item_list[:-2]: 
            row.append(user_id)
            col.append(item)
            data.append(1)

    row = np.array(row)
    col = np.array(col)
    data = np.array(data)
    rating_matrix = csr_matrix((data, (row, col)), shape=(num_users, num_items))
    print(rating_matrix)
    return rating_matrix

In [23]:
user_seq = []
item_set = set()

for line in lines:

    items = line # [4643, 170, 531 ...]
    user_seq.append(items) # [[4643, 170, 531 ...], [541, 140, 71 ...], ...]
    item_set = item_set | set(items)
max_item = max(item_set) # 유저가 실제로 본 영화 중 ID가 가장 큰 영화 -> 실제 영화 개수
num_users = len(lines) # 유저 수
num_items = max_item + 2 # 영화 수+2 -> +2를 한 이유가 무엇일까?
print(num_users, num_items)
valid_rating_matrix = generate_rating_matrix_valid(user_seq, num_users, num_items)

31360 119147
  (0, 1)	1
  (0, 19)	1
  (0, 32)	1
  (0, 39)	1
  (0, 110)	1
  (0, 150)	1
  (0, 153)	1
  (0, 158)	1
  (0, 160)	1
  (0, 165)	1
  (0, 170)	1
  (0, 172)	1
  (0, 173)	1
  (0, 185)	1
  (0, 208)	1
  (0, 231)	1
  (0, 253)	1
  (0, 256)	1
  (0, 260)	1
  (0, 296)	1
  (0, 316)	1
  (0, 318)	1
  (0, 344)	1
  (0, 356)	1
  (0, 364)	1
  :	:
  (31359, 49651)	1
  (31359, 50601)	1
  (31359, 50872)	1
  (31359, 51086)	1
  (31359, 51662)	1
  (31359, 51884)	1
  (31359, 52579)	1
  (31359, 52975)	1
  (31359, 53123)	1
  (31359, 53322)	1
  (31359, 53996)	1
  (31359, 55269)	1
  (31359, 55814)	1
  (31359, 58879)	1
  (31359, 59315)	1
  (31359, 59784)	1
  (31359, 60069)	1
  (31359, 60816)	1
  (31359, 61160)	1
  (31359, 65682)	1
  (31359, 66762)	1
  (31359, 68319)	1
  (31359, 68954)	1
  (31359, 69526)	1
  (31359, 70286)	1


## 모든 사용자에게 동일하게 Most Popular 아이템 10개를 추천 결과로 제공

In [3]:
# most popular 10 items
num_users = train_df['user'].nunique()
mp_items = train_df['item'].value_counts()[:10].index.values # predictions
mp_items = np.concatenate([mp_items]*num_users) # same prediction for all users
users = train_df['user'].unique().repeat(10)

test_df = pd.DataFrame(zip(users,mp_items), columns=['user','item'])
test_df.to_csv("output/most_popular_submission.csv", index=False)

In [4]:
test_df

Unnamed: 0,user,item
0,11,2571
1,11,2959
2,11,296
3,11,318
4,11,356
...,...,...
313595,138493,4993
313596,138493,7153
313597,138493,5952
313598,138493,593
