In [118]:
from collections import defaultdict
import implicit
import pandas as pd
import numpy as np
from scipy import sparse
from tqdm import tqdm
import ast

In [11]:
def get_count(tp, id):
    # group by 오브젝트 index=True로 해야 id 가 index값으로 들어감
    playcount_groupbyid = tp[[id]].groupby(id, as_index=True)
    # 이걸하면 sql의 count 와 동일해짐
    count = playcount_groupbyid.size() 
    return count

In [12]:
def filter_triplets(tp, min_uc=5, min_sc=0):
    if min_sc > 0:
        itemcount = get_count(tp, 'item')
        tp = tp[tp['item'].isin(itemcount.index[itemcount >= min_sc])]
    
    if min_uc > 0:
        usercount = get_count(tp, 'user')
        tp = tp[tp['user'].isin(usercount.index[usercount >= min_uc])]

    usercount, itemcount = get_count(tp, 'user'), get_count(tp, 'item') 
    return tp, usercount, itemcount

In [17]:
def numerize(tp, profile2id, show2id):
    uid = tp['user'].apply(lambda x: profile2id[x])
    sid = tp['item'].apply(lambda x: show2id[x])
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [15]:
raw_data = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv', header=0)
raw_data, user_activity, item_popularity = filter_triplets(raw_data, min_uc=5, min_sc=0)

unique_uid = raw_data['user'].unique()
unique_sid = raw_data['item'].unique()

profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))

In [66]:
raw = numerize(raw_data, profile2id, show2id)
n_users = len(unique_uid)
n_items = len(unique_sid)

rows, cols = raw['uid'], raw['sid']
data = sparse.csr_matrix((np.ones_like(rows),
                            (rows, cols)), dtype='float64',
                            shape=(n_users, n_items))

In [67]:
user_item_matrix = data.toarray() 

In [74]:
id2profile= dict(map(reversed,profile2id.items()))
id2show= dict(map(reversed,show2id.items())) 

In [239]:
model = implicit.nearest_neighbours.BM25Recommender(K=10, K1=0.6, B=0.45, num_threads=0)
model.fit(data)
k = 10
users = unique_uid.repeat(k)
test_df = pd.DataFrame(users, columns=['user'])
test_df['item']=0

index = 0
for user in tqdm(unique_uid):
    uid = profile2id[user]
    recommendations = model.recommend(uid, data[uid])[0]
    recommendations = np.vectorize(id2show.get)(recommendations)
    for i in range(k):
        test_df.loc[index + i, 'item'] = recommendations[i]
    index += k
test_df.to_csv(f'/opt/ml/input/submission/lmf.csv', index=False)

  0%|          | 0/6807 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:29<00:00, 1074.18it/s]
