In [11]:
import torch
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
def preprocess(df) :
    print("preprocessing..")
    df = df.sort_values(['user', 'time'], ascending = [True, True])

    movies = df['item'].unique()
    movie_to_id = dict(zip(movies, range(len(movies))))
    id_to_movie = {v: k for k, v in movie_to_id.items()}
    
    df['item'] = df['item'].apply(lambda x : movie_to_id[x])
    popular_items = list(df['item'].value_counts()[:100].keys())
    print("Complete!")
    return df, movie_to_id, id_to_movie, popular_items

In [3]:
class StaticsBasedModel() :
    def __init__(self, data) :
        item_num = len(data['item'].unique())
        self.appearance = torch.zeros((item_num, item_num))
        self.appearance_ratio = torch.zeros((item_num, item_num))
        self.data = data
        self.users = data['user'].unique()

    def count_appearance(self) :
        print("count appearance ratio..")
        for user in tqdm(self.users) :
            user_item = self.data[self.data.user == user]['item'].values
            
            prev_item = user_item[0]
            self.appearance[prev_item][user_item[1]] += 1
            for index, item in enumerate(user_item[1:-1], start=1) :
                next_item = user_item[index+1]
                self.appearance[item][prev_item] += 1
                self.appearance[item][next_item] += 1
                prev_item = item
            self.appearance[user_item[-1]][prev_item] += 1

    def cal_ratio(self) :
        appearance_sum_a0 = torch.sum(self.appearance, axis = 0)
        appearance_sum_a1 = torch.sum(self.appearance, axis = 1)

        for item_index in range(appearance_sum_a0.size()[0]) :
            self.appearance_ratio[:, item_index] = \
                torch.div(self.appearance[:, item_index], appearance_sum_a0[item_index]) * 0.5 + \
                torch.div(self.appearance[:, item_index], appearance_sum_a1[item_index]) * 0.5

In [4]:
class MarcovChainModel(StaticsBasedModel) :
    def __init__(self, data) :
        super(MarcovChainModel, self).__init__(data)

    def cal_ratio(self) :
        appearance_sum = torch.sum(self.appearance, axis = 0)

        for item_index in range(appearance_sum.size()[0]) :
            self.appearance_ratio[:, item_index] = \
                torch.div(self.appearance[:, item_index], appearance_sum[item_index])

In [2]:
import pandas as pd
data_dir = '/opt/ml/movie-recommendation/data/train/'

df = pd.read_csv(data_dir+'train_ratings.csv')
df

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563
...,...,...,...
5154466,138493,44022,1260209449
5154467,138493,4958,1260209482
5154468,138493,68319,1260209720
5154469,138493,40819,1260209726


In [57]:
df, movie_to_id, id_to_movie, popular_items = preprocess(df)

preprocessing..
Complete!


In [289]:
model = MarcovChainModel(df)

model.count_appearance()
torch.save(model.appearance, data_dir+'appearance.pt')

model.cal_ratio()
torch.save(model.appearance_ratio, data_dir+'appearance_ratio.pt')

count appearance ratio..


100%|██████████| 31360/31360 [06:01<00:00, 86.74it/s] 


torch.Size([6807, 6807])

In [58]:
appearance = torch.load(data_dir+'appearance.pt')
appearance_ratio = torch.load(data_dir+'appearance_ratio.pt')
total_num = torch.sum(appearance) / 2
probs = appearance_ratio + torch.diag(torch.sum(appearance, axis=0) / total_num)

user_item_sequence = list(df.groupby(by='user')['item'])

In [66]:
window_size = 10
sub_u, sub_i = [], []

for user, item_sequence in tqdm(user_item_sequence):
    item_sequence = np.array(item_sequence)
    
    chain_prob = (probs[item_sequence[0], item_sequence[0]]).clone()
    window_index = 0

    scores = []
    for index, item in enumerate(item_sequence[:-1]):
        if window_size == 0:
            scores.append(appearance_ratio[item].mul(chain_prob).unsqueeze(1))
            chain_prob = chain_prob.div(
                probs[item_sequence[window_index], item_sequence[window_index]]
            )
            window_index += 1
            chain_prob = chain_prob.mul(
                probs[item_sequence[window_index], item_sequence[window_index]]
            )
            continue

        next_item = item_sequence[index+1]

        if index >= window_size :
            scores.append(appearance_ratio[item].mul(chain_prob).unsqueeze(1))
            chain_prob = chain_prob.div(
                probs[item_sequence[window_index], item_sequence[window_index]]
            )
            chain_prob = chain_prob.div(
                probs[item_sequence[window_index], item_sequence[window_index+1]]
            )
            window_index += 1
            chain_prob = chain_prob.mul(
                probs[item_sequence[window_index], item_sequence[window_index]]
            )
        
        chain_prob = chain_prob.mul(probs[item, next_item])
    scores.append(appearance_ratio[item_sequence[-1]].mul(chain_prob).unsqueeze(1))

    scores = torch.cat(scores, axis=1)
    score = torch.sum(scores, axis=1)
    ranking = torch.topk(score, len(score))[1]

    pred = []
    for item_id in ranking :
        if item_id in item_sequence :
            continue
        movie = id_to_movie[int(item_id)]
        sub_u.append(user)
        sub_i.append(movie)
        pred.append(movie)
        if len(pred) == 10 :
            break

100%|██████████| 31360/31360 [14:08<00:00, 36.97it/s]


In [67]:
submission = {"user" : sub_u, "item" : sub_i}
submission_df = pd.DataFrame(submission)
submission_df.to_csv(f'/opt/ml/movie-recommendation/BPR/output/chain-10.csv', index=False)

In [1]:
import torch

In [2]:
torch.randn((1024,1980,10)).transpose(1,2).size()

torch.Size([1024, 10, 1980])