In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from typing import List, Dict
import dataclasses
import os
from abc import ABC, abstractmethod
from sklearn.metrics import mean_squared_error
from collections import defaultdict, Counter
from mlxtend.frequent_patterns import apriori, association_rules
np.random.random(0)

array([], dtype=float64)

In [4]:
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/movies.dat', names=m_cols, sep='::', encoding='latin-1', engine='python')
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [5]:
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/tags.dat', names=t_cols, sep='::', engine='python')
user_tagged_movies['tag'] = user_tagged_movies.tag.str.lower()
user_tagged_movies

Unnamed: 0,user_id,movie_id,tag,timestamp
0,15,4973,excellent!,1215184630
1,20,1747,politics,1188263867
2,20,1747,satire,1188263867
3,20,2424,chick flick 212,1188263835
4,20,2424,hanks,1188263835
...,...,...,...,...
95575,71556,1377,gothic,1188263571
95576,71556,2424,chick flick,1188263606
95577,71556,3033,comedy,1188263626
95578,71556,3081,gothic,1188263565


In [6]:
movies_tag = user_tagged_movies.groupby('movie_id').agg({'tag': list})
movies = movies.merge(movies_tag, on='movie_id', how='left')
movies

Unnamed: 0,movie_id,title,genre,tag
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[pixar, pixar, pixar, animation, pixar, animat..."
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[for children, game, animals, joe johnston, ro..."
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[funniest movies, comedinha de velhinhos engra..."
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",[girl movie]
4,5,Father of the Bride Part II (1995),[Comedy],"[steve martin, pregnancy, remake, steve martin..."
...,...,...,...,...
10676,65088,Bedtime Stories (2008),"[Adventure, Children, Comedy]",
10677,65091,Manhattan Melodrama (1934),"[Crime, Drama, Romance]",
10678,65126,Choke (2008),"[Comedy, Drama]","[chuck palahniuk, based on book]"
10679,65130,Revolutionary Road (2008),"[Drama, Romance]",[toplist08]


In [7]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ratings.dat', names=r_cols, sep='::', engine='python')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [8]:
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings.user_id.isin(valid_user_ids)]
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392
...,...,...,...,...
132825,1053,33794,5.0,1134008301
132826,1053,34162,5.0,1134007983
132827,1053,34319,3.5,1134007773
132828,1053,35836,5.0,1134008021


In [9]:
movielens = ratings.merge(movies, on='movie_id')
movielens.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag
0,1,122,5.0,838985046,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
1,1,185,5.0,838983525,"Net, The (1995)","[Action, Crime, Thriller]","[computers, computers, internet, irwin winkler..."
2,1,231,5.0,838983392,Dumb & Dumber (1994),[Comedy],"[jeff daniels, jim carrey, stupid, jim carrey,..."
3,1,292,5.0,838983421,Outbreak (1995),"[Action, Drama, Sci-Fi, Thriller]","[biology, gross, disease, futuristmovies.com, ..."
4,1,316,5.0,838983392,Stargate (1994),"[Action, Adventure, Sci-Fi]","[egypt, space, time travel, time travel, alien..."


In [10]:
movielens['timestamp_rank'] = movielens.groupby('user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank'] <= 5]

In [11]:
movielens_test

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
0,1,122,5.0,838985046,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem...",1.0
8,1,362,5.0,838984885,"Jungle Book, The (1994)","[Adventure, Children, Romance]","[5, animated classic, bad remake, adapted from...",3.0
13,1,466,5.0,838984679,Hot Shots! Part Deux (1993),"[Action, Comedy, War]","[charlie sheen, comedy, parody, zaz, charlie s...",4.0
15,1,520,5.0,838984679,Robin Hood: Men in Tights (1993),[Comedy],"[parody, can't remember, very funny!, mel broo...",5.0
21,1,616,5.0,838984941,"Aristocats, The (1970)","[Animation, Children]","[disney, disney, disney animated feature, fran...",2.0
...,...,...,...,...,...,...,...,...
132693,1053,457,4.5,1134008458,"Fugitive, The (1993)",[Thriller],"[tommy lee jones, chase, tv series, excellent ...",3.0
132718,1053,1242,5.0,1134008464,Glory (1989),"[Action, Drama, War]","[action, drama, war, c, historical lackluster,...",1.0
132741,1053,2028,5.0,1134008444,Saving Private Ryan (1998),"[Action, Drama, War]","[world war ii, speilberg, steven spielberg, gf...",4.0
132750,1053,2501,5.0,1134008462,October Sky (1999),[Drama],"[liz should see, space program, true story, ae...",2.0


In [12]:
@dataclasses.dataclass(frozen=True)
class Dataset:
    train: pd.DataFrame     # 学習用のデータ
    test:  pd.DataFrame     # テスト用のデータ
    test_user2item: Dict[int, List[int]]        # ランキング指標のテストデータセット，キーがユーザIDで値がユーザが高評価したアイテムIDのセット
    item_content: pd.DataFrame      # アイテムのコンテンツ情報

In [13]:
class DataLoader:
    def __init__(self, num_users: int=100, num_test_items: int=5, data_path: str='../data/') -> None:
        self.num_users = num_users
        self.num_test_items = num_test_items
        self.data_path = data_path
    
    def load(self) -> Dataset:
        ratings, movie_content = self._load()
        movielens_train, movielens_test = self._split_data(ratings)

        # ランキング用の評価データは，各ユーザの評価値が4以上の映画のみを正解とする
        # キーはユーザID，値はユーザが高評価したアイテムIDのリスト
        movielens_test_user2items = (
            movielens_test[movielens_test.rating >= 4].groupby('user_id').agg({'movie_id': list})['movie_id'].to_dict()
        )
        return Dataset(movielens_train, movielens_test, movielens_test_user2items, movie_content)
    
    def _split_data(self, movielens: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):
        # 学習用とテスト用にデータを分割する
        # 各ユーザの直近の映画5件を評価用に使い，それ以外を学習用とする
        # まず，それぞれのユーザが評価した映画の順序を計算する
        # 直近付与した映画から順番を付与していく
        movielens['rating_order'] = movielens.groupby('user_id')['timestamp'].rank(ascending=False, method='first')
        movielens_train = movielens[movielens['rating_order'] > self.num_test_items]
        movielens_test = movielens[movielens['rating_order'] <= self.num_test_items]
        return movielens_train, movielens_test
    
    def _load(self) -> (pd.DataFrame, pd.DataFrame):
        # 映画情報読み込み
        m_cols = ['movie_id', 'title', 'genre']
        movies = pd.read_csv(os.path.join(self.data_path, 'movies.dat'), names=m_cols, sep='::', encoding='latin-1', engine='python')
        movies['genre'] = movies.genre.apply(lambda x:x.split('|'))

        # タグ情報読み込み
        t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
        user_tagged_movies = pd.read_csv(os.path.join(self.data_path, 'tags.dat'), names=t_cols, sep='::', engine='python')
        user_tagged_movies['tag'] = user_tagged_movies.tag.str.lower()
        
        # タグ情報を統合する
        movies_tag = user_tagged_movies.groupby('movie_id').agg({'tag': list})
        movies = movies.merge(movies_tag, on='movie_id', how='left')

        # 評価情報読み込み
        r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
        ratings = pd.read_csv(os.path.join(self.data_path, 'ratings.dat'), names=r_cols, sep='::', engine='python')

        # 評価情報を絞る
        valid_user_ids = sorted(ratings.user_id.unique())[:self.num_users]
        ratings = ratings[ratings.user_id <= max(valid_user_ids)]
        
        # 上記情報を統合する
        movielens_ratings = ratings.merge(movies, on='movie_id')
        return movielens_ratings, movies

In [14]:
@dataclasses.dataclass(frozen=True)
class RecommendResult:
    rating: pd.DataFrame        # テストデータセットの予測評価値，RMSEの評価
    user2items: Dict[int, List[int]]        # キーはユーザID，値ははおすすめアイテムIDのリスト，ランキング指標の評価

In [15]:
@dataclasses.dataclass(frozen=True)
class Metrics:
    rmse: float    # RMSE
    precision_at_k: float    # Precision@k
    recall_at_k: float       # Recall@k

    def __repr__(self) -> None:
        return f'rmse={self.rmse:.3f}, precision@k={self.precision_at_k:.3f}, recall@k={self.recall_at_k:.3f}'

In [16]:
class MetricsCalclator:
    def calc(self, true_rating: List[float], pred_rating: List[float], true_user2items: Dict[int, List[int]], pred_user2items: Dict[int, List[int]], k: int) -> Metrics:
        rmse = self._calc_rmse(true_rating, pred_rating)
        precision_at_k = self._calc_precision_at_k(true_user2items, pred_user2items, k)
        recall_at_k = self._calc_recall_at_k(true_user2items, pred_user2items, k)
        return Metrics(rmse, precision_at_k, recall_at_k)
    
    def _precision_at_k(self, true_items: List[int], pred_items: List[int], k: int) -> float:
        if k == 0: return 0.0
        p_at_k = (len(set(true_items)  & set(pred_items[:k])) / k)
        return p_at_k
    
    def _recall_at_k(self, true_items: List[int], pred_items: List[int], k: int) -> float:
        if true_items is None or k == 0: return 0.0
        r_at_k = (len(set(true_items)  & set(pred_items[:k])) / len(true_items))
        return r_at_k
    
    def _calc_rmse(self, true_rating: List[float], pred_rating: List[float]) -> float:
        return np.sqrt(mean_squared_error(true_rating, pred_rating))
    
    def _calc_precision_at_k(self, true_user2items: Dict[int, List[int]], pred_user2items: Dict[int, List[int]], k: int) -> float:
        scores = []
        for user_id in true_user2items.keys():
            p_at_k = self._precision_at_k(true_user2items[user_id], pred_user2items[user_id], k)
            scores.append(p_at_k)
        return np.mean(scores)
    
    def _calc_recall_at_k(self, true_user2items: Dict[int, List[int]], pred_user2items: Dict[int, List[int]], k: int) -> float:
        scores = []
        for user_id in true_user2items.keys():
            r_at_k = self._recall_at_k(true_user2items[user_id], pred_user2items[user_id], k)
            scores.append(r_at_k)
        return np.mean(scores)

In [17]:
class BaseRecommender(ABC):
    @abstractmethod
    def recommend(self, dataset: Dataset, **kwargs) -> RecommendResult:
        pass

    def run_sample(self) -> None:
        # Movielensのデータを取得
        movielens = DataLoader(num_users=1000, num_test_items=5, data_path='../data/').load()

        # 推薦結果
        recommend_results = self.recommend(movielens)

        # 推薦結果の評価
        metrics = MetricsCalclator().calc(
            movielens.test.rating.tolist(),
            recommend_results.rating.tolist(),
            movielens.test_user2item,
            recommend_results.user2items,
            k=10
        )
        print(metrics)

In [18]:
# ランダムレコメンド
class RandomRecommender(BaseRecommender):
    def recommend(self, dataset: Dataset, **kwargs) -> RecommendResult:
        # ユーザIDとアイテムIDに対して0始まりのインデックスを振る
        unique_user_ids = sorted(dataset.train.user_id.unique())
        unique_movie_ids = sorted(dataset.train.movie_id.unique())
        # 下の処理の意味
        # 例：unique_user_ids = [1, 8, 13, 14, 23]なら，range(len(unique_user_ids)) = [0, 1, 2, 3, 4]となり，zip()で[(1, 0), (8, 1), (13, 2), (14, 3), (23, 4)]となる
        user_id2index = dict(zip(unique_user_ids, range(len(unique_user_ids))))
        movie_id2index = dict(zip(unique_movie_ids, range(len(unique_movie_ids))))

        # ユーザ×アイテムの行列で，各セルの予測評価値は0.5-5.0の一様乱数とする
        # np.random.uniform()の第3引数はサイズ指定，行がユーザで列がアイテム（映画）に対応する
        pred_matrix = np.random.uniform(0.5, 5.0, (len(unique_user_ids), len(unique_movie_ids)))

        # RMSE評価用にテストデータに出てくるユーザとアイテムの予測評価値を格納する
        movie_rating_predict = dataset.test.copy()

        pred_results = []
        for i, row in dataset.test.iterrows():
            user_id = row['user_id']
            # テストデータのアイテムIDが学習用に登場していない場合も乱数を格納する
            if row['movie_id'] not in movie_id2index:
                pred_results.append(np.random.uniform(0.5, 5.0))
                continue

            # テストデータに現れるユーザIDとアイテムIDのインデックスを取得し評価値行列の値を取得する
            user_index = user_id2index[row['user_id']]
            movie_index = movie_id2index[row['movie_id']]
            pred_score = pred_matrix[user_index, movie_index]
            pred_results.append(pred_score)

        movie_rating_predict['rating_pred'] = pred_results

        # ランキング評価用のデータ作成
        # 各ユーザに対するおすすめ映画は，そのユーザがまだ評価していない映画の中からランダムに10作品とする
        # キーはユーザIDで値はおすすめのアイテムのIDのリスト
        pred_user2items = defaultdict(list)
        # ユーザがすでに評価した映画を取得する
        user_evaluated_movies = dataset.train.groupby('user_id').agg({'movie_id': list})['movie_id'].to_dict()

        for user_id in unique_user_ids:
            user_index = user_id2index[user_id]
            movie_indexes = np.argsort(pred_matrix[user_index, :])
            for movie_id in movie_indexes:
                movie_id = unique_movie_ids[movie_id]
                if movie_id not in user_evaluated_movies[user_id]:
                    pred_user2items[user_id].append(movie_id)
                if len(pred_user2items[user_id]) == 10:
                    break
        return RecommendResult(movie_rating_predict.rating_pred, pred_user2items)

RandomRecommender().run_sample()

rmse=1.901, precision@k=0.000, recall@k=0.001


In [19]:
# 評価値が高い映画の確認
# 評価数が少ない映画は信頼性が低いため，評価数に閾値を設ける
movie_stats = movielens_train.groupby(['movie_id', 'title']).agg({'rating': [np.size, np.mean]})
threshold_flg = movie_stats['rating']['size'] >= 100
movie_sorted_by_rating = movie_stats[threshold_flg].sort_values(by=('rating', 'mean'), ascending=False).head()
movie_sorted_by_rating

  movie_stats = movielens_train.groupby(['movie_id', 'title']).agg({'rating': [np.size, np.mean]})


Unnamed: 0_level_0,Unnamed: 1_level_0,rating,rating
Unnamed: 0_level_1,Unnamed: 1_level_1,size,mean
movie_id,title,Unnamed: 2_level_2,Unnamed: 3_level_2
318,"Shawshank Redemption, The (1994)",423,4.492908
50,"Usual Suspects, The (1995)",332,4.459337
912,Casablanca (1942),163,4.444785
904,Rear Window (1954),129,4.44186
2019,Seven Samurai (Shichinin no samurai) (1954),104,4.408654


In [20]:
class PopularityRecommender(BaseRecommender):
    def recommend(self, dataset: Dataset, **kwargs) -> RecommendResult:
        # 評価数の閾値
        minimum_num_rating = kwargs.get('monimum_num_rating', 100)
        # 各アイテムの平均評価値を計算し，その平均評価値を予測値として利用する
        movie_rating_average = dataset.train.groupby('movie_id').agg({'rating': np.mean})
        # テストデータに予測値を格納する
        # テストデータのみに存在するアイテムの予測評価値は0とする
        movie_rating_predict = dataset.test.merge(movie_rating_average, on='movie_id', how='left', suffixes=('_test', '_pred')).fillna(0)
        
        # 各ユーザに対するおすすめの映画は，そのユーザがまだ評価していない映画の中から評価値が高いものを10作品とする
        # 評価件数には閾値を設ける
        pred_user2items = defaultdict(list)
        # ユーザがすでに評価した映画を辞書型として取得する
        user_watched_movies = dataset.train.groupby('user_id').agg({'movie_id': list})['movie_id'].to_dict()
        # それぞれの映画の評価件数と平均評価値を取得する
        movie_stats = dataset.train.groupby('movie_id').agg({'rating': [np.size, np.mean]})
        threshold_flg = movie_stats['rating']['size'] >= minimum_num_rating
        # 評価件数が閾値以上で，かつ平均評価値が高い順に並べ替え，映画IDのリストを取得する
        movie_sorted_by_rating = (movie_stats[threshold_flg].sort_values(by=[('rating', 'mean')], ascending=False).index.tolist())
        # ユーザごとにおすすめ映画を選択する
        for user_id in dataset.train.user_id.unique():
            for movie_id in movie_sorted_by_rating:
                if movie_id not in user_watched_movies[user_id]:
                    pred_user2items[user_id].append(movie_id)
                    if len(pred_user2items[user_id]) == 10:
                        break
        return RecommendResult(movie_rating_predict.rating_pred, pred_user2items)
    
PopularityRecommender().run_sample()

rmse=1.089, precision@k=0.008, recall@k=0.025


  movie_rating_average = dataset.train.groupby('movie_id').agg({'rating': np.mean})
  movie_stats = dataset.train.groupby('movie_id').agg({'rating': [np.size, np.mean]})


In [26]:
class AssociationRecommender(BaseRecommender):
    def recommend(self, dataset: Dataset, **kwargs) -> RecommendResult:
        # 評価値の閾値
        min_support = kwargs.get('min_support', 0.1)
        min_threshold = kwargs.get('min_threshold', 1)

        # ユーザ×映画の行列形式に変更
        user_movie_matrix = dataset.train.pivot(index='user_id', columns='movie_id', values='rating')

        # ライブラリ使用のために，4以上の評価値は1，4未満の評価値は0
        user_movie_matrix[user_movie_matrix.isnull()] = 0
        user_movie_matrix[user_movie_matrix < 4] = 0
        user_movie_matrix[user_movie_matrix >= 4] = 1

        # 支持度が高い映画
        freq_movies = apriori(user_movie_matrix, min_support=min_support, use_colnames=True)

        # アソシエーションルールの計算（リフト値の高い順に表示）
        rules = association_rules(freq_movies, metric='lift', min_threshold=min_threshold)

        # アソシエーションルールに従って，各ユーザにまだ評価していない映画を10本推薦する
        pred_user2items = defaultdict(list)
        user_evaluated_movies = dataset.train.groupby('user_id').agg({'movie_id': list})['movie_id'].to_dict()
        # 学習用データで評価値が4以上のものだけ取得する
        movielens_train_high_rating = dataset.train[dataset.train.rating >= 4]

        for user_id, data in movielens_train_high_rating.groupby('user_id'):
            # ユーザが直近評価した5つの映画を取得
            input_data = data.sort_values('timestamp')['movie_id'].tolist()[:5]
            # それらの映画が条件部に1本でも含まれているアソシエーションルールを抽出
            matched_flags = rules.antecedents.apply(lambda x: len(set(input_data) & x)) >= 1

            # アソシエーションルールの帰結部の映画をリストに格納し登場頻度順に並べ替え，ユーザがまだ評価していなければ推薦リストに追加する
            consequent_movies = []
            for i, row in rules[matched_flags].sort_values('lift', ascending=False).iterrows():
                consequent_movies.extend(row['consequents'])
            # 登場頻度をカウント
            counter = Counter(consequent_movies)
            for movie_id, movie_cnt in counter.most_common():
                if movie_id not in user_evaluated_movies[user_id]:
                    pred_user2items[user_id].append(movie_id)
                if len(pred_user2items[user_id]) == 10:
                    break
        # アソシエーションルールによる評価値の予測は難しいため，RMSEは計算しない
        return RecommendResult(dataset.test.rating, pred_user2items)
    
AssociationRecommender().run_sample()



rmse=0.000, precision@k=0.012, recall@k=0.040
