<a href="https://colab.research.google.com/github/andylee50609/data-course-sample/blob/main/Week3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

下載套件

In [1]:
pip install scikit-surprise



讀取資料

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-09 12:37:04--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.1’


2022-01-09 12:37:05 (41.6 MB/s) - ‘All_Beauty.csv.1’ saved [15499476/15499476]

--2022-01-09 12:37:05--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.1’


2022-01-09 12:37:05 (32.9 MB/s) - ‘meta_All_Beauty.json.gz.1’ saved [10329961/10329961]



參數宣告

In [3]:
import pandas as pd
import numpy as np
from itertools import combinations
from collections import defaultdict
import time
from surprise import Reader
from surprise import Dataset
from surprise import KNNBasic

資料讀取

In [4]:
metadata = pd.read_json('meta_All_Beauty.json.gz', lines = True)[['asin', 'title', 'description', 'rank', 'brand']]
ratings = pd.read_csv('All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header = None)
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

# 取近期的資料較有參考價值
ratings = ratings[ratings['DATE'] >= "2017-09-01"]

資料前處理

In [5]:
# ======== Data Clean ========

# 補充討論熱度及評分資訊
tmp = ratings[["asin","overall"]].groupby("asin").agg({'asin':'size', 'overall':'mean'}).rename(columns={'asin':'reviewNum','overall':'meanScore'}).reset_index()
metadata = pd.merge(metadata, tmp, on="asin", how ="inner").fillna(0)
ratings = pd.merge(ratings, tmp, on="asin", how ="inner").fillna(0)

# 補充商品子類別資訊
metadata['sub_category'] = metadata['rank'].str.split("in ").str[1].replace(r'&amp;','&', regex = True).replace(r'\(','', regex = True)
metadata = metadata.drop(columns = ["rank"])
metadata['brand']= metadata['brand'].str.replace('#','').replace('\(','',regex=True).replace('\)','',regex=True).replace(',','',regex=True).replace('-','',regex=True).replace('\.','',regex=True).replace('\'','',regex=True).replace('\*','',regex=True).replace('', np.nan)
rule_based_recom = metadata[metadata["reviewNum"]>=300][["asin","meanScore"]].sort_values("meanScore", ascending = False).drop_duplicates()

# ========Testing/Training Set========

# 訓練資料(20180901前的交易)
ratings_trainings = ratings[(ratings['DATE'] < '2018-09-01')]

# 測試資料(20180901-20180930的交易)
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')]

# groupby[reviewerID],將結果存成list並建成字典
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())


# ======== 結構化資料處理(sub_category、brand) ========
# 將subCat轉成稀疏矩陣
dummy_subCat = pd.get_dummies(metadata['sub_category'] , columns = ['subcat'])

# 將brand轉成稀疏矩陣
dummy_brand = pd.get_dummies(metadata['brand'] , columns = ['brand'])

# ======== 非結構化資料處理(title、description) ========

# 把 description 從 list 轉為 str
metadata['description'] = metadata['description'].apply(lambda x: ' '.join(x))

#將 title 與 description 合併
metadata['title_description'] = metadata['title'] + metadata['description']
metadata['title_description'] = metadata['title_description'].str.lower()

產生推薦：CF(User Based)

In [6]:
def recommender_user_based(training_data, rule_based, users=[], k=10):

    # loading data from dataframe
    # user_to_items dict:
    # {
    #   'user': {
    #       'item': ratings...
    #   }...
    # }
    user_to_items = defaultdict(dict) # 製作出雙層迴圈：user-item-rating
    for _, row in training_data.iterrows(): # 對一列列進行迭代
        row = dict(row) # 迭代出的每列變成dic
        user = row['reviewerID'] # key值填入'reviewerID'
        item = row['asin']
        rating = float(row['overall'])

        user_to_items[user][item] = rating # 雙層字典結構

    # print("total users before filtering: ", len(user_to_items))

    # remove obscure user to decrease data size
    # filtering params
    remove_obscure_user = True # 當作開關使用，若設為 False，擇不會進行資料篩選
    user_rating_threshold = 3
    all_users = list(user_to_items.keys())
    for user in all_users:
        ratings = user_to_items[user]
        if remove_obscure_user and len(ratings) < user_rating_threshold:
            del user_to_items[user]

    # print("total users  after filtering: ", len(user_to_items))

    # generate item to user mapping dict
    # {
    #   'item': {
    #       'user': ratings...
    #   }...
    # }
    item_to_users = defaultdict(dict)
    for user, items in user_to_items.items(): # 雙層迴圈把user, item, rating都取出->轉置
        for item, rating in items.items():
            item_to_users[item][user] = rating

    # prepare data of computing user similarity 
    init_sim = lambda: [0 for _ in range(3)] # 透過兩次lambda傳入，設置出 user1、user2的初始 [0,0,0]
    factory = lambda: defaultdict(init_sim)
    pre_user_similarity = defaultdict(factory)
    n = len(item_to_users)
    index = 0
    for item, user_ratings in item_to_users.items():
        if len(user_ratings) > 1:
            # print(f"item: {item} have been rated by {len(user_ratings)} users progress: {index}/{n}")
            for user1, user2 in combinations(user_ratings.keys(), 2): #兩兩一組 排列組合
                xy = user_ratings[user1] * user_ratings[user2]
                xx = user_ratings[user1] ** 2
                yy = user_ratings[user2] ** 2
                pre_user_similarity[user1][user2][0] += xy    # [xy,xx,yy]
                pre_user_similarity[user1][user2][1] += xx
                pre_user_similarity[user1][user2][2] += yy
                # 走過每一個 item，得到每一個 item 有哪一些 user 對它評分，之後產生所有 user 的倆倆組合，計算 xy, xx, yy 之後填入
                pre_user_similarity[user2][user1][0] += xy
                pre_user_similarity[user2][user1][1] += xx
                pre_user_similarity[user2][user1][2] += yy
        index += 1

    user_similarity = {}
    for src_user in pre_user_similarity:
        user_similarity_order = []
        for dst_user, val in pre_user_similarity[src_user].items(): # break出來後到這邊
            xy = val[0]
            xx = val[1]
            yy = val[2]
            div = ((xx*yy) ** 0.5) # 開平方根
            if div == 0: # 
                continue
            similarity = xy / div
            if similarity < 0:  # 小於0者排除
                continue
            for i, s in enumerate(user_similarity_order): #s-tuple # 實現sort
                target_similarity = s[1] # 
                if target_similarity < similarity:
                    user_similarity_order.insert(i, (dst_user, similarity))
                    break
            else: # 迴圈中沒有執行break(完整跑完)，就會執行這段  ##(待更新)當第一個抓到的數字是最小，
                user_similarity_order.append((dst_user, similarity))
        user_similarity[src_user] = user_similarity_order # list中tuple

    recommendation = {}
    for user in users: #testing data中的user
        if user in user_similarity:
            sim_users = user_similarity[user]
            recommended_items = []
            recommended_items_set = set()
            user_have_rated = set(user_to_items[user]) # 同產品不同評分 去重複(只想確定他有評分、不管評分高低)
            stop_recommend = False
            # 迴圈看 相似度陣列裡所有的使用者
            for sim_user, _ in sim_users:#97行的 跑相似度中的tuple取出
                # 按照 rating 去排序 [item:rating,...]
                items_from_sim_user = sorted(list(user_to_items[sim_user].items()), key=lambda item: item[1]) #抓最相似的使用者，在u2i中評過的item針對dict中value小到大排序
                for item, _ in items_from_sim_user: # _是rating(不重要的變數以底線代之)
                    # 如果商品之前該使用者為買過 且尚不在推薦名單內
                    if item not in user_have_rated and item not in recommended_items_set: #只取用戶沒評過的產品(評過代表買過)
                        recommended_items.append(item)
                        recommended_items_set.add(item)
                    if len(recommended_items) >= k:
                        stop_recommend = True # TRUE跳出
                        break
                if stop_recommend:
                    break # 結束新增商品 (推薦完畢)
            recommendation[user] = recommended_items
        else: # 不在那38位中
            recommendation[user] = []
    
    # 使用Rule-Based
    if rule_based == True:
        for i in recommendation.keys():
            if recommendation[i] == []:
                recommendation[i] = list(rule_based_recom["asin"][:k]) # 輔以rule-based進行推薦
            
    return recommendation

ratings_by_user_based = recommender_user_based(ratings_trainings, True, users)

產生推薦：CF(Item Based)

In [7]:
def recommender_item_based(training_data, rule_based, users=[], k=10):

    # loading data from dataframe
    # item_to_users dict:
    # {
    #   'item': {
    #       'user': ratings...
    #   }...
    # }
    item_to_users = defaultdict(dict)
    for _, row in training_data.iterrows():
        row = dict(row)
        user = row['reviewerID']
        item = row['asin']
        rating = float(row['overall'])
        item_to_users[item][user] = rating

    user_to_items = defaultdict(dict)
    for item, rating_users in item_to_users.items():
        for user, rating in rating_users.items():
            user_to_items[user][item] = rating

    init_sim = lambda: [0, 0, 0]
    factory = lambda: defaultdict(init_sim)
    pre_item_similarity = defaultdict(factory)
    for user, items in user_to_items.items():
        if len(items) > 1:
            for i1, i2 in combinations(items.keys(), 2):
                xy = items[i1] * items[i2]
                xx = items[i1] ** 2
                yy = items[i2] ** 2
                pre_item_similarity[i1][i2][0] += xy
                pre_item_similarity[i1][i2][1] += xx
                pre_item_similarity[i1][i2][2] += yy

                pre_item_similarity[i2][i1][0] += xy
                pre_item_similarity[i2][i1][1] += xx
                pre_item_similarity[i2][i1][2] += yy

    item_similarity = {}
    for src_item in pre_item_similarity:
        item_similarity_order = []
        for dst_item, val in pre_item_similarity[src_item].items():
            xy = val[0]
            xx = val[1]
            yy = val[2]
            div = ((xx*yy) ** 0.5)
            if div == 0:
                continue
            similarity = xy / div
            if similarity < 0:
                continue
            for i, s in enumerate(item_similarity_order):
                target_similarity = s[1]
                if target_similarity < similarity:
                    item_similarity_order.insert(i, (dst_item, similarity))
                    break
            else:
                item_similarity_order.append((dst_item, similarity))
        item_similarity[src_item] = item_similarity_order

    # print(f"get {k} recommendation items for for user: {users}")

    recommendation = {}
    for user in users:
        items = []
        items_set = set()
        stop = False
        user_has_rated = set(user_to_items[user])
        for item in user_has_rated:
            if item in item_similarity:
                for sim_item, _ in item_similarity[item]:
                    # skip the item user has rated
                    if sim_item not in user_has_rated and sim_item not in items_set:
                        items.append(sim_item)
                        items_set.add(sim_item)
                    if len(items) >= k:
                        stop = True
                        break
                if stop:
                    break
        recommendation[user] = items
        
    # 使用Rule-Based
    if rule_based == True:
        for i in recommendation.keys():
            if recommendation[i] == []:
                recommendation[i] = list(rule_based_recom["asin"][:k]) # 輔以rule-based進行推薦

    return recommendation    

ratings_by_item_based = recommender_item_based(ratings_trainings, True, users)

產生推薦：CF(surprise)

In [8]:
def recommender_surprise(training_data, rule_based, users=[], k=10, user_based=False, algo=KNNBasic):

    training_data = (
        training_data
        .sort_values("DATE", ascending=False)
        .groupby(['reviewerID', 'asin']).head(1)
    )

    reader = Reader(rating_scale=(0, 5))
    training_data = training_data[['reviewerID', 'asin', 'overall']]
    data = Dataset.load_from_df(training_data, reader=reader)

    sim_options = {
        'name': 'cosine',
        'user_based': user_based  # compute similarities between items
    }
    algo_impl = algo(sim_options=sim_options)
    trainset = data.build_full_trainset()
    algo_impl.fit(trainset)

    recommendation = {}
    for user in users:
        items_user_rated = set(training_data.loc[training_data['reviewerID'] == user]['asin'].to_list())
        recommend_item_list = []
        recommend_item_set = set()
        for item in items_user_rated:
            iid = algo_impl.trainset.to_inner_iid(item)
            recommend_items_iid = algo_impl.get_neighbors(iid, k)
            for sim_item_iid in recommend_items_iid:
                item_raw_id = algo_impl.trainset.to_raw_iid(sim_item_iid)
                if item_raw_id not in items_user_rated and item_raw_id not in recommend_item_set:
                    recommend_item_list.append(item_raw_id)
                    recommend_item_set.add(item_raw_id)

            if len(recommend_item_list) >= k:
                recommend_item_list = recommend_item_list[:k]
                break
        recommendation[user] = recommend_item_list

    # 使用Rule-Based
    if rule_based == True:
        for i in recommendation.keys():
            if recommendation[i] == []:
                recommendation[i] = list(rule_based_recom["asin"][:k]) # 輔以rule-based進行推薦

    return recommendation

ratings_by_surprise = recommender_surprise(ratings_trainings, True, users)

Computing the cosine similarity matrix...
Done computing similarity matrix.


結果評估

In [9]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score


print(evaluate(ratings_testings_by_user, ratings_by_user_based))

# 手刻版本，推薦準確率：0 % (Traning Data：2017/9-2018/8)
# 手刻版本，推薦準確率：0 % (Traning Data：2016/9-2018/8)

# 加上rule-based，推薦準確率：9.83% (Traning Data：2017/9-2018/8)
# 加上rule-based，推薦準確率：1.35% (Traning Data：2016/9-2018/8)

print(evaluate(ratings_testings_by_user, ratings_by_item_based))

# 手刻版本，推薦準確率：0 % (Traning Data：2017/9-2018/8)
# 手刻版本，推薦準確率：0 % (Traning Data：2016/9-2018/8)

# 加上rule-based，推薦準確率：9.66% (Traning Data：2017/9-2018/8)
# 加上rule-based，推薦準確率：1.35% (Traning Data：2016/9-2018/8)

print(evaluate(ratings_testings_by_user, ratings_by_surprise))

# # 純套用surprise，推薦準確率：0.17 % (Traning Data：2017/9-2018/8)
# # 純套用surprise，推薦準確率：0 % (Traning Data：2016/9-2018/8)

# # 加上rule-based，推薦準確率：9.83% (Traning Data：2017/9-2018/8)
# # 加上rule-based，推薦準確率：1.35% (Traning Data：2016/9-2018/8)
    

0.09830508474576272
0.09661016949152543
0.09830508474576272
