<a href="https://colab.research.google.com/github/andylee50609/data-course-sample/blob/main/Week1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

【參數宣告】

In [2]:
import pandas as pd
import numpy as np

【資料讀取】

In [3]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2021-12-26 06:37:54--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2021-12-26 06:37:54 (39.6 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2021-12-26 06:37:55--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2021-12-26 06:37:55 (34.4 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [4]:
metadata = pd.read_json('meta_All_Beauty.json.gz', lines = True)
ratings = pd.read_csv('All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header = None)
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

【資料前處理】

In [5]:
# 訓練資料(20180901前的交易)
ratings_trainings = ratings[(ratings['DATE'] < '2018-09-01')]

# 測試資料(20180901-20180930的交易)
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')]

# groupby[reviewerID],將結果存成list並建成字典
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

【特徵組合】

In [6]:
# ====商品銷售排行====

# 紀錄銷售排名資訊
metadata["rank"] = metadata["rank"].str.split(" ").str[0].str.replace(",","").fillna(1000000000).astype(np.int64)
ratings_trainings = pd.merge(ratings_trainings, metadata[["asin","rank"]], on="asin", how ="left")

# ====商品討論熱度====

# 紀錄評論者回覆次數
tmp = ratings_trainings[["asin", "reviewerID"]].groupby("asin").count().rename(columns = {"reviewerID":"reviewerCnt_total"})
ratings_trainings = pd.merge(ratings_trainings, tmp, on="asin", how ="left").fillna(1000000000)

# 紀錄評論者回覆次數(1個月內)
tmp = ratings_trainings[(ratings_trainings['DATE'] >= '2018-08-01')][["asin", "reviewerID"]].groupby("asin").count().rename(columns = {"reviewerID":"reviewerCnt_1M"})
ratings_trainings = pd.merge(ratings_trainings, tmp, on="asin", how ="left").fillna(0)

# 紀錄評論者回覆次數(3個月內)
tmp = ratings_trainings[(ratings_trainings['DATE'] >= '2018-06-01')][["asin", "reviewerID"]].groupby("asin").count().rename(columns = {"reviewerID":"reviewerCnt_3M"})
ratings_trainings = pd.merge(ratings_trainings, tmp, on="asin", how ="left").fillna(0)

# ====商品評論分數====

# 紀錄評論分數平均
tmp = ratings_trainings[["asin", "overall"]].groupby("asin").mean().rename(columns = {"overall":"overallAvg_total"})
ratings_trainings = pd.merge(ratings_trainings, tmp, on="asin", how ="left").fillna(0)

# 紀錄評論分數平均(1個月內)
tmp = ratings_trainings[(ratings_trainings['DATE'] >= '2018-08-01')][["asin", "overall"]].groupby("asin").mean().rename(columns = {"overall":"overallAvg_1M"})
ratings_trainings = pd.merge(ratings_trainings, tmp, on="asin", how ="left").fillna(0)

# 紀錄評論分數平均(3個月內)
tmp = ratings_trainings[(ratings_trainings['DATE'] >= '2018-06-01')][["asin", "overall"]].groupby("asin").mean().rename(columns = {"overall":"overallAvg_3M"})
ratings_trainings = pd.merge(ratings_trainings, tmp, on="asin", how ="left").fillna(0)

【推薦方法】

In [7]:
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''    
    recommendations = {}        
    training_data = training_data.drop(columns = ["reviewerID","overall","unixReviewTime","DATE"]).drop_duplicates().reset_index(drop=True)
    
    revcnt_total_per = np.percentile(training_data["reviewerCnt_total"].sort_values(ascending=False),95)
    revcnt_1M_per = np.percentile(training_data["reviewerCnt_1M"].sort_values(ascending=False),95)
    revcnt_3M_per = np.percentile(training_data["reviewerCnt_3M"].sort_values(ascending=False),95)
    
    # ===============================
    # Rule1：利用商品銷售排行進行推薦
    # ===============================
    dataSort = training_data.sort_values(by = ["rank"], ascending=True)
    dataSort["ranking"] = dataSort["rank"].rank(method='dense',ascending=True)
    dataSort.reset_index(drop=True, inplace=True)
    rule_1= dataSort['asin'].tolist()[:k]
    
    # =================================
    # Rule2：利用商品熱度及評價進行推薦
    # ================================= 
    dataSort = training_data[training_data["reviewerCnt_total"] >= revcnt_total_per]
    dataSort = dataSort[dataSort["overallAvg_total"] >= 4]
    dataSort["ranking_overallAvg"] = dataSort["overallAvg_total"].rank(method='dense',ascending=False)
    dataSort["ranking_reviewerCnt"] = dataSort["reviewerCnt_total"].rank(method='dense',ascending=False)
    dataSort["ranking"] = dataSort["ranking_overallAvg"] + dataSort["ranking_reviewerCnt"]
    dataSort = dataSort.sort_values("ranking")
    dataSort.reset_index(drop=True, inplace=True)
    rule_2= dataSort['asin'].tolist()[:k]    

    # =================================
    # Rule3：利用商品熱度及評價進行推薦(1個月內)
    # ================================= 
    dataSort = training_data[training_data["reviewerCnt_1M"] >= revcnt_1M_per]
    dataSort = dataSort[dataSort["overallAvg_1M"] >= 4]
    dataSort["ranking_overallAvg"] = dataSort["overallAvg_1M"].rank(method='dense',ascending=False)
    dataSort["ranking_reviewerCnt"] = dataSort["reviewerCnt_1M"].rank(method='dense',ascending=False)
    dataSort["ranking"] = dataSort["ranking_overallAvg"] + dataSort["ranking_reviewerCnt"]
    dataSort = dataSort.sort_values("ranking")
    dataSort.reset_index(drop=True, inplace=True)
    rule_3= dataSort['asin'].tolist()[:k]   

    # =================================
    # Rule4：利用商品熱度及評價進行推薦(3個月內)
    # ================================= 
    dataSort = training_data[training_data["reviewerCnt_3M"] >= revcnt_3M_per]
    dataSort = dataSort[dataSort["overallAvg_3M"] >= 4]
    dataSort["ranking_overallAvg"] = dataSort["overallAvg_3M"].rank(method='dense',ascending=False)
    dataSort["ranking_reviewerCnt"] = dataSort["reviewerCnt_3M"].rank(method='dense',ascending=False)
    dataSort["ranking"] = dataSort["ranking_overallAvg"] + dataSort["ranking_reviewerCnt"]
    dataSort = dataSort.sort_values("ranking")
    dataSort.reset_index(drop=True, inplace=True)
    rule_4= dataSort['asin'].tolist()[:k]   
    
    recommendations ={"rule_1":rule_1,
                      "rule_2":rule_2,
                      "rule_3":rule_3,
                      "rule_4":rule_4}
    
    return recommendations

ratings_by_rule = recommender(ratings_trainings, users)

【評估方法】

In [8]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    for rule in ratings_by_user:
        total = 0
        for d in ratings_testings_by_user:
            total += len(set(ratings_by_user[rule]) & set(ratings_testings_by_user[d]))
            score = total / len(ratings_testings)
        print(rule + "：" +  str(round(score*100,2)) + "%" )
evaluate(ratings_testings_by_user, ratings_by_rule)
    

rule_1：0.51%
rule_2：0.0%
rule_3：1.69%
rule_4：4.24%
