<a href="https://colab.research.google.com/github/andylee50609/data-course-sample/blob/main/Week2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

【參數宣告】



In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

【資料讀取】

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-03 11:18:59--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.1’


2022-01-03 11:19:00 (19.8 MB/s) - ‘All_Beauty.csv.1’ saved [15499476/15499476]

--2022-01-03 11:19:00--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.1’


2022-01-03 11:19:01 (15.2 MB/s) - ‘meta_All_Beauty.json.gz.1’ saved [10329961/10329961]



In [3]:
metadata = pd.read_json('meta_All_Beauty.json.gz', lines = True)[['asin', 'title', 'description', 'rank', 'brand']]
ratings = pd.read_csv('All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header = None)
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

# 取近期的資料較有參考價值
ratings = ratings[ratings['DATE'] >= "2017-09-01"]

【資料前處理】

In [4]:
# ======== Data Clean ========

# 補充討論熱度及評分資訊
tmp = ratings[["asin","overall"]].groupby("asin").agg({'asin':'size', 'overall':'mean'}).rename(columns={'asin':'reviewNum','overall':'meanScore'}).reset_index()
metadata = pd.merge(metadata, tmp, on="asin", how ="inner").fillna(0)
ratings = pd.merge(ratings, tmp, on="asin", how ="inner").fillna(0)

# 補充商品子類別資訊
metadata['sub_category'] = metadata['rank'].str.split("in ").str[1].replace(r'&amp;','&', regex = True).replace(r'\(','', regex = True)
metadata = metadata.drop(columns = ["rank"])
metadata['brand']= metadata['brand'].str.replace('#','').replace('\(','',regex=True).replace('\)','',regex=True).replace(',','',regex=True).replace('-','',regex=True).replace('\.','',regex=True).replace('\'','',regex=True).replace('\*','',regex=True).replace('', np.nan)
rule_based_recom = metadata[metadata["reviewNum"]>=300][["asin","meanScore"]].sort_values("meanScore", ascending = False).drop_duplicates()


# ========Testing/Training Set========

# 訓練資料(20180901前的交易)
ratings_trainings = ratings[(ratings['DATE'] < '2018-09-01')]

# 測試資料(20180901-20180930的交易)
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')]

# groupby[reviewerID],將結果存成list並建成字典
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())


# ======== 結構化資料處理(sub_category、brand) ========
# 將subCat轉成稀疏矩陣
dummy_subCat = pd.get_dummies(metadata['sub_category'] , columns = ['subcat'])

# 將brand轉成稀疏矩陣
dummy_brand = pd.get_dummies(metadata['brand'] , columns = ['brand'])

# ======== 非結構化資料處理(title、description) ========

# 把 description 從 list 轉為 str
metadata['description'] = metadata['description'].apply(lambda x: ' '.join(x))

#將 title 與 description 合併
metadata['title_description'] = metadata['title'] + metadata['description']
metadata['title_description'] = metadata['title_description'].str.lower()

【產生推薦】

In [5]:
# 計算商品用標題所表示的 tfidf 矩陣 (文本向量化)
df = metadata.drop_duplicates('title_description')
tf = TfidfVectorizer(analyzer='word', stop_words=("english"))
tfidf_matrix = tf.fit_transform(df['title_description'])

# 加入結構化資料
tfidf_df = pd.DataFrame(data=tfidf_matrix.toarray(), columns=tf.get_feature_names())
tfidf_df = pd.concat([tfidf_df, dummy_subCat, dummy_brand], axis=1).fillna(0)

# 計算商品間的相似程度
similarity_matrix = cosine_similarity(tfidf_df)
mapping = pd.Series(metadata.index,index = metadata['title_description'])



def recommend_item(item_input, k=2):
    try:
        item_index = mapping[item_input]
        similarity_score = list(enumerate(similarity_matrix[item_index]))
        similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        similarity_score = similarity_score[:k]
        item_indices = [i[0] for i in similarity_score]
        return (df['asin'].iloc[item_indices].tolist())
    except:
        return []

# 利用使用者購買過的商品產生推薦
def recommend_items(items, k):
    res = []
    for d in items:
        res.extend(recommend_item(d, k))
    return res


def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    ratings_trainings = training_data
    
    for user in users:
      recom_list = recommend_items(metadata[metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['title_description'].tolist(), k)[:k]
      if recom_list:
        recommendations[user] = recom_list
      else:
        recommendations[user] = list(rule_based_recom["asin"])[:k]

    return recommendations

ratings_by_user = recommender(ratings_trainings, users)



【結果評估】

In [6]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.09830508474576272