<a href="https://colab.research.google.com/github/YCPNG/data-course-sample/blob/main/S4_A4_Content_based_Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample Code

## 基礎建設

In [None]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [None]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-02 15:08:24--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2022-01-02 15:08:25 (24.6 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2022-01-02 15:08:25--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2022-01-02 15:08:25 (19.1 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [None]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [None]:
metadata.head(3)

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]


In [None]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

In [None]:
ratings.head(3)

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE
0,143026860,A1V6B6TNIC10QE,1.0,1424304000,2015-02-19
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800,2014-12-18
2,143026860,A1572GUYS7DGSR,4.0,1407628800,2014-08-10


In [None]:
print('metadata:', metadata.shape, 'ratings:', ratings.shape)

metadata: (32892, 19) ratings: (371345, 5)


## 資料整理

In [None]:
# metadata資料清洗 去除重複資料
metadata_c = metadata.drop_duplicates('asin')

In [None]:
# 僅保留需要的欄位
metadata_c = metadata_c[['description', 'title', 'brand', 'rank', 'asin']]
metadata_c.head(3)

Unnamed: 0,description,title,brand,rank,asin
0,[Loud 'N Clear Personal Sound Amplifier allows...,Loud 'N Clear&trade; Personal Sound Amplifier,idea village,"2,938,573 in Beauty & Personal Care (",6546546450
1,[No7 Lift & Luminate Triple Action Serum 50ml ...,No7 Lift &amp; Luminate Triple Action Serum 50...,,"872,854 in Beauty & Personal Care (",7178680776
2,[No7 Stay Perfect Foundation now stays perfect...,No7 Stay Perfect Foundation Cool Vanilla by No7,No7,"956,696 in Beauty & Personal Care (",7250468162


In [None]:
# rank欄位轉換成數字，文字部分另作成category欄位
rank = metadata['rank']
rank_str = rank.str.split(expand = True)
rank_num = rank_str[0]
rank_cat = rank_str[2] + " " + rank_str[3] + " " + rank_str[4] + " " + rank_str[5]
rank_num = pd.to_numeric(rank_num.str.replace(',',''))
metadata_c['rank'] = rank_num
metadata_c['rank_cat'] = rank_cat

In [None]:
# 將title都換成小寫
metadata_c['title'] = metadata_c['title'].str.lower().replace(',','')

In [None]:
# 將description中list資料合併
metadata_c['description'] = metadata_c['description'].apply(' '.join)

In [None]:
# 將description都轉成小寫
metadata_c['description'] = metadata_c['description'].str.lower().replace(',', '')

In [None]:
# 去除title中的特殊符號
import re

metadata_c['title'] = [re.sub('\W+', ' ', title) for title in metadata_c['title']]
metadata_c['title']

0              loud n clear trade personal sound amplifier
1        no7 lift amp luminate triple action serum 50ml...
2          no7 stay perfect foundation cool vanilla by no7
3        wella koleston perfect hair colour 44 44 mediu...
4        lacto calamine skin balance oil control 120 ml...
                               ...                        
32887     barielle pro textured grip cuticle nipper purple
32888     buy 3 get 1 free salon perfect eye makeup cor...
32889     now d mannose 500 mg 120 veg capsules pack of 3 
32890    12 white feather shuttlecocks birdies badminto...
32891    feshfen scrunchy scrunchies synthetic hair bun...
Name: title, Length: 32488, dtype: object

In [None]:
# 去除description中特殊符號
metadata_c['description'] = [re.sub('\W+', ' ', des) for des in metadata_c['description']]

In [None]:
# 去除rank_cat中特殊符號，轉小寫
metadata_c['rank_cat'] = metadata_c['rank_cat'].str.lower()
metadata_c['rank_cat'] = metadata_c['rank_cat'].str.replace('&', '')
metadata_c['rank_cat'] = metadata_c['rank_cat'].str.replace('amp;', '')

In [None]:
all_text = metadata_c['description'] + " " + metadata_c['title'] + " " + metadata_c['brand']
metadata_c['all_text'] = all_text

In [None]:
metadata_c = metadata_c[['asin', 'all_text']]

In [None]:
metadata_c

Unnamed: 0,asin,all_text
0,6546546450,loud n clear personal sound amplifier allows y...
1,7178680776,no7 lift luminate triple action serum 50ml by ...
2,7250468162,no7 stay perfect foundation now stays perfect ...
3,7367905066,wella koleston perfect hair colour 44 44 medi...
4,7414204790,lacto calamine skin balance daily nourishing l...
...,...,...
32887,B01HIWLLUK,barielle pro textured grip cuticle nipper pur...
32888,B01HJ1K3YK,buy 3 get 1 free salon perfect eye makeup co...
32889,B01HJ84SGM,now d mannose 500 mg 120 veg capsules pack of...
32890,B01HJASD20,brand new and high quality br enables fast vol...


## 資料切分

In [None]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

In [None]:
ratings_trainings_3m = ratings[
    (ratings['DATE'] >= '2018-06-01') & 
    (ratings['DATE'] < '2018-09-01')
]

In [None]:
# 去除停用字
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_rmsw = CountVectorizer(stop_words='english')
raw_sklearn_text = vectorizer_rmsw.fit_transform(metadata_c['all_text'].values.astype('U'))

## 產生推薦

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 計算商品用標題所表示的 tfidf 矩陣
df = metadata_c.drop_duplicates('all_text')

In [None]:
df

Unnamed: 0,asin,all_text
0,6546546450,loud n clear personal sound amplifier allows y...
1,7178680776,no7 lift luminate triple action serum 50ml by ...
2,7250468162,no7 stay perfect foundation now stays perfect ...
3,7367905066,wella koleston perfect hair colour 44 44 medi...
4,7414204790,lacto calamine skin balance daily nourishing l...
...,...,...
32887,B01HIWLLUK,barielle pro textured grip cuticle nipper pur...
32888,B01HJ1K3YK,buy 3 get 1 free salon perfect eye makeup co...
32889,B01HJ84SGM,now d mannose 500 mg 120 veg capsules pack of...
32890,B01HJASD20,brand new and high quality br enables fast vol...


In [None]:
df = df.reset_index()

In [None]:
del df['index']

In [None]:
tf = TfidfVectorizer(analyzer='word', stop_words='english', min_df=0.001)
tfidf_matrix = tf.fit_transform(df['all_text'].values.astype('str'))

In [None]:
# 計算商品間的相似程度
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
mapping = pd.Series(df.index,index = df['asin'])

In [None]:
# 每個商品回傳 k 個最相近的商品
def recommend_item(item_input, k=2):
    try:
        item_index = mapping[item_input]
        similarity_score = list(enumerate(similarity_matrix[item_index]))
        similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        similarity_score = similarity_score[:k]
        item_indices = [i[0] for i in similarity_score]
        return (df['asin'].iloc[item_indices].tolist())
    except:
        return []

In [None]:
# 利用使用者購買過的商品產生推薦
def recommend_items(items, k):
    res = []
    for d in items:
        res.extend(recommend_item(d, k))
    return res

In [None]:
def recommender(training_data, users=[], k=30):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}

    ratings_trainings = training_data

    recommendations = {user : recommend_items(metadata_c[metadata_c['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['asin'].tolist(), k) for user in users}

    return recommendations

ratings_by_user = recommender(ratings_trainings, users)

In [None]:
def recommender_3m(training_data, users=[], k=30):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}

    ratings_trainings_3m = training_data

    recommendations = {user : recommend_items(metadata_c[metadata_c['asin'].isin(ratings_trainings_3m[ratings_trainings_3m['reviewerID'] == user]['asin'].tolist())]['asin'].tolist(), k) for user in users}

    return recommendations

ratings_by_user_3m = recommender(ratings_trainings_3m, users)

## 結果評估

In [None]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.005084745762711864

In [None]:
evaluate(ratings_testings_by_user, ratings_by_user_3m)

0.003389830508474576