<a href="https://colab.research.google.com/github/andylee50609/data-course-sample/blob/main/Week4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

【參數宣告】

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

【資料讀取】

In [None]:
# Training Data
recipes_df = pd.read_json('recipes.jsonl.gz', lines=True, compression='gzip').rename(columns = {"id":"recipe_id"})
ingredients_df = pd.read_json('ingredients.jsonl.gz', lines=True, compression='gzip')
favorites_df = pd.read_json('favorites.json.gz', compression='gzip')

# Test Data
test_favorites_df = pd.read_json('test_favorites.json.gz', compression='gzip')
test_user_ids = pd.read_csv('test_user_ids.txt.gz', compression='gzip', names=['user_id'])

【資料探索(EDA)】

In [None]:
# 將2020年之食譜收藏資訊也作為一項特徵
tmp = favorites_df[["recipe_id","user_id"]].groupby("recipe_id").count().rename(columns = {"user_id":"favorites_count_2020"}).reset_index()
recipes_df = pd.merge(recipes_df, tmp, how = "left", on = "recipe_id")
recipes_df["favorites_count_2020"] = recipes_df["favorites_count_2020"].fillna(0)

# 透過相關係數分析圖找出數值Feature間之關係，透過分析圖可以發現「跟著做數量(dishes_count)」與收藏數(favorites_count)有顯著之正相關。
# 此外，除了「圖片步驟數(photo_steps_count)」有些微相關外，其餘特徵皆明顯不相關。
plt.figure(figsize=(10, 8))
feature_corr = recipes_df.corr()
sns.heatmap(feature_corr, annot=True)

# 跟著做數量為46, 45, 167 時，收藏量顯著高
plt.figure(figsize=(16, 8))
sns.barplot('dishes_count','favorites_count', data=recipes_df, ci=None)

# 圖片步驟數為36, 30, 27 時，收藏量顯著高
plt.figure(figsize=(16, 8))
sns.barplot('photo_steps_count','favorites_count', data=recipes_df, ci=None)

【資料前處理】

In [None]:
# groupby[user_id],將結果存成list並建成字典
test_favorite_dict = test_favorites_df.groupby('user_id').agg(list).reset_index().to_dict('records')
test_favorite_dict = { row['user_id']: row['recipe_id'] for row in test_favorite_dict }

# 透過outerjoin檢視test_data的客戶是否過去有相關收藏紀錄，經檢視，test_data的客戶皆為既有客戶
tmp = pd.merge(favorites_df["user_id"].fillna(0).drop_duplicates().rename("old_user_id"), test_user_ids, how = "outer", left_on="old_user_id", right_on="user_id")
new_cust = tmp[tmp["old_user_id"].isnull()]
old_cust = tmp[~tmp["user_id"].isnull()]

【Rule_Based 規則】

In [None]:
# 依據「收藏數」量進行排序
rank_list = recipes_df["favorites_count"].rank(method='dense',ascending=False, pct=True).reset_index()

# 依據「2020年收藏數量」進行排序
tmp = recipes_df["favorites_count_2020"].rank(method='dense',ascending=False, pct=True).reset_index()
rank_list = pd.merge(rank_list, tmp, how="left", on = "index")

# 依據「跟著做數量」進行排序
tmp = recipes_df["dishes_count"].rank(method='dense',ascending=False, pct=True).reset_index()
rank_list = pd.merge(rank_list, tmp, how="left", on = "index")

# 依據「圖片步驟數」進行排序
tmp = recipes_df["photo_steps_count"].rank(method='dense',ascending=False, pct=True).reset_index()
rank_list = pd.merge(rank_list, tmp, how="left", on = "index")

# 依據「發布時間」進行排序
tmp = recipes_df["published_at"].rank(method='dense',ascending=False, pct=True).reset_index()
rank_list = pd.merge(rank_list, tmp, how="left", on = "index")

# join recipe已取得recipe_id
rank_list = pd.merge(rank_list, recipes_df.reset_index()[["index","recipe_id"]], how="left", on = "index")

# 加總最終之排序結果
rank_list["totalRank"] = rank_list["favorites_count"] + rank_list["favorites_count_2020"] + rank_list["dishes_count"] + rank_list["photo_steps_count"] + rank_list["published_at"]

rank_list = rank_list.sort_values("totalRank").reset_index(drop=True)

【進行推薦】

In [None]:
def recommender(users=[], k=3):

  return { user: rank_list["recipe_id"][:k].tolist() for user in users}

predictions = recommender(test_user_ids['user_id'])

【評估結果】

In [None]:
def evaluate(real_results={}, predicted_results={}):
    '''
    * real_results: dict 真實被收藏的食譜資料
    * predicted_results: dict 利用訓練資料學習的推薦食譜
    * method: str
    * score: float
    '''
    total = 0
    for d in predicted_results:
          total += len(set(predicted_results[d]) & set(test_favorites_df[test_favorites_df['user_id'] == d]['recipe_id']))

    score = total / test_favorites_df.shape[0]
    return score

evaluate(test_favorites_df, predictions)