In [1]:
import json
import pandas as pd
import os
import shutil
import numpy as np
import scipy as sp

DATA_DIR = "../data"
DATA_FILE = os.path.join(DATA_DIR, "data.json")
DUMP_FILE = os.path.join(DATA_DIR, "dump.pkl")

store_columns = (
    "id",  # 음식점 고유번호
    "store_name",  # 음식점 이름
    "branch",  # 음식점 지점 여부
    "area",  # 음식점 위치
    "tel",  # 음식점 번호
    "address",  # 음식점 주소
    "latitude",  # 음식점 위도
    "longitude",  # 음식점 경도
    "category",  # 음식점 카테고리
    "review_cnt", # 리뷰 개수
)

review_columns = (
    "id",  # 리뷰 고유번호
    "store",  # 음식점 고유번호
    "user",  # 유저 고유번호
    "score",  # 평점
    "content",  # 리뷰 내용
    "reg_time",  # 리뷰 등록 시간
)

user_columns = (
    "id", # 유저 고유 번호
    "gender", # 유저 성별
    "born_year", # 유저 생년
    "review", # 리뷰 고유 번호
)


def import_data(data_path=DATA_FILE):
    """
    Req. 1-1-1 음식점 데이터 파일을 읽어서 Pandas DataFrame 형태로 저장합니다
    """

    try:
        with open(data_path, encoding="utf-8") as f:
            data = json.loads(f.read())
    except FileNotFoundError as e:
        print(f"`{data_path}` 가 존재하지 않습니다.")
        exit(1)

    stores = []  # 음식점 테이블
    reviews = []  # 리뷰 테이블
    users = [] # 유저 테이블

    for d in data:

        categories = [c["category"] for c in d["category_list"]]
        stores.append(
            [
                d["id"],
                d["name"],
                d["branch"],
                d["area"],
                d["tel"],
                d["address"],
                d["latitude"],
                d["longitude"],
                "|".join(categories),
                d["review_cnt"]
            ]
        )

        for review in d["review_list"]:
            r = review["review_info"]
            u = review["writer_info"]

            reviews.append(
                [r["id"], d["id"], u["id"], r["score"], r["content"], r["reg_time"]]
            )

            users.append(
                [u["id"], u["gender"], u["born_year"], r["id"]]
            )



    store_frame = pd.DataFrame(data=stores, columns=store_columns)
    review_frame = pd.DataFrame(data=reviews, columns=review_columns)
    user_frame = pd.DataFrame(data=users, columns=user_columns)

    return {"stores": store_frame, "reviews": review_frame, "users": user_frame}


def dump_dataframes(dataframes):
    pd.to_pickle(dataframes, DUMP_FILE)


def load_dataframes():
    return pd.read_pickle(DUMP_FILE)


def main():

    print("[*] Parsing data...")
    data = import_data()
    print("[+] Done")

    print("[*] Dumping data...")
    dump_dataframes(data)
    print("[+] Done\n")

    data = load_dataframes()

    term_w = shutil.get_terminal_size()[0] - 1
    separater = "-" * term_w

    print("[음식점]")
    print(f"{separater}\n")
    print(data["stores"].head())
    print(f"\n{separater}\n\n")

    print("[리뷰]")
    print(f"{separater}\n")
    print(data["reviews"].head())
    print(f"\n{separater}\n\n")


if __name__ == "__main__":
    main()


[*] Parsing data...
[+] Done
[*] Dumping data...
[+] Done

[음식점]
-----------------------------------------------------------------------------------------------------------------------

   id     store_name branch  area            tel                address  \
0   1           Agal   None    홍대  010-6689-5886   서울특별시 마포구 동교동 170-13   
1   2         Assisy   None    광주   062-367-0700    광주광역시 서구 농성동 631-33   
2   3  Battered Sole   None   이태원    02-749-6867   서울특별시 용산구 이태원동 118-9   
3   4      Chakyoung   None  달맞이길   051-756-5566  부산광역시 해운대구 중2동 1509-5   
4   5       Delabobo   None   발산역   02-2667-9854      서울특별시 강서구 등촌동 689   

    latitude   longitude   category  review_cnt  
0  37.556862  126.926666   아구찜|포장마차           0  
1  35.150746  126.890062         카페           0  
2  37.535032  126.991664    피쉬앤칩스|펍           0  
3  35.158587  129.175004  레스토랑|카프레제           0  
4  37.559904  126.840512  디저트카페|디저트           0  

--------------------------------------------------------------

In [2]:
import itertools
from collections import Counter
from parse import load_dataframes
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from scipy.sparse import csr_matrix


font_list = fm.findSystemFonts(fontpaths=None, fontext="ttf")
if any(["notosanscjk" in font.lower() for font in font_list]):
    plt.rcParams["font.family"] = "Noto Sans CJK JP"
else:
    if not any(["malgun" in font.lower() for font in font_list]):
        raise Exception(
            "Font missing, please install Noto Sans CJK or Malgun Gothic. If you're using ubuntu, try `sudo apt install fonts-noto-cjk`"
        )

    plt.rcParams["font.family"] = "Malgun Gothic"

sns.set_palette(sns.color_palette("Spectral"))
plt.rc("xtick", labelsize=6)


data = load_dataframes()

In [3]:
users = data["users"][["id", "review"]]
print(users)

           id  review
0       68632       1
1      389728       2
2       68716       3
3      774353       4
4      115682       5
...       ...     ...
91393   17371    2401
91394  198050    2402
91395  190766    2403
91396  201564    2404
91397  611078    2405

[91398 rows x 2 columns]


In [4]:
reviews = data["reviews"][["id", "store", "user", "score"]]
reviews.rename(columns = {'id' : 'review_id'}, inplace=True)

print(reviews)

       review_id   store    user  score
0              1      15   68632      5
1              2      18  389728      5
2              3      19   68716      4
3              4      37  774353      2
4              5      38  115682      3
...          ...     ...     ...    ...
91393       2401  360499   17371      5
91394       2402  360505  198050      4
91395       2403  360514  190766      5
91396       2404  360514  201564      4
91397       2405  360515  611078      5

[91398 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [5]:
stores = data["stores"][["id", "store_name"]]
print(stores)

            id     store_name
0            1           Agal
1            2         Assisy
2            3  Battered Sole
3            4      Chakyoung
4            5       Delabobo
...        ...            ...
459983  459996         행복부대찌개
459984  459997           행복부페
459985  459998           행복분식
459986  459999           행복분식
459987  460000           행복분식

[459988 rows x 2 columns]


In [6]:
users_and_reviews = pd.merge(left=users, right=reviews, left_on = ["id", "review"], right_on = ["user", "review_id"])[["id", "store", "score"]]
user_and_review = pd.merge(left = users_and_reviews, right = stores, left_on = "store", right_on = "id", how = "left")[["id_x", "store", "score", "store_name"]].reset_index()
user_and_review.rename(columns = {'id_x' : 'user_id'}, inplace=True)
print(user_and_review)

       index  user_id   store  score  store_name
0          0    68632      15      5         써리힐
1          1   389728      18      5    진삼미 샌드위치
2          2    68716      19      4  한옥마을 전주비빔밥
3          3   774353      37      2       007식당
4          4   115682      38      3       010수산
...      ...      ...     ...    ...         ...
91693  91693    17371  360499      5    지구촌한우생고기
91694  91694   198050  360505      4        지군포차
91695  91695   190766  360514      5          지글
91696  91696   201564  360514      4          지글
91697  91697   611078  360515      5          지글

[91698 rows x 5 columns]


In [10]:
# 평점이 5개 이상 있는 음식점 데이터
store_rating = data["stores"][['id', 'store_name', 'review_cnt']]
store_rating = store_rating[store_rating['review_cnt']>5]
print(store_rating)

            id   store_name  review_cnt
148        149  101번지 남산돈까스          11
220        221          10Q          12
549        550         17도씨           6
641        642    1967 바다지음          10
1712      1713          29펍           6
...        ...          ...         ...
359524  359537         중화복춘           9
359614  359627          중화원           6
359615  359628          중화원           8
360451  360464          지구당          26
360454  360467          지구당           6

[2284 rows x 3 columns]


In [13]:
store_review = pd.merge(left = user_and_review, right = store_rating, left_on = 'store', right_on = 'id')[['user_id', 'store', 'score', 'store_name_x']]
store_review.rename(columns = {'store_name_x' : 'store_name'}, inplace=True)
store_review.rename(columns = {'store' : 'store_id'}, inplace=True)
store_review

Unnamed: 0,user_id,store_id,score,store_name
0,293161,250217,4,신발원
1,293161,250217,4,신발원
2,788031,250217,3,신발원
3,352994,250217,1,신발원
4,10859,250217,5,신발원
...,...,...,...,...
24359,745622,360467,4,지구당
24360,280855,360467,5,지구당
24361,128271,360467,5,지구당
24362,811088,360467,4,지구당


피벗 테이블을 이용해 user - store name 매트릭스 구성

In [15]:
ratings_matrix = store_review.pivot_table('score', index = 'user_id', columns='store_name').fillna(0)
print(ratings_matrix)

store_name  101번지 남산돈까스  10Q  17도씨  1967 바다지음  29펍  33하우스  3일한우국밥  4SEED  \
user_id                                                                    
23                  0.0  0.0   0.0        0.0  0.0    0.0     0.0    0.0   
105                 0.0  0.0   0.0        0.0  0.0    0.0     0.0    0.0   
137                 0.0  0.0   0.0        0.0  0.0    0.0     0.0    0.0   
147                 0.0  0.0   0.0        0.0  0.0    0.0     0.0    0.0   
162                 0.0  0.0   0.0        0.0  0.0    0.0     0.0    0.0   
...                 ...  ...   ...        ...  ...    ...     ...    ...   
948691              0.0  0.0   0.0        0.0  0.0    0.0     0.0    0.0   
949019              0.0  0.0   0.0        0.0  0.0    0.0     0.0    0.0   
949596              0.0  0.0   0.0        0.0  0.0    0.0     0.0    0.0   
949670              0.0  0.0   0.0        0.0  0.0    0.0     0.0    0.0   
950224              0.0  0.0   0.0        0.0  0.0    0.0     0.0    0.0   

store_name 

음식점 x 사용자 매트릭스로 변환

In [16]:
ratings_matrix_T = ratings_matrix.transpose()
ratings_matrix_T

user_id,23,105,137,147,162,166,172,182,189,218,...,947927,948202,948319,948416,948690,948691,949019,949596,949670,950224
store_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101번지 남산돈까스,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10Q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17도씨,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1967 바다지음,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29펍,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
중앙해장,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
중평떡볶이,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
중화복춘,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
중화원,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


item-user matrix에서 cosine similarity 사용

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)

item_sim_df = pd.DataFrame(data = item_sim, index = ratings_matrix.columns, columns = ratings_matrix.columns)
print(item_sim_df.head(3))

store_name   101번지 남산돈까스  10Q      17도씨  1967 바다지음       29펍  33하우스    3일한우국밥  \
store_name                                                                      
101번지 남산돈까스     1.000000  0.0  0.082747        0.0  0.000000    0.0  0.147723   
10Q             0.000000  1.0  0.000000        0.0  0.000000    0.0  0.000000   
17도씨            0.082747  0.0  1.000000        0.0  0.127071    0.0  0.000000   

store_name   4SEED  4브라더스  5.5닭갈비  ...  중식당 청담  중앙관  중앙닭강정  중앙떡볶이  중앙식당  중앙해장  \
store_name                         ...                                          
101번지 남산돈까스    0.0    0.0     0.0  ...     0.0  0.0    0.0    0.0   0.0   0.0   
10Q            0.0    0.0     0.0  ...     0.0  0.0    0.0    0.0   0.0   0.0   
17도씨           0.0    0.0     0.0  ...     0.0  0.0    0.0    0.0   0.0   0.0   

store_name   중평떡볶이      중화복춘  중화원  지구당  
store_name                              
101번지 남산돈까스    0.0  0.000000  0.0  0.0  
10Q            0.0  0.000000  0.0  0.0  
17도씨           0.0  0.10

음식점 이름 목록

In [22]:
ratings_matrix.columns

Index(['101번지 남산돈까스', '10Q', '17도씨', '1967 바다지음', '29펍', '33하우스', '3일한우국밥',
       '4SEED', '4브라더스', '5.5닭갈비',
       ...
       '중식당 청담', '중앙관', '중앙닭강정', '중앙떡볶이', '중앙식당', '중앙해장', '중평떡볶이', '중화복춘',
       '중화원', '지구당'],
      dtype='object', name='store_name', length=2017)

3일한우국밥과 유사도가 높은 음식점 5개 추천

In [25]:
item_sim_df['3일한우국밥'].sort_values(ascending=False)[1:6]

store_name
다케       0.279822
목동분식     0.252399
대관원      0.250156
사모님짬뽕    0.247739
이조보쌈     0.239397
Name: 3일한우국밥, dtype: float64

중앙관과 유사도가 높은 음식점 5개 추천

In [26]:
item_sim_df['중앙관'].sort_values(ascending=False)[1:6]

store_name
닭발집      0.405906
우리분식     0.253796
놈파스타     0.235320
빈해원      0.223551
방콕야시장    0.219508
Name: 중앙관, dtype: float64

### TOP-N 

TOP-N을 쓰는 이유는 특정 item과 가장 비슷한 n개의 item들만 유사도 계산에 사용하기 위해서임

In [32]:
def predict_rating_topsim(ratings_arr, item_sim_arr, n = 20):
    pred = np.zeros(ratings_arr.shape)
    
    # user-item의 item 개수 만큼 loop
    for col in range(ratings_arr.shape[1]):
        # 유사도가 큰 순으로 n개의 데이터 행렬의 index 반환
        top_n_items = [np.argsort(item_sim_arr[:, col])[:-n-1:-1]]
        # 각 item 별로 전체 사용자들의 예측 평점
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row, :][top_n_items].T)
            pred[row, col] /= np.sum(item_sim_arr[col, :][top_n_items])
    return pred

293161 유저에게 음식점 추천

해당 유저가 방문한 음식점 리스트

In [28]:
username = 293161

user_rating_id = ratings_matrix.loc[username, :]
user_rating_id[user_rating_id > 0].sort_values(ascending=False)[:10]

store_name
동복리 해녀촌    5.0
루엘드파리      5.0
우진해장국      5.0
거대         4.0
금산제면소      4.0
마가만두       4.0
명진전복       4.0
봉산옥        4.0
블랙업커피      4.0
삼촌네        4.0
Name: 293161, dtype: float64

user가 방문하지 않은 음식점 추천하기

In [29]:
def not_visited_store(ratings_matrix, user_id):
    # user_id가 방문한 음식점 list
    user_rating = ratings_matrix.loc[user_id, :]
    visited = user_rating[user_rating>0].index.tolist()
    
    # 모든 store list로 만들기
    store_list = ratings_matrix.columns.tolist()
    
    # user가 방문한 음식점은 제외
    not_visited = [i for i in store_list if i not in visited]
    return not_visited

In [34]:
def recomm_store_by_user(pred_df, user_id, not_visited, top_n):
    recomm_store = pred_df.loc[user_id, not_visited].sort_values(ascending=False)[:top_n]
    return recomm_store

user가 방문하지 않은 음식점

In [31]:
not_visit = not_visited_store(ratings_matrix, 293161)
not_visit

['101번지 남산돈까스',
 '10Q',
 '17도씨',
 '1967 바다지음',
 '29펍',
 '33하우스',
 '3일한우국밥',
 '4SEED',
 '4브라더스',
 '5.5닭갈비',
 '60년전통 할매국밥',
 '63뷔페 파빌리온',
 '646테이블',
 '7HUNDRED',
 '88돼지',
 '88버거',
 '88생선구이',
 '8월의양',
 'BRCD',
 'C27',
 'JB파스타',
 'K375',
 'La Cour 1912',
 'PIZZA UP',
 'WE Ccafe',
 'e99',
 '가나안덕',
 '가디록',
 '가림시골밥상',
 '가마마루이',
 '가마솥보은순대',
 '가마솥족발',
 '가메골 손만두',
 '가미우동',
 '가배도',
 '가보정',
 '가비가배',
 '가시식당',
 '가시아방',
 '가야가야',
 '가야밀면',
 '가야성',
 '가양 칼국수',
 '가온',
 '가재골수제비',
 '가쯔레쯔',
 '가타쯔무리',
 '갈릴리농원',
 '갈비1987',
 '갈비씨',
 '갈현동할머니떡볶이',
 '감각의제국',
 '감나무집',
 '감나무집 기사식당',
 '감성타코',
 '감칠',
 '갓덴스시',
 '갓잇',
 '갓포아키',
 '강강술래',
 '강남',
 '강남곱',
 '강남면옥',
 '강남불백',
 '강릉가는길',
 '강릉감자옹심이',
 '강릉불고기',
 '강서면옥',
 '강원도 막국수',
 '강원도손순두부',
 '강촌숯불닭갈비',
 '강한고집',
 '강화국수',
 '강화손칼국수',
 '개군할머니 토종순대국',
 '개금밀면',
 '개돼지 크래프트 브루펍',
 '개미집',
 '개성집',
 '개천식당',
 '갯마을',
 '거멍국수',
 '거북이식당',
 '거인통닭',
 '건강한빵',
 '건도리횟집',
 '건봉국밥',
 '걸리버막창',
 '겁없는토끼부엌',
 '겐로쿠우동',
 '겐텐',
 '겟썸커피',
 '경미네집',
 '경발원',
 '경복궁',
 '경복궁삼계탕',
 '경상도집',
 '경성양육관',
 '경성함바그',
 '경양식192

CF로 추천하기

In [36]:
# 비슷한 item만 추천에 사용
ratings_pred = predict_rating_topsim(ratings_matrix.values, item_sim_df.values, n = 5)

ratings_pred_matrix = pd.DataFrame(data = ratings_pred, index = ratings_matrix.index, columns = ratings_matrix.columns)

# item 기반의 KNN CF로 추천
recomm_store = recomm_store_by_user(ratings_pred_matrix, 293161, not_visit, top_n = 5)
recomm_store = pd.DataFrame(data=recomm_store.values, index=recomm_store.index, columns = ["score"])

  pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row, :][top_n_items].T)
  pred[row, col] /= np.sum(item_sim_arr[col, :][top_n_items])


Index(['애정훠궈', '강릉감자옹심이', '오뗄두스', '아찌떡볶이', '달콤한 거짓말'], dtype='object', name='store_name')

In [37]:
for name in recomm_store.index:
    print(name)

애정훠궈
강릉감자옹심이
오뗄두스
아찌떡볶이
달콤한 거짓말
