# Data import

In [22]:
import json
import pandas as pd
import os
import shutil
import numpy as np
import scipy as sp

DATA_DIR = "../data"
DATA_FILE = os.path.join(DATA_DIR, "data.json")
DUMP_FILE = os.path.join(DATA_DIR, "dump.pkl")

store_columns = (
    "id",  # 음식점 고유번호
    "store_name",  # 음식점 이름
    "branch",  # 음식점 지점 여부
    "area",  # 음식점 위치
    "tel",  # 음식점 번호
    "address",  # 음식점 주소
    "latitude",  # 음식점 위도
    "longitude",  # 음식점 경도
    "category",  # 음식점 카테고리
    "review_cnt", # 리뷰 개수
)

review_columns = (
    "id",  # 리뷰 고유번호
    "store",  # 음식점 고유번호
    "user",  # 유저 고유번호
    "score",  # 평점
    "content",  # 리뷰 내용
    "reg_time",  # 리뷰 등록 시간
)

user_columns = (
    "id", # 유저 고유 번호
    "gender", # 유저 성별
    "born_year", # 유저 생년
    "review", # 리뷰 고유 번호
)


def import_data(data_path=DATA_FILE):
    """
    Req. 1-1-1 음식점 데이터 파일을 읽어서 Pandas DataFrame 형태로 저장합니다
    """

    try:
        with open(data_path, encoding="utf-8") as f:
            data = json.loads(f.read())
    except FileNotFoundError as e:
        print(f"`{data_path}` 가 존재하지 않습니다.")
        exit(1)

    stores = []  # 음식점 테이블
    reviews = []  # 리뷰 테이블
    users = [] # 유저 테이블

    for d in data:

        categories = [c["category"] for c in d["category_list"]]
        stores.append(
            [
                d["id"],
                d["name"],
                d["branch"],
                d["area"],
                d["tel"],
                d["address"],
                d["latitude"],
                d["longitude"],
                "|".join(categories),
                d["review_cnt"]
            ]
        )

        for review in d["review_list"]:
            r = review["review_info"]
            u = review["writer_info"]

            reviews.append(
                [r["id"], d["id"], u["id"], r["score"], r["content"], r["reg_time"]]
            )

            users.append(
                [u["id"], u["gender"], u["born_year"], r["id"]]
            )



    store_frame = pd.DataFrame(data=stores, columns=store_columns)
    review_frame = pd.DataFrame(data=reviews, columns=review_columns)
    user_frame = pd.DataFrame(data=users, columns=user_columns)

    return {"stores": store_frame, "reviews": review_frame, "users": user_frame}


def dump_dataframes(dataframes):
    pd.to_pickle(dataframes, DUMP_FILE)


def load_dataframes():
    return pd.read_pickle(DUMP_FILE)


def main():

    print("[*] Parsing data...")
    data = import_data()
    print("[+] Done")

    print("[*] Dumping data...")
    dump_dataframes(data)
    print("[+] Done\n")

    data = load_dataframes()

    term_w = shutil.get_terminal_size()[0] - 1
    separater = "-" * term_w

    print("[음식점]")
    print(f"{separater}\n")
    print(data["stores"].head())
    print(f"\n{separater}\n\n")

    print("[리뷰]")
    print(f"{separater}\n")
    print(data["reviews"].head())
    print(f"\n{separater}\n\n")


if __name__ == "__main__":
    main()


[*] Parsing data...
[+] Done
[*] Dumping data...
[+] Done

[음식점]
-----------------------------------------------------------------------------------------------------------------------

   id     store_name branch  area            tel                address  \
0   1           Agal   None    홍대  010-6689-5886   서울특별시 마포구 동교동 170-13   
1   2         Assisy   None    광주   062-367-0700    광주광역시 서구 농성동 631-33   
2   3  Battered Sole   None   이태원    02-749-6867   서울특별시 용산구 이태원동 118-9   
3   4      Chakyoung   None  달맞이길   051-756-5566  부산광역시 해운대구 중2동 1509-5   
4   5       Delabobo   None   발산역   02-2667-9854      서울특별시 강서구 등촌동 689   

    latitude   longitude   category  review_cnt  
0  37.556862  126.926666   아구찜|포장마차           0  
1  35.150746  126.890062         카페           0  
2  37.535032  126.991664    피쉬앤칩스|펍           0  
3  35.158587  129.175004  레스토랑|카프레제           0  
4  37.559904  126.840512  디저트카페|디저트           0  

--------------------------------------------------------------

# Data 

In [23]:
import itertools
from collections import Counter
from parse import load_dataframes
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from scipy.sparse import csr_matrix


font_list = fm.findSystemFonts(fontpaths=None, fontext="ttf")
if any(["notosanscjk" in font.lower() for font in font_list]):
    plt.rcParams["font.family"] = "Noto Sans CJK JP"
else:
    if not any(["malgun" in font.lower() for font in font_list]):
        raise Exception(
            "Font missing, please install Noto Sans CJK or Malgun Gothic. If you're using ubuntu, try `sudo apt install fonts-noto-cjk`"
        )

    plt.rcParams["font.family"] = "Malgun Gothic"

sns.set_palette(sns.color_palette("Spectral"))
plt.rc("xtick", labelsize=6)

data = load_dataframes()

## User Data

In [24]:
users = data["users"][["id", "review"]]
print(users)

           id  review
0       68632       1
1      389728       2
2       68716       3
3      774353       4
4      115682       5
...       ...     ...
91393   17371    2401
91394  198050    2402
91395  190766    2403
91396  201564    2404
91397  611078    2405

[91398 rows x 2 columns]


## Review Data

In [25]:
reviews = data["reviews"][["id", "store", "user", "score"]]
reviews.rename(columns = {'id' : 'review_id'}, inplace=True)

print(reviews)

       review_id   store    user  score
0              1      15   68632      5
1              2      18  389728      5
2              3      19   68716      4
3              4      37  774353      2
4              5      38  115682      3
...          ...     ...     ...    ...
91393       2401  360499   17371      5
91394       2402  360505  198050      4
91395       2403  360514  190766      5
91396       2404  360514  201564      4
91397       2405  360515  611078      5

[91398 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


## Store Data

In [26]:
stores = data["stores"][["id", "store_name"]]
print(stores)

            id     store_name
0            1           Agal
1            2         Assisy
2            3  Battered Sole
3            4      Chakyoung
4            5       Delabobo
...        ...            ...
459983  459996         행복부대찌개
459984  459997           행복부페
459985  459998           행복분식
459986  459999           행복분식
459987  460000           행복분식

[459988 rows x 2 columns]


## User & Review Merge 

user 별 별점을 부여한 점수와 해당 음식점, 음식점이름 출력

In [27]:
users_and_reviews = pd.merge(left=users, right=reviews, left_on = ["id", "review"], right_on = ["user", "review_id"])[["id", "store", "score"]]
print(users_and_reviews)

           id   store  score
0       68632      15      5
1      389728      18      5
2       68716      19      4
3      774353      37      2
4      115682      38      3
...       ...     ...    ...
91693   17371  360499      5
91694  198050  360505      4
91695  190766  360514      5
91696  201564  360514      4
91697  611078  360515      5

[91698 rows x 3 columns]


In [28]:
user_and_review = pd.merge(left = users_and_reviews, right = stores, left_on = "store", right_on = "id", how = "left")[["id_x", "store", "score", "store_name"]].reset_index()
user_and_review.rename(columns = {'id_x' : 'user_id'}, inplace=True)
print(user_and_review)

       index  user_id   store  score  store_name
0          0    68632      15      5         써리힐
1          1   389728      18      5    진삼미 샌드위치
2          2    68716      19      4  한옥마을 전주비빔밥
3          3   774353      37      2       007식당
4          4   115682      38      3       010수산
...      ...      ...     ...    ...         ...
91693  91693    17371  360499      5    지구촌한우생고기
91694  91694   198050  360505      4        지군포차
91695  91695   190766  360514      5          지글
91696  91696   201564  360514      4          지글
91697  91697   611078  360515      5          지글

[91698 rows x 5 columns]


# User 별 리뷰를 남긴 음식점 번호와 점수

In [29]:
groupby_user =  user_and_review.groupby(user_and_review["user_id"], as_index=False)
user_dict = {}
for k, v in groupby_user:
    user_dict[k] = dict(zip(v['store'], v['score']))
print(user_dict)

{7: {133795: 5, 310892: 4, 319233: 4}, 15: {119631: 4}, 23: {41329: 2, 87281: 5, 126845: 5, 128535: 5, 160821: 4, 191548: 4, 296516: 5, 347776: 5, 351165: 4}, 62: {225715: 3}, 64: {23889: 1}, 74: {134521: 5}, 105: {34302: 4, 45739: 3, 174801: 5, 214836: 4, 238221: 4, 349914: 3}, 137: {91183: 5, 202410: 5}, 147: {16359: 4, 20210: 4, 20471: 5, 20933: 5, 39509: 3, 40955: 5, 44738: 5, 44753: 4, 46526: 5, 47190: 5, 57887: 3, 79476: 4, 88301: 4, 95673: 4, 95927: 5, 109095: 5, 116257: 4, 118133: 4, 120967: 2, 122959: 4, 125458: 5, 129941: 4, 132853: 4, 138588: 3, 139440: 3, 140368: 5, 140427: 3, 173806: 4, 174905: 5, 187049: 3, 192876: 5, 196723: 3, 209344: 3, 211686: 4, 216701: 5, 217529: 5, 224911: 4, 235990: 5, 237676: 1, 243419: 5, 243478: 5, 246124: 5, 248482: 3, 248606: 4, 250253: 5, 251573: 5, 267243: 4, 273085: 1, 276000: 3, 276153: 2, 276715: 5, 295434: 5, 298993: 4, 315930: 3, 316643: 4, 326498: 3, 339715: 3, 340891: 5, 344260: 4, 345799: 5, 346973: 4}, 156: {36353: 5}, 161: {207446

평점을 부여한 유저 리스트

In [30]:
user_list = []
store_set = set()
for u in user_dict:
    user_list.append(u)
    
    for s in user_dict[u]:
        store_set.add(s)
print(user_list)

[7, 15, 23, 62, 64, 74, 105, 137, 147, 156, 161, 162, 166, 172, 178, 182, 189, 218, 221, 249, 279, 313, 335, 345, 347, 369, 396, 431, 435, 438, 448, 454, 465, 473, 483, 496, 527, 531, 535, 541, 559, 640, 658, 702, 725, 744, 746, 775, 813, 820, 848, 885, 911, 942, 950, 955, 959, 970, 972, 1001, 1004, 1005, 1006, 1016, 1028, 1030, 1046, 1070, 1088, 1091, 1102, 1109, 1121, 1146, 1147, 1151, 1190, 1199, 1225, 1227, 1405, 1408, 1409, 1437, 1438, 1451, 1467, 1469, 1493, 1500, 1521, 1557, 1567, 1571, 1581, 1582, 1596, 1597, 1599, 1620, 1639, 1656, 1677, 1701, 1705, 1719, 1727, 1746, 1757, 1761, 1777, 1787, 1795, 1882, 1974, 2032, 2062, 2065, 2071, 2075, 2087, 2101, 2105, 2109, 2112, 2131, 2142, 2146, 2147, 2148, 2158, 2174, 2175, 2182, 2192, 2195, 2213, 2245, 2257, 2269, 2279, 2289, 2294, 2295, 2301, 2306, 2314, 2340, 2341, 2364, 2368, 2379, 2382, 2418, 2419, 2445, 2450, 2458, 2473, 2480, 2542, 2572, 2606, 2610, 2646, 2709, 2731, 2735, 2779, 2798, 2821, 2827, 2836, 2842, 2847, 2849, 2852, 285

평점이 있는 음식점 리스트

In [31]:
store_list = list(store_set)
print(store_list)

[262144, 262147, 262152, 262154, 262158, 15, 18, 262163, 19, 262164, 262165, 262168, 262169, 262170, 262172, 262173, 262174, 262176, 262177, 131105, 262178, 262180, 262181, 262182, 262183, 38, 262185, 37, 262187, 131116, 262190, 262191, 262192, 262193, 49, 50, 131124, 53, 262196, 262195, 56, 262200, 58, 131131, 60, 262207, 131135, 262208, 68, 131140, 262214, 262218, 262219, 131148, 262220, 262222, 131151, 262224, 262225, 262226, 131154, 131156, 77, 86, 262231, 262234, 262235, 262236, 91, 262238, 262239, 262240, 94, 262242, 262243, 262244, 92, 102, 262247, 262249, 131179, 262252, 262253, 110, 262255, 131183, 262257, 262258, 262256, 262260, 262259, 262263, 262265, 262266, 123, 262268, 262270, 262271, 262272, 129, 131, 262276, 133, 134, 262279, 262278, 262281, 262280, 262283, 262284, 262285, 262286, 262287, 262288, 262289, 262290, 262291, 148, 149, 150, 262292, 152, 131220, 154, 151, 262293, 262295, 262304, 131237, 262310, 131241, 173, 131245, 175, 177, 180, 131255, 184, 262329, 262328, 2

## CF 

In [32]:
# CF 추천 시스템
rating_dic = {
    'user_id':[],
    'store_id':[],
    'rating':[]
}

for u in user_dict:
    for s in user_dict[u]:
        u_id = user_list.index(u)
        s_id = store_list.index(s)
        rate = user_dict[u][s]
        
        rating_dic['user_id'].append(u_id)
        rating_dic['store_id'].append(s_id)
        rating_dic['rating'].append(rate)
        
print(rating_dic['user_id'], rating_dic['store_id'], rating_dic['rating'])

[0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 17, 18, 18, 19, 20, 20, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 25, 25, 25, 26, 27, 28, 29, 30, 31, 32, 32, 32, 32, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 3

In [33]:
df = pd.DataFrame(rating_dic)
df

Unnamed: 0,user_id,store_id,rating
0,0,1109,5
1,0,18309,4
2,0,21170,4
3,1,42320,4
4,2,15548,2
...,...,...,...
91393,18989,26026,1
91394,18989,35225,5
91395,18989,35801,5
91396,18990,28232,5


## KNN 

In [34]:
import surprise

# rating_scale : 평점의 범위
reader = surprise.Reader(rating_scale=(1, 5))
reader

<surprise.reader.Reader at 0x24cf053db50>

In [35]:
col_list = ['user_id', 'store_id', 'rating']
data = surprise.Dataset.load_from_df(df[col_list], reader)

print(col_list)
print(data)

['user_id', 'store_id', 'rating']
<surprise.dataset.DatasetAutoFolds object at 0x0000024CF052DB80>


### Training 

In [36]:
trainset = data.build_full_trainset()
option = {'name':'pearson'}
algo = surprise.KNNBasic(sim_options=option)

algo.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x24cf03ab370>

# Result

사용자 입력 받기

In [37]:
who = input('user id: ')

user id: 7


나와 비슷한 취향을 가진 다른 사용자

In [38]:
index = user_list.index(int(who))
result = algo.get_neighbors(index, k = 20)
print('당신과 비슷한 음식점을 가는 사용자는? : ', result)

당신과 비슷한 음식점을 가는 사용자는? :  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


비슷한 취향을 가진 사용자들이 방문한 음식점 리스트

In [47]:
for r in result:
    max_rating = data.df[data.df["user_id"]==r]["rating"].max()
    cos_id = data.df[(data.df["rating"]==max_rating)&(data.df["user_id"]==r)]["store_id"].values
    
    store_id_list = []
    for cos_item in cos_id:
        store_id_list.append(stores[stores['id']==store_list[cos_item]]['store_name'].index[0])
    
    for idx in store_id_list:
        print(idx, stores[stores['id']==idx]['store_name'].values[0])

119625 라화방
87276 대찬식당
126839 마담김치찌개
128529 마부마라탕
296503 옥산반점
347763 정씨부엌
225706 소꼴
23886 경성함바그
134515 말뚝곱창
174794 버거킹
91178 더진국
202401 뿅뿅치킨
20469 개미기사식당
20931 개성집
40952 국일반점
44735 그리다
46523 금모래국밥
47187 금수저묵은지김치찜
95922 도스마스
109090 둘둘치킨
125452 리에또
140362 맴섬횟집
174898 버거킹
192868 불로만 숯불바베큐
216692 서가원김밥
217520 서북산농장
235981 수유동면옥
243409 스타벅스
243468 스타벅스
246114 시골손칼국수보리밥
250242 신별주부
251562 신의주 찹쌀순대
276704 에머이
295421 오춘자비어
340878 장원토종닭
345786 젊은날의추억
36350 교동짬뽕
207437 산쪼메
309323 우진정
4049 88선수촌
12390 가마골흑염소
13299 가미온밥상
34242 공차
34715 관훈맨션
37381 교토참치
41701 굴섬
53590 김판쇠 전주우족탕
61131 나이스골뱅이
67827 넷째네양다리
83819 대마족발
90804 더알찬쭈꾸미
102726 동도식육점
117712 라공방
118893 라온
119776 락빌리어드
123968 루빈루
124047 루씨살롱
124226 루트에브리데이
125582 리체식당
136238 맛나분식
140562 맷돌포두부
143720 멘야칸지루
150062 모쿠데판야끼
151199 몰릭
151513 몽구스찜닭
154869 문화식당
166202 바스버거
183101 본때감자탕
194860 브로이하우스
196000 블루베리
199221 빌리엔젤
217557 서브웨이
224450 세븐세븐
235182 수상한감자탕
240707 스마일 찹쌀 꽈배기
241498 스시랑랑
243605 스타벅스
245135 스프링코트
245243 슬로우시티
249379 신동해해물탕
256894 써브웨이
2