In [1]:
import pandas as pd
import numpy as np
import dgl
import torch
from tqdm import tqdm, trange
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
import torch.optim as optim
from konlpy.tag import *
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import random
import pickle
import os

Using backend: pytorch


In [2]:
data_dir = '/opt/ml/final-project-level3-recsys-02/data/'

In [4]:
with open(data_dir + 'food.pickle', 'rb') as f :
    raw_df = pickle.load(f)
USE_COLS = ['placeName', 'placeType', 'placeAddress', 'themeKeywords','like', 'menulabel', 'ageLabel', 'ratingLabel',  'visitLabel', 'blogLabel']
raw_df = raw_df[USE_COLS]
raw_df = raw_df[~raw_df.placeType.str.contains('성급')].reset_index().copy()


In [9]:
raw_df['placeID'] = raw_df.apply(lambda x : x['placeName'] + x['placeAddress'], axis = 1)
raw_df['placeID'] = raw_df['placeID'].apply(lambda x : x.replace(" ", ""))

### Place type

In [10]:
p_df = raw_df[['placeID', 'placeType']]
p_df.columns = ['placeID', 'feature']
p_df.head()

Unnamed: 0,placeID,feature
0,밀밭정원서울마포구마포대로16길13,"칼국수,만두"
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),바(BAR)
2,동북양꼬치서울영등포구디지털로37길26-1,양꼬치
3,농부쌈밥서울동작구사당로30길19,쌈밥
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,김밥


In [11]:
for idx, value in enumerate(p_df.iterrows()):
    if not p_df['feature'][idx]:
        p_df['placeID']

In [12]:
p_df['feature'] = p_df['feature'].apply(lambda x : x.split(','))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p_df['feature'] = p_df['feature'].apply(lambda x : x.split(','))


In [13]:
p_df = pd.DataFrame([
    [place_id, feature] for place_id, features in p_df.itertuples(index=False)
    for feature in features
    ], columns=p_df.columns)

In [14]:
p_df.head()

Unnamed: 0,placeID,feature
0,밀밭정원서울마포구마포대로16길13,칼국수
1,밀밭정원서울마포구마포대로16길13,만두
2,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),바(BAR)
3,동북양꼬치서울영등포구디지털로37길26-1,양꼬치
4,농부쌈밥서울동작구사당로30길19,쌈밥


#### Place Theme Keywords

In [15]:
okt = Okt()

In [16]:
def prep_nouns(word:str):
    noun = okt.nouns(word)
    if noun:
        return noun[0]
    else:
        return ""

In [17]:
k_df = raw_df[['placeID', 'themeKeywords']]

In [18]:
theme_place = k_df[k_df.themeKeywords.str.len()!=0]['themeKeywords']
theme_place

1                                [술집, 세계맥주, 맥주집, 호프집, 생맥주]
3        [인심좋은, 친절한, 친절하신, 친절하고, 쌈밥, 제육볶음, 오리로스, 부대찌개, ...
6                                              [닭갈비, 닭갈비집]
7                  [심플한, 돼지곱창, 시장, 소곱창, 곱창, 막창, 신선한, 숨어있는]
8        [친절함, 친절하고, 화려한, 친절한, 시장, 소곱창, 양대창, 막창, 곱창, 나들...
                               ...                        
12646     [고급진, 깨끗한, 고급스러운, 안락한, 초밥, 젓갈, 튀김, 횟집, 참치회, 신선한]
12651    [아늑한, 분위기좋은, 토속적인분위기, 김치찌개, 굴보쌈, 한정식, 곱창, 비빔밥,...
12662                                [만두, 아이스크림, 설렁탕, 불고기]
12664                         [닭갈비, 닭갈비집, 주먹밥, 막국수, 새로오픈한]
12665    [고급진, 이국적, 고급스러운, 카레, 팟타이, 태국음식, 쌀국수, 누들, 나들이,...
Name: themeKeywords, Length: 3499, dtype: object

In [19]:
k_df['prepThemeKeywords'] = k_df['themeKeywords'].apply(lambda x : list(map(lambda x : prep_nouns(x), x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_df['prepThemeKeywords'] = k_df['themeKeywords'].apply(lambda x : list(map(lambda x : prep_nouns(x), x)))


In [20]:
keyword_list = list(set(k_df[k_df.prepThemeKeywords.str.len()!=0]['prepThemeKeywords'].sum()))

In [21]:
k_df = pd.DataFrame([
    [id, keyword] for id, keywords in k_df[['placeID', 'prepThemeKeywords']].itertuples(index=False)
    for keyword in keywords
], columns=['placeID', 'prepThemeKeywords'])

In [22]:
k_df = k_df[k_df.prepThemeKeywords!=""]
k_df.head()

Unnamed: 0,placeID,prepThemeKeywords
0,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),술집
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),세계
2,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),맥주
3,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),호프
4,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),생맥주


#### Like

In [23]:
l_df = pd.DataFrame.from_records(raw_df['like'])
l_df.head()

Unnamed: 0,음식이 맛있어요,재료가 신선해요,친절해요,특별한 메뉴가 있어요,단체모임 하기 좋아요,매장이 청결해요,혼밥하기 좋아요,양이 많아요,가성비가 좋아요,매장이 넓어요,...,건강한 맛이에요,아늑해요,컨셉이 독특해요,샐러드바가 잘 되어있어요,현지 맛에 가까워요,추천을 잘해줘요,라이브공연이 훌륭해요,파티하기 좋아요,반려동물과 가기 좋아요,잡내가 적어요
0,26.0,11.0,10.0,7.0,4.0,3.0,2.0,2.0,2.0,2.0,...,,,,,,,,,,
1,59.0,2.0,62.0,22.0,18.0,40.0,,2.0,12.0,1.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,73.0,45.0,42.0,3.0,,10.0,10.0,25.0,57.0,4.0,...,,,,,,,,,,
4,18.0,8.0,17.0,2.0,1.0,6.0,25.0,6.0,18.0,2.0,...,,,,,,,,,,


In [24]:
frequency = np.sum(~l_df.isna(), axis=0)
cond1 = frequency > 1
cond2 = frequency < 9000
like_cols = sorted(frequency[np.where(cond1&cond2)[0]].index.values)

In [25]:
l_df = l_df[like_cols].fillna(0)

In [26]:
total_record = []
for index in tqdm(l_df.index.values):
    topk = np.argsort(l_df.values[index])[::-1]

    cnt = 0
    for t in topk:
        if l_df.values[index, t] == 0 or cnt==5:
            break
        else :
            total_record.append((raw_df['placeID'][index], l_df.columns[t]))
            cnt += 1

l_df = pd.DataFrame.from_records(total_record)
l_df.columns = ['placeID', 'like']

100%|██████████| 12677/12677 [00:00<00:00, 13944.08it/s]


In [27]:
l_df.head(10)

Unnamed: 0,placeID,like
0,밀밭정원서울마포구마포대로16길13,단체모임 하기 좋아요
1,밀밭정원서울마포구마포대로16길13,혼밥하기 좋아요
2,밀밭정원서울마포구마포대로16길13,뷰가 좋아요
3,밀밭정원서울마포구마포대로16길13,매장이 넓어요
4,밀밭정원서울마포구마포대로16길13,화장실이 깨끗해요
5,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),인테리어가 멋져요
6,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),술이 다양해요
7,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),오래 머무르기 좋아요
8,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),대화하기 좋아요
9,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),화장실이 깨끗해요


#### Menu

In [28]:
m_df = raw_df[['placeID', 'menulabel']]
m_df = m_df.dropna()
m_df.head()

Unnamed: 0,placeID,menulabel
0,밀밭정원서울마포구마포대로16길13,30000이하
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),30000이상
2,동북양꼬치서울영등포구디지털로37길26-1,30000이하
3,농부쌈밥서울동작구사당로30길19,10000이하
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,10000이하


#### Age

In [29]:
a_df = raw_df[['placeID', 'ageLabel']]
a_df.head()

Unnamed: 0,placeID,ageLabel
0,밀밭정원서울마포구마포대로16길13,"[50대, 40대, 60대]"
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),"[20대, 30대, 40대]"
2,동북양꼬치서울영등포구디지털로37길26-1,[]
3,농부쌈밥서울동작구사당로30길19,"[20대, 30대, 40대]"
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,[]


In [30]:
a_df = pd.DataFrame([
    [id, age] for id, ages in a_df.itertuples(index=False)
    for age in ages
], columns=a_df.columns)

In [31]:
a_df.head()

Unnamed: 0,placeID,ageLabel
0,밀밭정원서울마포구마포대로16길13,50대
1,밀밭정원서울마포구마포대로16길13,40대
2,밀밭정원서울마포구마포대로16길13,60대
3,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),20대
4,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),30대


#### Rating

In [32]:
r_df = raw_df[['placeID', 'ratingLabel']]
r_df.head()

Unnamed: 0,placeID,ratingLabel
0,밀밭정원서울마포구마포대로16길13,4.5이하
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),4.5이상
2,동북양꼬치서울영등포구디지털로37길26-1,4.5이하
3,농부쌈밥서울동작구사당로30길19,4.5이하
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,4.5이하


#### Visit

In [33]:
v_df = raw_df[['placeID', 'visitLabel']]
v_df.head()

Unnamed: 0,placeID,visitLabel
0,밀밭정원서울마포구마포대로16길13,visitQ2
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),visitQ3
2,동북양꼬치서울영등포구디지털로37길26-1,visitQ1
3,농부쌈밥서울동작구사당로30길19,visitQ4
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,visitQ3


#### Blog

In [34]:
b_df = raw_df[['placeID', 'blogLabel']]
b_df.head()

Unnamed: 0,placeID,blogLabel
0,밀밭정원서울마포구마포대로16길13,blogQ2
1,식스센스다이닝BAR서울동대문구왕산로2길9(2층방역룸예약),blogQ4
2,동북양꼬치서울영등포구디지털로37길26-1,blogQ1
3,농부쌈밥서울동작구사당로30길19,blogQ4
4,홍당무김밥서울영등포구문래로180영등포센트럴푸르지오시티,blogQ1


## Reamp IDs

In [35]:
def remap_id(id_lst) :
    id_lst.sort()
    id_to_idx, idx_to_id = dict(), dict()
    for index, value in enumerate(id_lst) :
        id_to_idx[value] = index
        idx_to_id[index] = value
    return id_to_idx, idx_to_id 

#### All Features

In [36]:
k_df.rename(columns={'prepThemeKeywords':'feature'}, inplace=True)
l_df.rename(columns={'like':'feature'}, inplace=True)
m_df.rename(columns={'menulabel':'feature'}, inplace=True)
a_df.rename(columns={'ageLabel':'feature'}, inplace=True)
r_df.rename(columns={'ratingLabel':'feature'}, inplace=True)
v_df.rename(columns={'visitLabel':'feature'}, inplace=True)
b_df.rename(columns={'blogLabel':'feature'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r_df.rename(columns={'ratingLabel':'feature'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  v_df.rename(columns={'visitLabel':'feature'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  b_df.rename(columns={'blogLabel':'feature'}, inplace=True)


In [64]:
# metadata 1개 컬럼으로
meta_df_col = ['placeID', 'menulabel','ageLabel', 'ratingLabel', 'visitLabel','blogLabel']
raw_df['meta'] = raw_df['menulabel']
for col in meta_df_col[2:]:
    raw_df['meta'] += raw_df[col].astype(str)

meta_df = raw_df[['placeID','meta']]
meta_df.rename(columns={'meta':'feature'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_df.rename(columns={'meta':'feature'}, inplace=True)


In [71]:
all_df = pd.concat([p_df, k_df, l_df, m_df, a_df, r_df, v_df, b_df], axis=0)
plactypelike_df = pd.concat([p_df, k_df, l_df], axis=0)
plactypelikemeta_df = pd.concat([p_df, k_df, l_df, meta_df], axis=0)
# plactype_df = pd.concat([p_df, k_df, l_df], axis=0)

all_df.to_csv(data_dir+'all_df.csv',index=False)
plactypelike_df.to_csv(data_dir+'plactypelike_df.csv',index=False)
plactypelikemeta_df.to_csv(data_dir+'plactypelikemeta_df.csv',index=False)