In [26]:
import os, sys, pickle 
import pandas as pd 
import json
from collections import defaultdict

from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm 

from torch.utils.data import DataLoader, Dataset
from PIL import Image
import glob, os
import numpy as np 

from torchvision import transforms 

from settings import * 
from utils import * 

# !conda install -c conda-forge ipywidgets -y

In [3]:
photo_info = json2df(PATH_DICT['photo'])
item_info = json2df(PATH_DICT['item_paths'])

user_info = json2df(PATH_DICT['user_paths'])

review_info = json2df(PATH_DICT['review_paths'])

transformating json to dataframe...: 0it [00:00, ?it/s]

transformating json to dataframe...: 0it [00:00, ?it/s]

transformating json to dataframe...: 0it [00:00, ?it/s]

transformating json to dataframe...: 0it [00:00, ?it/s]

### 전처리 기준

- 사용자: 리뷰를 최소 10회 이상 작성한(review_count) 사용자를 대상으로 함.
    - 리뷰: 최대 길이를 256으로 설정하고 BertTokenizer (BPE)를 사용함.
- 레스토랑
    - 사진:Food에 해당하는 사진만 사용함. 사진을 불러올 때 사진이 깨진 파일이 있어 제거하는 작업을 수행함.


In [4]:
item_df = pd.merge(item_info, photo_info, how='inner').loc[:, ['business_id', 'stars', 'photo_id', 'label']]
item_df = item_df[item_df.loc[:, 'label'] == 'food'] # food에 해당하는 이미지.
item_df = item_df.drop_duplicates('business_id')

In [5]:
review_df = review_info[review_info.loc[:, 'user_id'].isin(user_info.user_id)].loc[:, ['user_id', 'business_id', 'text', 'stars']].copy()
review_df = pd.merge(review_df, item_df.loc[:, ['business_id', 'photo_id']], how='inner')
review_df.photo_id = review_df.photo_id.apply(lambda x: re.sub('[^0-9a-zA-Z]', '', x) + '.jpg')

In [None]:
img_path = modify_img_path(PATH_DICT['photo'])

In [10]:
drop_img = drop_invalid_image(review_df) # 이미지 파일이 깨진 경우 삭제.

Loding images:   0%|          | 0/25959 [00:00<?, ?it/s]

The number of drop images is 1.0! Drop image name is Pk878Yndygr4LRUDH7Hg.jpg
The number of drop images is 2.0! Drop image name is feUGw0P5byOq4U40C77tyQ.jpg
The number of drop images is 3.0! Drop image name is N6hL8FQ84A2DznF2S2Lp7g.jpg
The number of drop images is 4.0! Drop image name is YW1WMOkVbdFBrixDnKgoqA.jpg
The number of drop images is 5.0! Drop image name is bf3ymV0YgP7B6rEoriaU2w.jpg
The number of drop images is 6.0! Drop image name is MduVueqYTBlEkXaxrh1ug.jpg
The number of drop images is 7.0! Drop image name is JGpfPj8VEvnq1BXqr3wA.jpg
The number of drop images is 8.0! Drop image name is ytJ4lihJrvyzMMRGWwDNw.jpg
The number of drop images is 9.0! Drop image name is ydm3g1wUWSxJnMPgHk2JhQ.jpg
The number of drop images is 10.0! Drop image name is E7Wpzn1fCnVJ8zKpecPQ.jpg
The number of drop images is 11.0! Drop image name is JoQ5xekjQUkj8rukJIzqgg.jpg
The number of drop images is 12.0! Drop image name is cNkUV0sInfhPy5PP8SHtQ.jpg
The number of drop images is 13.0! Drop image 

In [11]:
drop_idx = review_df[review_df.photo_id.isin(drop_img)].index 
review_df = review_df.drop(index=drop_idx).reset_index(drop=True)

target_user = review_df.user_id.value_counts()[review_df.user_id.value_counts() >= 10].index  # 10회 이상.

review_df = review_df[review_df.user_id.isin(target_user)].reset_index(drop=True)

In [12]:
d_train, d_test = train_test_split(review_df, train_size=0.6, random_state=42)
d_valid, d_test = train_test_split(d_test, train_size=0.5, random_state=42)

In [13]:
d_train = d_train.astype({'user_id':'category', 'business_id':'category'})
d_valid = d_valid.astype({'user_id':'category', 'business_id':'category'})
d_test = d_test.astype({'user_id':'category', 'business_id':'category'})

In [14]:
u_cat = d_train.user_id.cat.categories
b_cat = d_train.business_id.cat.categories

In [15]:
d_valid.user_id = d_valid.user_id.cat.set_categories(u_cat)
d_valid.business_id = d_valid.business_id.cat.set_categories(b_cat)

d_test.user_id = d_test.user_id.cat.set_categories(u_cat)
d_test.business_id = d_test.business_id.cat.set_categories(b_cat)

In [16]:
d_train.user_id = d_train.user_id.cat.codes
d_train.business_id = d_train.business_id.cat.codes 

d_valid.user_id = d_valid.user_id.cat.codes
d_valid.business_id = d_valid.business_id.cat.codes

d_test.user_id = d_test.user_id.cat.codes
d_test.business_id = d_test.business_id.cat.codes 

In [17]:
d_train = d_train.dropna()
d_valid = d_valid.dropna()
d_test = d_test.dropna()

d_train.reset_index(drop=True, inplace=True)
d_valid.reset_index(drop=True, inplace=True)
d_test.reset_index(drop=True, inplace=True)

In [18]:
d_train = d_train.astype({'user_id': int, 'business_id': int})
d_valid = d_valid.astype({'user_id': int, 'business_id': int})
d_test = d_test.astype({'user_id': int, 'business_id': int})

In [19]:
save_pkl(review_df, fname=os.path.join(BASE_DIR,'data/data_info.pkl'))
save_pkl(d_train, fname=os.path.join(BASE_DIR,'data/train.pkl'))
save_pkl(d_valid, fname=os.path.join(BASE_DIR,'data/valid.pkl'))
save_pkl(d_test, fname=os.path.join(BASE_DIR,'data/test.pkl'))

Success pickle file, which name is /workspace/data/data_info.pkl
Success pickle file, which name is /workspace/data/train.pkl
Success pickle file, which name is /workspace/data/valid.pkl
Success pickle file, which name is /workspace/data/test.pkl
