In [116]:
import os, sys, pickle 
import pandas as pd 
import json

from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm 

from settings import * 
from utils import * 

# os.chdir('/workspace')

In [3]:
photo_paths = os.path.join(DATA_DIR, 'photos.json')
img_paths = os.path.join(DATA_DIR, 'photos')
item_paths = os.path.join(DATA_DIR, 'yelp_academic_dataset_business.json')
user_paths = os.path.join(DATA_DIR, 'yelp_academic_dataset_user.json')
review_paths = os.path.join(DATA_DIR, 'yelp_academic_dataset_review.json')

In [4]:
# !conda install -c conda-forge ipywidgets -y

In [5]:
def json2df(path):
    json_data = []
    with open(path, 'r') as f:
        for line in tqdm(f, desc='transformating json to dataframe...'):
            json_data.append(json.loads(line))
    return pd.json_normalize(json_data)

In [6]:
photo_info = json2df(photo_paths)
item_info = json2df(item_paths)

user_info = json2df(user_paths)

review_info = json2df(review_paths)

transformating json to dataframe...: 0it [00:00, ?it/s]

transformating json to dataframe...: 0it [00:00, ?it/s]

transformating json to dataframe...: 0it [00:00, ?it/s]

transformating json to dataframe...: 0it [00:00, ?it/s]

### 전처리 기준

- 사용자: 리뷰를 최소 10회 이상 작성한(review_count) 사용자를 대상으로 함.
- 레스토랑
    - 사진: Food에 해당하는 사진만 사용함.

In [50]:
item_df = pd.merge(item_info, photo_info, how='inner').loc[:, ['business_id', 'stars', 'photo_id', 'label']]
item_df = item_df[item_df.loc[:, 'label'] == 'food']
item_df = item_df.drop_duplicates('business_id')

In [109]:
review_df = review_info[review_info.loc[:, 'user_id'].isin(user_info.user_id)].loc[:, ['user_id', 'business_id', 'text', 'stars']].copy()
review_df = pd.merge(review_df, item_df.loc[:, ['business_id', 'photo_id']], how='inner')

target_user = review_df.user_id.value_counts()[review_df.user_id.value_counts() >= 10].index

review_df = review_df[review_df.user_id.isin(target_user)].reset_index(drop=True)

In [117]:
d_train, d_test = train_test_split(review_df, train_size=0.6, random_state=42)
d_valid, d_test = train_test_split(d_test, train_size=0.5, random_state=42)

In [119]:
d_train = d_train.astype({'user_id':'category', 'business_id':'category'})
d_valid = d_valid.astype({'user_id':'category', 'business_id':'category'})
d_test = d_test.astype({'user_id':'category', 'business_id':'category'})

In [120]:
u_cat = d_train.user_id.cat.categories
b_cat = d_train.business_id.cat.categories

In [121]:
d_valid.user_id = d_valid.user_id.cat.set_categories(u_cat)
d_valid.business_id = d_valid.business_id.cat.set_categories(b_cat)

d_test.user_id = d_test.user_id.cat.set_categories(u_cat)
d_test.business_id = d_test.business_id.cat.set_categories(b_cat)

In [122]:
d_train.user_id = d_train.user_id.cat.codes
d_train.business_id = d_train.business_id.cat.codes 

d_valid.user_id = d_valid.user_id.cat.codes
d_valid.business_id = d_valid.business_id.cat.codes 

d_test.user_id = d_test.user_id.cat.codes
d_test.business_id = d_test.business_id.cat.codes 

In [123]:
d_train = d_train.dropna()
d_valid = d_valid.dropna()
d_test = d_test.dropna()

d_train.reset_index(drop=True, inplace=True)
d_valid.reset_index(drop=True, inplace=True)
d_test.reset_index(drop=True, inplace=True)

In [124]:
d_train = d_train.astype({'user_id': int, 'business_id': int})
d_valid = d_valid.astype({'user_id': int, 'business_id': int})
d_test = d_test.astype({'user_id': int, 'business_id': int})

In [127]:
save_pkl(review_df, fname='data/data_info.pkl')

save_pkl(d_train, fname='data/train.pkl')
save_pkl(d_valid, fname='data/valid.pkl')
save_pkl(d_test, fname='data/test.pkl')

Success pickle file, which name is data/data_info.pkl
Success pickle file, which name is data/train.pkl
Success pickle file, which name is data/valid.pkl
Success pickle file, which name is data/test.pkl
