# 1. 레시피 정보과 리뷰 정보를 병합하여 저장

In [1]:
import os
from datetime import datetime as dt

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
raw_data_dir = '/dev/shm/data/1.raw'

In [3]:
recipe_df = pd.read_csv(os.path.join(raw_data_dir, 'recipes_full_240313.csv'))
review_df = pd.read_csv(os.path.join(raw_data_dir, 'reviews_full_240313.csv'))

recipe_df.shape, review_df.shape

((1429, 2), (221928, 5))

## 중복 개체 제거

In [4]:
# 유저 중복 제거 - 리뷰
print(f'before drop duplicated users: {review_df.shape[0]}')
review_df = review_df.drop_duplicates(subset=['uid'])
print(f'after drop duplicated users: {review_df.shape[0]}')

before drop duplicated users: 221928
after drop duplicated users: 128555


In [5]:
# 유저 중복 제거 - 레시피
print(f'before drop duplicated users: {recipe_df.shape[0]}')
review_df = review_df.drop_duplicates(subset=['uid'])
print(f'after drop duplicated users: {recipe_df.shape[0]}')

before drop duplicated users: 1429
after drop duplicated users: 1429


트랜젝션 데이터로 변환

uid, user_name, itemid, rating, date

In [6]:
# str -> dict
review_df['history'] = review_df['history'].apply(eval)
recipe_df['recipes'] = recipe_df['recipes'].apply(eval)

In [7]:
review_transactions = []

for i, row in tqdm(review_df.iterrows(), total=review_df.shape[0]):
	uid = row['uid']
	for recipe_sno, data in row['history'].items():
		rating = data['rating']
		datetime = data['datetime']
		review_transactions.append([uid, recipe_sno, rating, datetime])

review_transaction_df = pd.DataFrame(review_transactions, columns=['uid', 'recipe_sno', 'rating', 'datetime'])

100%|██████████| 128555/128555 [00:08<00:00, 14344.54it/s]


In [8]:
recipe_transactions = []

for i, row in tqdm(recipe_df.iterrows(), total=recipe_df.shape[0]):
	uid = row['uid']
	for recipe_sno in row['recipes']:
		recipe_transactions.append([uid, recipe_sno])

recipe_transaction_df = pd.DataFrame(recipe_transactions, columns=['uid', 'recipe_sno'])

100%|██████████| 1429/1429 [00:00<00:00, 12992.78it/s]


In [9]:
all_transaction_df = pd.concat([recipe_transaction_df, review_transaction_df], axis=0)
all_transaction_df.drop_duplicates()

Unnamed: 0,uid,recipe_sno,rating,datetime
0,gomusin76,6834819,,
1,gomusin76,6834466,,
2,gomusin76,6834339,,
3,gomusin76,6834128,,
4,gomusin76,6834038,,
...,...,...,...,...
418553,72139975,6891816,5.0,2021-09-28 12:28
418554,77296341,6869801,5.0,2023-08-05 22:28
418555,25121000,6891816,5.0,2022-11-02 20:15
418556,38986666,4164229,5.0,2023-11-06 19:35


In [10]:
now = dt.now().strftime('%y%m%d')
all_transaction_df.to_csv(f'/dev/shm/data/2.merged/merged-data-{now}.csv', index=False)