In [30]:
import os
import pandas as pd
import numpy as np

### 데이터 전처리

In [66]:
data_dir = "../data"
preprocessed_output_dir = "../data/train"
raw_ratebeer = pd.read_json(os.path.join(data_dir, "raw_ratebeer.json"))

### 1. 결측치 제거 (label) reviewscore, beer_id NaN 제거

In [54]:
# reviewscore
print(raw_ratebeer.reviewscore.isna().sum())

# 인덱스 초기화
raw_ratebeer = raw_ratebeer[~raw_ratebeer.reviewscore.isna()]
raw_ratebeer.index = range(len(raw_ratebeer))

In [55]:
# reviewscore
print(raw_ratebeer.beer_id.isna().sum())

# 인덱스 초기화
raw_ratebeer = raw_ratebeer[~raw_ratebeer.beer_id.isna()]
raw_ratebeer.index = range(len(raw_ratebeer))

### 2. 리뷰 개수가 10개 미만인 유저 제거

In [57]:
# 리뷰 개수 10개 이상인 유저 리스트
over10user_list = (raw_ratebeer["profile_name"].value_counts() >= 10).where(lambda x: x == True).dropna().index.tolist()

# 리뷰 개수 10개 이상인 유저 리스트를 받아와서 해당 유저들로만 구성된 데이터셋으로 업데이트
raw_ratebeer = raw_ratebeer[raw_ratebeer["profile_name"].isin(over10user_list)]

# 인덱스 초기화
raw_ratebeer = raw_ratebeer[~raw_ratebeer.reviewscore.isna()]
raw_ratebeer.index = range(len(raw_ratebeer))

### 3. user_id 등록

In [58]:
n_user = raw_ratebeer.profile_name.nunique()
print(f"n_user= {n_user:,}")

n_user= 14,701


In [59]:
profile_name2idx = {profile_name:user_id for user_id, profile_name in enumerate(raw_ratebeer.profile_name.unique())}
raw_ratebeer["user_id"] = [profile_name2idx[profile_name] for profile_name in raw_ratebeer.profile_name]

raw_ratebeer.head(1)

Unnamed: 0,reviewscore,reviewtime,reviewtext,appearance,aroma,palate,taste,overall,profile_name,beer_name,beer_id,brewer_id,abv,style,image_url,user_id
0,3.9,2022-05-15T17:26:51.803Z,Tap at Fermentoren CPH. Poured a hazy golden c...,3.0,8.0,4.0,8.0,16.0,jmgreenuk,Hill Farmstead Arthur,131594.0,11233.0,6.0,Saison / Farmhouse / Grisette,https://res.cloudinary.com/ratebeer/image/uplo...,0


### 4. user-password 등록 (임의설정) - varcha

In [60]:
# np.random.seed(42)
# profile_name2password = {profile_name:str(np.random.randint(low = 0, high = 99999999)).zfill(8) for profile_name in raw_ratebeer.profile_name.unique()}
# raw_ratebeer["password"] = [profile_name2password[profile_name] for profile_name in raw_ratebeer.profile_name]

### 5. gender, birth

In [61]:
####

### 6. text 처리

In [62]:
# raw_ratebeer["reviewtext"].iloc[3138322].replace("......", "")

### 7. type 정정

In [91]:
raw_ratebeer["beer_id"] = raw_ratebeer["beer_id"].astype(int)
raw_ratebeer["brewer_id"] = raw_ratebeer["brewer_id"].astype(int)
raw_ratebeer["user_id"] = raw_ratebeer["user_id"].astype(int)

### 8. training file making

In [92]:
raw_ratebeer[["user_id", "beer_id", "reviewscore", "reviewtime"]].isna().sum()

user_id        0
beer_id        0
reviewscore    0
reviewtime     0
dtype: int64

In [93]:
train_ratings = raw_ratebeer[["user_id", "beer_id", "reviewscore", "reviewtime"]]
train_ratings.columns = ["user", "item", "rating", "time"]
train_ratings.head(1)

Unnamed: 0,user,item,rating,time
0,0,131594,3.9,2022-05-15T17:26:51.803Z


In [96]:
train_ratings.to_csv(os.path.join(preprocessed_output_dir, "train_ratings.csv"), index=False)

### 8. attribute extraction (style)

In [97]:
style_df = raw_ratebeer.groupby("beer_id")["beer_id", "style"].head(1)
style_df.index = range(len(style_df))

  style_df = raw_ratebeer.groupby("beer_id")["beer_id", "style"].head(1)


In [100]:
style_df

Unnamed: 0,beer_id,style
0,131594,Saison / Farmhouse / Grisette
1,121536,Specialty Grain - Other
2,12617,Pale Lager - American
3,14940,IPA - English
4,25933,Blonde Ale / Golden Ale
...,...,...
9330,118583,Red Ale - Irish
9331,786,Amber Lager - International / Vienna
9332,51898,Stout - Imperial
9333,5067,Red Ale / International Amber Ale


In [104]:
array, index = pd.factorize(style_df["style"])
style_df["style"] = array
style_df.groupby("beer_id")["style"].apply(list).to_json(os.path.join(preprocessed_output_dir, "rb_item2attributes.json"))