In [13]:
import os
import pandas as pd
import numpy as np

### 데이터 전처리

In [14]:
data_dir = "../data"
# preprocessed_output_dir = "../data/train"
raw_ratebeer = pd.read_json(os.path.join(data_dir, "raw_ratebeer.json"))

### 1. 결측치 제거 (label) reviewscore, beer_id NaN 제거

In [15]:
# reviewscore
print(raw_ratebeer.reviewscore.isna().sum())

# 인덱스 초기화
raw_ratebeer = raw_ratebeer[~raw_ratebeer.reviewscore.isna()]
raw_ratebeer.index = range(len(raw_ratebeer))

28


In [16]:
# beer_id
print(raw_ratebeer.beer_id.isna().sum())

# 인덱스 초기화
raw_ratebeer = raw_ratebeer[~raw_ratebeer.beer_id.isna()]
raw_ratebeer.index = range(len(raw_ratebeer))

779


### 2. user_id 등록

In [17]:
n_user = raw_ratebeer.profile_name.nunique()
print(f"n_user= {n_user:,}")

n_user= 60,580


In [18]:
profile_name2idx = {profile_name:user_id for user_id, profile_name in enumerate(raw_ratebeer.profile_name.unique())}
raw_ratebeer["user_id"] = [profile_name2idx[profile_name] for profile_name in raw_ratebeer.profile_name]

raw_ratebeer.head(1)

Unnamed: 0,reviewscore,reviewtime,reviewtext,appearance,aroma,palate,taste,overall,profile_name,beer_name,beer_id,brewer_id,abv,style,image_url,user_id
0,3.9,2022-05-15T17:26:51.803Z,Tap at Fermentoren CPH. Poured a hazy golden c...,3.0,8.0,4.0,8.0,16.0,jmgreenuk,Hill Farmstead Arthur,131594.0,11233.0,6.0,Saison / Farmhouse / Grisette,https://res.cloudinary.com/ratebeer/image/uplo...,0


### 3. Text 전처리

In [20]:
counts = 0
for texts in raw_ratebeer["reviewtext"]:
    if "\r" in texts:
        counts +=1
    print(texts)
    break
print(counts)

Tap at Fermentoren CPH. Poured a hazy golden colour with, a lasting frothy white head. The aroma is big funky yeast. The flavour is moderate to strong sour, with a fresh, crisp, acidic, yeasty, tart fruits, light herbal palate. Medium bodied with average carbonation.


### 4. 리뷰 개수가 10개 미만인 유저 제거

In [5]:
# 리뷰 개수 10개 이상인 유저 리스트
over10user_list = (raw_ratebeer["profile_name"].value_counts() >= 10).where(lambda x: x == True).dropna().index.tolist()

# 리뷰 개수 10개 이상인 유저 리스트를 받아와서 해당 유저들로만 구성된 데이터셋으로 업데이트
raw_ratebeer = raw_ratebeer[raw_ratebeer["profile_name"].isin(over10user_list)]

# 인덱스 초기화
raw_ratebeer = raw_ratebeer[~raw_ratebeer.reviewscore.isna()]
raw_ratebeer.index = range(len(raw_ratebeer))

### 4. user-password 등록 (임의설정) - varcha

In [8]:
# np.random.seed(42)
# profile_name2password = {profile_name:str(np.random.randint(low = 0, high = 99999999)).zfill(8) for profile_name in raw_ratebeer.profile_name.unique()}
# raw_ratebeer["password"] = [profile_name2password[profile_name] for profile_name in raw_ratebeer.profile_name]

### 5. gender, birth

In [9]:
####

### 6. text 처리

In [12]:
for texts in raw_ratebeer["reviewtext"]:
    print(texts)
    

0          Tap at Fermentoren CPH. Poured a hazy golden c...
1          Bottle I shared with JB.  Beautiful cloudy str...
2          2019 vintage\nAroma is vinegar, sour apples, f...
3          Golden with a white head - Malt aroma with som...
4          Bouteille.\nRobe orangée, trouble, peu scintil...
                                 ...                        
3137528    A surprisingly nice, smooth Scottish ale with ...
3137529    Good beer, much beter on tap at the Brewery th...
3137530    Good roasted flavor with just a hint of sweetn...
3137531    It's a sweet beer. I think the more I drank it...
3137532    .................................................
Name: reviewtext, Length: 3137533, dtype: object

In [10]:
# raw_ratebeer["reviewtext"].iloc[3138322].replace("......", "")

### 7. type 정정

In [11]:
raw_ratebeer["beer_id"] = raw_ratebeer["beer_id"].astype(int)
raw_ratebeer["brewer_id"] = raw_ratebeer["brewer_id"].astype(int)
raw_ratebeer["user_id"] = raw_ratebeer["user_id"].astype(int)

### 8. Train Test split

In [45]:
n_user = raw_ratebeer.user_id.nunique()
test_ratio = 0.1
n_test_user = int(n_user*test_ratio)

np.random.seed = 42
test_user_list = np.random.choice(range(n_user), size = n_test_user, replace=False)

In [52]:
test_ratebeer = raw_ratebeer[raw_ratebeer["user_id"].isin(test_user_list)].drop("reviewtext", axis=1)
test_ratebeer.to_csv("test_ratebeer_without_text.csv", index = False)

In [50]:
train_ratebeer = raw_ratebeer[~raw_ratebeer["user_id"].isin(test_user_list)].drop("reviewtext", axis=1)
# "train_ratebeer_without_text.csv"
train_ratebeer.to_csv("train_ratebeer_without_text.csv", index = False)

In [51]:
len(train_ratebeer)

2808429

In [None]:
########################################################################
########################################################################
########################################################################
########################################################################

## 데이터 셋 분리 [Testset 분리]
- 총 15000명 가량의 유저를 통해서 
- 한 2000명에 대해서 각 1

In [13]:
raw_ratebeer.value_counts("user_id")

user_id
596      5998
560      5831
595      5516
554      5159
587      5120
         ... 
14203      10
14204      10
14205      10
5874       10
14700      10
Length: 14701, dtype: int64

### 8. training file making

In [12]:
raw_ratebeer[["user_id", "beer_id", "reviewscore", "reviewtime"]].isna().sum()

user_id        0
beer_id        0
reviewscore    0
reviewtime     0
dtype: int64

In [93]:
train_ratings = raw_ratebeer[["user_id", "beer_id", "reviewscore", "reviewtime"]]
train_ratings.columns = ["user", "item", "rating", "time"]
train_ratings.head(1)

Unnamed: 0,user,item,rating,time
0,0,131594,3.9,2022-05-15T17:26:51.803Z


In [96]:
train_ratings.to_csv(os.path.join(preprocessed_output_dir, "train_ratings.csv"), index=False)

### 8. attribute extraction (style)

In [97]:
style_df = raw_ratebeer.groupby("beer_id")["beer_id", "style"].head(1)
style_df.index = range(len(style_df))

  style_df = raw_ratebeer.groupby("beer_id")["beer_id", "style"].head(1)


In [100]:
style_df

Unnamed: 0,beer_id,style
0,131594,Saison / Farmhouse / Grisette
1,121536,Specialty Grain - Other
2,12617,Pale Lager - American
3,14940,IPA - English
4,25933,Blonde Ale / Golden Ale
...,...,...
9330,118583,Red Ale - Irish
9331,786,Amber Lager - International / Vienna
9332,51898,Stout - Imperial
9333,5067,Red Ale / International Amber Ale


In [104]:
array, index = pd.factorize(style_df["style"])
style_df["style"] = array
style_df.groupby("beer_id")["style"].apply(list).to_json(os.path.join(preprocessed_output_dir, "rb_item2attributes.json"))