In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math
import time
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

## dataset

In [2]:
ratebeer = pd.read_json("../data/ratebeer_korea.json")

In [3]:
ratebeer.tail()

Unnamed: 0,reviewScore,reviewTime,reviewText,appearance,aroma,palate,taste,overall,profileName,beerName,beerID,brewerID,ABV,style,imageUrl
94316,3.0,2000-06-14 04:58:02.000,"pale clear golden colour with a fine head, low...",4.0,5.0,3.0,6.0,12.0,Bov,Beck's,703,112,4.8,Pale Lager - International / Premium,https://res.cloudinary.com/ratebeer/image/uplo...
94317,4.2,2000-06-10 16:51:58.000,"love that skunky smell ! Actually, my fav eve...",4.0,9.0,4.0,8.0,17.0,sallyclub99,Beck's,703,112,4.8,Pale Lager - International / Premium,https://res.cloudinary.com/ratebeer/image/uplo...
94318,2.7,2000-06-05 08:46:21.000,Much better than American pilsners which isn't...,2.0,5.0,2.0,6.0,12.0,kublai3,Beck's,703,112,4.8,Pale Lager - International / Premium,https://res.cloudinary.com/ratebeer/image/uplo...
94319,3.3,2000-06-03 23:59:02.000,"A decent, light-German beer. Not bad, but then...",3.0,6.0,3.0,6.0,15.0,Aubrey,Beck's,703,112,4.8,Pale Lager - International / Premium,https://res.cloudinary.com/ratebeer/image/uplo...
94320,3.3,2000-05-07 01:07:31.000,Beck's is a decent german beer. It's a little...,3.0,6.0,4.0,7.0,13.0,roland,Beck's,703,112,4.8,Pale Lager - International / Premium,https://res.cloudinary.com/ratebeer/image/uplo...


In [4]:
ratebeer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93623 entries, 0 to 94320
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   reviewScore  93620 non-null  float64
 1   reviewTime   93623 non-null  object 
 2   reviewText   93623 non-null  object 
 3   appearance   92036 non-null  float64
 4   aroma        92036 non-null  float64
 5   palate       92036 non-null  float64
 6   taste        92036 non-null  float64
 7   overall      92036 non-null  float64
 8   profileName  93623 non-null  object 
 9   beerName     93623 non-null  object 
 10  beerID       93623 non-null  int64  
 11  brewerID     93623 non-null  int64  
 12  ABV          93623 non-null  float64
 13  style        93623 non-null  object 
 14  imageUrl     93623 non-null  object 
dtypes: float64(7), int64(2), object(6)
memory usage: 11.4+ MB


In [5]:
# 결측치 제거
ratebeer = ratebeer.dropna(subset=['reviewScore'])
ratebeer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93620 entries, 0 to 94320
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   reviewScore  93620 non-null  float64
 1   reviewTime   93620 non-null  object 
 2   reviewText   93620 non-null  object 
 3   appearance   92036 non-null  float64
 4   aroma        92036 non-null  float64
 5   palate       92036 non-null  float64
 6   taste        92036 non-null  float64
 7   overall      92036 non-null  float64
 8   profileName  93620 non-null  object 
 9   beerName     93620 non-null  object 
 10  beerID       93620 non-null  int64  
 11  brewerID     93620 non-null  int64  
 12  ABV          93620 non-null  float64
 13  style        93620 non-null  object 
 14  imageUrl     93620 non-null  object 
dtypes: float64(7), int64(2), object(6)
memory usage: 11.4+ MB


### 편의점 유통 맥주 개수

In [6]:
# 전체 데이터에서 한국 편의점 맥주 개수
len(ratebeer['beerName'].unique())

81

## preprocessing

### reviewScore 기준 상위권

In [18]:
reviewScore_top = ratebeer.groupby(['beerID']).mean()[['reviewScore']].sort_values(by=['reviewScore'], ascending = False).iloc[:4, :].index

In [20]:
ratebeer[ratebeer['beerID'].isin(reviewScore_top)]['beerName'].unique()

array(['Platinum White Ale', 'Paulaner Hefe-Weissbier',
       'Franziskaner Hefe-Weissbier / Weissbier Naturtrub',
       'BrewDog Punk IPA'], dtype=object)

In [36]:
ratebeer[ratebeer['beerID'].isin(reviewScore_top)]['imageUrl'].unique()

array(['https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_580686',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_647',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_1088',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_135361'],
      dtype=object)

### 리뷰수 기준 상위권

In [28]:
reviewNum_top = ratebeer.groupby(['beerID']).count()[['beerName']].sort_values(by=['beerName'], ascending = False).iloc[:4, :].index

In [31]:
ratebeer[ratebeer['beerID'].isin(reviewNum_top)]['beerName'].unique()

array(['Pilsner Urquell', 'Hoegaarden', 'Heineken', 'Budweiser'],
      dtype=object)

In [32]:
ratebeer[ratebeer['beerID'].isin(reviewNum_top)]['imageUrl'].unique()

array(['https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_717',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_399',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_37',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_473'],
      dtype=object)

### Overall 기준 상위권

In [33]:
overall_top = ratebeer.groupby(['beerID']).mean()[['overall']].sort_values(by=['overall'], ascending = False).iloc[:4, :].index

In [34]:
ratebeer[ratebeer['beerID'].isin(overall_top)]['beerName'].unique()

array(['Platinum White Ale', 'Paulaner Hefe-Weissbier',
       'Franziskaner Hefe-Weissbier / Weissbier Naturtrub',
       'BrewDog Punk IPA'], dtype=object)

In [35]:
ratebeer[ratebeer['beerID'].isin(overall_top)]['imageUrl'].unique()

array(['https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_580686',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_647',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_1088',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_135361'],
      dtype=object)

### Steam Rating Formula

In [47]:
avg_rating = ratebeer.groupby(['beerID']).mean()[['reviewScore']]

In [48]:
avg_rating

Unnamed: 0_level_0,reviewScore
beerID,Unnamed: 1_level_1
37,2.168056
55,3.230782
221,2.497443
251,2.204497
268,2.234644
...,...
580686,4.100000
614833,2.500000
621308,2.790909
632627,2.682927


In [49]:
score = avg_rating - ((avg_rating - 3.0) * (2 ** (-1 * np.log2(ratebeer.groupby(['beerID']).count()[['reviewScore']]))))
score

Unnamed: 0_level_0,reviewScore
beerID,Unnamed: 1_level_1
37,2.168253
55,3.230406
221,2.498871
251,2.204885
268,2.235030
...,...
580686,3.000000
614833,2.600000
621308,2.797245
632627,2.690660


In [50]:
streamRating_top = score.sort_values(by=['reviewScore'], ascending = False).iloc[:4, :].index 

In [51]:
ratebeer[ratebeer['beerID'].isin(streamRating_top)]['beerName'].unique()

array(['Paulaner Hefe-Weissbier', 'Goose Island Goose IPA',
       'Franziskaner Hefe-Weissbier / Weissbier Naturtrub',
       'BrewDog Punk IPA'], dtype=object)

In [52]:
ratebeer[ratebeer['beerID'].isin(streamRating_top)]['imageUrl'].unique()

array(['https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_647',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_814',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_1088',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_135361'],
      dtype=object)

### Hacker News Formula

In [72]:
ratebeer.groupby(['beerID']).count()[['beerName']].sort_values(by=['beerName'], ascending = False) - 1

Unnamed: 0_level_0,beerName
beerID,Unnamed: 1_level_1
399,4620
473,4428
717,4347
37,4222
742,4075
...,...
252000,7
506847,4
614833,4
480225,1


In [99]:
pd.to_datetime(ratebeer['reviewTime']).values.astype(np.int64) // 10 ** 9

array([1614430433, 1610129192, 1608214821, ...,  960194781,  960076742,
        957661651])