In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math
import time
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

## dataset

In [2]:
ratebeer = pd.read_json("../data/ratebeer_korea.json")

In [3]:
ratebeer.tail()

Unnamed: 0,reviewScore,reviewTime,reviewText,appearance,aroma,palate,taste,overall,profileName,beerName,beerID,brewerID,ABV,style,imageUrl
94316,3.0,2000-06-14 04:58:02.000,"pale clear golden colour with a fine head, low...",4.0,5.0,3.0,6.0,12.0,Bov,Beck's,703,112,4.8,Pale Lager - International / Premium,https://res.cloudinary.com/ratebeer/image/uplo...
94317,4.2,2000-06-10 16:51:58.000,"love that skunky smell ! Actually, my fav eve...",4.0,9.0,4.0,8.0,17.0,sallyclub99,Beck's,703,112,4.8,Pale Lager - International / Premium,https://res.cloudinary.com/ratebeer/image/uplo...
94318,2.7,2000-06-05 08:46:21.000,Much better than American pilsners which isn't...,2.0,5.0,2.0,6.0,12.0,kublai3,Beck's,703,112,4.8,Pale Lager - International / Premium,https://res.cloudinary.com/ratebeer/image/uplo...
94319,3.3,2000-06-03 23:59:02.000,"A decent, light-German beer. Not bad, but then...",3.0,6.0,3.0,6.0,15.0,Aubrey,Beck's,703,112,4.8,Pale Lager - International / Premium,https://res.cloudinary.com/ratebeer/image/uplo...
94320,3.3,2000-05-07 01:07:31.000,Beck's is a decent german beer. It's a little...,3.0,6.0,4.0,7.0,13.0,roland,Beck's,703,112,4.8,Pale Lager - International / Premium,https://res.cloudinary.com/ratebeer/image/uplo...


In [4]:
ratebeer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93623 entries, 0 to 94320
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   reviewScore  93620 non-null  float64
 1   reviewTime   93623 non-null  object 
 2   reviewText   93623 non-null  object 
 3   appearance   92036 non-null  float64
 4   aroma        92036 non-null  float64
 5   palate       92036 non-null  float64
 6   taste        92036 non-null  float64
 7   overall      92036 non-null  float64
 8   profileName  93623 non-null  object 
 9   beerName     93623 non-null  object 
 10  beerID       93623 non-null  int64  
 11  brewerID     93623 non-null  int64  
 12  ABV          93623 non-null  float64
 13  style        93623 non-null  object 
 14  imageUrl     93623 non-null  object 
dtypes: float64(7), int64(2), object(6)
memory usage: 11.4+ MB


In [5]:
# 결측치 제거
ratebeer = ratebeer.dropna(subset=['reviewScore']).reset_index(drop=True)
ratebeer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93620 entries, 0 to 93619
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   reviewScore  93620 non-null  float64
 1   reviewTime   93620 non-null  object 
 2   reviewText   93620 non-null  object 
 3   appearance   92036 non-null  float64
 4   aroma        92036 non-null  float64
 5   palate       92036 non-null  float64
 6   taste        92036 non-null  float64
 7   overall      92036 non-null  float64
 8   profileName  93620 non-null  object 
 9   beerName     93620 non-null  object 
 10  beerID       93620 non-null  int64  
 11  brewerID     93620 non-null  int64  
 12  ABV          93620 non-null  float64
 13  style        93620 non-null  object 
 14  imageUrl     93620 non-null  object 
dtypes: float64(7), int64(2), object(6)
memory usage: 10.7+ MB


### 편의점 유통 맥주 개수

In [6]:
# 전체 데이터에서 한국 편의점 맥주 개수
len(ratebeer['beerName'].unique())

81

## preprocessing

### reviewScore 기준 상위권

In [7]:
# reviewScore 기준 topk개
def reviewScore_topk(df, topk):

    return df.groupby(['beerID']).mean()[['reviewScore']].sort_values(by=['reviewScore'], ascending = False).iloc[:topk, :].index

In [8]:
topk = 4

reviewScore_topk_list = reviewScore_topk(ratebeer, topk)

In [9]:
ratebeer[ratebeer['beerID'].isin(reviewScore_topk_list)]['beerName'].unique()

array(['Platinum White Ale', 'Paulaner Hefe-Weissbier',
       'Franziskaner Hefe-Weissbier / Weissbier Naturtrub',
       'BrewDog Punk IPA'], dtype=object)

In [10]:
ratebeer[ratebeer['beerID'].isin(reviewScore_topk_list)]['imageUrl'].unique()

array(['https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_580686',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_647',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_1088',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_135361'],
      dtype=object)

### 리뷰수 기준 상위권

In [11]:
# 리뷰수 기준 topk개
def reviewNum_topk(df, topk):

    return df.groupby(['beerID']).count()[['beerName']].sort_values(by=['beerName'], ascending = False).iloc[:topk, :].index

In [12]:
topk = 4

reviewNum_topk_list = reviewNum_topk(ratebeer, topk)

In [13]:
ratebeer[ratebeer['beerID'].isin(reviewNum_topk_list)]['beerName'].unique()

array(['Pilsner Urquell', 'Hoegaarden', 'Heineken', 'Budweiser'],
      dtype=object)

In [14]:
ratebeer[ratebeer['beerID'].isin(reviewNum_topk_list)]['imageUrl'].unique()

array(['https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_717',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_399',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_37',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_473'],
      dtype=object)

### Overall 기준 상위권

In [15]:
# Overall 기준 topk개
def overall_topk(df, topk):

    return df.groupby(['beerID']).mean()[['overall']].sort_values(by=['overall'], ascending = False).iloc[:topk, :].index

In [16]:
topk = 4

overall_topk_list = overall_topk(ratebeer, topk)

In [17]:
ratebeer[ratebeer['beerID'].isin(overall_topk_list)]['beerName'].unique()

array(['Platinum White Ale', 'Paulaner Hefe-Weissbier',
       'Franziskaner Hefe-Weissbier / Weissbier Naturtrub',
       'BrewDog Punk IPA'], dtype=object)

In [18]:
ratebeer[ratebeer['beerID'].isin(overall_topk_list)]['imageUrl'].unique()

array(['https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_580686',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_647',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_1088',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_135361'],
      dtype=object)

### Steam Rating Formula

In [19]:
def steam_Rating(df, topk):
    avg_rating = df.groupby(['beerID']).mean()[['reviewScore']]
    score = avg_rating - ((avg_rating - 3.0) * (2 ** (-1 * np.log10(df.groupby(['beerID']).count()[['reviewScore']]))))
    
    return score.sort_values(by=['reviewScore'], ascending = False).iloc[:topk, :].index 

In [20]:
topk = 4

steamRating_topk_list = steam_Rating(ratebeer, topk)

In [21]:
ratebeer[ratebeer['beerID'].isin(steamRating_topk_list)]['beerName'].unique()

array(['Paulaner Hefe-Weissbier', 'Hoegaarden',
       'Franziskaner Hefe-Weissbier / Weissbier Naturtrub',
       'BrewDog Punk IPA'], dtype=object)

In [22]:
ratebeer[ratebeer['beerID'].isin(steamRating_topk_list)]['imageUrl'].unique()

array(['https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_647',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_399',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_1088',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_135361'],
      dtype=object)

### Hacker News Formula

In [23]:
def Hacker_News(df, topk):
    # 맥주 별 리뷰 수
    pageviews = (df.groupby(['beerID']).count()['beerName'] - 1)
    
    # 현재 시간
    now  = datetime.now()
    
    # 현재 시간 기준 리뷰 작성 시간 가중치 처리
    df['age'] = ((now - pd.to_datetime(df['reviewTime'])).dt.days + 2) ** 1.8
    
    # 맥주 기준으로 평균
    age = df.groupby(['beerID']).mean()['age']
    
    score = pageviews / age
    
    return score.sort_values(ascending = False).iloc[:topk].index

In [24]:
topk = 4

HackerNews_topk_list = Hacker_News(ratebeer, topk)

In [25]:
ratebeer[ratebeer['beerID'].isin(HackerNews_topk_list)]['beerName'].unique()

array(['Pilsner Urquell', 'Leffe Blonde / Blond', 'Hoegaarden',
       'BrewDog Punk IPA'], dtype=object)

In [26]:
ratebeer[ratebeer['beerID'].isin(HackerNews_topk_list)]['imageUrl'].unique()

array(['https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_717',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_2514',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_399',
       'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_135361'],
      dtype=object)

### popular topk

In [27]:
# topk 맥주 index list 함수
def popular_topk(df, topk, method='news'):
    if method=='reviewScore':
        return reviewScore_topk(df, topk)
    elif method=='reviewNum':
        return reviewNum_topk(df, topk)
    elif method=='overall':
        return overall_topk(df, topk)
    elif method=='steam':
        return steam_Rating(df, topk)
    else:
        return Hacker_News(df, topk)

In [34]:
topk = 4

print(popular_topk(ratebeer, topk, method='reviewScore'))
print(popular_topk(ratebeer, topk, method='reviewNum'))
print(popular_topk(ratebeer, topk, method='overall'))
print(popular_topk(ratebeer, topk, method='steam'))
print(popular_topk(ratebeer, topk, method='news'))

Int64Index([580686, 135361, 1088, 647], dtype='int64', name='beerID')
Int64Index([399, 473, 717, 37], dtype='int64', name='beerID')
Int64Index([580686, 135361, 1088, 647], dtype='int64', name='beerID')
Int64Index([135361, 1088, 647, 399], dtype='int64', name='beerID')
Int64Index([135361, 399, 2514, 717], dtype='int64', name='beerID')


In [35]:
# topk 맥주 이름
def topk_beerName(df, topk_list):
    return df[df['beerID'].isin(topk_list)]['beerName'].unique()

# topk 맥주 이미지 주소
def topk_Image(df, topk_list):
    return df[df['beerID'].isin(topk_list)]['imageUrl'].unique()

In [36]:
topk_list = popular_topk(ratebeer, topk, method='news')

print(topk_beerName(ratebeer, topk_list))
print(topk_Image(ratebeer, topk_list))

['Pilsner Urquell' 'Leffe Blonde / Blond' 'Hoegaarden' 'BrewDog Punk IPA']
['https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_717'
 'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_2514'
 'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_399'
 'https://res.cloudinary.com/ratebeer/image/upload/w_400,c_limit,d_Default_Beer_qqrv7k.png,f_auto/beer_135361']
