### 0. Kaggle을 통해 data 가져오기

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ehallmar/beers-breweries-and-beer-reviews")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\kjw02\.cache\kagglehub\datasets\ehallmar\beers-breweries-and-beer-reviews\versions\2


### 1. 필요 라이브러리 설치 및 data 확인

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import KFold as SurpriseKFold

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [5]:
file_list = os.listdir(path)
print(file_list)

['beers.csv', 'breweries.csv', 'reviews.csv']


In [6]:
beer = os.path.join(path, 'beers.csv')
breweries = os.path.join(path, 'breweries.csv')
reviews = os.path.join(path, 'reviews.csv')

In [7]:
beer_df = pd.read_csv(beer)
breweries_df = pd.read_csv(breweries)
reviews_df = pd.read_csv(reviews)

print(f"Beer DataFrame: {beer_df.head()}")
print(f"Breweries DataFrame: {breweries_df.head()}")
print(f"Reviews DataFrame: {reviews_df.head()}")

Beer DataFrame:        id                                  name  brewery_id state country  \
0  202522                        Olde Cogitator        2199    CA      US   
1   82352  Konrads Stout Russian Imperial Stout       18604   NaN      NO   
2  214879                        Scottish Right       44306    IN      US   
3  320009               MegaMeow Imperial Stout        4378    WA      US   
4  246438                       Peaches-N-Cream       44617    PA      US   

                     style availability   abv                   notes retired  
0    English Oatmeal Stout     Rotating   7.3  No notes at this time.       f  
1   Russian Imperial Stout     Rotating  10.4  No notes at this time.       f  
2             Scottish Ale   Year-round   4.0  No notes at this time.       t  
3  American Imperial Stout       Winter   8.7    Every time this year       f  
4       American Cream Ale     Rotating   5.1  No notes at this time.       f  
Breweries DataFrame:       id            

### 2. EDA 및 preprocessing

In [None]:
# beer_df & breweries_df를 brewery_id를 기준으로 병합

merge_df = pd.merge(beer_df, breweries_df, how = 'left', left_on = 'brewery_id', right_on = 'id', suffixes = ('_beer', '_brewery'))

print(f"Merged DataFrame: {merge_df.head()}")

Merged DataFrame:    id_beer                             name_beer  brewery_id state_beer  \
0   202522                        Olde Cogitator        2199         CA   
1    82352  Konrads Stout Russian Imperial Stout       18604        NaN   
2   214879                        Scottish Right       44306         IN   
3   320009               MegaMeow Imperial Stout        4378         WA   
4   246438                       Peaches-N-Cream       44617         PA   

  country_beer                    style availability   abv  \
0           US    English Oatmeal Stout     Rotating   7.3   
1           NO   Russian Imperial Stout     Rotating  10.4   
2           US             Scottish Ale   Year-round   4.0   
3           US  American Imperial Stout       Winter   8.7   
4           US       American Cream Ale     Rotating   5.1   

               notes_beer retired  id_brewery                name_brewery  \
0  No notes at this time.       f        2199         Main Street Brewery   
1  N

In [None]:
# merge의 결과로 봤을 때 state와 country는 beer와 brewery가 각각 동일한 값으로 판단
# beer_df의 값으로 통일

merge_df['state'] = merge_df['state_beer'].fillna(merge_df['state_brewery'])
merge_df['country'] = merge_df['country_beer'].fillna(merge_df['country_brewery'])

# 중복되는 column 제거

merge_df = merge_df.drop(columns=[
    'state_beer', 'state_brewery', 'country_beer', 'country_brewery'
])

# column 이름을 재정리 및 순서 재배치
df = merge_df.rename(columns={
    'id_beer': 'beer_id',
    'name_beer': 'beer_name',
    'name_brewery': 'brewery_name',
    'notes_beer': 'beer_notes',
    'notes_brewery': 'brewery_notes'
})

if 'id_brewery' in df.columns:
    df = df.drop(columns=['id_brewery'])

df = df[['beer_id', 'beer_name', 'brewery_id', 'brewery_name', 'city', 'state', 'country', 'style', 'availability', 'abv', 'types', 'beer_notes', 'brewery_notes', 'retired']]

print(f"Final DataFrame: {df.head()}")

Final DataFrame:    beer_id                             beer_name  brewery_id  \
0   202522                        Olde Cogitator        2199   
1    82352  Konrads Stout Russian Imperial Stout       18604   
2   214879                        Scottish Right       44306   
3   320009               MegaMeow Imperial Stout        4378   
4   246438                       Peaches-N-Cream       44617   

                 brewery_name        city state country  \
0         Main Street Brewery  Pleasanton    CA      US   
1        Lervig Aktiebryggeri   Stavanger   NaN      NO   
2               Byway Brewing     Hammond    IN      US   
3  Georgetown Brewing Company     Seattle    WA      US   
4         Mad Princes Brewing  Doylestown    PA      US   

                     style availability   abv  \
0    English Oatmeal Stout     Rotating   7.3   
1   Russian Imperial Stout     Rotating  10.4   
2             Scottish Ale   Year-round   4.0   
3  American Imperial Stout       Winter   8.7  

In [None]:
# merge한 data의 결측치 확인

df.isna().sum()

beer_id              0
beer_name            0
brewery_id           0
brewery_name         0
city              3034
state            60592
country             20
style                1
availability         0
abv              38797
types                0
beer_notes          46
brewery_notes      533
retired              0
dtype: int64

In [None]:
# 결측치 확인 결과 numericcal data는 abv, 나머지는 categorical data
# abv는 mean으로 대체, 나머지는 'Unknown'/'No notes'로 대체

abv_imputer = SimpleImputer(strategy = 'mean')
df['abv'] = abv_imputer.fit_transform(df[['abv']])

df['style'] = df['style'].fillna('Unknown')
df['country'] = df['country'].fillna('Unknown')
df['state'] = df['state'].fillna('Unknown')
df['city'] = df['city'].fillna('Unknown')
df['beer_notes'] = df['beer_notes'].fillna('No notes')
df['brewery_notes'] = df['brewery_notes'].fillna('No notes')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358873 entries, 0 to 358872
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   beer_id        358873 non-null  int64  
 1   beer_name      358873 non-null  object 
 2   brewery_id     358873 non-null  int64  
 3   brewery_name   358873 non-null  object 
 4   city           358873 non-null  object 
 5   state          358873 non-null  object 
 6   country        358873 non-null  object 
 7   style          358873 non-null  object 
 8   availability   358873 non-null  object 
 9   abv            358873 non-null  float64
 10  types          358873 non-null  object 
 11  beer_notes     358873 non-null  object 
 12  brewery_notes  358873 non-null  object 
 13  retired        358873 non-null  object 
dtypes: float64(1), int64(2), object(11)
memory usage: 38.3+ MB


In [None]:
# df와 reviews_df를 beer_id 기준으로 병합

full_data = pd.merge(df, reviews_df, on = 'beer_id', how = 'left')

full_data.head()

Unnamed: 0,beer_id,beer_name,brewery_id,brewery_name,city,state,country,style,availability,abv,types,beer_notes,brewery_notes,retired,username,date,text,look,smell,taste,feel,overall,score
0,202522,Olde Cogitator,2199,Main Street Brewery,Pleasanton,CA,US,English Oatmeal Stout,Rotating,7.3,"Brewery, Bar, Eatery",No notes at this time.,No notes at this time.,f,,,,,,,,,
1,82352,Konrads Stout Russian Imperial Stout,18604,Lervig Aktiebryggeri,Stavanger,Unknown,NO,Russian Imperial Stout,Rotating,10.4,Brewery,No notes at this time.,No notes at this time.,f,FZR23,2018-05-28,,4.5,4.5,4.5,4.5,4.5,4.5
2,82352,Konrads Stout Russian Imperial Stout,18604,Lervig Aktiebryggeri,Stavanger,Unknown,NO,Russian Imperial Stout,Rotating,10.4,Brewery,No notes at this time.,No notes at this time.,f,samEBC,2018-02-21,,4.25,4.0,4.0,4.25,4.25,4.09
3,82352,Konrads Stout Russian Imperial Stout,18604,Lervig Aktiebryggeri,Stavanger,Unknown,NO,Russian Imperial Stout,Rotating,10.4,Brewery,No notes at this time.,No notes at this time.,f,iftcoach,2017-10-13,,4.0,4.0,4.0,4.0,4.0,4.0
4,82352,Konrads Stout Russian Imperial Stout,18604,Lervig Aktiebryggeri,Stavanger,Unknown,NO,Russian Imperial Stout,Rotating,10.4,Brewery,No notes at this time.,No notes at this time.,f,parris,2017-09-29,,4.0,4.0,3.5,3.5,3.75,3.7


In [None]:
# review_df의 look, smell, taste, feel, overall에 결측치 존재, mean으로 대체
# 그 외의 결측치 존재하는 데이터들은 각각에 알맞은 값으로 대체

review_imputer = SimpleImputer(strategy = 'mean')
for col in ['look', 'smell', 'taste', 'feel', 'overall', 'score']:
    full_data[col] = review_imputer.fit_transform(full_data[[col]])

full_data['username'] = full_data['username'].fillna('Anonymous')
full_data['date'] = full_data['date'].fillna('Unknown')
full_data['text'] = full_data['text'].fillna('No review text')

In [27]:
full_data.isna().sum()

beer_id          0
beer_name        0
brewery_id       0
brewery_name     0
city             0
state            0
country          0
style            0
availability     0
abv              0
types            0
beer_notes       0
brewery_notes    0
retired          0
username         0
date             0
text             0
look             0
smell            0
taste            0
feel             0
overall          0
score            0
is_like          0
dtype: int64

### 3. Feature Engineering

In [32]:
# CF용 raw target을 생성
y_target_raw = full_data['score']
y_target_binary = (full_data['score'] >= 4.0).astype(int)

full_data['is_like'] = y_target_binary

# User-Specific feature 생성

user_stats = full_data.groupby('username')['score'].agg(
    user_avg_rating = 'mean',
    user_rating_counts = 'count'
).reset_index()

full_data = pd.merge(full_data, user_stats, on = 'username', how = 'left')

# Item-Specific feature 생성

beer_stats = full_data.groupby('beer_id')['score'].agg(
    beer_avg_rating = 'mean',
    beer_rating_counts = 'count'
).reset_index()

full_data = pd.merge(full_data, beer_stats, on = 'beer_id', how = 'left')

# training에 포함되면 안되는 feature 추출
target_leakage_features = ['score', 'look', 'smell', 'taste', 'feel', 'overall']


### 3.1 각 모델별 데이터 셋 생성

In [33]:
# Collaborative Filtering용 데이터 셋 생성

df_for_cf = full_data[['username', 'beer_id', 'score']]

df_for_cf.columns = ['user', 'item', 'rating']
print(f"Data for Collaborative Filtering: {df_for_cf.head()}")


# Content-Based Filtering용 데이터 셋 생성

unique_beer_df = df.drop_duplicates(subset = ['beer_id']).set_index('beer_id')

numeric_features = ['abv']
numeric_transformer = Pipeline(steps = [('scaler', StandardScaler())])

categorical_features = ['style']
categorical_transformer = Pipeline(steps = [('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output = False))])

cb_preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder = 'drop'
)

cb_preprocessor.fit(unique_beer_df)

# Content-Based Filtering용 아이템 특징 행렬 생성
beer_feature_vectors_array = cb_preprocessor.transform(unique_beer_df)
cb_feature_names = cb_preprocessor.get_feature_names_out()
beer_feature_vectors_dict = {
    beer_id: vector 
    for beer_id, vector in zip(unique_beer_df.index, beer_feature_vectors_array)
}

print(f"Content-Based Filtering Beer Feature Vectors: {beer_feature_vectors_array.shape[0]}, features: {beer_feature_vectors_array.shape[1]}")

# Hybrid Filtering용 데이터 셋 생성

meta_original_features = [
    'user_avg_rating',
    'user_rating_counts',
    'beer_avg_rating',
    'beer_rating_counts',
    'abv',
    'style'
]

cols_to_drop = [
    'is_like',
    *target_leakage_features,
    'beer_id',
    'username',
    'beer_name',
    'brewery_id',
    'brewery_name',
    'date', 'text', 'city', 'state', 'country', 'types', 'beer_notes', 'brewery_notes', 'availability', 'retired'
]

X_meta_base_raw = full_data[meta_original_features].copy()
y_meta = full_data['is_like'].copy()

X_meta_base = pd.get_dummies(X_meta_base_raw, columns = ['style'], prefix = 'style')

print(X_meta_base.head())
print(y_meta.head())

Data for Collaborative Filtering:         user    item    rating
0  Anonymous  202522  3.889811
1      FZR23   82352  4.500000
2     samEBC   82352  4.090000
3   iftcoach   82352  4.000000
4     parris   82352  3.700000
Content-Based Filtering Beer Feature Vectors: 358873, features: 114
   user_avg_rating  user_rating_counts  beer_avg_rating  beer_rating_counts  \
0         3.881723               53163         3.889811                   1   
1         3.435173                 375         3.974848                  99   
2         3.883229                3029         3.974848                  99   
3         3.685149                 505         3.974848                  99   
4         4.365046                 864         3.974848                  99   

    abv  style_American Adjunct Lager  style_American Amber / Red Ale  \
0   7.3                         False                           False   
1  10.4                         False                           False   
2  10.4           