1. Load Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from joblib import dump

In [2]:
# import the dataset
beer_df = pd.read_csv('../data/raw/beer_reviews.csv')

2. Explore Data

In [3]:
pd.set_option('display.max_columns', None)
beer_df.head(10)

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883
5,1075,Caldera Brewing Company,1325524659,3.0,3.5,3.5,oline73,Herbed / Spiced Beer,3.0,3.5,Caldera Ginger Beer,4.7,52159
6,1075,Caldera Brewing Company,1318991115,3.5,3.5,3.5,Reidrover,Herbed / Spiced Beer,4.0,4.0,Caldera Ginger Beer,4.7,52159
7,1075,Caldera Brewing Company,1306276018,3.0,2.5,3.5,alpinebryant,Herbed / Spiced Beer,2.0,3.5,Caldera Ginger Beer,4.7,52159
8,1075,Caldera Brewing Company,1290454503,4.0,3.0,3.5,LordAdmNelson,Herbed / Spiced Beer,3.5,4.0,Caldera Ginger Beer,4.7,52159
9,1075,Caldera Brewing Company,1285632924,4.5,3.5,5.0,augustgarage,Herbed / Spiced Beer,4.0,4.0,Caldera Ginger Beer,4.7,52159


In [4]:
beer_df.tail(10)

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
1586604,14359,The Defiant Brewing Company,1288890206,4.0,4.5,4.5,njmoons,Pumpkin Ale,3.5,3.5,The Horseman's Ale,5.2,33061
1586605,14359,The Defiant Brewing Company,1163291143,5.0,5.0,5.0,NyackNicky,Pumpkin Ale,5.0,5.0,The Horseman's Ale,5.2,33061
1586606,14359,The Defiant Brewing Company,1162871808,5.0,4.5,4.0,blitheringidiot,Pumpkin Ale,5.0,5.0,The Horseman's Ale,5.2,33061
1586607,14359,The Defiant Brewing Company,1162865640,5.0,5.0,4.5,PopeDX,Pumpkin Ale,5.0,4.5,The Horseman's Ale,5.2,33061
1586608,14359,The Defiant Brewing Company,1162685856,3.5,4.0,4.0,treehugger02010,Pumpkin Ale,3.5,3.0,The Horseman's Ale,5.2,33061
1586609,14359,The Defiant Brewing Company,1162684892,5.0,4.0,3.5,maddogruss,Pumpkin Ale,4.0,4.0,The Horseman's Ale,5.2,33061
1586610,14359,The Defiant Brewing Company,1161048566,4.0,5.0,2.5,yelterdow,Pumpkin Ale,2.0,4.0,The Horseman's Ale,5.2,33061
1586611,14359,The Defiant Brewing Company,1160702513,4.5,3.5,3.0,TongoRad,Pumpkin Ale,3.5,4.0,The Horseman's Ale,5.2,33061
1586612,14359,The Defiant Brewing Company,1160023044,4.0,4.5,4.5,dherling,Pumpkin Ale,4.5,4.5,The Horseman's Ale,5.2,33061
1586613,14359,The Defiant Brewing Company,1160005319,5.0,4.5,4.5,cbl2,Pumpkin Ale,4.5,4.5,The Horseman's Ale,5.2,33061


In [5]:
beer_df['brewery_name'].value_counts()

Boston Beer Company (Samuel Adams)    39444
Dogfish Head Brewery                  33839
Stone Brewing Co.                     33066
Sierra Nevada Brewing Co.             28751
Bell's Brewery, Inc.                  25191
                                      ...  
Brauerei Stolz GmbH & Co. KG              1
Hausbrauerei Düll                         1
Browar Grybów                             1
Staro&#269;eský Pivovárek Dobruka        1
Spire Brewery                             1
Name: brewery_name, Length: 5742, dtype: int64

In [6]:
# dimensions of df
beer_df.shape

(1586614, 13)

In [7]:
# summary of df
beer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1586614 non-null  int64  
 1   brewery_name        1586599 non-null  object 
 2   review_time         1586614 non-null  int64  
 3   review_overall      1586614 non-null  float64
 4   review_aroma        1586614 non-null  float64
 5   review_appearance   1586614 non-null  float64
 6   review_profilename  1586266 non-null  object 
 7   beer_style          1586614 non-null  object 
 8   review_palate       1586614 non-null  float64
 9   review_taste        1586614 non-null  float64
 10  beer_name           1586614 non-null  object 
 11  beer_abv            1518829 non-null  float64
 12  beer_beerid         1586614 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [8]:
# general description of data
beer_df.describe()

Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_beerid
count,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1518829.0,1586614.0
mean,3130.099,1224089000.0,3.815581,3.735636,3.841642,3.743701,3.79286,7.042387,21712.79
std,5578.104,76544270.0,0.7206219,0.6976167,0.6160928,0.6822184,0.7319696,2.322526,21818.34
min,1.0,840672000.0,0.0,1.0,0.0,1.0,1.0,0.01,3.0
25%,143.0,1173224000.0,3.5,3.5,3.5,3.5,3.5,5.2,1717.0
50%,429.0,1239203000.0,4.0,4.0,4.0,4.0,4.0,6.5,13906.0
75%,2372.0,1288568000.0,4.5,4.0,4.0,4.0,4.5,8.5,39441.0
max,28003.0,1326285000.0,5.0,5.0,5.0,5.0,5.0,57.7,77317.0


3. Prepare Data

In [9]:
# create a copy of df for data preparation
beer_df_cleaned = beer_df.copy()

In [10]:
# Drop the unnecessary columns
beer_df_cleaned.drop(['brewery_id','review_time', 'review_overall', 'review_profilename', 'beer_style', 'beer_abv', 'beer_beerid'], axis=1, inplace=True)

In [11]:
beer_df_cleaned.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste,beer_name
0,Vecchio Birraio,2.0,2.5,1.5,1.5,Sausa Weizen
1,Vecchio Birraio,2.5,3.0,3.0,3.0,Red Moon
2,Vecchio Birraio,2.5,3.0,3.0,3.0,Black Horse Black Beer
3,Vecchio Birraio,3.0,3.5,2.5,3.0,Sausa Pils
4,Caldera Brewing Company,4.5,4.0,4.0,4.5,Cauldron DIPA


In [12]:
beer_df_cleaned.isna().sum()

brewery_name         15
review_aroma          0
review_appearance     0
review_palate         0
review_taste          0
beer_name             0
dtype: int64

In [13]:
beer_df_cleaned[beer_df_cleaned['brewery_name'].isna()]

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste,beer_name
651565,,2.5,2.5,1.5,1.5,Engel Tyrolian Bräu WRONG BREWERY SEE SCHWABIS...
659293,,4.5,3.5,4.0,3.5,Engel Bock Dunkel WRONG BREWERY SEE CRAILSHEIMER
659299,,3.0,3.0,4.0,4.0,Engel Gold WRONG BREWERY SEE CRAILSHEIMER
659300,,4.0,3.5,3.5,3.0,Engel Landbier WRONG BREWERY SEE CRAILSHEIMER
659301,,4.0,4.0,4.0,3.5,Engel Keller Hell WRONG BREWERY SEE CRAILSHEIMER
659302,,4.0,3.0,3.0,3.5,Engel Aloisius - WRONG BREWERY SEE CRAILSHEIMER
659303,,3.0,3.0,2.0,3.0,Engel Keller Dunkel WRONG BREWERY SEE CRAILSH...
659304,,4.0,4.0,4.0,4.5,Engel Keller Dunkel WRONG BREWERY SEE CRAILSH...
659305,,3.5,4.0,4.0,4.0,Engel Keller Dunkel WRONG BREWERY SEE CRAILSH...
1391043,,3.5,3.5,4.0,3.5,Hard Hat American Beer


In [14]:
# Replace null values of brewery name to 'Unknown'
beer_df_cleaned['brewery_name'] = beer_df_cleaned['brewery_name'].fillna('Unknown')

In [15]:
beer_df_cleaned.isna().sum()

brewery_name         0
review_aroma         0
review_appearance    0
review_palate        0
review_taste         0
beer_name            0
dtype: int64

In [16]:
beer_df_cleaned['brewery_name'].value_counts()

Boston Beer Company (Samuel Adams)      39444
Dogfish Head Brewery                    33839
Stone Brewing Co.                       33066
Sierra Nevada Brewing Co.               28751
Bell's Brewery, Inc.                    25191
                                        ...  
MonkSouth (Thirsty Monk South)              1
Stateline Brewery                           1
Northumberland Brewery Limited              1
Harmon Brewing And Tap Room                 1
Konishi Brewing Co. (Shirayuki beer)        1
Name: brewery_name, Length: 5743, dtype: int64

In [17]:
# Get the names of breweries with counts less than a threshold
brewery_name_counts = beer_df_cleaned['brewery_name'].value_counts()
breweries_to_filter = brewery_name_counts[brewery_name_counts < 10].index

# Replace 'brewery_name' values for the filtered breweries with 'unknown/others'
beer_df_cleaned.loc[beer_df_cleaned['brewery_name'].isin(breweries_to_filter), 'brewery_name'] = 'Others'

In [18]:
beer_df_cleaned['brewery_name'].value_counts()

Boston Beer Company (Samuel Adams)    39444
Dogfish Head Brewery                  33839
Stone Brewing Co.                     33066
Sierra Nevada Brewing Co.             28751
Bell's Brewery, Inc.                  25191
                                      ...  
Slip Point Brewing                       10
Roscoe's Hop House                       10
Kairinmaru Beer                          10
The Cambridge House                      10
Duckstein Brewery                        10
Name: brewery_name, Length: 3239, dtype: int64

In [19]:
# Save cleaned dataset into the interim data folder
beer_df_cleaned.to_csv('../data/interim/beer_reviews_cleaned.csv', index=False)

In [20]:
# Split the data into training and testing sets with 80-20 ratio
X = beer_df_cleaned.drop(['beer_name'], axis=1)
y = beer_df_cleaned['beer_name']

# Stratify based on response
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Check the shapes
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1269291, 5)
Shape of X_test: (317323, 5)
Shape of y_train: (1269291,)
Shape of y_test: (317323,)


In [22]:
# Create a list of numerical feautures
num_features = ['review_aroma', 'review_appearance', 'review_palate', 'review_taste']

In [23]:
# Create a list of categorical features
cat_features = ['brewery_name']

4. Build Pipeline

In [24]:
# Create a Pipeline called num_transformer with one step that contains StandardScaler
num_transformer = Pipeline(
     steps=[
        ('scaler', StandardScaler())
    ]
)

In [25]:
# Create a Pipeline called cat_transformer with one step that contains OneHotEncoder
cat_transformer = Pipeline(
    steps=[
        ('one_hot_encoder', OneHotEncoder(sparse=False, drop='first'))
    ]
)

In [26]:
# Create a ColumnTransformer called preprocessor with 2 steps containing num_transformer and cat_transformer that will be applied respectively to num_features and cat_features
preprocessor = ColumnTransformer(
    transformers=[
        ('num_features', num_transformer, num_features),
        ('cat_features', cat_transformer, cat_features)
    ]
)

In [27]:
knn_classifier = KNeighborsClassifier()

In [30]:
# Create a Pipeline called gm_pipe that contains 2 steps preprocessor and another that instantiate a GaussianMixture 
knn_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('clustering', knn_classifier)
    ]
)

In [None]:
# Fit model
knn_pipeline.fit(X_train, y_train)



In [None]:
# Prediction
y_pred = knn_pipeline.predict(X_test)

In [None]:
# Evaluate 
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")