In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,  GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix

## 1. Load Data

In [3]:
# Parameters
response = 'beer_style'
cat_features = ['brewery_name']
num_features = [
    'review_aroma', 'review_appearance',
    'review_palate', 'review_taste',
    'beer_abv'
]

features = cat_features + num_features

In [4]:
# Load actual data 
beer_df = pd.read_csv('../data/beer_reviews.csv')

# Select only relevant information
beer_df = beer_df[features + [response]]

In [5]:
beer_df.columns

Index(['brewery_name', 'review_aroma', 'review_appearance', 'review_palate',
       'review_taste', 'beer_abv', 'beer_style'],
      dtype='object')

## 2. EDA 

Examine instances

In [6]:
beer_df.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_style
0,Vecchio Birraio,2.0,2.5,1.5,1.5,5.0,Hefeweizen
1,Vecchio Birraio,2.5,3.0,3.0,3.0,6.2,English Strong Ale
2,Vecchio Birraio,2.5,3.0,3.0,3.0,6.5,Foreign / Export Stout
3,Vecchio Birraio,3.0,3.5,2.5,3.0,5.0,German Pilsener
4,Caldera Brewing Company,4.5,4.0,4.0,4.5,7.7,American Double / Imperial IPA


Check unique values for each feature

In [7]:
beer_df.nunique()

brewery_name         5742
review_aroma            9
review_appearance      10
review_palate           9
review_taste            9
beer_abv              530
beer_style            104
dtype: int64

Check shape of dataset

In [8]:
beer_df.shape

(1586614, 7)

Check null values

In [9]:
beer_df.isnull().sum()

brewery_name            15
review_aroma             0
review_appearance        0
review_palate            0
review_taste             0
beer_abv             67785
beer_style               0
dtype: int64

In [10]:
beer_df[beer_df['beer_abv'].isnull()]

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_style
273,Caldera Brewing Company,3.0,3.0,4.0,3.0,,American Stout
430,Moon River Brewing Company,4.0,4.5,3.5,3.5,,Scotch Ale / Wee Heavy
603,Moon River Brewing Company,3.5,4.0,3.5,3.5,,Scotch Ale / Wee Heavy
733,Caldera Brewing Company,4.0,4.0,4.0,4.0,,American IPA
798,Caldera Brewing Company,4.5,4.0,4.0,4.5,,American Double / Imperial Stout
...,...,...,...,...,...,...,...
1586568,The Defiant Brewing Company,3.5,4.0,4.0,4.0,,Bock
1586587,The Defiant Brewing Company,4.5,4.0,4.5,4.0,,Maibock / Helles Bock
1586596,The Defiant Brewing Company,3.0,5.0,4.0,3.5,,Belgian Strong Pale Ale
1586597,The Defiant Brewing Company,4.5,4.0,4.0,4.0,,Belgian Strong Pale Ale


Examine what feature will not exist if values with null `beer_abv` values are omitted. NOTE: Approach for this is to omit the instances will null `beer_abv` since all of the classes can still be represented. Something to explore is to perform imputation techniques. 

In [11]:
not_in_fulldataset = set(
    beer_df['beer_style']
) - set(
    beer_df[~beer_df['beer_abv'].isnull()]['beer_style']
)

print(not_in_fulldataset)

set()


In [12]:
beer_df.loc[~beer_df['beer_abv'].isnull()][response].nunique()

104

Check distribution of response

In [13]:
beer_df.loc[beer_df['beer_abv'].isnull()][response].value_counts()

American Pale Ale (APA)            5388
American IPA                       4422
American Amber / Red Ale           4011
American Porter                    3827
Hefeweizen                         1980
                                   ... 
Eisbock                              16
Japanese Rice Lager                  13
Euro Strong Lager                     9
Bière de Champagne / Bière Brut       5
Happoshu                              4
Name: beer_style, Length: 103, dtype: int64

Examine basic statistics

In [14]:
beer_df.describe()

Unnamed: 0,review_aroma,review_appearance,review_palate,review_taste,beer_abv
count,1586614.0,1586614.0,1586614.0,1586614.0,1518829.0
mean,3.735636,3.841642,3.743701,3.79286,7.042387
std,0.6976167,0.6160928,0.6822184,0.7319696,2.322526
min,1.0,0.0,1.0,1.0,0.01
25%,3.5,3.5,3.5,3.5,5.2
50%,4.0,4.0,4.0,4.0,6.5
75%,4.0,4.0,4.0,4.5,8.5
max,5.0,5.0,5.0,5.0,57.7


In [15]:
beer_df.corr()

  beer_df.corr()


Unnamed: 0,review_aroma,review_appearance,review_palate,review_taste,beer_abv
review_aroma,1.0,0.561029,0.616947,0.716776,0.33257
review_appearance,0.561029,1.0,0.566634,0.54698,0.263941
review_palate,0.616947,0.566634,1.0,0.734135,0.286711
review_taste,0.716776,0.54698,0.734135,1.0,0.290827
beer_abv,0.33257,0.263941,0.286711,0.290827,1.0


Examine number of instances per `brewery_name`

In [16]:
beer_df['brewery_name'].value_counts()

Boston Beer Company (Samuel Adams)    39444
Dogfish Head Brewery                  33839
Stone Brewing Co.                     33066
Sierra Nevada Brewing Co.             28751
Bell's Brewery, Inc.                  25191
                                      ...  
Brauerei Stolz GmbH & Co. KG              1
Hausbrauerei Düll                         1
Browar Grybów                             1
Staro&#269;eský Pivovárek Dobruka        1
Spire Brewery                             1
Name: brewery_name, Length: 5742, dtype: int64

In [17]:
(beer_df['brewery_name'].value_counts() < 5).sum()

1718

NOTE: for now, consider brewery_name with less than 5 instaces to be categorized as `others`

## 3. Data Preprocessing 
Simple and straightforward -- just resolving the issue of null values in `beer_abv`. For the brewery name, null values and brewery_name with less than 5 instances are just changed to `unknown/others`. 

In [18]:
# Drop the instances wih null values of `beer_abv`
beer_df = beer_df.loc[~beer_df['beer_abv'].isnull()]

In [19]:
# Change null values of brewery name to unknown/others
beer_df.loc[beer_df['brewery_name'].isnull(), 'brewery_name'] = 'unknown/others'

In [20]:
beer_df.isnull().sum()

brewery_name         0
review_aroma         0
review_appearance    0
review_palate        0
review_taste         0
beer_abv             0
beer_style           0
dtype: int64

In [21]:
# Get the names of breweries with counts less than a threshold
brewery_name_counts = beer_df['brewery_name'].value_counts()
breweries_to_filter = brewery_name_counts[brewery_name_counts < 10].index

# Replace 'brewery_name' values for the filtered breweries with 'others'
beer_df.loc[beer_df['brewery_name'].isin(breweries_to_filter), 'brewery_name'] = 'unknown/others'

In [22]:
# # Get the names of breweries with counts less than a threshold
# brewery_name_counts = beer_df['brewery_name'].value_counts()
# breweries_to_filter = brewery_name_counts[brewery_name_counts < 10].index

# # Replace 'brewery_name' values for the filtered breweries with 'others'
# beer_df = beer_df[~beer_df['brewery_name'].isin(breweries_to_filter)] 

In [23]:
beer_df['brewery_name'].value_counts()

Boston Beer Company (Samuel Adams)    38812
Dogfish Head Brewery                  33800
Stone Brewing Co.                     33022
Sierra Nevada Brewing Co.             28637
Bell's Brewery, Inc.                  24975
                                      ...  
Cervejaria Bodebrown Ltda.               10
Ostankinskiy Pivovarinniy Zavod          10
Blacksburg Brewing Company               10
Alina S.R.L.                             10
Solvang Brewing Company                  10
Name: brewery_name, Length: 2847, dtype: int64

In [24]:
# Check the range of values
beer_df.describe()

Unnamed: 0,review_aroma,review_appearance,review_palate,review_taste,beer_abv
count,1518829.0,1518829.0,1518829.0,1518829.0,1518829.0
mean,3.746163,3.850344,3.75368,3.804013,7.042387
std,0.695398,0.6143443,0.679385,0.7286724,2.322526
min,1.0,0.0,1.0,1.0,0.01
25%,3.5,3.5,3.5,3.5,5.2
50%,4.0,4.0,4.0,4.0,6.5
75%,4.0,4.0,4.0,4.5,8.5
max,5.0,5.0,5.0,5.0,57.7


Store the brewery names

In [36]:
sorted_brewery_names = sorted(beer_df['brewery_name'].unique())

with open('../models/brewery_names.pkl', 'wb') as file:
    pickle.dump(sorted_brewery_names, file)

## 4. Modeling 

Modeling Parameters

In [None]:
# Find the hyperparams based on the scoring value 
scoring = 'f1'

# Define search grid
param_grid = {'classifier__n_neighbors': [1, 3, 5, 7, 9, 11]}
param_grid = {'classifier__n_neighbors': [1]}

# CV
cv = 5

# Set if gridsearch will be performed: NOTE: this should be False if Gridsearch will not be performed
gridsearch = False

#### 4.1 Split Data

In [None]:
# Split the data into training and testing sets
X = beer_df.drop(columns=[response])
y = beer_df[response]

# Stratify based on response
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### 4.2 Build Pipeline

In [None]:
n_classes = beer_df[response].nunique()

# Create tx for num and cat features
num_transformer = Pipeline(steps = [
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps = [
    ('one-hot-encoder', OneHotEncoder(sparse=False, drop='first'))
])

# Combine the tx
preprocessor = ColumnTransformer(
    transformers=[
        ('num_cols', num_transformer, num_features),
        ('cat_cols', cat_transformer, cat_features),
    ]
)

# Instantiate a kNN Classifier
knn_classifier = KNeighborsClassifier()

# Make a kNN pipeline
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', knn_classifier)
    ]
)

#### 4.3a Experiment through Gridsearch
NOTE: Encountered some problems due to high imbalanced data -- attemp to perform this without CV in subsection `4.3b`.

In [None]:
if gridsearch:
    # Create the grid search
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=scoring)

    # Fit the grid search to the training data
    grid_search.fit(X_train, y_train)

In [None]:
if gridsearch: 
    # Get the best hyperparameters
    best_k = grid_search.best_params_['classifier__n_neighbors']
    best_pipeline = grid_search.best_estimator_
    print(f"Best k: {best_k}")

#### 4.3b Experiment -- different k values

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

def train_knn_model(k, X_train, X_test, y_train, y_test, num_features, cat_features):
    # Create pieline
    num_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    cat_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_transformer, num_features),
            ('cat', cat_transformer, cat_features)
        ])

    knn_classifier = KNeighborsClassifier(n_neighbors=k)

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', knn_classifier)
    ])

    # Fit model
    pipeline.fit(X_train, y_train)

    # Prediction
    y_pred = pipeline.predict(X_test)

    # Evaluate 
    accuracy = accuracy_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred,average='micro')
#     roc_auc = roc_auc_score(y_test, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
#     print(f"F1 Score: {f1:.2f}")
#     print(f"ROC AUC Score: {roc_auc:.2f}")

    return pipeline


In [None]:
from joblib import dump

In [None]:
# Fit with k=3
knn_pipeline_k3 = train_knn_model(3, X_train, X_test, y_train, y_test, num_features, cat_features)

In [None]:
dump(knn_pipeline_k3, '../model/knn-k3.joblib')

In [None]:
# Fit with k=5
knn_pipeline_k5 = train_knn_model(5, X_train, X_test, y_train, y_test, num_features, cat_features)

In [None]:
dump(knn_pipeline_k5, '../model/knn-k5.joblib')

In [None]:
knn_pipeline_k3.predict(X_train)