In [None]:
import os


1. Load Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from joblib import dump

ModuleNotFoundError: No module named 'pandas'

In [None]:
# import the dataset
beer_df = pd.read_csv('../data/raw/beer_reviews.csv')

2. Explore Data

In [None]:
pd.set_option('display.max_columns', None)
beer_df.head(10)

In [None]:
beer_df.tail(10)

In [None]:
# dimensions of df
beer_df.shape

In [None]:
# summary of df
beer_df.info()

In [None]:
# general description of data
beer_df.describe()

3. Prepare Data

In [None]:
# create a copy of df for data preparation
beer_df_cleaned = beer_df.copy()

In [None]:
# Drop the unnecessary columns
beer_df_cleaned.drop(['brewery_id','review_time', 'review_overall', 'review_profilename', 'beer_style', 'beer_beerid'], axis=1, inplace=True)

In [None]:
beer_df_cleaned.head()

In [None]:
beer_df_cleaned.isna().sum()

In [None]:
beer_df_cleaned[beer_df_cleaned['beer_abv'].isna()]

In [None]:
beer_df_cleaned.dropna(subset=['beer_abv'], inplace=True)

In [None]:
beer_df_cleaned[beer_df_cleaned['brewery_name'].isna()]

In [None]:
# Replace null values of brewery name to 'Unknown'
beer_df_cleaned['brewery_name'] = beer_df_cleaned['brewery_name'].fillna('Others')

In [None]:
beer_df_cleaned.isna().sum()

In [None]:
beer_df_cleaned['brewery_name'].value_counts()

In [None]:
# Get the names of breweries with counts less than a threshold
brewery_name_counts = beer_df_cleaned['brewery_name'].value_counts()
breweries_to_filter = brewery_name_counts[brewery_name_counts < 10].index

# Replace 'brewery_name' values for the filtered breweries with 'unknown/others'
beer_df_cleaned.loc[beer_df_cleaned['brewery_name'].isin(breweries_to_filter), 'brewery_name'] = 'Others'

In [None]:
beer_df_cleaned['brewery_name'].value_counts()

In [None]:
beer_df_cleaned.shape

In [None]:
beer_df_cleaned.duplicated().sum()

In [None]:
beer_df_cleaned.drop_duplicates(inplace=True)

In [None]:
beer_df_cleaned.shape

In [None]:
# Save cleaned dataset into the interim data folder
beer_df_cleaned.to_csv('../data/interim/beer_reviews_cleaned.csv', index=False)

In [None]:
# Split the data into training and testing sets with 80-20 ratio
X = beer_df_cleaned.drop(['beer_name'], axis=1)
y = beer_df_cleaned['beer_name']

# Stratify based on response
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Check the shapes
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
# Create a list of numerical feautures
num_features = ['review_aroma', 'review_appearance', 'review_palate', 'review_taste','beer_abv']

In [None]:
# Create a list of categorical features
cat_features = ['brewery_name']

4. Build Pipeline

In [None]:
# Create a Pipeline called num_transformer with one step that contains StandardScaler
num_transformer = Pipeline(
     steps=[
        ('scaler', StandardScaler())
    ]
)

In [None]:
# Create a Pipeline called cat_transformer with one step that contains OneHotEncoder
cat_transformer = Pipeline(
    steps=[
        ('one_hot_encoder', OneHotEncoder(sparse=False, drop='first'))
    ]
)

In [None]:
# Create a ColumnTransformer called preprocessor with 2 steps containing num_transformer and cat_transformer that will be applied respectively to num_features and cat_features
preprocessor = ColumnTransformer(
    transformers=[
        ('num_features', num_transformer, num_features),
        ('cat_features', cat_transformer, cat_features)
    ]
)

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)

In [103]:
# Create a Pipeline called gm_pipe that contains 2 steps preprocessor and another that instantiate a GaussianMixture 
knn_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('clustering', knn_classifier)
    ]
)

In [104]:
# Fit model
knn_pipeline.fit(X_train, y_train)



In [105]:
# Prediction
y_pred = knn_pipeline.predict(X_test)

In [None]:
# Evaluate 
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.27
