In [None]:
__author__ = "Alina Molnar"
__copyright__ = "Copyright (C) 2020-2021 Alina Molnar"
__license__ = "CC BY-NC"
__version__ = "1.0"

# STEP 1. IMPORT LIBRARIES

In [None]:
import glob
from math import ceil
from pathlib import PureWindowsPath

import h2o
import IPython
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from h2o.estimators import (H2OGradientBoostingEstimator,
                            H2ORandomForestEstimator)
from IPython.display import display
from scipy.stats import spearmanr

## Settings

In [None]:
pd.options.display.max_rows = 999 
pd.options.display.max_columns = 999
sns.set_palette("colorblind")

print(IPython.sys_info())

# STEP 2. DATA UNDERSTANDING. CLEAN, TRANSFORM, PREPROCESS DATA

## 2.1 Collect Initial Data

In [None]:
# Read beer file
beer_all = pd.read_excel("https://github.com/alina-molnar/Beer-Recommendation-Project-Proof-of-Concept/blob/main/beer_input/beer_241.xlsx", sheet_name="Sheet1")

## 2.2 Describe Data.

In [None]:
# Write function that takes any dataframe and displays basic details about it.
# Use display() because it automatically adds an empty line between outputs, instead of print() which doesn't.

def inspect_function(df):
    """Display dataframe properties."""
    
    display(df.shape)
    display(df.head())
    display(df.info())
    display(df.isna().sum())
    display(df.describe().round(1))
    
inspect_function(beer_all)
# Rating is between -1 and 11.

In [None]:
# Standardize appeareance. Convert column labels to lowercase.
beer_all.columns = beer_all.columns.str.lower()

# Convert columns values to lowercase if they are strings.
beer_all = beer_all.applymap(lambda col:col.lower() if type(col) == str else col)

# Convert Name column from object to string.
beer_all["name"] = beer_all["name"].astype("string")

# Cut alcohol content from end of name and store as separate column.
beer_all["abv"] = [name.rsplit(maxsplit=1)[-1] for name in beer_all["name"]]

# Convert alcohol content to float.
beer_all["abv"] = beer_all["abv"].astype(float)

# Convert object types to category, except Split column.
beer_all[["method", "style", "flavor", "fermentation"]] = beer_all[["method", "style", "flavor", "fermentation"]].astype("category")

In [None]:
# Validation of uniqueness in beer names. Check for duplicates, remove if found.
beer_all.drop_duplicates(subset="name", keep="last", inplace=True)

# Description of numeric variables after standardized appearance and after removal of duplicates.
inspect_function(beer_all)
# At the beginning there were 241 rows and 8 columns, now there are 240 rows and 9 columns.

# Percentage of beer with ratings lower than 5.
under_5_rating = beer_all["rating"] < 5
under_5_rating_percentage = (len(beer_all[under_5_rating])/len(beer_all["rating"]))*100
print(f"Currently {under_5_rating_percentage:.1f}% of total beers are discarded.")

In [None]:
# Description of categorical variables.

def describe_categorical_columns(df, numeric_column):
    """Print basic statistics of subgroups in categorical columns.
    
    Args:
    df (pandas.DataFrame): Dataframe containing numerical and categorical columns.
    numeric_column (str): Name of the column containing numerical data.

    Returns:
    Print basic statistics of each column containing categorical data: count, mean, std, min, 25%, 50%, 75%, max.
    """
    # Store in a list the columns containing categorical data:
    list_categoricals = df.select_dtypes(include=["category"]).columns.tolist()
    # Iterate through list of categorical columns:
    for i, elem in enumerate(list_categoricals):
        description = df.groupby([elem])[numeric_column].describe().round(1)
        print(description)

describe_categorical_columns(beer_all, "rating")

In [None]:
# Descrition of Country column.
country_unique = len(set(beer_all["country"])) 
print(f"There are {country_unique} unique countries.")

## 2.3 Explore Data


### Hypothesis 1: There might be a linear relationship between ratings and alcohol content.

In [None]:
# Plot alcohol content vs. rating by subgroup to check for hidden patterns.

def scatter_sns(df, x_numeric_column, y_numeric_column):
    """Seaborn scatter type subplots of two numeric variables as x and y, grouped by categorical columns.
    
    Args:
    df (pandas.DataFrame): Dataframe containing numerical and categorical columns.
    x_numeric_column (str): Name of numerical column to be plotted on x-axis.
    y_numeric_column (str): Name of numerical column to be plotted on y-axis.

    Returns:
    Seaborn scatter type subplots of x and y series, split by subgroups of categorical columns used as hue.
    Title and name of axes are added automatically based on the name of x and y series.
    """

    # Store in a list the columns containing categorical data:
    list_categoricals = df.select_dtypes(include=["category"]).columns.tolist()

    # Calculate the number of subplots in the figure:
    number_of_plots = len(list_categoricals)
    # Set the number of columns to 3 because it fits most screens:
    number_of_cols = 3
    # Calculate the number of rows in which subplots are shown:
    number_of_rows = ceil(number_of_plots/number_of_cols)

    # Plot figure and set title:
    fig = plt.figure()
    fig.suptitle(f"relationship between {df[x_numeric_column].name} and {df[y_numeric_column].name} by each categorical feature".title())
    
    # Iterate through list of categorical columns:
    for i, elem in enumerate(list_categoricals):
        # Add subplots sequentially.
        # Mark the first subplot as i+1 because subplot indices start at 1, and list indeces start at 0.
        ax = fig.add_subplot(number_of_rows, number_of_cols, i+1)
        # Create each subplot, set title of subplot, labels and legend:
        sns.scatterplot(x=df[x_numeric_column], y=df[y_numeric_column], hue=elem, data=df, s=15)
        ax.set_title(elem.title(), fontsize=12, verticalalignment="bottom", y=0.95)
        ax.set(xlabel=df[x_numeric_column].name.capitalize(), ylabel=df[y_numeric_column].name.capitalize())
        ax.legend(fontsize=5, loc="best")
        # In VS Code the legend is upper left in minimized window and in best location when maximized.      
    plt.show()

scatter_sns(beer_all, "abv", "rating")

# Result 1: The scatterplot shows no linear relationship and no pattern between ratings and ABV.
# However, subgroup of lemon flavor with zero or low ABV has higher ratings compared to other groups.

### Hypothesis 2: Some subgroups might have a low number of observations and lead to overfitting the machine learning model.

In [None]:
# Create column to store status of occurrences.
beer_all["occurrence"] = np.nan

# Count occurrences in subgroups of categorical data and return df with subgroups below threshold.
def select_too_few_categorical_observations(df, categorical_columns, occurrence_column, threshold_percentage):
    """Count observations of categorical features and store result in custom column.

    If subgroup has less observations than threshold, mark them as too_few in a results column.

    Args:
    df (pandas.DataFrame): Dataframe containing categorical columns.
    categorical_columns (list): List of categorical columns.
    occurrence_column (int): Name of column that stores the count of occurrences.
    threshold_percentage (int, float): Percentage of minimum observations from total.

    Returns:
    df (pandas.DataFrame): Selection from original dataframe.
    Rows contain subgroups with counted observations less than threshold.
    """

    # Calculate the number of rows needed to pass the threshold, and print result.
    threshold = len(df)*threshold_percentage/100
    print(f"The threshold is {threshold} observations or more.")

    # Iterate through list of categorical columns and count occurrences of subgroups:
    for i, elem in enumerate(categorical_columns):
        counted = df[elem].value_counts()
    # Convert counts to dictionary:
        counted_dictionary = counted.to_dict()
    # Iterate through dictionary and store result if too_few:
        for key, value in counted_dictionary.items():
            if value < threshold:
                df.loc[df[elem] == key, occurrence_column] = "too_few"
                print(f"Too few {key} {elem}.")
    # Fill occurrence column with "enough" if the record was not marked as too_few:
    for elem in df[occurrence_column]:
        if elem != "too_few":
            df.loc[df[occurrence_column] != "too_few", occurrence_column] = "enough"
    # Select rows with too_few observations:
    too_few = df.loc[df[occurrence_column] == "too_few"]
    return too_few

# Define list of categorical features to be checked and set a threshold of 5% from the total.
categoricals = ["method", "style", "flavor", "fermentation"]
too_few_subgroups = select_too_few_categorical_observations(beer_all, categoricals, "occurrence", 5)

# Result 2: Style and Flavor columns have subgroups below the 5% threshold of the total observations.

### Hypothesis 3: Flavor column might have observable variation between its subgroups.

In [None]:
g = sns.barplot(x="flavor", y="rating", data=beer_all)
g.axhline(y=beer_all["rating"].mean(), linestyle="dotted", color="black")
g.set(xlabel="Flavor", ylabel="Rating", title="Average Rating Of Style Subgroups")
g.set_ylim([0, 10])
plt.show()

# Result 3: Flavor subgroups have observable variation between their average.
# The errorbar is bigger on herb subgroup.

### Hypothesis 4: Lemon beers’ high average might not be due to outliers.

In [None]:
g = sns.boxplot(x="flavor", y="rating", data=beer_all)
g.axhline(y=beer_all["rating"].median(), linestyle="dotted", color="black")
g.set(xlabel="Flavor", ylabel="Rating", title="Distribution Of Ratings In Flavor Subgroups")
g.set_ylim([0, 10])
plt.show()

# Result 4: The boxplot shows that the distribution of lemon beers is due to higher ratings overall compared to other subgroups.
# There's no median on the lemon box. Let's find out why.

### Let's investigate what's going on with the lineless box of lemon ratings.

In [None]:
# Check if lemon median equals one of the quantiles, remember median is 0.50 quantile.
lemon_ratings = beer_all[beer_all["flavor"] == "lemon"]["rating"]
print(lemon_ratings.quantile([0.25, 0.50, 0.75]))
# That's it, 0.50 and 0.75 quantile are equal, so that's why the unusual boxplot.
# Both 50% and 75% of all lemon ratings are higher or equal to 8.

# Check distribution of lemon beer ratings to see if the quantile explanation matches the graph.
g = sns.kdeplot(x=lemon_ratings)
g.set(title="KDE of Lemon Beer Rating", xlabel="Rating")
g.set_xticks(range(-1, 12))
plt.show()
# There's a peak of observations where the rating is 8.
# More than half of the distribution is on the left side of the 8 mark, so 75% looks plausible.

# Conclusion: The ratings of lemon beers are so much higher than the rest, that their median overlaps with its 75th percentile.

## 2.4 Verify Data Quality

### Data coverage in numerical variables.

In [None]:
# Data coverage in ratings.
unique_ratings = np.unique(beer_all["rating"])
unique_ratings_list = list(unique_ratings)
print(f"Uniques values of ratings are {unique_ratings_list}")

# Data coverage in abv.
unique_abv = np.unique(beer_all["abv"])
unique_abv_list = list(unique_abv)
print(f"Uniques values of alcohol content are {unique_abv_list}")

### Data coverage in categorical variables.

In [None]:
def countplot_sns(df):
    """Count of observations in each subgroup of categorical columns, plotted as bars.
    
    Args:
    df (pandas.DataFrame): Dataframe containing categorical columns.

    Returns:
    Seaborn countplot in subplots for co unting observations in each subgroup ofcategorical columns.
    Bars in descending order of counts.
    Axes labels are added automatically based on column names.
    """

    # Select columns with data type category:
    list_categoricals = df.select_dtypes(include=["category"]).columns.tolist()
    # Calculate the number of subplots in the figure:
    number_of_plots = len(list_categoricals)
    # Set the number of columns to 3 because it fits most screens:
    number_of_cols = 3
    # Calculate the number of rows in which subplots are shown:
    number_of_rows = ceil(number_of_plots/number_of_cols)

    # Plot figure and set title:
    fig = plt.figure()
    fig.suptitle("count of observations in each subgroup of categorical columns".title())
    # Iterate through list of categorical columns:
    for i, elem in enumerate(list_categoricals):
        # Define bar sorting criteria as descending counts:
        desc_order = df[elem].value_counts().index
        # Add subplots sequentially.
        # Mark the first subplot as i+1 because subplot indices start at 1, and list indeces start at 0.
        ax = fig.add_subplot(number_of_rows, number_of_cols, i+1)
        # Create each subplot, set labels and legend:
        sns.countplot(x=elem, data=df, order=desc_order)
        ax.set_title(elem.title(), fontsize=12, verticalalignment="bottom", y=0.95)
        ax.set_xticklabels(desc_order, fontsize=6, rotation=30, horizontalalignment="right", verticalalignment="top")
        ax.set(xlabel="", ylabel="")
    plt.show()

countplot_sns(beer_all)
# Size of subgroups consistent with stores' assortment.

# STEP 3. DATA PREPARATION

## 3.1 Select Data

### 3.1.1 Distribution of ratings.

In [None]:
g = sns.kdeplot(x=beer_all["rating"])
g.set(title="KDE Of Rating", xlabel="Rating")
g.set_xticks(range(-1, 12))
plt.show()
# Curve of ratings KDE is gaussian.

### 3.1.2 Distribution of alcohol content.

In [None]:
# Commercial beer is either regular, alcohol-free, or lemonade mix, and a smooth curve hides these groups.
# Set bandwidth lower than 1 to check if groups show up.  
g = sns.kdeplot(x=beer_all["abv"], label="smoothed curve")
g = sns.kdeplot(x=beer_all["abv"], bw_adjust=0.3, label="focused curve")
g.set(title="KDE Of Alcohol Content", xlabel="Alcohol content")
g.set_xticks(range(0, 11))
g.legend()
plt.show()
# Curve of alcohol content KDE is not gaussian.
# Still, it shows a pattern of three subgroups each with its own gaussian curve.

# Check proportion of alcohol-free beer because it influences the curve of alcohol content.
abv_list = beer_all["abv"].tolist()
abv_zero = (abv_list.count(0)/len(abv_list))*100
print(f"Alcohol-free are {abv_zero:.2f}% of total beer.")

### 3.1.3 Combined distribution of alcohol content and ratings.

In [None]:
# Calculate Spearman's correlation coefficient because it works for non-linear relationship if the variables are monotonic.
spearman_corr, _ = spearmanr(beer_all["abv"], beer_all["rating"])
print(f"Spearman\'s correlation: {spearman_corr:.2f}.")
# Result: -0.11, so if they have any kind of relationship, it is not monotonic.
# The scatterplot between abv and rating is consistent with Spearman's correlation coefficient.

# Plot KDE of alcohol content and rating. Set bandwidth less than 1 because distribution of ABV is not gaussian.
g = sns.kdeplot(data=beer_all, x=beer_all["abv"], y=beer_all["rating"], bw_adjust=0.7)
g.set(title="KDE of alcohol content and rating", xlabel="Alcohol content", ylabel="Rating")
plt.show() 
# There are three zones, so it makes sense to split the dataset into three groups after all cleanup is done.

### 3.1.4 Check country column if it has enough observations for each unique value, otherwise drop the column.

In [None]:
# Count occurrences for each country.
country_count = beer_all["country"].value_counts()

# Count occurrences for each country as percentage.
country_percentage = beer_all["country"].value_counts(normalize=True).mul(100).round(1)

# Collect all results in one dataframe.
country_stats = pd.DataFrame({"observations": country_count, "percentage": country_percentage})
# print(country_stats)
# There are 17 unique countries and 15 of them have each less than 5% of the total observations.

# Drop country column.
beer_all = beer_all.drop("country", 1)

### 3.1.5 Check which categorical features have high variation across subgroups for building models.

In [None]:
# Plot ratings variation in subgroups across categorical features.

def barplot_sns(df, y_numeric_column, graph_title, categorical_columns=None):
    """Seaborn subplots of bars showing mean of numeric column grouped by categorical columns.
    If list of categorical columns is not provided, uses columns of category type from dataframe.
    
    Args:
    df (pandas.DataFrame): Dataframe containing numerical and categorical columns.
    y_series (str): Name of numerical column.
    graph_title (str): Title of graph.
    categorical_columns (list, optional): List of categorical columns.

    Returns:
    Seaborn subplots of bars with mean of each subgroup in categorical columns.
    Horizontal line to mark the mean of each column. Bars in descending order of mean.
    Axes labels are added automatically based on column names.
    """
    # Check if input contains list of categorical columns. If not, select columns with data type category:
    if categorical_columns != None:
        list_categoricals = categorical_columns
    else:
        list_categoricals = df.select_dtypes(include=["category"]).columns.tolist()
    
    # Calculate the number of subplots in the figure:
    number_of_plots = len(list_categoricals)
    # Set the number of columns to 3 because it fits most screens:
    number_of_cols = 3
    # Calculate the number of rows in which subplots are shown:
    number_of_rows = ceil(number_of_plots/number_of_cols)

    # Plot figure and set title:
    fig = plt.figure()
    fig.suptitle(graph_title.title())
    # Iterate through list of categorical columns:
    for i, elem in enumerate(list_categoricals):
    # Define bar sorting criteria as descending mean:
        desc_order = list(df.groupby(elem)[y_numeric_column].mean().reset_index().sort_values(by=y_numeric_column, ascending=False)[elem])
        # Add subplots sequentially.
        # Mark the first subplot as i+1 because subplot indices start at 1, and list indeces start at 0.
        ax = fig.add_subplot(number_of_rows, number_of_cols, i+1)
        # Create each subplot, set horizontal line to mark the mean, set labels and legend:
        sns.barplot(x=elem, y=y_numeric_column, data=df, order=desc_order, dodge=False)
        ax.axhline(y=df[y_numeric_column].mean(), linestyle="dotted", color="black")
        ax.set_title(elem.title(), fontsize=12, verticalalignment="bottom", y=0.95)
        ax.set_xticklabels(desc_order, fontsize=6, rotation=30, horizontalalignment="right", verticalalignment="top")
        ax.set(xlabel="", ylabel="")
        ax.set_ylim([0, 10])
        
    plt.show()
 
barplot_sns(beer_all, "rating", "average rating of beer subgroups", categoricals)


# The barplot shows Flavor and Style have high variation between the average of their subgroups.
# Method and Fermentation have low variation across their subgroups.
# Errorbars are bigger on subgroups with low number of observations.

### Explain sequence for sorting bars of categoricals by value counts (numerical values) in the previous function. Use subgroups of flavor column as example.

In [None]:
# Start grouping all rows by flavor and select only the rating column, then calculate its mean.
# Reset the index to avoid future errors.
grouped = beer_all.groupby("flavor")["rating"].mean().reset_index()
# Then sort this series by ratings in descending order and select flavor labels.
ordered = grouped.sort_values(by="rating", ascending=False)["flavor"]
# Lastly, turn these grouped and ordered flavor labels into a list ready to use when creating graphs.
stored_in_list = list(ordered)
# In a more concise (and hard to read) line, the flow looks like this:
desc_order = list(beer_all.groupby("flavor")["rating"].mean().reset_index().sort_values(by="rating", ascending=False)["flavor"])

In [None]:
# Abstract representation of ratings distribution. Take a look at median, IQR, whiskers, outliers.

def boxplot_sns(df, y_numeric_column, graph_title, categorical_columns=None):
    """Seaborn subplots with boxplots of subgroups' distribution in categorical columns.
    If list of categorical columns is not provided, uses columns of category type from dataframe.
    
    Args:
    df (pandas.DataFrame): Dataframe containing numerical and categorical columns.
    y_series (str): Name of numerical column.
    graph_title (str): Title for graph.
    categorical_columns (list, optional): List of categorical columns.

    Returns:
    Seaborn subplots of boxplots showing distribution of subgroups in categorical columns.
    Boxes in descending order of median.
    Axes labels are added automatically based on column names.
    """

    # Check if input contains list of categorical columns. If not, select columns with data type category:
    if categorical_columns != None:
        list_categoricals = categorical_columns
    else:
        list_categoricals = df.select_dtypes(include=["category"]).columns.tolist()
    
    # Calculate the number of subplots in the figure:
    number_of_plots = len(list_categoricals)
    # Set the number of columns to 3 because it fits most screens:
    number_of_cols = 3
    # Calculate the number of rows in which subplots are shown:
    number_of_rows = ceil(number_of_plots/number_of_cols)
    
    # Plot figure and set title:
    fig = plt.figure()
    fig.suptitle(graph_title.title())
    # Iterate through list of categorical columns:
    for i, elem in enumerate(list_categoricals):
        # Define box sorting criteria as descending median:
        desc_order = list(df.groupby(elem)[y_numeric_column].median().reset_index().sort_values(by=y_numeric_column, ascending=False)[elem])
        # Add subplots sequentially.
        # Mark the first subplot as i+1 because subplot indices start at 1, and list indeces start at 0.
        ax = fig.add_subplot(number_of_rows, number_of_cols, i+1)
        # Create each subplot, set labels and legend:
        sns.boxplot(x=elem, y=y_numeric_column, data=df, order=desc_order)
        ax.set_title(elem.title(), fontsize=12, verticalalignment="bottom", y=0.95)
        ax.set_xticklabels(desc_order, fontsize=6, rotation=30, horizontalalignment="right", verticalalignment="top")
        ax.set(xlabel="", ylabel="")
        ax.set_ylim([0, 10])

    plt.show()

boxplot_sns(beer_all, "rating", "Rating distribution of beer subgroups", categoricals)

# The boxplot shows Flavor and Style have high variation between the distributions of their subgroups.
# Method and Fermentation have low variation across their subgroups.
# Errorbars are bigger on subgroups with low number of observations.

## 3.2 Clean data.

In [None]:
# Define range of normal ratings as 2*std away from the mean because dataset is small and has gaussian distribution.
mean_rating = beer_all["rating"].mean()
std_rating = beer_all["rating"].std()
two_std_rating = std_rating * 2

lower_limit_rating = mean_rating - two_std_rating
upper_limit_rating = mean_rating + two_std_rating

print(f"The mean rating is {mean_rating:.2f}.")
print(f"The lower limit of normal ratings is {lower_limit_rating:.2f} and the upper limit is {upper_limit_rating:.2f}.")

# Identify rating outliers. Use result to train machine learning model and avoid overfitting.
outlier_ratings = [x for x in beer_all["rating"] if x < lower_limit_rating or x > upper_limit_rating]
outlier_ratings.sort()
# print(f"These are the rating outliers: {outlier_ratings}")
print(f"There are {len(outlier_ratings)} outliers out of {len(beer_all.rating)} total rating observations.")

## 3.3 Construct Data

### 3.3.1 Derived Attributes.

In [None]:
# Create column for filtration status.
unfiltered_words = ["unfiltered", "kellerbier", "natur", "naturtrubes", "nefiltrata", "nonfiltrata"]
beer_all["filtration"] = beer_all["name"].str.contains("|".join(unfiltered_words))
beer_all["filtration"] = beer_all["filtration"].replace({True: "unfiltered", False: "filtered"})

# Create column for pasteurization status.
unpasteurized_words = ["unpasteurized", "kellerbier", "natur", "naturtrubes", "nepasteurizata", "nonpastorizzata"]
beer_all["pasteurization"] = beer_all["name"].str.contains("|".join(unpasteurized_words))
beer_all["pasteurization"] = beer_all["pasteurization"].replace({True: "unpasteurized", False: "pasteurized"})

# Format the new columns as categoricals.
beer_all[["filtration", "pasteurization"]] = beer_all[["filtration", "pasteurization"]].astype("category")

### 3.3.2 Generated records.

In [None]:
# Create column to bin alcohol content as categorical data.
abv_bins = [0, 0.5, 2.8, 4.4, 5.5, max(beer_all["abv"])]
perception_labels = ["drive", "refresh", "weak", "tasty", "too_strong"]
beer_all["perception"] = pd.cut(beer_all["abv"], bins=abv_bins, labels=perception_labels, include_lowest=True)

## 3.4 Integrate Data

In [None]:
# 3.4.1 Select observations with ratings less than 2*std away from the mean. Use limits calculated at 3.2 and reset index.
outlier_condition = (beer_all.rating < lower_limit_rating) | (beer_all.rating > upper_limit_rating)
beer_2std = beer_all.drop(beer_all[outlier_condition].index)
beer_2std.reset_index(inplace=True, drop=True)
print(f"Dataframe without outliers has {len(beer_2std.rating)} rows.")
# Check this out, below selection by square brackets doesn't work in f-string, must use dot notation.
# print(f"There are {len(beer_2std["rating"])} observations with normal ratings.")

In [None]:
# 3.4.2 Split dataset by alcohol content into three subsets: alcohol-free, light and regular beer.
alc_free_all = beer_all[beer_all["abv"] <= 0.5]
light_all = beer_all[(beer_all["abv"] > 0.5) & (beer_all["abv"] <= 3)]
regular_all = beer_all[beer_all["abv"] > 3]

alc_free_2std = beer_2std[beer_2std["abv"] <= 0.5]
light_2std = beer_2std[(beer_2std["abv"] > 0.5) & (beer_2std["abv"] <= 3)]
regular_2std = beer_2std[beer_2std["abv"] > 3]

print(f"In complete ratings range there are {len(alc_free_all)} alcohol-free, {len(light_all)} light and {len(regular_all)} regular beers.")
print(f"In less than 2*std away ratings there are {len(alc_free_2std)} alcohol-free, {len(light_2std)} light and {len(regular_2std)} regular beers.")

In [None]:
# 3.4.3 Check distribution of ratings in subsets
# Plot distribution of ratings in each subset to check if their curves are gaussian.
g = sns.kdeplot(x=alc_free_all["rating"], label="all")
g = sns.kdeplot(x=alc_free_2std["rating"], label="2_std")
g.set(title="Alcohol-free Beer Ratings", xlabel="Rating")
g.set_xticks(range(-1, 12))
plt.legend()
plt.show()

g = sns.kdeplot(x=light_all["rating"], label="all")
g = sns.kdeplot(x=light_2std["rating"], label="2_std")
g.set(title="Light Beer Ratings", xlabel="Rating")
g.set_xticks(range(-1, 12))
plt.legend()
plt.show()

g = sns.kdeplot(x=regular_all["rating"], label="all")
g = sns.kdeplot(x=regular_2std["rating"], label="2_std")
g.set(title="Regular Beer Ratings", xlabel="Rating")
g.set_xticks(range(-1, 12))
plt.legend()
plt.show()
# KDE plots of the six subsets prove that ratings keep their gaussian curve even if split by alcohol content criteria.

In [None]:
# Create a dictionary that stores datasets as values and their names as keys.
clean_dataframes = [beer_all, alc_free_all, light_all, regular_all, beer_2std, alc_free_2std, light_2std, regular_2std]
dataframe_names = ["beer_all", "alc_free_all", "light_all", "regular_all", "beer_2std", "alc_free_2std", "light_2std", "regular_2std"]

dataframes_dict = dict(zip(dataframe_names, clean_dataframes))

In [None]:
# 3.4.4 Check feature variation across subgroups when split into the three subsets based on alcohol content.

for key, value in dataframes_dict.items():
    boxplot_sns(value, "rating", f"Distribution of {key}")
    plt.show()

# All subsets have higher variation in Style and Flavor, and no variation in Method.
# Some subsets have a bit of variation in Filtration, Pasteurization, Fermentation and Perception.

## 3.5 Format Data

In [None]:
# Sort df on ABV, then on rating.
beer_all.sort_values(["rating", "abv"], ascending=[True, True], inplace=True)

## 3.6 Dataset - output.

In [None]:
# Export dataframes and remove index because beer identification is done through their unique name.
# It's also good to avoid having two columns with indices next time the file is imported.

def export_files_csv(dataframes_dictionary, output_folder):
    """Export dictionary of pandas DataFrames to csv files
    Must have names as keys and dataframes as values.

    Args:
    dataframes_dictionary (dict): Dictionary of dataframes to be exported.
    output_folder (str): Path to folder where to export result.

    Returns:
    csv files.
    """

    for key, value in dataframes_dictionary.items():
        output_address = output_folder + str(key) + ".csv"
        value.to_csv(output_address, index=False)

# Path to folder containing clean files.
clean_files_path = "https://github.com/alina-molnar/Beer-Recommendation-Project-Proof-of-Concept/blob/main/beer_output/cleaning_output/"


export_files_csv(dataframes_dict, clean_files_path)

## 3.7 Dataset Description

In [None]:
# Print total number of columns, rows and ratings lower than 5.
# Print description of categorical columns.

for key, value in dataframes_dict.items():
    total_rows = value.shape[0]
    lower_than_five = value[value["rating"] < 5]
    only_lows = lower_than_five.shape[0]
    total_columns = value.shape[1]
    print(f"{key} \nRows: {total_rows} \nNumber of ratings lower than 5: {only_lows} \nColumns: {total_columns} \n")
    describe_categorical_columns(value, "rating")

# STEP 4. MODELING. MACHINE LEARNING WITH H2O

### Initialize H2O

In [None]:
# Assertions are disabled because they are mainly used for error checking and debugging purposes.
# nthreads=-1 means use all CPU on the host
h2o.init(nthreads=-1, enable_assertions=False)

### Collect .csv files.

In [None]:
# Select clean files.
clean_files = glob.glob(clean_files_path + "*.csv")

### Import datasets as dictionaries into H2O.

In [None]:
# Write function to import multiple files at once.

def files_to_h2o_frames(files, response_column):
    """Import files into H2O frames.
    
    Args:
    files (list): List of files to be imported.
    response_column (str): Name of response column.

    Returns:
    dict: Dictionary containing H2O frames, predictors and response.
    Key: the name of a dataset. Values: the imported frame, its list of predictors, the response column.
    """

    # Create empty dictionary to be populated at each iteration.
    dictionary = {}

    # Iterate through list of files to create name, frame and list of predictors.
    for i, elem in enumerate(files):
        name = PureWindowsPath(elem).stem
        frame = h2o.import_file(elem)
        if "beer" in name:
            predictors = ["style", "flavor", "perception", "abv"]
        elif "alc_free" in name:
            predictors = ["style", "flavor"]
        elif "light" in name:
            predictors = ["style", "flavor", "abv"]
        else:
            predictors = ["style", "flavor", "pasteurization", "abv"]

        # Add key and values to dictionary.
        dictionary[name] = {"frame":frame, "predictors":predictors, "response": response_column}
    return dictionary

frames_dictionary = files_to_h2o_frames(clean_files, "rating")

# Print the resulting dictionaries to check if they look right.
print(frames_dictionary)

### 4.1 Select Modeling Techniques.

In [None]:
# DRF, GBM - see motives in Readme file.

### 4.2 Generate Test Design.

In [None]:
# The model will learn from the training set and will be assessed on the test set.
# Train 0.7, valid 0.15 and test 0.15 splits were decided manually to ensure they are diverse no matter how few observations there are.
# See note in Readme file.

### 4.3 Build Models.

### 4.3.1 Distributed Random Forest - DRF.

In [None]:
# Write function to generate DRF model and export prediction as pandas dataframe.

def model_h2o_drf(frames, pred_output_folder, mse_output_folder):
    """Build DRF model in H2O for each dataset, add prediction to pandas dataframe, export result and MSE as csv.

    Args:
    frames (dict): Dictionary containing frames, predictors and response column.
    pred_output_folder (str): Path to folder where to export DRF model.
    mse_output_folder (str): Path to folder where to export file with MSE of all DRF models.

    Returns:
    zip archive of each model
    csv file with predictions of each model
    csv file with MSE of all DRF models
    """

    # Create list of model names.
    model_names = []

    # Create list of MSE results from each model.
    mse_list = []

    # Iterate through dictionary and store its elements under short variable names to help with readability:
    for key, value in frames.items():
        # Access the name.
        name = str(key)+ "_drf"

        # Append model name to list.
        model_names.append(name)

        # Access the frame.
        frame = frames[key]["frame"]
        # Access the predictors.
        predictor_list = frames[key]["predictors"]
        # Access the response.
        response_column = frames[key]["response"]

        # Split rows into training, validation and test sets. This makes reproducibility possible.
        train = frame[frame["split"]=="train"]
        valid = frame[frame["split"]=="valid"]
        test = frame[frame["split"]=="test"]

        # Instantiate model with custom parameters.   
        model = H2ORandomForestEstimator(seed=12, categorical_encoding="Enum", nfolds=4, fold_assignment="random", 
        mtries=len(predictor_list), nbins=13, nbins_top_level=16, build_tree_one_node=True)

        # Train model. Specify predictors, response column, training frame and validation frame.
        model.train(x=predictor_list, y=response_column, training_frame=train, validation_frame=valid, model_id=name+"_model")

        # Print model to show variable importance.
        # print(model)

        # Export model.
        model_file = model.download_mojo(path="https://github.com/alina-molnar/Beer-Recommendation-Project-Proof-of-Concept/tree/main/beer_output/models/drf_models", get_genmodel_jar=False)

        # Generate prediction. It gets stored in a H2O frame with one column named "predict".
        prediction = model.predict(frame)

        # Calculate model performance on test set.
        performance = model.model_performance(test)

        # Store model performance as json into a dictionary.
        perf_dict = performance._metric_json

        # Select only MSE from performance dictionary. Use ndarray.item method to catch errors in case MSE output is not a float.
        mse_value = np.asarray([value for key, value in perf_dict.items() if key == "MSE"]).item()
        
        # Append MSE list.
        mse_list.append(mse_value)

        # Print model name, predictor list and MSE.
        print(f"This is DRF {name} model trained on {predictor_list} and its MSE is {mse_value:.2f}")

        # Add prediction to original H2O frame to help further analysis.
        dataset_plus_prediction = frame.cbind(prediction)

        # Convert H2O frame of model to pandas dataframe.
        dataset_plus_prediction_pandas = dataset_plus_prediction.as_data_frame()

        # Export prediction dataframe.
        output_address = pred_output_folder + name
        dataset_plus_prediction_pandas.to_csv(output_address + ".csv", index=False)

    # Zip names and MSE values into a pandas dataframe.
    mse_models = pd.DataFrame(zip(model_names, mse_list), columns=["model_name", "mse"])

    # Export MSE dataframe to csv file.
    mse_models.to_csv(mse_output_folder + "drf_mse.csv", index=False)

### 4.3.2 Gradient Boosting Machine - GBM.

In [None]:
# Write function to generate GBM model and export prediction as pandas dataframe.

def model_h2o_gbm(frames, pred_output_folder, mse_output_folder):
    """Build GBM model in H2O for each dataset, add prediction to pandas dataframe, export result and MSE as csv.

    Args:
    frames (dict): Dictionary containing frames, predictors and response column.
    pred_output_folder (str): Path to folder where to export GBM model.
    mse_output_folder (str): Path to folder where to export file with MSE of all GBM models.

    Returns:
    zip archive of each model
    csv file with predictions of each model
    csv file with MSE of all GBM models
    """
    
    # Create list of model names.
    model_names = []

    # Create list of MSE results from each model.
    mse_list = []

    # Iterate through dictionary and store its elements under short variable names to help with readability:
    for key, value in frames.items():
        # Access the name.
        name = str(key) + "_gbm"

        # Append model name to list.
        model_names.append(name)

        # Access the frame.
        frame = frames[key]["frame"]
        # Access the predictors.
        predictor_list = frames[key]["predictors"]
        # Access the response.
        response_column = frames[key]["response"]

        # Split rows into training, validation and test sets. This makes reproducibility possible.
        train = frame[frame["split"]=="train"]
        valid = frame[frame["split"]=="valid"]
        test = frame[frame["split"]=="test"]

        # Instantiate model with custom parameters.
        model = H2OGradientBoostingEstimator(seed=12, categorical_encoding="Enum", nfolds=4, fold_assignment="random", 
        min_rows=1, nbins=13, nbins_top_level=16, distribution="gaussian", build_tree_one_node=True)

        # Train model. Specify predictors, response column, training frame and validation frame.
        model.train(x=predictor_list, y=response_column, training_frame=train, validation_frame=valid, model_id=name+"_model")

        # Print model to show variable importance.
        # print(model)
        
        # Export model.
        model_file = model.download_mojo(path="https://github.com/alina-molnar/Beer-Recommendation-Project-Proof-of-Concept/tree/main/beer_output/models/gbm_models", get_genmodel_jar=False)

        # Generate prediction. It gets stored in a H2O frame with one column named "predict".
        prediction = model.predict(frame)

        # Calculate model performance on test set and print it.
        performance = model.model_performance(test)

        # Export model performance to json file.
        perf_dict = performance._metric_json

        # Select only MSE from performance json file. Use ndarray.item method to catch errors in case MSE output is not a float.
        mse_value = np.asarray([value for key, value in perf_dict.items() if key == "MSE"]).item()
        
        # Append MSE list.
        mse_list.append(mse_value)

        # Print model name, predictor list and MSE.
        print(f"This is GBM {name} model trained on {predictor_list} and its MSE is {mse_value:.2f}.")

        # Add prediction to original H2O frame to help further analysis.
        dataset_plus_prediction = frame.cbind(prediction)

        # Convert H2O frame to pandas dataframe.
        dataset_plus_prediction_pandas = dataset_plus_prediction.as_data_frame()

        # Export prediction dataframe.
        output_address = pred_output_folder + name + ".csv"
        dataset_plus_prediction_pandas.to_csv(output_address, index=False)

    # Zip names and MSE values into a pandas dataframe.
    mse_models = pd.DataFrame(zip(model_names, mse_list), columns=["model_name", "mse"])

    # Export MSE dataframe to csv file.
    mse_models.to_csv(mse_output_folder + "gbm_mse.csv", index=False)

In [None]:
# Output folders for pandas prediction dataframes resulted from DRF and GBM models.
pred_drf_folder = "https://github.com/alina-molnar/Beer-Recommendation-Project-Proof-of-Concept/tree/main/beer_output/models/drf_models/"
pred_gbm_folder = "https://github.com/alina-molnar/Beer-Recommendation-Project-Proof-of-Concept/tree/main/beer_output/models/gbm_models/"

# Output folder for MSE of models.
mse_folder = "https://github.com/alina-molnar/Beer-Recommendation-Project-Proof-of-Concept/tree/main/beer_output/metrics/mse/"

# Call functions that build models and export dataframes. 
model_h2o_drf(frames_dictionary, pred_drf_folder, mse_folder)
model_h2o_gbm(frames_dictionary, pred_gbm_folder, mse_folder)

# STEP 5. EVALUATION

## 5.1 Evaluate results.

In [None]:
# Path to folder containing predictions.
predictions_path = "https://github.com/alina-molnar/Beer-Recommendation-Project-Proof-of-Concept/tree/main/beer_output/predictions/"

# Select files containing predictions. Recursive parameter extracts files also from subfolders in path.
predictions_files = glob.glob(predictions_path + "/**/*.csv", recursive=True)

# Extract name of files and use them to store dataframes.
predictions_names = [PureWindowsPath(elem).stem for elem in predictions_files]

# Read files into pandas dataframes.
predictions_dataframes = [pd.read_csv(item) for item in predictions_files]

# Create dictionary to store names and dataframes of predictions.
predictions_dict = dict(zip(predictions_names, predictions_dataframes))

### 5.1.1 Assessment of data mining results w.r.t. business success criteria.

In [None]:
# Create function to input dictionary of dataframes and return their recall scores.

def recall_score(input_dict, real_response_column, predict_column, threshold, recall_output_folder):
    """Calculate recall score of predictions and export them to csv file.

    The name of the response column should be the same in all dataframes. Same applies for predicted column.
    Possible values of recall score are between 0 and 1.
    The function returns "np.NaN" if the total number of true positives plus false negatives is zero.

    Args:
    input_dict (dict): Dictionary of dataframes and their names.
    real_response_column (str): The name of the response column with numerical data as values.
    predict_column(str): The name of the predicted column with numerical data as values.
    threshold (int, float): The threshold that separates outcomes.
    recall_output_folder (str): Path to folder where to export csv file with recall scores.

    Returns:
    csv file with recall scores.
    """

    # Create list of model names.
    model_names = []

    # Create list of recall scores from all models.
    recall_list = []

    # Iterate through input dictionary:
    for key, value in input_dict.items():
        # Access the name of the model.
        model_name = str(key)

        # Append name to model list.
        model_names.append(model_name)

        # Access the dataframe.
        predictions_df = input_dict[key]

        # Select true positives and false negatives:
        tp = predictions_df[(predictions_df[real_response_column] < threshold) & (predictions_df[predict_column] < threshold)]
        fn = predictions_df[(predictions_df[real_response_column] < threshold) & (predictions_df[predict_column] >= threshold)]

        # Calculate recall score:
        try:
            recall = len(tp) / (len(tp) + len(fn))
        except ZeroDivisionError:
            recall = np.NaN
        
        # Append recall to score list.
        recall_list.append(recall)

    # Zip model names and recall scores into a pandas dataframe.
    recall_models = pd.DataFrame(zip(model_names, recall_list), columns=["model_name", "recall"])

    # Export recall dataframe to csv file.
    recall_models.to_csv(recall_output_folder + "recall_score.csv", index=False)

In [None]:
# Folder where to store recall.
recall_path = "https://github.com/alina-molnar/Beer-Recommendation-Project-Proof-of-Concept/tree/main/beer_output/metrics/recall/"

# Generate recall scores.
recall_score(predictions_dict, "rating", "predict", 5, recall_path)

In [None]:
# Import recall score file and MSE files of DRF and GBM models.
drf_gbm_recall_score = pd.read_csv("https://github.com/alina-molnar/Beer-Recommendation-Project-Proof-of-Concept/blob/main/beer_output/metrics/recall/recall_score.csv")
drf_mse = pd.read_csv("https://github.com/alina-molnar/Beer-Recommendation-Project-Proof-of-Concept/blob/main/beer_output/metrics/mse/drf_mse.csv")
gbm_mse = pd.read_csv("https://github.com/alina-molnar/Beer-Recommendation-Project-Proof-of-Concept/blob/main/beer_output/metrics/mse/gbm_mse.csv")

# Merge the MSE files.
merged_mse = drf_mse.merge(gbm_mse, how="outer")

# Merge total MSE with recall score.
mse_recall = merged_mse.merge(drf_gbm_recall_score, how="outer")

In [None]:
# Create column to store the model type.
mse_recall["model_type"] = mse_recall["model_name"].str.contains("drf")
mse_recall["model_type"] = mse_recall["model_type"].replace({True: "DRF", False: "GBM"})

# Create a column to store the range of each dataset.
mse_recall["dataset_range"] = mse_recall["model_name"].str.contains("_2std")
mse_recall["dataset_range"] = mse_recall["dataset_range"].replace({True: "2-std", False: "all"})

# Sort dataframe by recall score.
mse_recall.sort_values(["recall", "model_name"], ascending=False, inplace=True)

# Export dataframe.
mse_recall.to_csv("https://github.com/alina-molnar/Beer-Recommendation-Project-Proof-of-Concept/blob/main/beer_output/metrics/mse_recall.csv", index=False)

### Recall score.

In [None]:
# Plot MSE and the recall score of each model to compare metrics.
fig, ax = plt.subplots()
sns.lineplot(x="model_name", y="mse", data=mse_recall, label="MSE", linestyle="dotted", marker="o")
sns.lineplot(x="model_name", y="recall", data=mse_recall, label="Recall", linestyle="dashed", marker="D")
ax.set(title="MSE And Recall Score Of Models", xlabel="", ylabel="MSE and recall")
ax.set_xticks(np.arange(len(mse_recall["model_name"])))
plt.xticks(rotation=30, horizontalalignment="right", verticalalignment="top")
ax.legend()
plt.show()
# MSE is lowest on models with maximum recall score, which is to be expected.
# However, MSE doesn't show a reverse pattern of recall. This proves that lower MSE doesn't necessarily mean a better model.

### 5.1.2 Approved Models.

In [None]:
# Print MSE and recall dataframe.
print(mse_recall)

# Plot recall scores. Use dodge parameter to create even space between bars when using hue.
# Graph split by model type.
sns.barplot(x="model_name", y="recall", data=mse_recall, hue=mse_recall["model_type"], dodge=False)
plt.xticks(rotation=30, horizontalalignment="right", verticalalignment="top")
plt.ylabel(mse_recall["recall"].name.capitalize())
plt.title("recall of models split by type".title())
plt.legend(loc="upper right")
plt.show()
# Graph shows that DRF and GBM have the similar results for the same model, except for beer full range dataset where DRF performed better.
# Will use DRF models for predictions on the unseen dataset.

# Graph split by dataset range.
sns.barplot(x="model_name", y="recall", data=mse_recall, hue=mse_recall["dataset_range"], dodge=False)
plt.xticks(rotation=30, horizontalalignment="right", verticalalignment="top")
plt.ylabel(mse_recall["recall"].name.capitalize())
plt.title("recall of models split by dataset range".title())
plt.legend(loc="upper right")
plt.show()
# Graph shows that full range datasets generated better models than datasets with 2*std away ratings.
# This was to be expected because the goal of the project is to predict outliers, not the bulk of average ratings.
# Will keep outliers when testing the model on the unseen beer dataset.

## 5.2 Review Process.

### False negatives.

In [None]:
# Create function to input dictionary of frames and return dictionary of false negatives in each model:

def false_negatives_dictionary(input_dict, real_response_column, predict_column, threshold):
    """Select false negatives observations from model output dataframes.

    Args:
    input_dict (dict): Dictionary of dataframes and their names.
    real_response_column (str): The name of the response column containing numerical data as values.
    predict_column(str): The name of the predicted column containing numerical data as values.
    threshold (int, float): The threshold that separates outcomes.

    Returns:
    dict: Dictionary of dataframes with false negatives. Keys are names and values are pandas dataframes. 
    """

    # Create empty dictionary to be populated at each iteration:
    fn_dict = {}

    # Iterate through input dictionary to extract each dataframe and its name:
    for key, value in input_dict.items():
        dataframe_name = str(key)
        
        # Select rows that contain false negatives:
        fn_dataframe = value[(value[real_response_column] < threshold) & (value[predict_column] >= threshold)]

        # Update empty dictionary with each false negative dataframe and its name:
        if not fn_dataframe.empty:
            # Define name for each false negatives dataframe:
            fn_name = dataframe_name + "_fn"
            fn_dict[fn_name] = fn_dataframe
        else:
            pass
    return fn_dict


false_negatives = false_negatives_dictionary(predictions_dict, "rating", "predict", 5)

### False negatives split by occurence of observations in subgroups.

In [None]:
def plot_occurrences(fn_dictionary):
    """Subplots of false negatives split by occurrence in each dataset.

    Args:
    fn_dictionary (dict): Dictionary of dataframes and their names.

    Returns:
    Seaborn countplot in subplots of false negatives.
    """

    # Create figure and set its title:
    fig = plt.figure()
    fig.suptitle("Occurrence of observations in false negatives".title())
    # Start index of subplots:
    i=0

    # Iterate through input dictionary:
    for key, value in fn_dictionary.items():
        dataframe_name = str(key)

        # Add subplots sequentially.
        # Mark the first subplot as i+1 because subplot indices start at 1, and i is initialized at 0.
        ax = fig.add_subplot(3, 4, i+1)
        # Set title of subplot:
        ax.set_title(f"{dataframe_name}", verticalalignment="top", y=0.9)
        
        # Select rows of each subgroup:
        too_few = value[value["occurrence"] == "too_few"]
        enough = value[value["occurrence"] == "enough"]

        # If both subgroups are present use whole dataframe for plotting, otherwise use the single series.
        # This helps when aligning labels.
        if len(too_few["occurrence"]) > 0:
            sns.countplot(x="occurrence", data=value, order=value["occurrence"].value_counts().index)
        else:
            sns.countplot(x="occurrence", data=enough)

        # Count bars to be plotted because it helps with setting bar width:
        bar_number = value["occurrence"].nunique()

        # Set bar width. Matplotlib divides the plot area into bars according to the number of bars. 
        # The float parameter provided by user is not used as a constant.
        # That's why setting a parameter dependent on number of bars cancels the division made by matplotlib.
        # This results in bars with the same width.
        for patch in ax.patches:
            patch.set_width(0.3*bar_number)
        # Show value counts of observations as labels on top of bars:
        ax.bar_label(ax.containers[0])

        # Set only y label to show they are counts.
        # An x label would crowd the figure because of limited space between rows of graphs.
        # The title of the whole figure already says what's on x axis.
        ax.set(xlabel="", ylabel="Count")

        # Calculate middle points of bars and use them to mark location of x-ticks:
        midpoints = [patch.get_x() + patch.get_width() / 2 for patch in ax.patches]
        ax.set_xticks(midpoints)
        # Define list of labels and set them under x-ticks:
        list_labels = list(value["occurrence"].value_counts().index)
        ax.set_xticklabels(list_labels)

        # Set common y limit for all subplots in order to have the same scale when visualizing them:
        plt.ylim([0, 20])

        # Increment the index for the next subplot:
        i += 1
    
    # Show all graphs in one figure.
    plt.show()

plot_occurrences(false_negatives)

# Graphs show that most of the wrong predictions were generated for subgroups that had enough observations.

### 6.1.1 Import and clean unseen file following the same steps as with seen data

In [None]:
# Import unseen file.
beer_unseen_raw = pd.read_csv("C:\\Users\\alina\\OneDrive\\beer_project\\unseen_data\\unseen_input\\beer_unseen_raw.csv", header=0)

# Standardize appeareance. Convert column labels to lowercase.
beer_unseen_raw.columns = beer_unseen_raw.columns.str.lower()

# Convert columns values to lowercase if they are strings.
beer_unseen = beer_unseen_raw.applymap(lambda col:col.lower() if type(col) == str else col)

# Convert Name column from object to string.
beer_unseen["name"] = beer_unseen["name"].astype("string")

# Cut alcohol content from end of name and store as separate column.
beer_unseen["abv"] = [name.rsplit(maxsplit=1)[-1] for name in beer_unseen["name"]]

# Convert alcohol content to float.
beer_unseen["abv"] = beer_unseen["abv"].astype(float)

# Validation of uniqueness in beer names. Check for duplicates, remove if found.
beer_unseen.drop_duplicates(subset="name", keep="last", inplace=True)

# Create column for Pasteurization status.
beer_unseen["pasteurization"] = beer_unseen["name"].str.contains("|".join(unpasteurized_words))
beer_unseen["pasteurization"] = beer_unseen["pasteurization"].replace({True: "unpasteurized", False: "pasteurized"})

### 6.1.2 Create datasets based on alcohol content

In [None]:
# Split unseen dataset into alcohol-free, light and regular beer.
alc_free_unseen = beer_unseen[beer_unseen["abv"] <= 0.5]
light_unseen = beer_unseen[(beer_unseen["abv"] > 0.5) & (beer_unseen["abv"] <= 3)]
regular_unseen = beer_unseen[beer_unseen["abv"] > 3]

# Create list of unseen datasets, list of identificators and zip them.
clean_unseen = [alc_free_unseen, light_unseen, regular_unseen]
unseen_names = ["alc_free_unseen", "light_unseen", "regular_unseen"]
unseen_dict = dict(zip(unseen_names, clean_unseen))

# Export unseen datasets after cleaning.
unseen_clean_path = "C:\\Users\\alina\\OneDrive\\beer_project\\unseen_data\\unseen_output\\unseen_clean\\"
export_files_csv(unseen_dict, unseen_clean_path)

### 6.1.3 Apply ML models on unseen data

In [None]:
# Start H2O and import unseen files.
h2o.init(nthreads=-1, enable_assertions=False)
alc_free_unseen_frame = h2o.import_file("C:\\Users\\alina\\OneDrive\\beer_project\\unseen_data\\unseen_output\\unseen_clean\\alc_free_unseen.csv")
light_unseen_frame = h2o.import_file("C:\\Users\\alina\\OneDrive\\beer_project\\unseen_data\\unseen_output\\unseen_clean\\light_unseen.csv")
regular_unseen_frame = h2o.import_file("C:\\Users\\alina\\OneDrive\\beer_project\\unseen_data\\unseen_output\\unseen_clean\\regular_unseen.csv")

# Import models trained models in step 4.3.3.
alc_free_imported_model = h2o.import_mojo("C:\\Users\\alina\\OneDrive\\beer_project\\beer_output\\models\\drf_models\\alc_free_all_drf_model.zip")
light_imported_model = h2o.import_mojo("C:\\Users\\alina\\OneDrive\\beer_project\\beer_output\\models\\drf_models\\light_all_drf_model.zip")
regular_imported_model = h2o.import_mojo("C:\\Users\\alina\\OneDrive\\beer_project\\beer_output\\models\\drf_models\\regular_all_drf_model.zip")

# Select test rows
alc_free_unseen_test = alc_free_unseen_frame[alc_free_unseen_frame["split"]=="test"]
light_unseen_test = light_unseen_frame[light_unseen_frame["split"]=="test"]
regular_unseen_test = regular_unseen_frame[regular_unseen_frame["split"]=="test"]

# Generate predictions
alc_free_unseen_prediction = alc_free_imported_model.predict(alc_free_unseen_frame)
light_unseen_prediction = light_imported_model.predict(light_unseen_frame)
regular_unseen_prediction = regular_imported_model.predict(regular_unseen_frame)

# Calculate performance of model on test rows
alc_free_unseen_perf = alc_free_imported_model.model_performance(alc_free_unseen_test)
light_unseen_perf = light_imported_model.model_performance(light_unseen_test)
regular_unseen_perf = regular_imported_model.model_performance(regular_unseen_test)

# Store model performance as json into a dictionary.
alc_free_perf_dict = alc_free_unseen_perf._metric_json
light_perf_dict = light_unseen_perf._metric_json
regular_perf_dict = regular_unseen_perf._metric_json

### 6.1.4 Export predictions

In [None]:
# Frames plus predictions
alc_free_unseen_plus_pred = alc_free_unseen_frame.cbind(alc_free_unseen_prediction)
light_unseen_plus_pred = light_unseen_frame.cbind(light_unseen_prediction)
regular_unseen_plus_pred = regular_unseen_frame.cbind(regular_unseen_prediction)

# Convert frames to dataframes
alc_free_unseen_pred_df = alc_free_unseen_plus_pred.as_data_frame()
light_unseen_pred_df = light_unseen_plus_pred.as_data_frame()
regular_unseen_pred_df = regular_unseen_plus_pred.as_data_frame()

# Create list of unseen predictions ad zip them with dataset identificators.
unseen_predictions = [alc_free_unseen_pred_df, light_unseen_pred_df, regular_unseen_pred_df]
unseen_predictions_dict = dict(zip(unseen_names, unseen_predictions))

# Export prediction dataframe.
alc_free_unseen_pred_df.to_csv("C:\\Users\\alina\\OneDrive\\beer_project\\unseen_data\\unseen_output\\unseen_predictions\\alc_free_unseen_pred.csv", index=False)
light_unseen_pred_df.to_csv("C:\\Users\\alina\\OneDrive\\beer_project\\unseen_data\\unseen_output\\unseen_predictions\\light_unseen_pred.csv", index=False)
regular_unseen_pred_df.to_csv("C:\\Users\\alina\\OneDrive\\beer_project\\unseen_data\\unseen_output\\unseen_predictions\\regular_unseen_pred.csv", index=False)

### 6.1.5 Calculate recall

In [None]:
# Write function to calculate and export recall scores.

def calculate_recall(dict_of_predictions):
    """Calculate recall score of predictions and export them to csv file.

    Possible values of recall score are between 0 and 1.
    The function returns "np.NaN" if the total number of true positives plus false negatives is zero.

    Args:
    dict_of_predictions (dict): Dictionary of dataframes and their names.

    Returns:
    csv file with recall scores.
    """
    
    recall_list = []
    for key, value in dict_of_predictions.items():
        name = str(key)

        tp = value[(value["rating"] < 5) & (value["predict"] < 5)]
        fn = value[(value["rating"] < 5) & (value["predict"] >= 5)]
        
        # Export fn dataframe.
        fn.to_csv("C:\\Users\\alina\\OneDrive\\beer_project\\unseen_data\\unseen_output\\unseen_false_negatives\\" + name + ".csv", index=False)

        # Calculate recall score:
        try:
            recall = len(tp) / (len(tp) + len(fn))
        except ZeroDivisionError:
            recall = np.NaN
        
        # Append recall to score list.
        recall_list.append(recall)
        
    # Zip unseen names and recall scores into a pandas dataframe.
    recall_df = pd.DataFrame(zip(unseen_names, recall_list), columns=["model_name", "recall"])

    # Export recall dataframe to csv file.
    recall_df.to_csv("C:\\Users\\alina\\OneDrive\\beer_project\\unseen_data\\unseen_output\\unseen_metrics\\recall_unseen.csv", index=False)

calculate_recall(unseen_predictions_dict)

### 6.1.6 Evaluate results

In [None]:
# Import recall file of unseen datasets.
recall_unseen = pd.read_csv("C:\\Users\\alina\\OneDrive\\beer_project\\unseen_data\\unseen_output\\unseen_metrics\\recall_unseen.csv")

# Sort recall dataframe by score.
recall_unseen.sort_values("recall", ascending=False, inplace=True )

# Print recall dataframe.
print(recall_unseen)


# Plot recall scores.
sns.barplot(x="model_name", y="recall", data=recall_unseen)
plt.title("recall of models split by type".title())
plt.ylabel("Recall")
plt.ylim([0, 1])
plt.legend(loc="upper right")
plt.show()
# Graph shows that regular beer is the only one from which low rated beer was detected.
# Alcohol-free and light beers either had a scoze of 0, or didn't have low rated beer in the dataset.
# It's possible to improve these scores or lack of score by collecting more data.