# Brewery Dataset (Using Polars)

## Dataset Inspection (Bronze)

In [None]:
import polars as pl
import os

In [None]:
# Load brewery_operations.csv
brewery_ds = pl.read_csv('bronze/brewery_data_complete_extended.csv')

In [None]:
brewery_ds.schema

In [None]:
brewery_ds.head()

In [None]:
brewery_ds.describe()

The dataset does not contain any null values.

## Bronze -> Silver

Brew date is stored as a string, transforming to Date object.

In [None]:
# Convert 'Brew_Date' to datetime
brewery_ds_dt = brewery_ds.with_columns(
    pl.col('Brew_Date').str.to_datetime()
)

# Print an example of Brew_Date
brewery_ds_dt.select(pl.col('Brew_Date')).head()

Ingredient ratios are difficult to interpret. Splitting them into three columns that represent the percentage of each ingredient.
According to clarification here: https://www.kaggle.com/datasets/ankurnapa/brewery-operations-and-market-analysis-dataset/discussion/460903
The ratio represents water:grains:hops

In [None]:
# Split 'Ingredient_Ratio' into exactly three parts and cast to float
brewery_ds_dt_s = brewery_ds_dt.with_columns(
    pl.col('Ingredient_Ratio').str.split_exact(n=2, by=":").alias('Ingredients')
)

# Extract ratio values and cast to float
brewery_ds_dt_ss = brewery_ds_dt_s.with_columns([
    pl.col('Ingredients').struct.field('field_0').cast(pl.Float64).alias('Water_Ratio'),
    pl.col('Ingredients').struct.field('field_1').cast(pl.Float64).alias('Grains_Ratio'),
    pl.col('Ingredients').struct.field('field_2').cast(pl.Float64).alias('Hops_Ratio')
])

# Calculate total ratio
brewery_ds_dt_sss = brewery_ds_dt_ss.with_columns(
    (pl.col('Water_Ratio') + pl.col('Grains_Ratio') + pl.col('Hops_Ratio')).alias('Total_Ratio')
)

# Calculate percentages
brewery_ds_dt_ssss = brewery_ds_dt_sss.with_columns([
    (pl.col('Water_Ratio') / pl.col('Total_Ratio') * 100).alias('Water_Percentage'),
    (pl.col('Grains_Ratio') / pl.col('Total_Ratio') * 100).alias('Grains_Percentage'),
    (pl.col('Hops_Ratio') / pl.col('Total_Ratio') * 100).alias('Hops_Percentage')
])

# Drop unnecessary columns
brewery_ds_dt_ss_d = brewery_ds_dt_ssss.drop(['Ingredients', 'Water_Ratio', 'Grains_Ratio', 'Hops_Ratio', 'Total_Ratio'])

print(brewery_ds_dt_ss_d.head())

Adding the losses during each stage of brewing to a total loss is an easy and intuitive transformation to be made.
It is unclear, and unexplained in the Data Card, whether the volume loss is from the initial amount or from each sequential step. 
Operating under the assumption that volume losses are as percentages of the volume from the previous step.

In [None]:
brewery_ds_dt_sst = brewery_ds_dt_ss_d.with_columns(
    (
        100 * (1 - (
            (1 - pl.col('Loss_During_Brewing') / 100) *
            (1 - pl.col('Loss_During_Fermentation') / 100) *
            (1 - pl.col('Loss_During_Bottling_Kegging') / 100)
        ))
    ).alias('Total_Loss_Percentage')
)

print(brewery_ds_dt_sst.select(pl.col('Total_Loss_Percentage')).head())


In [None]:
# Save the dataset
brewery_ds_dt_sst.write_csv('silver/brewery_data.csv')

# Silver -> Gold: Ingredient Ratio effect on Alcohol Content and Quality Score

We'll need the Ingredient Rations (Water, Grains, Hops), the Alcohol Content, Quality Score.
Additional columns we'll include: Fermentation Time, Temperature, pH_Level, Gravity and Beer_Style, Batch_ID for identification

In [2]:
import polars as pl
silver_ds = pl.read_csv('silver/brewery_data.csv')
gold_ds = silver_ds.select([
    'Batch_ID',
    'Beer_Style',
    'Water_Percentage',
    'Grains_Percentage',
    'Hops_Percentage',
    'Fermentation_Time',
    'Temperature',
    'pH_Level',
    'Gravity',
    'Alcohol_Content',
    'Quality_Score'
])

# Calculate total ingredient percentage (as a sanity check)
gold_ds_t = gold_ds.with_columns(
    (
        pl.col('Water_Percentage') +
        pl.col('Grains_Percentage') +
        pl.col('Hops_Percentage')
    ).alias('Total_Ingredient_Percentage')
)

# Drop rows whose total ingredient percentage is not 100
gold_ds_tf = gold_ds_t.filter(pl.col('Total_Ingredient_Percentage') == 100)

# Drop the total ingredient percentage column
gold_ds_tfs = gold_ds_tf.drop('Total_Ingredient_Percentage')

print(gold_ds_tfs.count())
print(gold_ds_tfs.head())


shape: (1, 11)
┌──────────┬────────────┬────────────┬────────────┬───┬──────────┬─────────┬───────────┬───────────┐
│ Batch_ID ┆ Beer_Style ┆ Water_Perc ┆ Grains_Per ┆ … ┆ pH_Level ┆ Gravity ┆ Alcohol_C ┆ Quality_S │
│ ---      ┆ ---        ┆ entage     ┆ centage    ┆   ┆ ---      ┆ ---     ┆ ontent    ┆ core      │
│ u32      ┆ u32        ┆ ---        ┆ ---        ┆   ┆ u32      ┆ u32     ┆ ---       ┆ ---       │
│          ┆            ┆ u32        ┆ u32        ┆   ┆          ┆         ┆ u32       ┆ u32       │
╞══════════╪════════════╪════════════╪════════════╪═══╪══════════╪═════════╪═══════════╪═══════════╡
│ 6394088  ┆ 6394088    ┆ 6394088    ┆ 6394088    ┆ … ┆ 6394088  ┆ 6394088 ┆ 6394088   ┆ 6394088   │
└──────────┴────────────┴────────────┴────────────┴───┴──────────┴─────────┴───────────┴───────────┘
shape: (5, 11)
┌──────────┬────────────┬────────────┬───────────┬───┬──────────┬──────────┬───────────┬───────────┐
│ Batch_ID ┆ Beer_Style ┆ Water_Perc ┆ Grains_Pe ┆ … ┆ pH_Lev

In [24]:
mean_pH = gold_ds_tfs['pH_Level'].mean()
print(f"Mean pH Level: {mean_pH}")

tol = 0.1  
def relative_pH(pH_value):
    if pH_value < mean_pH - tol:
        return 'More Acidic'
    elif pH_value > mean_pH + tol:
        return 'More Alkaline'
    else:
        return 'Close to Mean'


# Add the 'Relative_pH_Level' column using apply()
gold_ds_tfs_p = gold_ds_tfs.with_columns(
    pl.col('pH_Level').map_elements(relative_pH, return_dtype=str).alias('Relative_pH_Level')
)


print(gold_ds_tfs_p.select(['Batch_ID', 'pH_Level', 'Relative_pH_Level']).head(10))


Mean pH Level: 4.999854772201371
shape: (10, 3)
┌──────────┬──────────┬───────────────────┐
│ Batch_ID ┆ pH_Level ┆ Relative_pH_Level │
│ ---      ┆ ---      ┆ ---               │
│ i64      ┆ f64      ┆ str               │
╞══════════╪══════════╪═══════════════════╡
│ 7870796  ┆ 5.289845 ┆ More Alkaline     │
│ 9810411  ┆ 5.275643 ┆ More Alkaline     │
│ 2623342  ┆ 4.778016 ┆ More Acidic       │
│ 6441292  ┆ 5.332881 ┆ More Alkaline     │
│ 8843420  ┆ 4.507213 ┆ More Acidic       │
│ 8178852  ┆ 4.911262 ┆ Close to Mean     │
│ 5607233  ┆ 4.809827 ┆ More Acidic       │
│ 3699698  ┆ 5.312025 ┆ More Alkaline     │
│ 1816588  ┆ 5.473564 ┆ More Alkaline     │
│ 4910947  ┆ 4.685796 ┆ More Acidic       │
└──────────┴──────────┴───────────────────┘


In [25]:
# Save the dataset
gold_ds_tfs_p.write_csv('gold/brewery_ratio_outcomes.csv')