In [2]:
# Import Dependencies
import pandas as pd
from pathlib import Path
import numpy as np
import scipy.stats as stats


In [3]:
# Path to the data set .csv file
csv_path = Path("Resources/Beer Profile and Ratings Data Set-RUTHGN.csv")

# Read file into a Pandas dataframe
beer_df = pd.read_csv(csv_path)

In [4]:
# preview initial dataframe
print(f'Total number of rows: {len(beer_df)}')
print("------------------------")
beer_df.head()

Total number of rows: 3197
------------------------


Unnamed: 0,Name,Style,Brewery,Beer Name (Full),Description,ABV,Min IBU,Max IBU,Astringency,Body,...,Fruits,Hoppy,Spices,Malty,review_aroma,review_appearance,review_palate,review_taste,review_overall,number_of_reviews
0,Amber,Altbier,Alaskan Brewing Co.,Alaskan Brewing Co. Alaskan Amber,"Notes:Richly malty and long on the palate, wit...",5.3,25,50,13,32,...,33,57,8,111,3.498994,3.636821,3.556338,3.643863,3.847082,497
1,Double Bag,Altbier,Long Trail Brewing Co.,Long Trail Brewing Co. Double Bag,"Notes:This malty, full-bodied double alt is al...",7.2,25,50,12,57,...,24,35,12,84,3.798337,3.846154,3.904366,4.024948,4.034304,481
2,Long Trail Ale,Altbier,Long Trail Brewing Co.,Long Trail Brewing Co. Long Trail Ale,Notes:Long Trail Ale is a full-bodied amber al...,5.0,25,50,14,37,...,10,54,4,62,3.409814,3.667109,3.600796,3.6313,3.830239,377
3,Doppelsticke,Altbier,Uerige Obergärige Hausbrauerei GmbH / Zum Uerige,Uerige Obergärige Hausbrauerei GmbH / Zum Ueri...,Notes:,8.5,25,50,13,55,...,49,40,16,119,4.148098,4.033967,4.150815,4.205163,4.005435,368
4,Sleigh'r Dark Doüble Alt Ale,Altbier,Ninkasi Brewing Company,Ninkasi Brewing Company Sleigh'r Dark Doüble A...,Notes:Called 'Dark Double Alt' on the label.Se...,7.2,25,50,25,51,...,11,51,20,95,3.625,3.973958,3.734375,3.765625,3.817708,96


In [5]:
# preview full column list and understand data types before performing any transformation
beer_df.dtypes

Name                  object
Style                 object
Brewery               object
Beer Name (Full)      object
Description           object
ABV                  float64
Min IBU                int64
Max IBU                int64
Astringency            int64
Body                   int64
Alcohol                int64
Bitter                 int64
Sweet                  int64
Sour                   int64
Salty                  int64
Fruits                 int64
Hoppy                  int64
Spices                 int64
Malty                  int64
review_aroma         float64
review_appearance    float64
review_palate        float64
review_taste         float64
review_overall       float64
number_of_reviews      int64
dtype: object

In [6]:
# Identify if any incomplete rows (if null values are present and need to be addressed)
beer_df.count()

# There do not appear to be any incomplete rows

Name                 3197
Style                3197
Brewery              3197
Beer Name (Full)     3197
Description          3197
ABV                  3197
Min IBU              3197
Max IBU              3197
Astringency          3197
Body                 3197
Alcohol              3197
Bitter               3197
Sweet                3197
Sour                 3197
Salty                3197
Fruits               3197
Hoppy                3197
Spices               3197
Malty                3197
review_aroma         3197
review_appearance    3197
review_palate        3197
review_taste         3197
review_overall       3197
number_of_reviews    3197
dtype: int64

In [7]:
# List unique values for "Style" to identify potential similarities for further cleaning
print(f"Number of styles: {len(beer_df['Style'].unique())}")
print("------------------------")
print(beer_df['Style'].unique())

Number of styles: 111
------------------------
['Altbier' 'Barleywine - American' 'Barleywine - English'
 'Bitter - English Extra Special / Strong Bitter (ESB)' 'Bitter - English'
 'Bière de Champagne / Bière Brut' 'Blonde Ale - American'
 'Blonde Ale - Belgian' 'Bock - Doppelbock' 'Bock - Eisbock'
 'Bock - Maibock' 'Bock - Traditional' 'Bock - Weizenbock' 'Braggot'
 'Brett Beer' 'Brown Ale - American' 'Brown Ale - Belgian Dark'
 'Brown Ale - English' 'California Common / Steam Beer' 'Chile Beer'
 'Cream Ale' 'Dubbel' 'Farmhouse Ale - Bière de Garde'
 'Farmhouse Ale - Sahti' 'Farmhouse Ale - Saison' 'Fruit and Field Beer'
 'Gruit / Ancient Herbed Ale' 'Happoshu' 'Herb and Spice Beer'
 'IPA - American' 'IPA - Belgian' 'IPA - Black / Cascadian Dark Ale'
 'IPA - English' 'IPA - Imperial' 'IPA - New England' 'Kvass' 'Kölsch'
 'Lager - Adjunct' 'Lager - American Amber / Red' 'Lager - American'
 'Lager - European / Dortmunder Export' 'Lager - European Dark'
 'Lager - European Pale' 'Lager - 

In [8]:
style_summ = beer_df.groupby(beer_df['Style'])
print(style_summ.describe())

                                                     ABV                       \
                                                   count       mean       std   
Style                                                                           
Altbier                                             39.0   5.748718  1.139164   
Barleywine - American                               38.0  11.165789  1.313192   
Barleywine - English                                26.0  11.270769  1.944051   
Bitter - English                                    41.0   4.334146  0.558395   
Bitter - English Extra Special / Strong Bitter ...  33.0   5.765152  0.747755   
...                                                  ...        ...       ...   
Wheat Beer - Kristallweizen                         26.0   4.730769  1.426259   
Wheat Beer - Wheatwine                              15.0  10.683333  1.167670   
Wheat Beer - Witbier                                36.0   5.511111  1.462244   
Wild Ale                    

In [None]:
# Isolate the primary beer styles from any substyles (which appear after a hyphen "-" in initial Styles data)

# create new column to the right of existing Style column

# for loop to iterate through data to append sub style right of the "- " (hypen and space) to new Sub_Style column
# try/except needed to skip rows that have no sub-style (i.e., do not contain "- ")

# what function to remove sub-style from rows in Style column once separated?

In [None]:
# ??? histogram to visualize major beer styles by review count? By average rating?

In [28]:
# Before setting index, confirm that the unique beer names to become index equals total row count (3197)
print(len(beer_df['Beer Name (Full)'].unique()))
print(len(beer_df))

3197
3197


In [30]:
# Set index to the unique, contatenated beer name in column 'Beer Name (Full)'
beer_name = beer_df.set_index('Beer Name (Full)')

beer_name.head()

Unnamed: 0_level_0,Name,Style,Brewery,Description,ABV,Min IBU,Max IBU,Astringency,Body,Alcohol,...,Fruits,Hoppy,Spices,Malty,review_aroma,review_appearance,review_palate,review_taste,review_overall,number_of_reviews
Beer Name (Full),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alaskan Brewing Co. Alaskan Amber,Amber,Altbier,Alaskan Brewing Co.,"Notes:Richly malty and long on the palate, wit...",5.3,25,50,13,32,9,...,33,57,8,111,3.498994,3.636821,3.556338,3.643863,3.847082,497
Long Trail Brewing Co. Double Bag,Double Bag,Altbier,Long Trail Brewing Co.,"Notes:This malty, full-bodied double alt is al...",7.2,25,50,12,57,18,...,24,35,12,84,3.798337,3.846154,3.904366,4.024948,4.034304,481
Long Trail Brewing Co. Long Trail Ale,Long Trail Ale,Altbier,Long Trail Brewing Co.,Notes:Long Trail Ale is a full-bodied amber al...,5.0,25,50,14,37,6,...,10,54,4,62,3.409814,3.667109,3.600796,3.6313,3.830239,377
Uerige Obergärige Hausbrauerei GmbH / Zum Uerige Uerige Doppelsticke,Doppelsticke,Altbier,Uerige Obergärige Hausbrauerei GmbH / Zum Uerige,Notes:,8.5,25,50,13,55,31,...,49,40,16,119,4.148098,4.033967,4.150815,4.205163,4.005435,368
Ninkasi Brewing Company Sleigh'r Dark Doüble Alt Ale,Sleigh'r Dark Doüble Alt Ale,Altbier,Ninkasi Brewing Company,Notes:Called 'Dark Double Alt' on the label.Se...,7.2,25,50,25,51,26,...,11,51,20,95,3.625,3.973958,3.734375,3.765625,3.817708,96


In [32]:
# Preview beers with greatest number of reviews

beer_name['number_of_reviews'].sort_values(ascending=False).head()

Beer Name (Full)
Dogfish Head Brewery 90 Minute IPA                         3290
Sierra Nevada Brewing Co. Sierra Nevada Celebration Ale    3000
Stone Brewing Co. Stone Ruination IPA                      2704
Russian River Brewing Company Pliny The Elder              2527
Founders Brewing Company Founders Breakfast Stout          2502
Name: number_of_reviews, dtype: int64

In [33]:
# Preview beers with least number of reviews

beer_name['number_of_reviews'].sort_values(ascending=True).head()

Beer Name (Full)
Harpoon Brewery Barrel Aged Munich Dark                                1
Great Lakes Brewing Company Coffee Infused Edmund Fitzgerald Porter    1
Telluride Brewing Co. Bridal Veil Rye Pale Ale                         1
Boulevard Brewing Co. Westside Rye                                     1
New Belgium Brewing Trippel Grand Cru                                  1
Name: number_of_reviews, dtype: int64