In [None]:
#Dataset link: https://www.kaggle.com/jtrofe/beer-recipes

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv('/content/recipeData.csv', encoding="ISO-8859-1")

In [None]:
df.head()

Unnamed: 0,BeerID,Name,URL,Style,StyleID,Size(L),OG,FG,ABV,IBU,...,BoilGravity,Efficiency,MashThickness,SugarScale,BrewMethod,PitchRate,PrimaryTemp,PrimingMethod,PrimingAmount,UserId
0,1,Vanilla Cream Ale,/homebrew/recipe/view/1633/vanilla-cream-ale,Cream Ale,45,21.77,1.055,1.013,5.48,17.65,...,1.038,70.0,,Specific Gravity,All Grain,,17.78,corn sugar,4.5 oz,116.0
1,2,Southern Tier Pumking clone,/homebrew/recipe/view/16367/southern-tier-pumk...,Holiday/Winter Special Spiced Beer,85,20.82,1.083,1.021,8.16,60.65,...,1.07,70.0,,Specific Gravity,All Grain,,,,,955.0
2,3,Zombie Dust Clone - EXTRACT,/homebrew/recipe/view/5920/zombie-dust-clone-e...,American IPA,7,18.93,1.063,1.018,5.91,59.25,...,,70.0,,Specific Gravity,extract,,,,,
3,4,Zombie Dust Clone - ALL GRAIN,/homebrew/recipe/view/5916/zombie-dust-clone-a...,American IPA,7,22.71,1.061,1.017,5.8,54.48,...,,70.0,,Specific Gravity,All Grain,,,,,
4,5,Bakke Brygg Belgisk Blonde 50 l,/homebrew/recipe/view/89534/bakke-brygg-belgis...,Belgian Blond Ale,20,50.0,1.06,1.01,6.48,17.84,...,1.05,72.0,,Specific Gravity,All Grain,,19.0,Sukkerlake,6-7 g sukker/l,18325.0


In [None]:
df.dtypes

BeerID          int64
URL            object
Style          object
StyleID         int64
Size(L)       float64
OG            float64
FG            float64
ABV           float64
IBU           float64
Color         float64
BoilSize      float64
BoilTime        int64
Efficiency    float64
SugarScale     object
BrewMethod     object
dtype: object

#CLEANING THE DATASET

In [None]:
#extracting the percentage of missing data in each column
df = df.drop(['Name'],axis=1)
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,'percent_missing': percent_missing})
missing_value_df

Unnamed: 0,column_name,percent_missing
BeerID,BeerID,0.0
URL,URL,0.0
Style,Style,0.806921
StyleID,StyleID,0.0
Size(L),Size(L),0.0
OG,OG,0.0
FG,FG,0.0
ABV,ABV,0.0
IBU,IBU,0.0
Color,Color,0.0


In [None]:
#excluding columns that the percentage of missing data is bigger than 0.9
for i in range(len(missing_value_df['percent_missing'])):
  if missing_value_df['percent_missing'][i] > 0.9:
    column_name = missing_value_df['column_name'][i]
    df = df.drop(labels=column_name,axis=1)

In [None]:
#verfying
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,'percent_missing': percent_missing})
missing_value_df

Unnamed: 0,column_name,percent_missing
BeerID,BeerID,0.0
URL,URL,0.0
Style,Style,0.806921
StyleID,StyleID,0.0
Size(L),Size(L),0.0
OG,OG,0.0
FG,FG,0.0
ABV,ABV,0.0
IBU,IBU,0.0
Color,Color,0.0


In [None]:
#just seeing the associations of the id's and the style names
analyse_df = df[['Style','StyleID']].drop_duplicates().dropna().sort_values(by=['StyleID'], ascending=True)
analyse_df

Unnamed: 0,Style,StyleID
246,Altbier,1
1981,Alternative Grain Beer,2
4268,Alternative Sugar Beer,3
48,American Amber Ale,4
509,American Barleywine,5
...,...,...
2638,Wheatwine,172
1733,Wild Specialty Beer,173
139,Winter Seasonal Beer,174
50,Witbier,175


In [None]:
#dropping the URL column (not relevant)
df = df.drop(['URL'],axis=1)

In [None]:
#converting object features to numeric
df = pd.get_dummies(df)

In [None]:
df.head()

Unnamed: 0,BeerID,StyleID,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,...,Style_Wild Specialty Beer,Style_Winter Seasonal Beer,Style_Witbier,Style_Wood-Aged Beer,SugarScale_Plato,SugarScale_Specific Gravity,BrewMethod_All Grain,BrewMethod_BIAB,BrewMethod_Partial Mash,BrewMethod_extract
0,1,45,21.77,1.055,1.013,5.48,17.65,4.83,28.39,75,...,0,0,0,0,0,1,1,0,0,0
1,2,85,20.82,1.083,1.021,8.16,60.65,15.64,24.61,60,...,0,0,0,0,0,1,1,0,0,0
2,3,7,18.93,1.063,1.018,5.91,59.25,8.98,22.71,60,...,0,0,0,0,0,1,0,0,0,1
3,4,7,22.71,1.061,1.017,5.8,54.48,8.5,26.5,60,...,0,0,0,0,0,1,1,0,0,0
4,5,20,50.0,1.06,1.01,6.48,17.84,4.57,60.0,90,...,0,0,0,0,0,1,1,0,0,0


In [None]:
#splitting:
x = df.drop(['StyleID'], axis= 1)
y = df['StyleID']

In [None]:
#setting up the ML models (for classification)
knn = KNeighborsClassifier()
lr = LogisticRegression() 
dtree = DecisionTreeClassifier()

In [None]:
#applying Stratified KFold for model comparision
skfold = StratifiedKFold(n_splits=5)

result_knn = cross_val_score(knn,x,y,cv = skfold)
result_lr = cross_val_score(lr,x,y,cv = skfold)
result_dtree = cross_val_score(dtree,x,y,cv = skfold)

In [None]:
#results: (the accuracy mean of each model)
print(f'KNN Result: {result_knn.mean()}')
print(f'LR Result: {result_lr.mean()}')
print(f'Decision Tree Result: {result_dtree.mean()}')

KNN Result: 0.0009206432963701297
LR Result: 0.1616550007277751
Decision Tree Result: 0.9984295408762801
