# Part I: Data Wrangling + EDA

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support

In [19]:
breweries = pd.read_csv("data/breweries.csv")
beers = pd.read_csv("data/beers.csv")

In [20]:
beers.shape

(2410, 8)

In [21]:
breweries.shape

(558, 4)

**Let's find out the number of missing values in each variable of the variables in the `beers` dataset.**

In [22]:
beers.isnull().sum(axis = 0)

Unnamed: 0       0
abv             62
ibu           1005
id               0
name             0
style            5
brewery_id       0
ounces           0
dtype: int64

In [23]:
beers = beers.dropna(subset = ["style", "abv", "ibu"])

In [24]:
#Combine both datasets using a left merge, and create a singular dataframe
beer_df = pd.merge(beers, breweries, how = "left")
beer_df.head()

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces,city,state
0,14,0.061,60.0,1979,Bitter Bitch,American Pale Ale (APA),177,12.0,,
1,21,0.099,92.0,1036,Lower De Boom,American Barleywine,368,8.4,,
2,22,0.079,45.0,1024,Fireside Chat,Winter Warmer,368,12.0,,
3,24,0.044,42.0,876,Bitter American,American Pale Ale (APA),368,12.0,,
4,25,0.049,17.0,802,Hell or High Watermelon Wheat (2009),Fruit / Vegetable Beer,368,12.0,,


In [25]:
beer_df.describe()

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,brewery_id,ounces
count,1403.0,1403.0,1403.0,1403.0,1403.0,1403.0
mean,1241.128297,0.059919,42.739843,1413.88881,223.375624,13.510264
std,691.675612,0.013585,25.962692,757.572191,150.38751,2.254112
min,14.0,0.027,4.0,1.0,0.0,8.4
25%,681.5,0.05,21.0,771.0,95.5,12.0
50%,1228.0,0.057,35.0,1435.0,198.0,12.0
75%,1864.5,0.068,64.0,2068.5,350.0,16.0
max,2408.0,0.125,138.0,2692.0,546.0,32.0


In [26]:
#Find the counts of the beer "styles" in the data
beer_df["style"].value_counts()

American IPA                          301
American Pale Ale (APA)               153
American Amber / Red Ale               77
American Double / Imperial IPA         75
American Blonde Ale                    61
                                     ... 
English Stout                           1
English Pale Mild Ale                   1
Old Ale                                 1
Roggenbier                              1
American Double / Imperial Pilsner      1
Name: style, Length: 90, dtype: int64

In [38]:
#We limit the beer styles to the top 5 with respect to commonality/popularity
styles = beer_df["style"].value_counts()[:5].index.tolist()
beer_df = beer_df[beer_df["style"].isin(styles)]

# Part II: Prediction Model

In [28]:
#Define split ratio for training and testing
num_training = int(len(beer_df) * 0.8)
num_testing = len(beer_df) - num_training

In [29]:
beer_X = beer_df[["abv", "ibu"]]
beer_Y = np.array(beer_df["style"])

**The code below splits the training and testing data using the ratio defined above.**

In [30]:
beer_train_X = beer_X[:num_training]
beer_test_X = beer_X[num_training:]
beer_train_Y = beer_Y[:num_training]
beer_test_Y = beer_Y[num_training:]

In [31]:
#This function creates a SVM [linear] classifier
def train_SVM(X, Y, kernel = "linear"):
    clf = SVC(kernel = kernel)
    clf.fit(X, Y)
    return clf

Using the training function above, the dataset is trained (using a SVM Linear Classifier). 

In [39]:
beer_clf = train_SVM(beer_train_X, beer_train_Y)

In [41]:
#Generate training and testing predictions using the primitive predict function.
beer_predicted_train_Y = beer_clf.predict(beer_train_X)
beer_predicted_test_Y = beer_clf.predict(beer_test_X)

# Part III: Model Assessment

In [34]:
print(classification_report(beer_train_Y, beer_predicted_train_Y))

                                precision    recall  f1-score   support

      American Amber / Red Ale       0.82      0.45      0.58        69
American Double / Imperial IPA       0.76      0.25      0.37        53
                  American IPA       0.69      0.84      0.76       236
       American Pale Ale (APA)       0.57      0.64      0.60       126

                      accuracy                           0.67       484
                     macro avg       0.71      0.54      0.58       484
                  weighted avg       0.69      0.67      0.65       484



In [35]:
print(classification_report(beer_test_Y, beer_predicted_test_Y))

                                precision    recall  f1-score   support

      American Amber / Red Ale       0.62      0.62      0.62         8
American Double / Imperial IPA       0.78      0.32      0.45        22
                  American IPA       0.70      0.72      0.71        65
       American Pale Ale (APA)       0.55      0.78      0.65        27

                      accuracy                           0.66       122
                     macro avg       0.66      0.61      0.61       122
                  weighted avg       0.68      0.66      0.64       122



In [36]:
confusion_matrix(beer_train_Y, beer_predicted_train_Y)

array([[ 31,   1,  10,  27],
       [  0,  13,  40,   0],
       [  0,   3, 198,  35],
       [  7,   0,  38,  81]])

In [37]:
confusion_matrix(beer_test_Y, beer_predicted_test_Y)

array([[ 5,  0,  2,  1],
       [ 1,  7, 14,  0],
       [ 0,  2, 47, 16],
       [ 2,  0,  4, 21]])