In [82]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [83]:
# import data and drop extra columns
features = pd.read_csv("../data/prepped/modeling_features.csv")
labels = pd.read_csv("../data/prepped/modeling_outcome.csv")
features = features.drop(["Unnamed: 0"], axis=1)
labels = labels.drop(["Unnamed: 0"], axis=1)

In [84]:
# make the outcome categorical, incrementing from 0 in 0.25 steps
labels["rating_value"] = [str(round(i * 4) / 4) for i in labels["rating_value"]]

In [85]:
# perform a test train split, and generate folds for cross validation
train_features, test_features, train_outcome, test_outcome = train_test_split(
    features,
    labels["rating_value"],
    test_size=0.25,
    random_state=42
)
folds = KFold(n_splits=10, shuffle=True, random_state=42)

In [86]:
# create the pipeline
pipeline = make_pipeline(
    MinMaxScaler(),
    MLPClassifier()
)
# set params
pipeline_params = {
    "mlpclassifier__hidden_layer_sizes": [100],
    "mlpclassifier__activation": ["relu"],
    "mlpclassifier__solver": ["adam"],
    "mlpclassifier__alpha": [0.001],
    "mlpclassifier__learning_rate": ["constant"],
    "mlpclassifier__random_state": [42],
    "mlpclassifier__beta_1": [0.07],
    "mlpclassifier__beta_2": [0.999],
    "mlpclassifier__early_stopping": [False]
}
# perform grid search
model = GridSearchCV(pipeline, pipeline_params, cv=folds, verbose=True)
model.fit(train_features, train_outcome)
score = model.score(test_features, test_outcome)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.0s finished


In [92]:
# output score
import numpy as np
print("model score:", score)
print("best params:", model.best_params_)
# np.unique(model.predict(features))
# test = features[0:1]
# test.shape

test = model.predict(features)
# print(model.predict(features[67:69]))
for i in range(0, test.size):
    if (test[i] == "3.0"):
        print(i)
#         print(features.abv[i:i+1])
# pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 500)

features[68:69]

model score: 0.5853658536585366
best params: {'mlpclassifier__activation': 'relu', 'mlpclassifier__alpha': 0.001, 'mlpclassifier__beta_1': 0.07, 'mlpclassifier__beta_2': 0.999, 'mlpclassifier__early_stopping': False, 'mlpclassifier__hidden_layer_sizes': 100, 'mlpclassifier__learning_rate': 'constant', 'mlpclassifier__random_state': 42, 'mlpclassifier__solver': 'adam'}
68
1371


Unnamed: 0,abv,ibu,diff_g,boil_time,efficiency,ferm_total_weight,ferm_type_base_malt,ferm_type_crystal_malt,ferm_type_roasted_malt,ferm_type_other,ferm_type_extract,ferm_type_sugar,ferm_type_raw,ferm_type_acidulated_malt,ferm_type_fruit,ferm_type_gluten_free_malt,hops_type_pellet,hops_type_leaf_whole,hops_type_plug,other_type_spice,other_type_water_agt,other_type_other,other_type_fining,other_type_flavor,other_type_herb,yeast_attenuation,method_all_grain,method_biab,method_extract,method_partial_mash,yeast_form_dry,yeast_form_liquid,yeast_flocculation_high,yeast_flocculation_low,yeast_flocculation_medium
68,22.05,0.0,0.168,60,70.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,75.0,1,0,0,0,0,1,0,0,1


In [67]:
# train a knn model
pipeline = make_pipeline(
    MinMaxScaler(),
    KNeighborsClassifier()
)
pipeline_params = {
    "kneighborsclassifier__n_neighbors": [10]
}
model = GridSearchCV(pipeline, pipeline_params, cv=folds)
model.fit(train_features, train_outcome)
score = model.score(test_features, test_outcome)

In [62]:
# get the knn model score
print("model score:", score)
print("best params:", model.best_params_)

model score: 0.5304878048780488
best params: {'kneighborsclassifier__n_neighbors': 10}
