In [36]:
# Load Dataset
import pandas
import re

# Load Kaggle Wine Dataset: https://www.kaggle.com/zynicide/wine-reviews
try:
  wine_reviews = pandas.read_csv("../data/winemag-data-130k-v2.csv")
except:
  wine_reviews = pandas.read_csv("https://drive.google.com/uc?export=download&id=1UFKyzq8aTg-1hgYVxVA0bm7D2mmKqSk9")

# Parse Year from Title
year = []
for title in wine_reviews.title:
    year_match = re.search("(\d{4})", title)
    if year_match:
        year.append(year_match.group(1))
    else:
        year.append(None)

wine_reviews.insert(wine_reviews.shape[1], value=year, column="year")

# Drop incomplete data
wine_reviews.drop(columns="Unnamed: 0", inplace=True)
wine_reviews.dropna(axis=0, inplace=True)

# Correct Data Types
wine_reviews = wine_reviews.astype({
    "country": "category",
    "description": "string",
    "designation": "category",
    "points": "int64",
    "price": "float64",
    "province": "category",
    "region_1": "category",
    "region_2": "category",
    "taster_name": "category",
    "taster_twitter_handle": "category",
    "title": "string",
    "variety": "category",
    "winery": "category",
    "year": "int64",
})

In [37]:
# Transfom and Featurize Data
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import *
from sklearn.pipeline import Pipeline

# Build list of column transformers
col_tf = {}

# Drop Points
col_tf["standard"] = [
    ("points", "drop", make_column_selector("points")),
    ("price", PowerTransformer(method="box-cox"), make_column_selector("price")),
    ("year", MinMaxScaler(), make_column_selector("year")),
    ("taster", "drop", make_column_selector("taster_*")),
]

# Build OneHot and Ordinal Encoders for Catergorial Data
col_tf["OneHotEncoder"] = []
col_tf["OrdinalEncoder"] = []
for feat in wine_reviews.select_dtypes(include=['category']).columns:
    categories = wine_reviews[feat].unique()
    onehot = OneHotEncoder(categories=[categories])
    ordinal = OrdinalEncoder(categories=[categories])
    col_tf["OneHotEncoder"].append((feat, onehot, make_column_selector(feat)))
    col_tf["OrdinalEncoder"].append((feat, ordinal, make_column_selector(feat)))

In [38]:
from sklearn.preprocessing import FunctionTransformer

# Sparse to Dense Transformer
DenseTransformer = FunctionTransformer(
    func = lambda x: x.toarray(),
    accept_sparse=True,
)

# Requires Dense
req_dense = [
    "GaussianProcessClassifier",
    "GaussianProcessRegressor",
    "QuadraticDiscriminantAnalysis",
]

def add_dense_tf(steps):
    if steps[-1][0] in req_dense:
        steps.insert(-1, ("DenseTransformer", DenseTransformer))
    return steps

In [39]:
# Preallocate a list of models to train
models = []

In [40]:
# Try Various Classifers
import re
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

# One-Hot + Continous Data
classifers = [
    KNeighborsClassifier(),
    LinearSVC(),
    SVC(kernel="rbf"),
    SVC(kernel="poly"),
    SVC(kernel="sigmoid"),
    #GaussianProcessClassifier(),
    AdaBoostClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    #QuadraticDiscriminantAnalysis(),
]

for c in classifers:
    steps = [
        ("Std&OneHot", ColumnTransformer(col_tf["standard"]+col_tf["OneHotEncoder"])),
        (type(c).__name__, c),
    ]
    steps = add_dense_tf(steps)
    p = Pipeline(steps, verbose=True)
    models.append(p)


In [41]:
# Ordinal Classifers
from sklearn.naive_bayes import CategoricalNB

# Build Pipeline - Catergorical Only
steps = [
    ("OrdinalCatOnly", ColumnTransformer(col_tf["OrdinalEncoder"])),
    ("CateCategoricalNB", CategoricalNB()),
]
steps = add_dense_tf(steps)
p = Pipeline(steps, verbose=True)
models.append(p)

In [42]:
# Regression Models
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

regressors = [
    AdaBoostRegressor(),
    RandomForestRegressor(),
    #GaussianProcessRegressor(),
    LinearRegression(),
    Ridge(),
    ElasticNet(),
    Lasso(),
    KNeighborsRegressor(),
    LinearSVC(),
    DecisionTreeRegressor(),
]

for r in regressors:
    steps = [
        ("Std&OneHot", ColumnTransformer(col_tf["standard"]+col_tf["OneHotEncoder"])),
        (type(r).__name__, r),
    ]
    steps = add_dense_tf(steps)
    p = Pipeline(steps, verbose=True)
    models.append(p)

In [43]:
from copy import deepcopy
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor

def input_shape(tf, data):
    tf2 = deepcopy(tf)
    X = tf2.fit_transform(data)
    return X.shape[1]

# Preprocessor and Input Size
preprocess = ColumnTransformer(col_tf["standard"]+col_tf["OneHotEncoder"])
input_dim = input_shape(preprocess, wine_reviews)

# Build Regression Model
def mlp_regression():
    model = keras.Sequential([
        layers.Dense(100, activation="relu", input_dim=input_dim),
        layers.Dense(100, activation="relu"),
        layers.Dense(1, activation="linear"),
    ])
    model.compile(
        keras.optimizers.Adam(),
        loss=keras.losses.mean_squared_error,
        metrics=[
            keras.metrics.mean_squared_error,
            keras.metrics.mean_absolute_error,
        ]
    )
    return model

models.append(Pipeline([
    ("Std&OneHot", preprocess),
    ("KerasRegressor", KerasRegressor(build_fn=mlp_regression, epochs=2)),
   ], verbose=True
))

# Build Regression Model
def mlp_classifer():
    model = keras.Sequential([
        layers.Dense(100, activation="relu", input_dim=input_dim),
        layers.Dense(100, activation="relu"),
        layers.Dense(3, activation="softmax"),
    ])
    model.compile(
        keras.optimizers.Adam(),
        loss=keras.losses.categorical_crossentropy,
        metrics=[
            keras.metrics.categorical_accuracy,
            "accuracy",
        ],
    )
    return model

models.append(Pipeline([
    ("Std&OneHot", preprocess),
    ("KerasClassifier", KerasClassifier(build_fn=mlp_classifer, epochs=5)),
   ], verbose=True
))

In [44]:
# Build Label Pipelines
label_pipe = {}

def point_pipe(name, tf):
    return ColumnTransformer([(name, tf, make_column_selector("points"))])

# Bin all scores
label_pipe["Classifer"] = point_pipe("BinScores",
    KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
)

# Normalize Scores to a Gaussian
label_pipe["Regression"] = point_pipe("NormScores", StandardScaler())


In [45]:
# Enable Verbose and Parallel Models
for m in models:
    for k in m.get_params(deep=True).keys():
        new_param = {}
        if re.match(".*n_job", k):
            new_param[k] = -1
        if re.match(".*verbose", k):
            new_param[k] = True
        if new_param:
            m.set_params(**new_param)


Models to Train

In [46]:
models.reverse()
for idx, m in enumerate(models):
    label = " -> ".join([s[0] for s in m.steps])
    print(f"{idx:3d}: {label}")

  0: Std&OneHot -> KerasClassifier
  1: Std&OneHot -> KerasRegressor
  2: Std&OneHot -> DecisionTreeRegressor
  3: Std&OneHot -> LinearSVC
  4: Std&OneHot -> KNeighborsRegressor
  5: Std&OneHot -> Lasso
  6: Std&OneHot -> ElasticNet
  7: Std&OneHot -> Ridge
  8: Std&OneHot -> LinearRegression
  9: Std&OneHot -> RandomForestRegressor
 10: Std&OneHot -> AdaBoostRegressor
 11: OrdinalCatOnly -> CateCategoricalNB
 12: Std&OneHot -> RandomForestClassifier
 13: Std&OneHot -> DecisionTreeClassifier
 14: Std&OneHot -> AdaBoostClassifier
 15: Std&OneHot -> SVC
 16: Std&OneHot -> SVC
 17: Std&OneHot -> SVC
 18: Std&OneHot -> LinearSVC
 19: Std&OneHot -> KNeighborsClassifier


Train Models

In [47]:
import sys
import re
from sklearn.base import is_classifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import mean_squared_error, accuracy_score


# Split into Test/Train Sets
train, test = train_test_split(wine_reviews, test_size=0.2, shuffle=True)

for model in models:
    # Transform Labels based on Model Type
    if is_classifier(model) or re.match(".*classifier", model.steps[-1][0], flags=re.IGNORECASE):
        y_pipe = label_pipe["Classifer"]
        model_type = "classifer"
    else:
        y_pipe = label_pipe["Regression"]
        model_type = "regression"

    trainY = y_pipe.fit_transform(train).ravel()
    testY =  y_pipe.transform(test).ravel()

    model_name = model.steps[-1][0]
    print(f"\nTraining: {model_name} - {model_type}")

    try:
        model.fit(train, trainY)
        if model_type is "classifer":
            print(f"Train: {model.score(train, trainY)}")
            print(f"Test: {model.score(test, testY)}")
        else:
            print(f"Train: {mean_squared_error(trainY, model.predict(train))} - MSE")
            print(f"Test: {mean_squared_error(testY, model.predict(test))} - MSE")

    except:
        print(f"Error Training Model")
        print(sys.exc_info())


Training: KerasClassifier - classifer
[Pipeline] ........ (step 1 of 2) Processing Std&OneHot, total=   2.0s
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[Pipeline] ... (step 2 of 2) Processing KerasClassifier, total=  12.4s
Train: 0.9392753839492798
Test: 0.7918367385864258

Training: KerasRegressor - regression
[Pipeline] ........ (step 1 of 2) Processing Std&OneHot, total=   0.1s
Epoch 1/2
Epoch 2/2
[Pipeline] .... (step 2 of 2) Processing KerasRegressor, total=   5.2s
Train: 0.2612545411890727 - MSE
Test: 0.4949578826285108 - MSE

Training: DecisionTreeRegressor - regression
[Pipeline] ........ (step 1 of 2) Processing Std&OneHot, total=   0.1s
[Pipeline]  (step 2 of 2) Processing DecisionTreeRegressor, total=   3.9s
Train: 0.00023300095302142864 - MSE
Test: 0.7463288368524216 - MSE

Training: LinearSVC - classifer
[Pipeline] ........ (step 1 of 2) Processing Std&OneHot, total=   0.1s
[LibLinear][Pipeline] ......... (step 2 of 2) Processing LinearSVC, total=   0.7s
Train: 0.9

Save Trained Models to Disk

In [63]:
from pathlib import Path

for model in models:
    model_name = model.steps[-1][0]
    model_file = Path("../models", model_name)
    try:
        with open(model_file.with_suffix(".pickle"), "wb") as file:
            pickle.dump(model, file)
    except:
        # Rm empty file
        model_file.with_suffix(".pickle").unlink()

        # Save Keras Model
        model.steps[-1][1].model.save(model_file.with_suffix(".h5"))