In [1]:
# Load Dataset
import pandas
import re

# Load Kaggle Wine Dataset: https://www.kaggle.com/zynicide/wine-reviews
try:
  wine_reviews = pandas.read_csv("../data/winemag-data-130k-v2.csv")
except:
  wine_reviews = pandas.read_csv("https://drive.google.com/uc?export=download&id=1UFKyzq8aTg-1hgYVxVA0bm7D2mmKqSk9")

# Parse Year from Title
year = []
for title in wine_reviews.title:
    year_match = re.search("(\d{4})", title)
    if year_match:
        year.append(year_match.group(1))
    else:
        year.append(None)

wine_reviews.insert(wine_reviews.shape[1], value=year, column="year")

# Drop incomplete data
wine_reviews.drop(columns="Unnamed: 0", inplace=True)
wine_reviews.dropna(axis=0, inplace=True)

# Correct Data Types
wine_reviews = wine_reviews.astype({
    "country": "category",
    "description": "string",
    "designation": "category",
    "points": "int64",
    "price": "float64",
    "province": "category",
    "region_1": "category",
    "region_2": "category",
    "taster_name": "category",
    "taster_twitter_handle": "category",
    "title": "string",
    "variety": "category",
    "winery": "category",
    "year": "int64",
})

In [2]:
# Transfom and Featurize Data
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import *
from sklearn.pipeline import Pipeline

# Build list of column transformers
col_tf = {}

# Drop Points
col_tf["standard"] = [
    ("points", "drop", make_column_selector("points")),
    ("price", PowerTransformer(method="box-cox"), make_column_selector("price")),
    ("year", MinMaxScaler(), make_column_selector("year")),
    ("taster", "drop", make_column_selector("taster_*")),
]

# Build OneHot and Ordinal Encoders for Catergorial Data
col_tf["OneHotEncoder"] = []
col_tf["OrdinalEncoder"] = []
for feat in wine_reviews.select_dtypes(include=['category']).columns:
    categories = wine_reviews[feat].unique()
    onehot = OneHotEncoder(categories=[categories])
    ordinal = OrdinalEncoder(categories=[categories])
    col_tf["OneHotEncoder"].append((feat, onehot, make_column_selector(feat)))
    col_tf["OrdinalEncoder"].append((feat, ordinal, make_column_selector(feat)))

In [3]:
from sklearn.preprocessing import FunctionTransformer

# Sparse to Dense Transformer
DenseTransformer = FunctionTransformer(
    func = lambda x: x.toarray(),
    accept_sparse=True,
)

# Requires Dense
req_dense = [
    "GaussianProcessClassifier",
    "GaussianProcessRegressor",
    "QuadraticDiscriminantAnalysis",
]

def add_dense_tf(steps):
    if steps[-1][0] in req_dense:
        steps.insert(-1, ("DenseTransformer", DenseTransformer))
    return steps

In [4]:
# Preallocate a list of models to train
models = []

In [5]:
# Try Various Classifers
import re
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

# One-Hot + Continous Data
classifers = [
    KNeighborsClassifier(),
    LinearSVC(),
    SVC(kernel="rbf"),
    SVC(kernel="poly"),
    SVC(kernel="sigmoid"),
    #GaussianProcessClassifier(),
    AdaBoostClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    #QuadraticDiscriminantAnalysis(),
    MLPClassifier(hidden_layer_sizes=(10000, 100,), activation="relu"),
]

for c in classifers:
    steps = [
        ("Std&OneHot", ColumnTransformer(col_tf["standard"]+col_tf["OneHotEncoder"])),
        (type(c).__name__, c),
    ]
    steps = add_dense_tf(steps)
    p = Pipeline(steps, verbose=True)
    models.append(p)

steps[0][1].fit_transform(wine_reviews).shape

(22047, 12451)

In [6]:
# Ordinal Classifers
from sklearn.naive_bayes import CategoricalNB

# Build Pipeline - Catergorical Only
steps = [
    ("OrdinalCatOnly", ColumnTransformer(col_tf["OrdinalEncoder"])),
    ("CateCategoricalNB", CategoricalNB()),
]
steps = add_dense_tf(steps)
p = Pipeline(steps, verbose=True)
models.append(p)

In [7]:
# Regression Models
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

regressors = [
    AdaBoostRegressor(),
    RandomForestRegressor(),
    #GaussianProcessRegressor(),
    LinearRegression(),
    Ridge(),
    ElasticNet(),
    Lasso(),
    KNeighborsRegressor(),
    MLPRegressor(),
    LinearSVC(),
    DecisionTreeRegressor(),
]

for r in regressors:
    steps = [
        ("Std&OneHot", ColumnTransformer(col_tf["standard"]+col_tf["OneHotEncoder"])),
        (type(r).__name__, r),
    ]
    steps = add_dense_tf(steps)
    p = Pipeline(steps, verbose=True)
    models.append(p)

In [8]:
# Build Label Pipelines
label_pipe = {}

def point_pipe(name, tf):
    return ColumnTransformer([(name, tf, make_column_selector("points"))])

# Bin all scores
label_pipe["Classifer"] = point_pipe("BinScores",
    KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
)

# Normalize Scores to a Gaussian
label_pipe["Regression"] = point_pipe("NormScores", StandardScaler())


In [9]:
# Enable Verbose and Parallel Models
for m in models:
    for k in m.get_params(deep=True).keys():
        new_param = {}
        if re.match(".*n_job", k):
            new_param[k] = -1
        if re.match(".*verbose", k):
            new_param[k] = True
        if new_param:
            m.set_params(**new_param)


Models to Train

In [10]:
for idx, m in enumerate(models):
    label = " -> ".join([s[0] for s in m.steps])
    print(f"{idx:3d}: {label}")

  0: Std&OneHot -> KNeighborsClassifier
  1: Std&OneHot -> LinearSVC
  2: Std&OneHot -> SVC
  3: Std&OneHot -> SVC
  4: Std&OneHot -> SVC
  5: Std&OneHot -> AdaBoostClassifier
  6: Std&OneHot -> DecisionTreeClassifier
  7: Std&OneHot -> RandomForestClassifier
  8: Std&OneHot -> MLPClassifier
  9: OrdinalCatOnly -> CateCategoricalNB
 10: Std&OneHot -> AdaBoostRegressor
 11: Std&OneHot -> RandomForestRegressor
 12: Std&OneHot -> LinearRegression
 13: Std&OneHot -> Ridge
 14: Std&OneHot -> ElasticNet
 15: Std&OneHot -> Lasso
 16: Std&OneHot -> KNeighborsRegressor
 17: Std&OneHot -> MLPRegressor
 18: Std&OneHot -> LinearSVC
 19: Std&OneHot -> DecisionTreeRegressor


Train Models

In [11]:
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.base import is_classifier

# Split into Test/Train Sets
train, test = train_test_split(wine_reviews, test_size=0.2, shuffle=True)

for model in models:
    # Transform Labels based on Model Type
    if is_classifier(model):
        y_pipe = label_pipe["Classifer"]
    else:
        y_pipe = label_pipe["Regression"]

    trainY = y_pipe.fit_transform(train).ravel()
    testY =  y_pipe.transform(test).ravel()

    print("\nTraining: %s" % model.steps[-1][0])

    try:
        model.fit(train, trainY)
        print(f"Train: {model.score(train, trainY)}")
        print(f"Test: {model.score(test, testY)}")
    except:
        print(f"Error Training Model")
        print(sys.exc_info())


Training: KNeighborsClassifier
[Pipeline] ........ (step 1 of 2) Processing Std&OneHot, total=   2.8s
[Pipeline]  (step 2 of 2) Processing KNeighborsClassifier, total=   0.0s
Train: 0.8459488575154505
Test: 0.7902494331065759

Training: LinearSVC
[Pipeline] ........ (step 1 of 2) Processing Std&OneHot, total=   0.1s
[LibLinear][Pipeline] ......... (step 2 of 2) Processing LinearSVC, total=   0.7s
Train: 0.9263480183704712
Test: 0.8058956916099773

Training: SVC
[Pipeline] ........ (step 1 of 2) Processing Std&OneHot, total=   0.1s
[LibSVM][Pipeline] ............... (step 2 of 2) Processing SVC, total=  20.3s
Train: 0.8267279015705619
Test: 0.8081632653061225

Training: SVC
[Pipeline] ........ (step 1 of 2) Processing Std&OneHot, total=   0.1s
[LibSVM][Pipeline] ............... (step 2 of 2) Processing SVC, total=  22.2s
Train: 0.8562680728014969
Test: 0.8163265306122449

Training: SVC
[Pipeline] ........ (step 1 of 2) Processing Std&OneHot, total=   0.1s
[LibSVM][Pipeline] ...........