In [120]:
# Load Dataset
import pandas
import re

# Load Kaggle Wine Dataset: https://www.kaggle.com/zynicide/wine-reviews
try:
  wine_reviews = pandas.read_csv("../data/winemag-data-130k-v2.csv")
except:
  wine_reviews = pandas.read_csv("https://drive.google.com/uc?export=download&id=1UFKyzq8aTg-1hgYVxVA0bm7D2mmKqSk9")

# Parse Year from Title
year = []
for title in wine_reviews.title:
    year_match = re.search("(\d{4})", title)
    if year_match:
        year.append(year_match.group(1))
    else:
        year.append(None)

wine_reviews.insert(wine_reviews.shape[1], value=year, column="year")

# Drop incomplete data
wine_reviews.drop(columns="Unnamed: 0", inplace=True)
wine_reviews.dropna(axis=0, inplace=True)

# Correct Data Types
wine_reviews = wine_reviews.astype({
    "country": "category",
    "description": "string",
    "designation": "category",
    "points": "int64",
    "price": "float64",
    "province": "category",
    "region_1": "category",
    "region_2": "category",
    "taster_name": "category",
    "taster_twitter_handle": "category",
    "title": "string",
    "variety": "category",
    "winery": "category",
    "year": "int64",
})

In [121]:
# Transfom and Featurize Data
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import *
from sklearn.pipeline import Pipeline

# Build list of column transformers
col_tf = {}

# Drop Points
col_tf["standard"] = [
    ("dropPoints", "drop", make_column_selector("points")),
    ("PowerTransform", PowerTransformer(), make_column_selector("price")),
    ("MinMaxScaler", MinMaxScaler(), make_column_selector("year")),
    ("dropTaster", "drop", make_column_selector("taster_*")),
]

# Build OneHot and Ordinal Encoders for Catergorial Data
col_tf["OneHotEncoder"] = []
col_tf["OrdinalEncoder"] = []
for feat in wine_reviews.select_dtypes(include=['category']).columns:
    categories = wine_reviews[feat].unique()
    onehot = OneHotEncoder(categories=[categories])
    ordinal = OrdinalEncoder(categories=[categories])
    col_tf["OneHotEncoder"].append((f"OneHotVector-{feat}", onehot, make_column_selector(feat)))
    col_tf["OrdinalEncoder"].append((f"OrdinalEncoder-{feat}", ordinal, make_column_selector(feat)))


In [122]:
# Preallocate a list of models to train
models = []

In [123]:
# Try Various Classifers
import re
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

# One-Hot + Continous Data
classifers = [
    KNeighborsClassifier(),
    LinearSVC(),
    SVC(kernel="rbf"),
    SVC(kernel="poly"),
    SVC(kernel="sigmoid"),
    GaussianProcessClassifier(),
    AdaBoostClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    QuadraticDiscriminantAnalysis(),
    MLPClassifier(hidden_layer_sizes=(10000, 100,), activation="relu"),
]

for c in classifers:
    steps = [
        ("Featurize", ColumnTransformer(col_tf["standard"]+col_tf["OneHotEncoder"])),
        (type(c).__name__, c),
    ]
    p = Pipeline(steps, verbose=True)
    models.append(p)

In [124]:
# Ordinal Classifers
from sklearn.naive_bayes import CategoricalNB

# Restrict to catergorical (Will Nullify Most of col_tf["standard"])
catOnly = ("catOnly", "drop", make_column_selector(dtype_include="category"))

# Build Pipeline
steps = [
    ("Featurize", ColumnTransformer([catOnly]+col_tf["standard"]+col_tf["OrdinalEncoder"])),
    ("CateCategoricalNB", CategoricalNB()),
]
p = Pipeline(steps, verbose=True)
models.append(p)

In [125]:
# Regression Models
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

regressors = [
    AdaBoostRegressor(),
    RandomForestRegressor(),
    GaussianProcessRegressor(),
    LinearRegression(),
    Ridge(),
    ElasticNet(),
    Lasso(),
    KNeighborsRegressor(),
    MLPRegressor(),
    LinearSVC(),
    DecisionTreeRegressor(),
]

for r in regressors:
    steps = [
        ("Featurize", ColumnTransformer(col_tf["standard"]+col_tf["OneHotEncoder"])),
        (type(r).__name__, r),
    ]
    p = Pipeline(steps, verbose=True)
    models.append(p)

In [126]:
# Build Label Pipelines
label_pipe = {}

def point_pipe(name, tf):
    return ColumnTransformer([(name, tf, make_column_selector("points"))])

# Bin all scores
label_pipe["Classifer"] = point_pipe("BinScores",
    KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
)

# Normalize Scores to a Gaussian
label_pipe["Regression"] = point_pipe("NormScores", StandardScaler())


In [127]:
# Enable Verbose and Parralell Models
for m in models:
    for k in m.get_params(deep=True).keys():
        new_param = {}
        if re.match(".*n_job", k):
            new_param[k] = -1
        if re.match(".*verbose", k):
            new_param[k] = True
        if new_param:
            m.set_params(**new_param)


Train Models

In [None]:
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.base import is_classifier

# Split into Test/Train Sets
train, test = train_test_split(wine_reviews, test_size=0.2, shuffle=True)

for model in models:
    # Transform Labels based on Model Type
    if is_classifier(model):
        y_pipe = label_pipe["Classifer"]
    else:
        y_pipe = label_pipe["Regression"]

    Y = y_pipe.fit_transform(train).ravel()

    print("Training: %s" % model.steps[-1][0])
    try:
        model.fit(trainX, trainY)
        print(f"Train: {model.score(trainX, trainY)}")
        print(f"Test: {model.score(testX, testY)}")
    except:
        print(f"Error Training Model")
        print(sys.exc_info())

Training: KNeighborsClassifier
[Pipeline] ......... (step 1 of 2) Processing Featurize, total=   1.7s
[Pipeline]  (step 2 of 2) Processing KNeighborsClassifier, total=   0.0s
Train: 0.7726370697964506
Test: 0.6696145124716554
Training: LinearSVC
[Pipeline] ......... (step 1 of 2) Processing Featurize, total=   0.2s
[LibLinear][Pipeline] ......... (step 2 of 2) Processing LinearSVC, total=   1.5s
Train: 0.888926688212281
Test: 0.6947845804988663
Training: SVC
[Pipeline] ......... (step 1 of 2) Processing Featurize, total=   0.2s
[LibSVM][Pipeline] ............... (step 2 of 2) Processing SVC, total=  30.9s
Train: 0.7876056018597267
Test: 0.7090702947845805
Training: SVC
[Pipeline] ......... (step 1 of 2) Processing Featurize, total=   0.2s
[LibSVM][Pipeline] ............... (step 2 of 2) Processing SVC, total=  34.8s
Train: 0.8257073198389748
Test: 0.7160997732426304
Training: SVC
[Pipeline] ......... (step 1 of 2) Processing Featurize, total=   0.2s
[LibSVM]