In [9]:
# Load Dataset
import pandas
try:
  wine_reviews = pandas.read_csv("../data/winemag-data-130k-v2.csv")
except:
  wine_reviews = pandas.read_csv("https://drive.google.com/uc?export=download&id=1UFKyzq8aTg-1hgYVxVA0bm7D2mmKqSk9")


In [10]:
import re
year = []
for title in wine_reviews.title:
    year_match = re.search("(\d{4})", title)
    if year_match:
        year.append(year_match.group(1))
    else:
        year.append(None)

wine_reviews.insert(wine_reviews.shape[1], value=year, column="year")

In [11]:
# Drop incomplete data
wine_reviews.dropna(axis=0, inplace=True)

In [12]:
# Mark Catergorical Fields as such
catergorical_field = [
    "designation",
    "province",
    "region_1",
    "region_2",
    "taster_name",
    "variety",
    "winery",
    "country",
]
for field in catergorical_field:
    wine_reviews[field] = wine_reviews[field].astype('category')


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from matplotlib import pyplot

# Train on Catergorical data only
feature = pandas.concat([wine_reviews[field].cat.codes for field in catergorical_field], axis=1)
feature.columns = catergorical_field

# Build forest
models = [
    KNeighborsClassifier(),
    LinearSVC(),
    SVC(kernel="rbf"),
    SVC(kernel="poly"),
    SVC(kernel="sigmoid"),
    GaussianProcessClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    MLPClassifier(hidden_layer_sizes=(100,), activation="relu"),
]

# Record score
for model in models:
    train_score = []
    test_score = []
    nbin_list = range(3, 5)
    for nbins in nbin_list:
        print(f"Training: {model} - {nbins}\n")
        # Bin Scores into 5 bins
        labels = pandas.qcut(wine_reviews["points"], nbins, duplicates='drop').cat.codes

        # Split into Test/Train Sets
        trainX, testX, trainY, testY = train_test_split(feature, labels, test_size=0.2)

        # Train Model
        model.fit(trainX, trainY)
        train_score.append(model.score(trainX, trainY))
        test_score.append(model.score(testX, testY))

    # Create Plot
    pyplot.plot(nbin_list, train_score, label=f"Train - {model}")
    pyplot.plot(nbin_list, test_score, label=f"Test - {model}")

pyplot.legend()
pyplot.xlabel("Number of Bins for Points")
pyplot.ylabel("Model Performance")


Training: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform') - 3

Training: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform') - 4

Training: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) - 3





Training: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) - 4





Training: SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False) - 3

Training: SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False) - 4

Training: SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False) - 3

Training: SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_s

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from matplotlib import pyplot

# Train on Catergorical data only
feature = pandas.concat([wine_reviews[field].cat.codes for field in catergorical_field], axis=1)
feature.columns = catergorical_field

# Build forest
model = RandomForestClassifier()

# Record score
train_score = []
test_score = []
nbin_list = range(2, 20)
for nbins in nbin_list:
    # Bin Scores into 5 bins
    labels = pandas.cut(wine_reviews["points"], nbins).cat.codes

    # Split into Test/Train Sets
    trainX, testX, trainY, testY = train_test_split(feature, labels, test_size=0.2)

    # Train Model
    model.fit(trainX, trainY)
    train_score.append(model.score(trainX, trainY))
    test_score.append(model.score(testX, testY))

# Create Plot
pyplot.plot(nbin_list, train_score, label="Train")
pyplot.plot(nbin_list, test_score, label="Test")
pyplot.legend()
pyplot.xlabel("Number of Bins for Points")
pyplot.ylabel("Model Performance")
