In [3]:
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import random
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, KFold, StratifiedKFold
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, f1_score, classification_report, accuracy_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import CategoricalNB

from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

In [4]:
rum = pd.read_csv('Data/rum_12_2023.csv')
rum = rum[rum['Number_Reviews'] >= 5]
rum_taste_rating = rum[['Rating', 'Taste_Notes']].dropna()
rum_taste_rating = rum_taste_rating[rum_taste_rating['Taste_Notes'].map(lambda x: len(eval(x)) > 2)].reset_index(drop=True)
rum_taste_rating

Unnamed: 0,Rating,Taste_Notes
0,8.0,"['Vanilla', 'Sweet', 'Caramel', 'Coconut', 'Ba..."
1,7.0,"['Vanilla', 'Sweet', 'Caramel', 'Orange', 'Syn..."
2,7.5,"['Sweet', 'Caramel', 'Vanilla', 'Mild', 'Sugar..."
3,7.5,"['Caramel', 'Sweet', 'Vanilla', 'Woody', 'Alco..."
4,6.0,"['Vanilla', 'Caramel', 'Sweet', 'Spice', 'Cinn..."
...,...,...
5578,7.8,"['Black pepper', 'Smoky', 'Medicinal', 'Grass'..."
5579,7.9,"['Peat', 'Smoky', 'Bacon', 'Sweet', 'Fruity', ..."
5580,8.0,"['Red fruits', 'Menthol', 'Fruity', 'Medicinal..."
5581,7.2,"['Fruity', 'Herbal', 'Alcoholic', 'Spicy', 'Ma..."


In [5]:
rum = pd.read_csv('Data/rum_12_2023.csv')
rum = rum[rum['Number_Reviews'] >= 5]
top10 = list(rum['Country'].value_counts()[:5].index)
#top10.remove('unknown')

rum_taste_country = rum[rum['Country'].isin(top10)][['Country', 'Taste_Notes']].dropna()
rum_taste_country = rum_taste_country[rum_taste_country['Taste_Notes'].map(lambda x: len(eval(x)) > 2)].reset_index(drop=True)
rum_taste_country

Unnamed: 0,Country,Taste_Notes
0,barbados,"['Vanilla', 'Sweet', 'Caramel', 'Coconut', 'Ba..."
1,trinidad,"['Vanilla', 'Caramel', 'Sweet', 'Spice', 'Cinn..."
2,guyana,"['Sweet', 'Vanilla', 'Caramel', 'Round', 'Drie..."
3,jamaica,"['Banana', 'Ester', 'Spice', 'Peppery', 'Tropi..."
4,barbados,"['Woody', 'Vanilla', 'Caramel', 'Barrel', 'Str..."
...,...,...
2802,trinidad,"['Black pepper', 'Smoky', 'Medicinal', 'Grass'..."
2803,guyana,"['Peat', 'Smoky', 'Bacon', 'Sweet', 'Fruity', ..."
2804,trinidad,"['Red fruits', 'Menthol', 'Fruity', 'Medicinal..."
2805,trinidad,"['Fruity', 'Herbal', 'Alcoholic', 'Spicy', 'Ma..."


In [6]:
notes = defaultdict(int)
for t in rum_taste_rating['Taste_Notes']:
    for n in eval(t):
        notes[n] += 1
    
print({k: v for k, v in sorted(dict(notes).items(), key=lambda item: item[1], reverse=True)})
unique_notes = list(set(notes.keys()))

{'Woody': 2157, 'Vanilla': 1908, 'Sweet': 1091, 'Caramel': 1028, 'Spicy': 903, 'Fruity': 778, 'Tropical fruit': 739, 'Spice': 723, 'Dried fruit': 689, 'Roasted': 634, 'Oak': 568, 'Peppery': 535, 'Sugarcane': 521, 'Barrel': 482, 'Alcoholic': 477, 'Banana': 451, 'Ester': 436, 'Pineapple': 423, 'Citrus': 403, 'Dark chocolate': 399, 'Chocolate': 397, 'Honey': 394, 'Dry': 378, 'Leather': 374, 'Smoky': 361, 'Intense': 357, 'Vegetal': 350, 'Licorice': 344, 'Coconut': 297, 'Raisin': 291, 'Tobacco': 272, 'Coffee': 270, 'Floral': 252, 'Nutty': 246, 'Herbal': 242, 'Cinnamon': 238, 'Complex': 233, 'Bitter': 221, 'Fresh': 217, 'Grass': 201, 'Cocoa': 193, 'Mild': 183, 'Minty': 183, 'Burnt sugar': 178, 'Round': 173, 'Caramelized': 172, 'Anise': 168, 'Balanced': 166, 'Salty': 165, 'Rubber': 163, 'Tar': 162, 'Young wood': 156, 'Brown sugar': 152, 'Mango': 151, 'Funky': 150, 'Orange': 146, 'Prunes': 145, 'Oily': 142, 'Creamy': 140, 'Light': 140, 'Agricole': 139, 'Cherry': 139, 'Warm': 137, 'Toffee': 134

In [7]:
taste_output = {}

for note in sorted(unique_notes):
    arr = []
    for i in range(len(rum_taste_country)):
        if note in eval(rum_taste_country['Taste_Notes'][i]):
            arr.append(1)
        else:
            arr.append(0)
    taste_output[note] = arr

taste_matrix_top10 = pd.DataFrame(taste_output)
taste_matrix_top10 = taste_matrix_top10.loc[:, np.sum(taste_matrix_top10, axis=0).values > 3]
taste_matrix_top10

Unnamed: 0,Acidic,Agricole,Alcoholic,Allspice,Almond,Anise,Apple,Apricot,Armagnac,Astringent,...,Velvety,Walnut,Warm,Waxes,Whiskey,White chocolate,Wine,Woody,Yellow fruits,Young wood
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2802,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2803,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2805,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
taste_output = {}

for note in sorted(unique_notes):
    arr = []
    for i in range(len(rum_taste_rating)):
        if note in eval(rum_taste_rating['Taste_Notes'][i]):
            arr.append(1)
        else:
            arr.append(0)
    taste_output[note] = arr

taste_matrix = pd.DataFrame(taste_output)
taste_matrix = taste_matrix.loc[:, np.sum(taste_matrix, axis=0).values > 5]
taste_matrix = taste_matrix[list(taste_matrix_top10.columns)]
taste_matrix

Unnamed: 0,Acidic,Agricole,Alcoholic,Allspice,Almond,Anise,Apple,Apricot,Armagnac,Astringent,...,Velvety,Walnut,Warm,Waxes,Whiskey,White chocolate,Wine,Woody,Yellow fruits,Young wood
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5578,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5579,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5580,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5581,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
sum(taste_matrix.columns != taste_matrix_top10.columns)

0

In [19]:
taste_count = taste_matrix.sum(axis=0).to_frame(name='Count')
taste_count['Taste'] = taste_count.index
taste_count = taste_count.reset_index(drop=True)
taste_count = taste_count[['Taste', 'Count']]
taste_count.to_csv('Data/taste_counts.csv', index=False)

In [8]:
# split the data
X_train, X_test, Y_train, Y_test = train_test_split(taste_matrix, rum_taste_rating['Rating'], train_size=0.8, random_state=100, shuffle=True)

In [27]:
# basic linear regression
reg = LinearRegression().fit(X_train, Y_train)

lin_pred = reg.predict(X_test)

linear_mse = mean_squared_error(Y_test, lin_pred)
print(linear_mse)

0.43667026362482214


In [28]:
# basic SVR model
svr = SVR().fit(X_train, Y_train)

svr_pred = svr.predict(X_test)

svr_mse = mean_squared_error(Y_test, svr_pred)
print(svr_mse)

0.457000020855131


In [29]:
# SVR grid search

svr_params = {'gamma': [0.001, 0.01, 0.1, 1.0, 2.0, 5.0, 'auto', 'scale'],
              'C': list(np.arange(0.5 , 2.05, .05)),
              'epsilon': [0.001, 0.01, 0.1, 1.0]}

svr_grid = RandomizedSearchCV(SVR(),
                         svr_params,
                         n_iter = 250,
                         scoring = 'neg_mean_squared_error',
                         n_jobs = -1,
                         cv = 5,
                         random_state = 100)

svr_grid.fit(X_train, Y_train)
svr_grid_pred = svr_grid.predict(X_test)
svr_grid_mse = mean_squared_error(Y_test, svr_grid_pred)
print(svr_grid_mse)

0.4489909864803867


In [30]:
# basic lasso
lasso = Lasso(selection="random", random_state=100).fit(X_train, Y_train)

lasso_pred = lasso.predict(X_test)

lasso_mse = mean_squared_error(Y_test, lasso_pred)
print(lasso_mse)

0.6970115538838662


In [31]:
# lasso grid

lasso_params = {'alpha':[i/100000 for i in range(1,10001,2)]}

lasso_grid = RandomizedSearchCV(Lasso(),
                         lasso_params,
                         n_iter= 1000,
                         scoring = 'neg_mean_squared_error',
                         n_jobs = -1,
                         cv = 5,
                         random_state=100)



lasso_grid.fit(X_train, Y_train)
lasso_grid_pred = lasso_grid.predict(X_test)
lasso_grid_mse = mean_squared_error(Y_test, lasso_grid_pred)

print(lasso_grid_mse)

0.43447898718021805


In [32]:
# base random forest regressor
regr = RandomForestRegressor(random_state=100).fit(X_train, Y_train)

regr_pred = regr.predict(X_test)

regr_mse = mean_squared_error(Y_test, regr_pred)
print(regr_mse)

0.5120735983756094


In [36]:
# RF Regressor grid

regr_params = {
    'n_estimators' : list(np.arange(20, 250, 10)),
    'max_depth': [None] + list(np.arange(10, 510, 10)),
    'min_samples_split' : list(np.arange(2, 52, 2)),
    'max_features' : ['sqrt', 'log2', None] + list(np.arange(2, 102, 2)),
    'random_state': [100]
}

regr_grid = RandomizedSearchCV(
    RandomForestRegressor(),
    regr_params,
    n_iter = 1000,
    scoring = 'neg_mean_squared_error',
    n_jobs = -1,
    cv = 5,
    random_state=100
)

regr_grid.fit(X_train, Y_train)
regr_grid_pred = regr_grid.predict(X_test)
regr_grid_mse = mean_squared_error(Y_test, regr_grid_pred)
print(regr_grid_mse)

0.4581266472443143


In [37]:
# base keras NN
model = Sequential()
model.add(Dense(8, activation = 'relu', input_dim = X_train.shape[1]))
model.add(Dense(units = 1))

model.compile(optimizer = 'adam', loss = "mean_squared_error")
model.fit(X_train, Y_train, epochs = 100, verbose=0)

nn_pred = model.predict(X_test)

nn_mse = mean_squared_error(Y_test, nn_pred)
print(nn_mse)

0.6161157679115993


In [38]:
# tuning a random layers with random size
seed = 100
random.seed(seed)

kfold = KFold(n_splits= 5, shuffle=True, random_state=seed)


cv_scores = []
layers_ = []
dims_ = []

for _ in range(50):

    # generate random model params
    num_layers = random.randint(1,6)
    num_dims = [random.randrange(8,16,2) for _ in range(num_layers)]
    layers_.append(num_layers)
    dims_.append(num_dims)
    scores = []
    # run CV
    for train, test in kfold.split(X_train, Y_train):
        # build model
        model = Sequential()
        for i in range(num_layers):
            if i == 1:
                model.add(Dense(num_dims[i], activation = 'relu', input_dim = X_train.shape[1]))
            else:
                model.add(Dense(num_dims[i], activation = 'relu'))
        model.add(Dense(units = 1))
        model.compile(optimizer = 'adam', loss = "mean_squared_error")
        model.fit(X_train.iloc[train], Y_train.iloc[train], epochs = 100, verbose=0)
        score_ = model.evaluate(X_train.iloc[test], Y_train.iloc[test], verbose=0)
        scores.append(score_)
    
    cv_scores.append(np.mean(scores))
    
        
print(f'Best Configuration - Layers: {layers_[np.argmin(cv_scores)]}, Dims: {dims_[np.argmin(cv_scores)]}, 5-Fold CV MSE: {min(cv_scores)}')

Best Configuration - Layers: 1, Dims: [8], 5-Fold CV MSE: 0.6463635444641114


In [50]:
pickle.dump(lasso_grid, open('Models/taste_to_score.sav', 'wb'))

***

In [16]:
# split the data
X_train, X_test, Y_train, Y_test = train_test_split(taste_matrix_top10, rum_taste_country['Country'], train_size=0.8, random_state=100, shuffle=True)

In [40]:
# KNN base
neighbor = KNeighborsClassifier().fit(X_train, Y_train)
neighbor_pred = neighbor.predict(X_test)
neighbor_acc = accuracy_score(Y_test, neighbor_pred)
print(neighbor_acc)
print(classification_report(Y_test, neighbor_pred))

0.5106761565836299
              precision    recall  f1-score   support

    barbados       0.23      0.43      0.30        60
      guyana       0.43      0.35      0.38       115
     jamaica       0.58      0.69      0.63       149
  martinique       0.63      0.54      0.58       144
    trinidad       0.71      0.43      0.53        94

    accuracy                           0.51       562
   macro avg       0.52      0.49      0.49       562
weighted avg       0.55      0.51      0.52       562



In [41]:
knn_params = {
    'n_neighbors': list(range(2,51)),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10,20,30,40,50]
    
}

knn_grid = RandomizedSearchCV(KNeighborsClassifier(),
                         knn_params,
                         n_iter= 200,
                         scoring = 'accuracy',
                         n_jobs = -1,
                         cv = 3,
                         random_state=100)

knn_grid.fit(X_train, Y_train)
knn_grid_pred = knn_grid.predict(X_test)
knn_grid_acc = accuracy_score(Y_test, knn_grid_pred)
print(knn_grid_acc)
print(classification_report(Y_test, knn_grid_pred))
print(knn_grid.best_estimator_)

0.599644128113879
              precision    recall  f1-score   support

    barbados       0.41      0.62      0.49        60
      guyana       0.52      0.42      0.46       115
     jamaica       0.64      0.81      0.71       149
  martinique       0.64      0.64      0.64       144
    trinidad       0.83      0.43      0.56        94

    accuracy                           0.60       562
   macro avg       0.61      0.58      0.57       562
weighted avg       0.62      0.60      0.59       562

KNeighborsClassifier(algorithm='ball_tree', leaf_size=20, n_neighbors=43)


In [42]:
# Decision Tree base
dtc = DecisionTreeClassifier(random_state=100).fit(X_train, Y_train)
dtc_pred = dtc.predict(X_test)
dtc_acc = accuracy_score(Y_test, dtc_pred)
print(dtc_acc)
print(classification_report(Y_test, dtc_pred))

0.5177935943060499
              precision    recall  f1-score   support

    barbados       0.28      0.32      0.30        60
      guyana       0.37      0.36      0.36       115
     jamaica       0.63      0.66      0.64       149
  martinique       0.61      0.59      0.60       144
    trinidad       0.54      0.51      0.52        94

    accuracy                           0.52       562
   macro avg       0.49      0.49      0.49       562
weighted avg       0.52      0.52      0.52       562



In [43]:
# DTC Grid

dtc_params = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'min_samples_split': list(range(2,22,2)),
    'max_depth': [None] + list(range(5,205,5)),
    'max_features': [None, 'auto', 'sqrt', 'log2'] +  list(range(5,205,5)),
    'class_weight': ['balanced', None],
    'random_state': [100]
    
}

dtc_grid = RandomizedSearchCV(DecisionTreeClassifier(),
                         dtc_params,
                         n_iter= 1500,
                         scoring = 'accuracy',
                         n_jobs = -1,
                         cv = 3,
                         random_state=100)

dtc_grid.fit(X_train, Y_train)
dtc_grid_pred = dtc_grid.predict(X_test)
dtc_grid_acc = accuracy_score(Y_test, dtc_grid_pred)
print(dtc_grid_acc)
print(classification_report(Y_test, dtc_grid_pred))
print(dtc_grid.best_estimator_)

0.5480427046263345
              precision    recall  f1-score   support

    barbados       0.32      0.28      0.30        60
      guyana       0.38      0.55      0.45       115
     jamaica       0.70      0.70      0.70       149
  martinique       0.59      0.56      0.57       144
    trinidad       0.80      0.46      0.58        94

    accuracy                           0.55       562
   macro avg       0.56      0.51      0.52       562
weighted avg       0.58      0.55      0.55       562

DecisionTreeClassifier(criterion='log_loss', max_depth=20, max_features=30,
                       min_samples_split=16, random_state=100,
                       splitter='random')


In [44]:
# base Random Forest Classifier

clf = RandomForestClassifier().fit(X_train, Y_train)
clf_pred = clf.predict(X_test)
clf_acc = accuracy_score(Y_test, clf_pred)
print(clf_acc)
print(classification_report(Y_test, clf_pred))

0.6138790035587188
              precision    recall  f1-score   support

    barbados       0.35      0.35      0.35        60
      guyana       0.50      0.56      0.53       115
     jamaica       0.69      0.77      0.73       149
  martinique       0.66      0.64      0.65       144
    trinidad       0.78      0.56      0.65        94

    accuracy                           0.61       562
   macro avg       0.60      0.58      0.58       562
weighted avg       0.62      0.61      0.61       562



In [45]:
clf_params = {
    'n_estimators': list(range(50, 260, 5)),
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None] + list(range(5, 205, 5)),
    'min_samples_split': list(range(2, 30, 2)),
    'max_features': [None, 'auto', 'sqrt', 'log2'] +  list(range(5, 205, 5)),
    'class_weight': ['balanced', None],
    'random_state': [100]
    
}

clf_grid = RandomizedSearchCV(RandomForestClassifier(),
                         clf_params,
                         n_iter= 1500,
                         scoring = 'accuracy',
                         n_jobs = -1,
                         cv = 3,
                         random_state=100)

clf_grid.fit(X_train, Y_train)
clf_grid_pred = clf_grid.predict(X_test)
clf_grid_acc = accuracy_score(Y_test, clf_grid_pred)
print(clf_grid_acc)
print(classification_report(Y_test, clf_grid_pred))
print(clf_grid.best_estimator_)

0.6334519572953736
              precision    recall  f1-score   support

    barbados       0.45      0.42      0.43        60
      guyana       0.53      0.50      0.52       115
     jamaica       0.66      0.79      0.72       149
  martinique       0.66      0.71      0.68       144
    trinidad       0.83      0.56      0.67        94

    accuracy                           0.63       562
   macro avg       0.63      0.60      0.61       562
weighted avg       0.64      0.63      0.63       562

RandomForestClassifier(criterion='entropy', max_depth=95, max_features=5,
                       min_samples_split=26, n_estimators=225,
                       random_state=100)


In [17]:
# base NBC
nbc = CategoricalNB().fit(X_train, Y_train)
nbc_pred = nbc.predict(X_test)
nbc_acc = accuracy_score(Y_test, nbc_pred)
print(nbc_acc)
print(classification_report(Y_test, nbc_pred))

0.6405693950177936
              precision    recall  f1-score   support

    barbados       0.40      0.60      0.48        60
      guyana       0.53      0.57      0.55       115
     jamaica       0.81      0.76      0.78       149
  martinique       0.67      0.64      0.65       144
    trinidad       0.75      0.56      0.64        94

    accuracy                           0.64       562
   macro avg       0.63      0.63      0.62       562
weighted avg       0.66      0.64      0.65       562



In [47]:
# NBC grid

nbc_params = {
    'alpha': list(np.arange(0.1, 3.01, 0.01))
}

nbc_grid = GridSearchCV(CategoricalNB(),
                         nbc_params,
                         scoring = 'accuracy',
                         n_jobs = -1,
                         cv = 5)

nbc_grid.fit(X_train, Y_train)
nbc_grid_pred = nbc_grid.predict(X_test)
nbc_grid_acc = accuracy_score(Y_test, nbc_grid_pred)
print(nbc_grid_acc)
print(classification_report(Y_test, nbc_grid_pred))
print(nbc_grid.best_estimator_)

0.6281138790035588
              precision    recall  f1-score   support

    barbados       0.36      0.52      0.42        60
      guyana       0.51      0.56      0.53       115
     jamaica       0.80      0.76      0.78       149
  martinique       0.68      0.64      0.66       144
    trinidad       0.73      0.56      0.63        94

    accuracy                           0.63       562
   macro avg       0.61      0.61      0.61       562
weighted avg       0.65      0.63      0.64       562

CategoricalNB(alpha=0.1)


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan na

In [48]:
# base keras NN
seed = 100
random.seed(seed)
encoder = LabelEncoder()
encoder.fit(rum_taste_country['Country'])
num_classes = 5

model = Sequential()
model.add(Dense(8, input_dim = X_train.shape[1], activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
# Compile model

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, np_utils.to_categorical(encoder.transform(Y_train)), epochs = 100, verbose=0)
nn_pred = model.predict(X_test)
nn_pred = [encoder.inverse_transform([np.argmax(n)])[0] for n in nn_pred]
nn_acc = accuracy_score(Y_test, nn_pred)
print(nn_acc)
print(classification_report(Y_test, nn_pred))

0.608540925266904
              precision    recall  f1-score   support

    barbados       0.39      0.38      0.39        60
      guyana       0.51      0.54      0.53       115
     jamaica       0.70      0.71      0.71       149
  martinique       0.68      0.69      0.69       144
    trinidad       0.61      0.54      0.57        94

    accuracy                           0.61       562
   macro avg       0.58      0.57      0.58       562
weighted avg       0.61      0.61      0.61       562



In [49]:
# base keras NN
seed = 100
random.seed(seed)
encoder = LabelEncoder()
encoder.fit(rum_taste_country['Country'])
num_classes = 5

model = Sequential()
model.add(Dense(10, input_dim = X_train.shape[1], activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
# Compile model

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, np_utils.to_categorical(encoder.transform(Y_train)), epochs = 100, verbose=0)
nn_pred = model.predict(X_test)
nn_pred = [encoder.inverse_transform([np.argmax(n)])[0] for n in nn_pred]
nn_acc = accuracy_score(Y_test, nn_pred)
print(nn_acc)
print(classification_report(Y_test, nn_pred))

0.5729537366548043
              precision    recall  f1-score   support

    barbados       0.31      0.35      0.33        60
      guyana       0.45      0.48      0.47       115
     jamaica       0.67      0.66      0.67       149
  martinique       0.69      0.65      0.67       144
    trinidad       0.60      0.57      0.59        94

    accuracy                           0.57       562
   macro avg       0.54      0.54      0.54       562
weighted avg       0.58      0.57      0.58       562

