In [1]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
import pickle
import nltk
from nltk.corpus import wordnet as wn

In [2]:
# Read data
df = pd.read_csv("../Resources/winemag-data-130k-cleanedPunctuation.csv")

In [3]:
# Filter needed columns
# df_filtered = df.loc[df.points_group=='90-95']
df_filtered = df[['description', 'variety','country','winery']]

In [4]:
# Drop NAs and duplicates
df_filtered = df_filtered.dropna(how='any')
df_filtered=df_filtered.drop_duplicates()

In [5]:
X = df_filtered['description']
y = df_filtered['variety']

In [78]:
from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

# Step 2: Convert encoded labels to one-hot-encoding
y_categorical = to_categorical(encoded_y)
y_list=list(zip(y,encoded_y))
y_list=pd.DataFrame(y_list, columns=['Label','Class'])

In [7]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(X)
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)


In [8]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_categorical, test_size=0.2, random_state=100)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
 y_train.shape

In [None]:
 y_test.shape

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=40858))
model.add(Dense(units=701, activation='softmax'))

In [10]:
model.compile(loss="categorical_crossentropy",
              optimizer="adam", metrics=['accuracy'])

In [12]:
# Use the training data to fit (train) the model
model.fit(
    X_train,
    y_train,
    epochs=3,
    shuffle=True,
    verbose=2
)

Train on 95935 samples
Epoch 1/3
95935/95935 - 150s - loss: 2.6273 - accuracy: 0.4257
Epoch 2/3
95935/95935 - 158s - loss: 1.5534 - accuracy: 0.6366
Epoch 3/3
95935/95935 - 165s - loss: 1.1894 - accuracy: 0.7137


<tensorflow.python.keras.callbacks.History at 0x13a4d9d5048>

In [13]:
# Save the model
model.save("dl_v2.h5")

In [14]:
# Load the model
from tensorflow.keras.models import load_model
model = load_model("dl_v2.h5")

In [15]:
# Evaluate the model using the training data
model_loss, model_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

23984/23984 - 13s - loss: 1.6025 - accuracy: 0.6044
Loss: 1.6025264290987133, Accuracy: 0.6044029593467712


In [23]:
# Save model
import pickle
vectorizer_file = "tokenizer.sklearn"
pickle.dump(count_vect, open(vectorizer_file,'wb'))

tokenizer_file = "vectorizer.sklearn"
pickle.dump(tfidf_transformer, open(tokenizer_file,'wb'))


In [51]:
# Reusing model
import pickle
vectorizer = pickle.load(open(vectorizer_file, 'rb'))
tokenizer = pickle.load(open(tokenizer_file, 'rb'))
model = load_model("dl_v2.h5")

user_input=['fruity chocolate italy']
X_new = vectorizer.transform(user_input)
X_new = tokenizer.transform(X_new)
result = model.predict(X_new)



In [80]:
predicted_class=model.predict_classes(X_new)
predicted_class=pd.DataFrame(predicted_class,columns=['Class'])
print(f"Predicted class: {model.predict_classes(X_new)}")

Predicted class: [470]


In [79]:
y_list

Unnamed: 0,Label,Class
0,White Blend,684
1,Portuguese Red,447
2,Pinot Gris,434
3,Riesling,476
4,Pinot Noir,438
...,...,...
119914,Riesling,476
119915,Pinot Noir,438
119916,Gewürztraminer,209
119917,Pinot Gris,434


In [90]:
result = predicted_class.merge(y_list,on='Class',how='left').drop_duplicates()
result = result.Label
result

0    Red Blend
Name: Label, dtype: object