In [3]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
import pickle
import nltk
from nltk.corpus import wordnet as wn

In [4]:
# Read data
df = pd.read_csv("../Resources/winemag-data-130k-cleanedPunctuation.csv")

In [5]:
# Filter needed columns
# df_filtered = df.loc[df.points_group=='90-95']
df_filtered = df[['description', 'variety','country','winery']]

In [6]:
# Drop NAs and duplicates
df_filtered = df_filtered.dropna(how='any')
df_filtered=df_filtered.drop_duplicates()

In [7]:
X = df_filtered['description']
y = df_filtered['variety']

In [8]:
from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

# Step 2: Convert encoded labels to one-hot-encoding
y_categorical = to_categorical(encoded_y)
y_list=list(zip(y,encoded_y))
y_list=pd.DataFrame(y_list, columns=['Label','Class'])

In [9]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(X)
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)


In [10]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_categorical, test_size=0.2, random_state=100)

In [11]:
X_train.shape

(95935, 40858)

In [12]:
X_test.shape

(23984, 40858)

In [13]:
 y_train.shape

(95935, 701)

In [14]:
 y_test.shape

(23984, 701)

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=40858))
model.add(Dense(units=701, activation='softmax'))

In [16]:
model.compile(loss="categorical_crossentropy",
              optimizer="adam", metrics=['accuracy'])

In [17]:
# Use the training data to fit (train) the model
model.fit(
    X_train,
    y_train,
    epochs=3,
    shuffle=True,
    verbose=2
)

InvalidArgumentError: indices[1] = [0,39651] is out of order. Many sparse ops require sorted indices.
    Use `tf.sparse.reorder` to create a correctly ordered copy.

 [Op:SerializeManySparse]

In [None]:
# Save the model
model.save("dl_v2.h5")

In [None]:
# Load the model
from tensorflow.keras.models import load_model
model = load_model("dl_v2.h5")

In [None]:
# Evaluate the model using the training data
model_loss, model_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Save model
import pickle
vectorizer_file = "tokenizer.sklearn"
pickle.dump(count_vect, open(vectorizer_file,'wb'))

tokenizer_file = "vectorizer.sklearn"
pickle.dump(tfidf_transformer, open(tokenizer_file,'wb'))


In [None]:
# Reusing model
import pickle
vectorizer = pickle.load(open(vectorizer_file, 'rb'))
tokenizer = pickle.load(open(tokenizer_file, 'rb'))
model = load_model("dl_v2.h5")

user_input=['fruity chocolate italy']
X_new = vectorizer.transform(user_input)
X_new = tokenizer.transform(X_new)
result = model.predict(X_new)

In [None]:
predicted_class=model.predict_classes(X_new)
predicted_class=pd.DataFrame(predicted_class,columns=['Class'])
print(f"Predicted class: {model.predict_classes(X_new)}")

In [None]:
y_list

In [None]:
result = predicted_class.merge(y_list,on='Class',how='left').drop_duplicates()
result = result.Label
result