# Exploring Wine Review Data

In [17]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
import pickle
import nltk
from nltk.corpus import wordnet as wn

In [18]:
# Read data
df = pd.read_csv("../Resources/winemag-data-130k-cleanedPunctuation.csv")

In [3]:
del df['Unnamed: 0']
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_one,region_two,taster_name,taster_twitter_handle,title,variety,winery
0,Australia,This wine contains some material over 100 year...,Rare,100,350.0,Victoria,Rutherglen,,Joe Czerwinski,@JoeCz,Chambers Rosewood Vineyards NV Rare Muscat (Ru...,Muscat,Chambers Rosewood Vineyards
1,Italy,Thick as molasses and dark as caramelized brow...,Occhio di Pernice,100,210.0,Tuscany,Vin Santo di Montepulciano,,,,Avignonesi 1995 Occhio di Pernice (Vin Santo ...,Prugnolo Gentile,Avignonesi
2,France,This is a fabulous wine from the greatest Cham...,Brut,100,259.0,Champagne,Champagne,,Roger Voss,@vossroger,Krug 2002 Brut (Champagne),Champagne Blend,Krug
3,Italy,A perfect wine from a classic vintage the 2007...,Masseto,100,460.0,Tuscany,Toscana,,,,Tenuta dell'Ornellaia 2007 Masseto Merlot (Tos...,Merlot,Tenuta dell'Ornellaia
4,Portugal,This is the latest release of what has long be...,Barca-Velha,100,450.0,Douro,,,Roger Voss,@vossroger,Casa Ferreirinha 2008 Barca-Velha Red (Douro),Portuguese Red,Casa Ferreirinha


In [14]:
df.describe().round()

Unnamed: 0,points,price
count,129970.0,120974.0
mean,88.0,35.0
std,3.0,41.0
min,80.0,4.0
25%,86.0,17.0
50%,88.0,25.0
75%,91.0,42.0
max,100.0,3300.0


# Data Pre-processing

In [19]:
# Filter needed columns
df_filtered = df[['description', 'variety','country','winery']]

In [20]:
# Drop NAs and duplicates
df_filtered = df_filtered.dropna(how='any')
df_filtered=df_filtered.drop_duplicates()
df_filtered.head()

Unnamed: 0,description,variety,country,winery
0,This wine contains some material over 100 year...,Muscat,Australia,Chambers Rosewood Vineyards
1,Thick as molasses and dark as caramelized brow...,Prugnolo Gentile,Italy,Avignonesi
2,This is a fabulous wine from the greatest Cham...,Champagne Blend,France,Krug
3,A perfect wine from a classic vintage the 2007...,Merlot,Italy,Tenuta dell'Ornellaia
4,This is the latest release of what has long be...,Portuguese Red,Portugal,Casa Ferreirinha


In [21]:
# Identify x and y
X = df_filtered['description']
y = df_filtered['variety']

In [22]:
# label encoder for y
from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

# Convert encoded labels to one-hot-encoding
y_categorical = to_categorical(encoded_y)
y_list=list(zip(y,encoded_y))
y_list=pd.DataFrame(y_list, columns=['Label','Class'])

In [23]:
# Text preprocessing
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(X)
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [24]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_categorical, test_size=0.2, random_state=100)

In [12]:
X_train.shape

(95935, 40858)

In [13]:
X_test.shape

(23984, 40858)

In [14]:
 y_train.shape

(95935, 701)

In [15]:
 y_test.shape

(23984, 701)

# Deep learning 

In [9]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=40858))
model.add(Dense(units=701, activation='softmax'))

In [10]:
# Compile the model
model.compile(loss="categorical_crossentropy",
              optimizer="adam", metrics=['accuracy'])

In [12]:
# Use the training data to fit (train) the model
model.fit(
    X_train,
    y_train,
    epochs=3,
    shuffle=True,
    verbose=2
)

Train on 95935 samples
Epoch 1/3
95935/95935 - 150s - loss: 2.6273 - accuracy: 0.4257
Epoch 2/3
95935/95935 - 158s - loss: 1.5534 - accuracy: 0.6366
Epoch 3/3
95935/95935 - 165s - loss: 1.1894 - accuracy: 0.7137


<tensorflow.python.keras.callbacks.History at 0x13a4d9d5048>

In [13]:
# Save the model
model.save("dl_v2.h5")

In [15]:
# Evaluate the model using the training data
model_loss, model_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

23984/23984 - 13s - loss: 1.6025 - accuracy: 0.6044
Loss: 1.6025264290987133, Accuracy: 0.6044029593467712


In [30]:
import pickle
vectorizer_file = "tokenizer.sklearn"
pickle.dump(count_vect, open(vectorizer_file,'wb'))

tokenizer_file = "vectorizer.sklearn"
pickle.dump(tfidf_transformer, open(tokenizer_file,'wb'))

In [31]:
model = load_model("dl_v2.h5")

In [32]:
# Reusing model
import pickle
from tensorflow.keras.models import load_model
vectorizer = pickle.load(open(vectorizer_file, 'rb'))
tokenizer = pickle.load(open(tokenizer_file, 'rb'))


In [40]:
user_input=['semisweetchocolate']
X_new = vectorizer.transform(user_input)
X_new = tokenizer.transform(X_new)
result = model.predict(X_new)
result

array([[3.61994695e-04, 1.57652795e-03, 1.19487883e-03, 4.56790985e-06,
        7.17451563e-04, 3.91342037e-04, 1.70623916e-05, 2.43123411e-03,
        4.84761476e-05, 3.92587943e-04, 3.23136890e-04, 4.75713714e-05,
        1.76917051e-03, 3.22498847e-03, 2.36443779e-03, 8.37505737e-04,
        1.09817454e-04, 5.09041711e-04, 1.02509419e-03, 1.10452485e-04,
        1.21345348e-03, 1.32164176e-04, 6.57803437e-04, 9.38081765e-04,
        2.82378111e-04, 1.68485241e-03, 2.48682406e-03, 1.77100796e-04,
        1.91984698e-03, 1.04622450e-03, 4.58072191e-06, 3.13404901e-03,
        2.25078012e-03, 2.89478130e-03, 1.16702577e-04, 1.49583007e-04,
        6.27127651e-04, 4.85585997e-06, 1.62799071e-04, 1.46683794e-03,
        1.19194132e-03, 3.68533947e-05, 7.20304810e-03, 1.43133439e-04,
        1.21478908e-04, 4.19714979e-06, 3.79407675e-06, 1.04182305e-04,
        1.00250077e-03, 1.04489423e-04, 2.34914315e-03, 1.21748797e-03,
        4.65232115e-05, 4.50103425e-06, 4.44115176e-06, 2.898968

In [39]:
# Decode the result
predicted_class=model.predict_classes(X_new)
predicted_class=pd.DataFrame(predicted_class,columns=['Class'])
print(f"Predicted class: {model.predict_classes(X_new)}")

result = predicted_class.merge(y_list,on='Class',how='left').drop_duplicates()
result = result.Label
result

Predicted class: [125]


0    Chardonnay
Name: Label, dtype: object