In [2]:
import numpy as np
from gensim.models import Word2Vec
import pandas as pd

In [6]:
df = pd.read_csv('../cleaned_wine_df.csv')

In [7]:
df.head(3)

Unnamed: 0,country,description,points,price,province,region_1,title,variety,winery,year
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,19.0,Sicily & Sardinia,Etna,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011
2,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013


In [8]:
df.columns

Index(['country', 'description', 'points', 'price', 'province', 'region_1',
       'title', 'variety', 'winery', 'year'],
      dtype='object')

1. Split the data into training and testing sets small enough to fit into memory, also adjust to a 5 point scale

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import text_to_word_sequence

# Function to adjust the scale of 'points' data
def points_to_scale(points, scale=5):
    points_norm = (points - np.min(points)) / (np.max(points) - np.min(points))
    return np.round(points_norm * scale + 1).astype(int)

# Apply transformation to points
df['points'] = points_to_scale(df['points'])

# Function to load data
def load_data(df, percentage_of_data=None):
    sentences = df['description']
    y = df['points']
    
    if percentage_of_data is not None:
        assert(percentage_of_data > 0 and percentage_of_data <= 100)
        len_data = int(percentage_of_data / 100 * len(sentences))
        sentences, y = sentences[:len_data], y[:len_data]
    
    sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.2, random_state=42)
    X_train = [text_to_word_sequence(text) for text in sentences_train]
    X_test = [text_to_word_sequence(text) for text in sentences_test]

    return X_train, y_train.to_numpy(), X_test, y_test.to_numpy()

# Call the load_data function to split and preprocess data
X_train, y_train, X_test, y_test = load_data(df, percentage_of_data=10)

2023-08-05 14:46:35.424696: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-05 14:46:35.459963: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-05 14:46:35.461228: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2. Train a word2vec model on the sample corpus

In [11]:
# Train a word2vec model on the training data with the following parameters:
# - size: 100
# - window: 5
# - min_count: 3

word2vec = Word2Vec(X_train, vector_size=100, window=5, min_count=3)
wv = word2vec.wv

3. Convert Training Data into something we can feed into an RNN

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

# Embed the training and test sentences
X_train_embed = embedding(word2vec, X_train)
X_test_embed = embedding(word2vec, X_test)


# Pad the training and test embedded sentences
X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post', maxlen=200)
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post', maxlen=200)

4. Test that X_train and X_test are numpy arrays with shapes

In [13]:
# TEST ME
for X in [X_train_pad, X_test_pad]:
    assert type(X) == np.ndarray
    assert X.shape[-1] == word2vec.wv.vector_size


assert X_train_pad.shape[0] == len(X_train)
assert X_test_pad.shape[0] == len(X_test)

# Baseline model

It is always good to have a very simple model to test your own model against - to be sure you are doing something better than a very simple algorithm.

❓ **Question** ❓ What is your baseline accuracy? In this case, your baseline can be to predict the label that is the most present in `y_train` (of course, if the dataset is balanced, the baseline accuracy is 1/n where n is the number of classes - 2 here).

In [14]:
import numpy as np

# Calculate the most frequent label
most_frequent_label = np.argmax(np.bincount(y_train))

# Calculate the baseline accuracy
baseline_accuracy = np.mean(y_test == most_frequent_label)

print("Baseline Accuracy:", baseline_accuracy)

Baseline Accuracy: 0.33780276816608995


5. The BASIC model with no transformer power    

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Masking, LSTM, Dense

def build_rnn_model(input_shape, num_classes):
    model = Sequential()
    model.add(Masking(mask_value=0, input_shape=input_shape))
    model.add(LSTM(20, activation='tanh'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

    return model

6. Check if the model is above the baseline accuracy

In [17]:
# Find the number of unique classes
num_classes = np.unique(y_train).shape[0]

# Ensure that classes start from 0
y_train -= y_train.min()
y_test -= y_test.min()

# Instantiate the model
model = build_rnn_model(input_shape=X_train_pad.shape[1:], num_classes=num_classes)

# Convert the labels to one-hot encoded vectors
from tensorflow.keras.utils import to_categorical

y_train_oh = to_categorical(y_train, num_classes=num_classes)
y_test_oh = to_categorical(y_test, num_classes=num_classes)

# Define Early Stopping callback
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Fit the model
history = model.fit(X_train_pad, y_train_oh, epochs=20, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/20


2023-08-05 14:58:06.751070: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 591840000 exceeds 10% of free system memory.




2023-08-05 14:58:41.414186: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 148000000 exceeds 10% of free system memory.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


7. Evaluate the model on the test set

In [18]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_pad, y_test_oh)

print("Test Accuracy:", accuracy)

 3/73 [>.............................] - ETA: 1s - loss: 1.1486 - accuracy: 0.4688

2023-08-05 15:08:07.154057: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 184960000 exceeds 10% of free system memory.


Test Accuracy: 0.5177335739135742


: 