In [4]:
import numpy as np
from gensim.models import Word2Vec
import pandas as pd

In [5]:
wine_df = pd.read_csv('../cleaned_wine_df.csv')

In [6]:
# Function to adjust the scale of 'points' data
def points_to_scale(points, scale=5):
    points_norm = (points - np.min(points)) / (np.max(points) - np.min(points))
    return np.round(points_norm * scale + 1).astype(int)

# Apply transformation to points
wine_df['points'] = points_to_scale(wine_df['points'])

In [7]:
wine_df.tail(3)

Unnamed: 0,country,description,points,price,province,region_1,title,variety,winery,year
115599,France,Well-drained gravel soil gives this wine its c...,4,30.0,Alsace,Alsace,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser,2013
115600,France,"A dry style of Pinot Gris, this is crisp with ...",4,32.0,Alsace,Alsace,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss,2012
115601,France,"Big, rich and off-dry, this is powered by inte...",4,21.0,Alsace,Alsace,Domaine Schoffit 2012 Lieu-dit Harth Cuvée Car...,Gewürztraminer,Domaine Schoffit,2012


In [8]:
# Select 10% of your data
wine_sample_df = wine_df.sample(frac=0.1)

In [9]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from sklearn.metrics import r2_score
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

def build_transformer_model(input_shape):
    bert_model = TFBertModel.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
    input_ids = Input(shape=input_shape, dtype='int32')
    outputs = bert_model(input_ids)
    pooled_output = outputs.pooler_output  # Access the pooler_output (classification token) directly
    x = Dense(10, activation='relu')(pooled_output)
    output = Dense(1, activation='linear')(x)
    model = Model(inputs=input_ids, outputs=output)
    model.compile(optimizer=RMSprop(), loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError(), tf.keras.metrics.MeanSquaredLogarithmicError()])
    return model

In [11]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Extract the descriptions and points from the DataFrame
descriptions = wine_sample_df['description'].values
points = wine_sample_df['points'].values

# Tokenize the descriptions with a smaller max_length
input_ids = tokenizer(descriptions.tolist(), padding='max_length', max_length=128, truncation=True, return_tensors='tf').input_ids

# Split the tokenized descriptions and points into training and testing sets
train_inputs, test_inputs, y_train, y_test = train_test_split(input_ids.numpy(), points, test_size=0.2, random_state=42)

# Define the early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss', 
    min_delta=0.001, 
    patience=10, 
    verbose=1,
    restore_best_weights=True, 
)

# Build the model
model = build_transformer_model((128,))

# Train the model with the early stopping callback and a smaller batch size
history = model.fit(
    train_inputs, 
    y_train, 
    epochs=50,
    batch_size=16,  # smaller batch size
    validation_split=0.2, 
    callbacks=[early_stopping],
)

# Evaluate the model
evaluation_results = model.evaluate(test_inputs, y_test)
print(f"Test Loss: {evaluation_results[0]}")
print(f"Test Metrics: {evaluation_results[1:]}")

2023-08-05 16:57:33.184670: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: UNKNOWN ERROR (100)
2023-08-05 16:57:34.032236: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 325260288 exceeds 10% of free system memory.
2023-08-05 16:57:34.547929: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 325260288 exceeds 10% of free system memory.
2023-08-05 16:57:35.469808: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 325260288 exceeds 10% of free system memory.
2023-08-05 16:57:43.276019: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 325260288 exceeds 10% of free system memory.
2023-08-05 16:57:45.328231: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 325260288 exceeds 10% of free system memory.
Some layers from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment were not used when initializing TFBertModel: ['classifier', 'dropout_37'

TypeError: Inputs to a layer should be tensors. Got 'pooler_output' (of type <class 'str'>) as input for layer 'dense'.

In [None]:
from tensorflow.keras import backend as K

def r_squared(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

# In the model.compile(), you can include this in metrics as follows:

model.compile(optimizer=RMSprop(), loss='mean_squared_error', metrics=[r_squared])


In [None]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_pad, y_test_oh)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

In [None]:
from sklearn.metrics import classification_report

# Make predictions
y_pred = model.predict(X_test_pad)

# Convert prediction probabilities to class labels
y_pred_classes = np.argmax(y_pred, axis=1)

# Print the classification report
print(classification_report(y_test, y_pred_classes))