In [3]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install gcsfs

Note: you may need to restart the kernel to use updated packages.


In [5]:
!nvidia-smi

Tue Aug 15 13:23:13 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.199.02   Driver Version: 470.199.02   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8    11W /  70W |    105MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
from transformers import get_linear_schedule_with_warmup, BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from torch.optim.lr_scheduler import StepLR
from torch.optim import AdamW
import torch.optim as optim
import torch.nn as nn
import os
import numpy as np
import time
import pandas as pd
import numpy as np

In [7]:
import torch
torch.cuda.empty_cache()

In [9]:
import pandas as pd

# Define the path to the CSV file in GCS
csv_path = "gs://vino-verdict/data/cleaned_wine_df.csv"

# Read the CSV using pandas
df = pd.read_csv(csv_path)

In [10]:
df.head(3)

Unnamed: 0,country,description,points,price,province,region_1,title,variety,winery,year
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,19.0,Sicily & Sardinia,Etna,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011
2,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013


In [11]:
# 1. Preprocess the `description` column

# Check for missing values in the description and points columns
missing_values = df[['description', 'points']].isnull().sum()

# Drop rows with missing descriptions (if any)
df = df.dropna(subset=['description'])

# 2. Transform the `points` column into categorical labels

# Define bins for the wine ratings and labels for each bin
bins = [80, 89, 100]  # Note: We start at 79 to ensure 80 is included in the 'bad' category due to the nature of how bins are defined
labels = ['bad', 'good']

# Create a new column 'rating_category' with the binned labels
df['rating_category'] = pd.cut(df['points'], bins=bins, labels=labels, include_lowest=True)

missing_values, df[['description', 'rating_category']].head()

(description    0
 points         0
 dtype: int64,
                                          description rating_category
 0  Aromas include tropical fruit, broom, brimston...             bad
 1  This is ripe and fruity, a wine that is smooth...             bad
 2  Tart and snappy, the flavors of lime flesh and...             bad
 3  Pineapple rind, lemon pith and orange blossom ...             bad
 4  Much like the regular bottling from 2012, this...             bad)

In [12]:
df['rating_category'].value_counts()

rating_category
bad     71056
good    44546
Name: count, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split

# Assuming df is your dataframe
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['rating_category'], random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['rating_category'], random_state=42)

In [14]:
train_df.shape, valid_df.shape, test_df.shape

((92481, 11), (11560, 11), (11561, 11))

In [None]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Tokenize the descriptions from the train, validation, and test sets
train_encodings = tokenizer(list(train_df['description']), truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(list(val_df['description']), truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(list(test_df['description']), truncation=True, padding=True, max_length=128, return_tensors='pt')

train_encodings.keys()  # Display the keys to check the tokenization results

In [None]:
train_labels = (train_df['points'] > 3).long().to_numpy()
val_labels = (val_df['points'] > 3).long().to_numpy()
test_labels = (test_df['points'] > 3).long().to_numpy()

train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = torch.utils.data.TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)
test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

In [None]:
!nvidia-smi

In [None]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Training hyperparameters and initialization
batch_size = 16
epochs = 5
patience = 2

# Lists to store metrics
train_losses = []
val_losses = []
val_accuracies = []
val_f1_scores = []

# Early stopping and model saving initializations
best_val_loss = float('inf')
no_improve_epochs = 0
path_to_save = "gs://vino-verdict/models/multilingual-binary-sentiment.bin"

In [1]:
# Load the BERT model for binary classification
model = BertForSequenceClassification.from_pretrained(
    'nlptown/bert-base-multilingual-uncased-sentiment', 
    num_labels=2  # For binary classification
)

# Loss function for binary cross entropy
# Use BCEWithLogitsLoss for binary classification with logits (includes sigmoid activation)
loss_function = nn.BCEWithLogitsLoss()

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)
# Look into perhaps using a Learning Rate Decay
# This starts with a higher learning rate and slowly reduces it

# Define the learning rate scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=0.5)

# DataLoader
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=16)

NameError: name 'AutoModelForSequenceClassification' is not defined

In [None]:
# Early stopping variables
best_val_f1 = -float('inf')
no_improve_epochs = 0
patience = 3
epochs = 5
train_losses = []
val_losses = []
val_accuracies = []
val_f1_scores = []
train_accuracies = []
gradient_accumulation_steps = 2

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    train_predictions, train_true_vals = [], []

    optimizer.zero_grad()  # Reset gradients once at the start of the epoch

    for step, batch in enumerate(train_loader):
        input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        
        loss = loss_function(outputs.logits, labels.float().unsqueeze(-1))
        total_train_loss += loss.item()

        # Gradient accumulation
        loss = loss / gradient_accumulation_steps  # Normalize the loss
        loss.backward()

        # Get binary predictions for training data
        batch_train_predictions = torch.sigmoid(outputs.logits) > 0.5
        train_predictions.extend(batch_train_predictions.cpu().numpy())
        train_true_vals.extend(labels.cpu().numpy())
        
        # Update model parameters every gradient_accumulation_steps
        if (step + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Optional gradient clipping
            optimizer.step()
            optimizer.zero_grad()
    
    avg_train_loss = total_train_loss / len(train_loader)  # Calculate average training loss
    train_losses.append(avg_train_loss)
    
    train_accuracy = accuracy_score(train_true_vals, [int(item) for sublist in train_predictions for item in sublist])
    train_accuracies.append(train_accuracy)

    # Validation at the end of each epoch
    model.eval()
    val_loss = 0
    predictions, true_vals = [], []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
            outputs = model(input_ids, attention_mask=attention_mask)
            
            val_loss += loss_function(outputs.logits, labels.float().unsqueeze(-1)).item()
            
            # Get binary predictions
            batch_predictions = torch.sigmoid(outputs.logits) > 0.5
            predictions.extend(batch_predictions.cpu().numpy())
            true_vals.extend(labels.cpu().numpy())
    
    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    
    val_accuracy = accuracy_score(true_vals, [int(item) for sublist in predictions for item in sublist])
    val_f1 = f1_score(true_vals, [int(item) for sublist in predictions for item in sublist])
    
    val_accuracies.append(val_accuracy)
    val_f1_scores.append(val_f1)
    
    print(f"Epoch {epoch+1}/{epochs} - Training Loss: {avg_train_loss} - Training Accuracy: {train_accuracy} - Validation Loss: {avg_val_loss} - Validation Accuracy: {val_accuracy} - Validation F1: {val_f1}")

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        if no_improve_epochs == patience:
            print(f"Early stopping triggered after {epoch + 1} epochs!")
            break
    
    # Save the model if it's the best one seen so far
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        print("Saving the best model...")

        # Save model weights
        torch.save(model.state_dict(), "./multilingual-binary-sentiment.bin")
        os.system(f"gsutil cp ./multilingual-binary-sentiment.bin {path_to_save}")

        # Save the model's configuration
        config_path = "./multilingual-binary-sentiment.json"
        model.config.to_json_file(config_path)
        os.system(f"gsutil cp {config_path} gs://vino-verdict/models/")


In [None]:
test_loader = DataLoader(test_dataset, batch_size=16)
model.eval()
test_predictions, test_true_vals = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # Get binary predictions
        batch_predictions = torch.sigmoid(outputs.logits) > 0.5
        test_predictions.extend(batch_predictions.cpu().numpy())
        test_true_vals.extend(labels.cpu().numpy())

In [None]:
# Compute evaluation metrics (e.g., accuracy, F1 score) using test_predictions and test_true_vals
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, precision_score, recall_score

# Convert the predictions to a flattened list
test_predictions_flat = [int(item) for sublist in test_predictions for item in sublist]

# Calculate metrics
accuracy = accuracy_score(test_true_vals, test_predictions_flat)
f1 = f1_score(test_true_vals, test_predictions_flat)
conf_matrix = confusion_matrix(test_true_vals, test_predictions_flat)
class_report = classification_report(test_true_vals, test_predictions_flat, target_names=['bad', 'good'])
precision = precision_score(test_true_vals, test_predictions_flat)
recall = recall_score(test_true_vals, test_predictions_flat)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:\n", class_report)

In [None]:
import matplotlib.pyplot as plt

# Your data
epochs_range = list(range(1, len(train_losses) + 1))
training_losses = train_losses
validation_accuracies = val_accuracies
validation_f1_scores = val_f1_scores
validation_losses = val_losses

# Plotting
plt.figure(figsize=(10, 5))

# Overlay Training and Validation Loss
plt.plot(epochs_range, train_losses, label='Training Loss', color='blue')
plt.plot(epochs_range, val_losses, label='Validation Loss', color='red')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training & Validation Loss')
plt.legend()

plt.tight_layout()
plt.show()