In [1]:
pip install torch transformers regex safetensors gcsfs torchvision

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
pip install transformers torch scikit-learn pandas numpy

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
!nvidia-smi

Tue Aug 22 11:17:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8    17W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
from transformers import get_linear_schedule_with_warmup, BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight
from torch.optim.lr_scheduler import StepLR
from torch.optim import AdamW
import torch.optim as optim
import torch.quantization
import torch.nn as nn
import os
import numpy as np
import time
import pandas as pd
import numpy as np

In [5]:
import torch
torch.cuda.empty_cache()

In [6]:
import pandas as pd

# Define the path to the CSV file in GCS
csv_path = "gs://vino-verdict/data/cleaned_wine_df.csv"

# Read the CSV using pandas
df = pd.read_csv(csv_path)

# 1. Preprocess the `description` column

# Check for missing values in the description and points columns
missing_values = df[['description', 'points']].isnull().sum()

# Drop rows with missing descriptions (if any)
df = df.dropna(subset=['description'])

# 2. Transform the `points` column into categorical labels

# Define bins for the wine ratings and labels for each bin
bins = [0, 87, 90, 100]
labels = ['bad', 'good', 'excellent']

# Create a new column 'rating_category' with the binned labels
df['rating_category'] = pd.cut(df['points'], bins=bins, labels=labels, include_lowest=True)

missing_values, df[['description', 'rating_category']].head()

(description    0
 points         0
 dtype: int64,
                                          description rating_category
 0  Aromas include tropical fruit, broom, brimston...             bad
 1  This is ripe and fruity, a wine that is smooth...             bad
 2  Tart and snappy, the flavors of lime flesh and...             bad
 3  Pineapple rind, lemon pith and orange blossom ...             bad
 4  Much like the regular bottling from 2012, this...             bad)

In [7]:
df.head()

Unnamed: 0,country,description,points,price,province,region_1,title,variety,winery,year,rating_category
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,19.0,Sicily & Sardinia,Etna,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013,bad
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011,bad
2,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013,bad
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Michigan,Lake Michigan Shore,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013,bad
4,US,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012,bad


In [12]:
df['rating_category'].value_counts()

bad          45642
good         38870
excellent    31090
Name: rating_category, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split

# Splitting the dataframe
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['rating_category'], random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['rating_category'], random_state=42)

In [14]:
train_df.shape, valid_df.shape, test_df.shape

((92481, 11), (11560, 11), (11561, 11))

In [15]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the descriptions from the train, validation, and test sets
train_encodings = tokenizer(list(train_df['description']), truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(list(valid_df['description']), truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(list(test_df['description']), truncation=True, padding=True, max_length=128, return_tensors='pt')

train_encodings.keys()  # Display the keys to check the tokenization results

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [16]:
# Convert 'bad', 'good', and 'excellent' to 0, 1, and 2, respectively
train_labels = train_df['rating_category'].replace({'bad': 0, 'good': 1, 'excellent': 2}).to_numpy()
val_labels = valid_df['rating_category'].replace({'bad': 0, 'good': 1, 'excellent': 2}).to_numpy()
test_labels = test_df['rating_category'].replace({'bad': 0, 'good': 1, 'excellent': 2}).to_numpy()

# Convert to PyTorch tensors
train_labels = torch.tensor(train_labels).long()
val_labels = torch.tensor(val_labels).long()
test_labels = torch.tensor(test_labels).long()

# Create datasets
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = torch.utils.data.TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)
test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

In [17]:
!nvidia-smi

Tue Aug 22 11:22:10 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8    16W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [18]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
# Training hyperparameters and initialization
batch_size = 32
epochs = 5
patience = 2

# Lists to store metrics
train_losses = []
val_losses = []
val_accuracies = []
val_f1_scores = []

# Early stopping and model saving initializations
best_val_loss = float('inf')
no_improve_epochs = 0
path_to_save = "gs://vino-verdict/models/3_case.bin"

In [23]:
# Load the BERT model for multi-class classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', 
    num_labels=3  # For multivariate classification
)
model = model.to(device)

# Loss function for binary cross entropy
# Use BCEWithLogitsLoss for binary classification with logits (includes sigmoid activation)
loss_function = nn.CrossEntropyLoss()

# Optimizer
# Define the optimizer with L2 regularization (weight decay)
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)  # Adjust weight_decay as needed
# Look into perhaps using a Learning Rate Decay
# This starts with a higher learning rate and slowly reduces it

# Define the learning rate scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=0.5)

# DataLoader
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32)
val_loader = DataLoader(val_dataset, batch_size=32)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [29]:
from torch.quantization import default_observer, default_qconfig, QConfig

# Create a custom quantization configuration based on the default one
custom_observer = default_observer.with_args(dtype=torch.quint8)
custom_qconfig = QConfig(activation=custom_observer, weight=custom_observer)

# Set the custom configuration as the default for all modules
torch.quantization.default_qconfig = custom_qconfig

# Apply the configuration to our model
model.qconfig = custom_qconfig

# Prepare the model for static quantization. This inserts observers in the model
model_prepared = torch.quantization.prepare(model, inplace=False)

QConfig(activation=functools.partial(functools.partial(<class 'torch.quantization.observer.MinMaxObserver'>, reduce_range=True), dtype=torch.quint8), weight=functools.partial(functools.partial(<class 'torch.quantization.observer.MinMaxObserver'>, reduce_range=True), dtype=torch.quint8))


In [30]:
# Assuming you've already adjusted preprocessing and data loading to produce labels as 0, 1, or 2 
# corresponding to 'bad', 'average', and 'good'

# Early stopping variables
best_val_f1 = -float('inf')
no_improve_epochs = 0
patience = 3
epochs = 5
train_losses = []
val_losses = []
val_accuracies = []
val_f1_scores = []
train_accuracies = []
gradient_accumulation_steps = 2

print("Starting training...\n")

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    train_predictions, train_true_vals = [], []

    optimizer.zero_grad()

    for step, batch in enumerate(train_loader):
        input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        
        loss = loss_function(outputs.logits, labels)  # Ensure this is categorical cross-entropy
        total_train_loss += loss.item()

        # Gradient accumulation
        loss = loss / gradient_accumulation_steps
        loss.backward()

        # Get predictions for training data
        batch_train_predictions = torch.argmax(outputs.logits, dim=1)
        train_predictions.extend(batch_train_predictions.cpu().numpy())
        train_true_vals.extend(labels.cpu().numpy())
        
        if (step + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()
            
        if step % 100 == 0:
            print(f"Epoch {epoch+1} - Step {step}/{len(train_loader)} - Loss: {loss.item():.4f}")

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    
    train_accuracy = accuracy_score(train_true_vals, train_predictions)
    train_accuracies.append(train_accuracy)

    scheduler.step()

    # Validation
    model.eval()
    val_loss = 0
    predictions, true_vals = [], []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
            outputs = model(input_ids, attention_mask=attention_mask)
            
            val_loss += loss_function(outputs.logits, labels).item()
            
            batch_predictions = torch.argmax(outputs.logits, dim=1)
            predictions.extend(batch_predictions.cpu().numpy())
            true_vals.extend(labels.cpu().numpy())
                
    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    
    val_accuracy = accuracy_score(true_vals, predictions)
    val_f1 = f1_score(true_vals, predictions, average='weighted')  # Adjusted for multiclass
    
    val_accuracies.append(val_accuracy)
    val_f1_scores.append(val_f1)
    
    print(f"\nEpoch {epoch+1}/{epochs} Summary:")
    print(f"Training Loss: {avg_train_loss:.4f} - Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f} - Validation Accuracy: {val_accuracy:.4f} - Validation F1: {val_f1:.4f}\n")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        if no_improve_epochs == patience:
            print(f"Early stopping triggered after {epoch + 1} epochs!")
            break
    
   # Save the best model
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        print("Saving the best model...\n")

        temp_model_path = "./3_case.bin"
        temp_config_path = "./3_case_config.json"

        # Calibration using the validation data
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, _ = tuple(t.to(device) for t in batch)
                _ = model_prepared(input_ids, attention_mask=attention_mask)

        # Convert the model to a quantized version
        model_quantized = torch.quantization.convert(model_prepared, inplace=False)

        # Saving the quantized model
        quantized_model_path = "./3_case_quantized.bin"
        torch.save(model_quantized.state_dict(), quantized_model_path)
        os.system(f"gsutil cp {quantized_model_path} {path_to_save}")

        torch.save(model.state_dict(), temp_model_path)
        os.system(f"gsutil cp {temp_model_path} {path_to_save}")

        model.config.to_json_file(temp_config_path)
        os.system(f"gsutil cp {temp_config_path} gs://vino-verdict/models/")

print("Training completed.")

Starting training...

Epoch 1 - Step 0/2891 - Loss: 0.2777
Epoch 1 - Step 100/2891 - Loss: 0.3302
Epoch 1 - Step 200/2891 - Loss: 0.2496
Epoch 1 - Step 300/2891 - Loss: 0.2851
Epoch 1 - Step 400/2891 - Loss: 0.2425
Epoch 1 - Step 500/2891 - Loss: 0.2423
Epoch 1 - Step 600/2891 - Loss: 0.2628
Epoch 1 - Step 700/2891 - Loss: 0.2765
Epoch 1 - Step 800/2891 - Loss: 0.2261
Epoch 1 - Step 900/2891 - Loss: 0.1639
Epoch 1 - Step 1000/2891 - Loss: 0.2877
Epoch 1 - Step 1100/2891 - Loss: 0.3018
Epoch 1 - Step 1200/2891 - Loss: 0.2879
Epoch 1 - Step 1300/2891 - Loss: 0.2944
Epoch 1 - Step 1400/2891 - Loss: 0.2516
Epoch 1 - Step 1500/2891 - Loss: 0.2618
Epoch 1 - Step 1600/2891 - Loss: 0.2204
Epoch 1 - Step 1700/2891 - Loss: 0.2992
Epoch 1 - Step 1800/2891 - Loss: 0.2387
Epoch 1 - Step 1900/2891 - Loss: 0.2466
Epoch 1 - Step 2000/2891 - Loss: 0.3090
Epoch 1 - Step 2100/2891 - Loss: 0.2378
Epoch 1 - Step 2200/2891 - Loss: 0.2257
Epoch 1 - Step 2300/2891 - Loss: 0.2408
Epoch 1 - Step 2400/2891 - Los

  reduce_range will be deprecated in a future release of PyTorch."


AssertionError: The only supported dtype for nnq.Embedding is torch.quint8

In [None]:
test_loader = DataLoader(test_dataset, batch_size=32)

model.eval()
# If needed: model.load_state_dict(torch.load(temp_model_path))

test_predictions, test_true_vals = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
        outputs = model(input_ids, attention_mask=attention_mask)
        
        batch_predictions = torch.argmax(outputs.logits, dim=1)
        test_predictions.extend(batch_predictions.cpu().numpy())
        test_true_vals.extend(labels.cpu().numpy())

# Compute metrics for the original model
original_accuracy = accuracy_score(test_true_vals, test_predictions)
original_f1 = f1_score(test_true_vals, test_predictions, average='weighted')

model_quantized.eval()
# If needed: model_quantized.load_state_dict(torch.load(quantized_model_path))

test_predictions_quantized, test_true_vals_quantized = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
        outputs = model_quantized(input_ids, attention_mask=attention_mask)
        
        batch_predictions = torch.argmax(outputs.logits, dim=1)
        test_predictions_quantized.extend(batch_predictions.cpu().numpy())
        test_true_vals_quantized.extend(labels.cpu().numpy())

# Compute metrics for the quantized model
quantized_accuracy = accuracy_score(test_true_vals_quantized, test_predictions_quantized)
quantized_f1 = f1_score(test_true_vals_quantized, test_predictions_quantized, average='weighted')

In [None]:
print(f"Original Model - Accuracy: {original_accuracy:.4f}, F1 Score: {original_f1:.4f}")
print(f"Quantized Model - Accuracy: {quantized_accuracy:.4f}, F1 Score: {quantized_f1:.4f}")

In [None]:
unique_preds = set(test_predictions)
print("Unique predictions:", unique_preds)

In [None]:
from collections import Counter
class_distribution = Counter(test_true_vals)
print("Class distribution in test data:", class_distribution)

In [None]:
# Calculate metrics
accuracy = accuracy_score(test_true_vals, test_predictions)
f1 = f1_score(test_true_vals, test_predictions, average='weighted')
conf_matrix = confusion_matrix(test_true_vals, test_predictions)
class_report = classification_report(test_true_vals, test_predictions, target_names=['bad', 'average', 'excellent'])
precision = precision_score(test_true_vals, test_predictions, average='weighted')
recall = recall_score(test_true_vals, test_predictions, average='weighted')

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:\n", class_report)

In [None]:
import matplotlib.pyplot as plt

# Your data
epochs_range = list(range(1, len(train_losses) + 1))
training_losses = train_losses
validation_accuracies = val_accuracies
validation_f1_scores = val_f1_scores
validation_losses = val_losses

# Plotting
plt.figure(figsize=(15, 5))

# Plot Training and Validation Loss
plt.subplot(1, 3, 1)
plt.plot(epochs_range, train_losses, label='Training Loss', color='blue')
plt.plot(epochs_range, val_losses, label='Validation Loss', color='red')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training & Validation Loss')
plt.legend()

# Plot Training and Validation Accuracy
plt.subplot(1, 3, 2)
plt.plot(epochs_range, train_accuracies, label='Training Accuracy', color='blue')
plt.plot(epochs_range, val_accuracies, label='Validation Accuracy', color='red')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training & Validation Accuracy')
plt.legend()

# Plot Training and Validation F1 Score
plt.subplot(1, 3, 3)
plt.plot(epochs_range, val_f1_scores, label='Validation F1 Score', color='red')
# You can also add Training F1 Scores if you've computed them
plt.xlabel('Epochs')
plt.ylabel('F1 Score')
plt.title('Validation F1 Score')
plt.legend()

plt.tight_layout()
plt.show()