Loading the CSV Data:

In [1]:
pip install torch transformers regex safetensors gcsfs torchvision

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
pip install transformers torch scikit-learn pandas numpy

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
!nvidia-smi

Thu Aug 24 11:07:47 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P8    16W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
from transformers import get_linear_schedule_with_warmup, BertForSequenceClassification, AutoTokenizer, BertConfig
from transformers import AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
import torch.optim as optim
import torch.quantization
import torch.nn as nn
import os
import numpy as np
import time
import pandas as pd
import numpy as np

In [5]:
import torch
torch.cuda.empty_cache()

In [6]:
# Define the path to the CSV file in GCS
csv_path = "gs://vino-verdict/data/clean_wine_3scale.csv"

# Read the CSV using pandas
wine_df = pd.read_csv(csv_path)

Preprocessing:
We'll transform the points, which are on an 80-100 scale, to a 1-5 scale to match the sentiment model:

In [7]:
# Convert points to a 1-5 scale
wine_df['stars'] = ((wine_df['points'] - 80) / 4).astype(int)

In [8]:
wine_df['stars'] = wine_df['stars'].clip(1, 5)
wine_df['stars'] = wine_df['stars'] - 1

In [9]:
wine_df['stars']

0        1
1        1
2        0
3        1
4        0
        ..
17599    2
17600    2
17601    2
17602    2
17603    3
Name: stars, Length: 17604, dtype: int64

Tokenize the Data:
Use the tokenizer to tokenize the wine descriptions. Make sure to set truncation=True and padding=True to ensure consistent input lengths:

In [10]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
encodings = tokenizer(list(wine_df['description']), truncation=True, padding=True, max_length=256)

Split the Data:
We'll split the data into training, validation, and testing sets:

In [11]:
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']

# Split the data
train_input_ids, temp_input_ids, train_attention_mask, temp_attention_mask, train_labels, temp_labels = train_test_split(
    input_ids, attention_mask, wine_df['stars'], test_size=0.3
)

val_input_ids, test_input_ids, val_attention_mask, test_attention_mask, val_labels, test_labels = train_test_split(
    temp_input_ids, temp_attention_mask, temp_labels, test_size=0.5
)

Create DataLoaders:

In [12]:
# Convert lists to tensors
train_input_ids = torch.tensor(train_input_ids)
train_attention_mask = torch.tensor(train_attention_mask)
train_labels = torch.tensor(train_labels.to_numpy())

val_input_ids = torch.tensor(val_input_ids)
val_attention_mask = torch.tensor(val_attention_mask)
val_labels = torch.tensor(val_labels.to_numpy())

test_input_ids = torch.tensor(test_input_ids)
test_attention_mask = torch.tensor(test_attention_mask)
test_labels = torch.tensor(test_labels.to_numpy())

# Create datasets
train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_mask, val_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)

# DataLoaders
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

Load the Model:

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model.to(device)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [14]:
print(model.classifier)

Linear(in_features=768, out_features=5, bias=True)


Hyperparameters & Optimizer:

In [15]:
# Hyperparameters
lr = 2e-5
epochs = 3
weight_decay = 0.01

optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

# Linear learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=len(train_loader)*epochs
)

Training Loop:

In [16]:
# Parameters
path_to_save = "gs://vino-verdict/models/sentiment-bert-binary.bin"
best_val_accuracy = 0  # Initialize best validation accuracy to 0
best_val_loss = float('inf')  # Initialize the best validation loss to infinity
patience = 3  # Number of epochs with no improvement after which training will be stopped
num_epochs_no_improve = 0

In [17]:
for epoch in range(epochs):
    print(f"\nStarting Epoch {epoch+1}/{epochs}")
    
    # Training
    model.train()
    total_train_loss = 0
    for step, batch in enumerate(train_loader):
        if step % 100 == 0:
            print(f"  Training batch {step}/{len(train_loader)}")
        
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Training Loss: {avg_train_loss:.3f}")

    # Validation
    model.eval()
    total_val_loss = 0
    val_predictions, true_vals = [], []
    with torch.no_grad():
        for step, batch in enumerate(val_loader):
            if step % 100 == 0:
                print(f"  Validating batch {step}/{len(val_loader)}")
            
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            total_val_loss += outputs.loss.item()

            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()

            val_predictions.append(logits)
            true_vals.append(label_ids)

    avg_val_loss = total_val_loss / len(val_loader)
    val_predictions = np.concatenate(val_predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    val_accuracy = accuracy_score(true_vals, np.argmax(val_predictions, axis=1))
    print(f"Epoch {epoch+1} - Validation Accuracy: {val_accuracy:.3f} - Validation Loss: {avg_val_loss:.3f}")

    # Save the best model based on validation accuracy
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        print("Best model found based on validation accuracy - Saving to Google Cloud Storage...\n")
        
        # Save the model and its configuration locally
        save_directory = "./sentiment_bert_binary_model"
        model.save_pretrained(save_directory)
        
        # Copy the model and its configuration to Google Cloud Storage
        os.system(f"gsutil cp {save_directory}/pytorch_model.bin {path_to_save}")
        os.system(f"gsutil cp {save_directory}/config.json {path_to_save}/sentiment-bert-binary-config.json")
        
        # Remove the local copies
        os.remove(f"{save_directory}/pytorch_model.bin")
        os.remove(f"{save_directory}/config.json")
        os.rmdir(save_directory)

    # Implement early stopping based on validation loss
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        num_epochs_no_improve = 0
    else:
        num_epochs_no_improve += 1
        if num_epochs_no_improve == patience:
            print(f"Early stopping triggered after {epoch + 1} epochs!")
            break


Starting Epoch 1/3
  Training batch 0/771
  Training batch 100/771
  Training batch 200/771
  Training batch 300/771
  Training batch 400/771
  Training batch 500/771
  Training batch 600/771
  Training batch 700/771
Epoch 1 - Training Loss: 0.797
  Validating batch 0/166
  Validating batch 100/166
Epoch 1 - Validation Accuracy: 0.699 - Validation Loss: 0.688
Best model found based on validation accuracy - Saving to Google Cloud Storage...



Copying file://./sentiment_bert_binary_model/pytorch_model.bin [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [1 files][638.5 MiB/638.5 MiB]   32.2 MiB/s                                   
Operation completed over 1 objects/638.5 MiB.                                    
Copying file://./sentiment_bert_binary_model/


Starting Epoch 2/3
  Training batch 0/771
  Training batch 100/771
  Training batch 200/771
  Training batch 300/771
  Training batch 400/771
  Training batch 500/771
  Training batch 600/771
  Training batch 700/771
Epoch 2 - Training Loss: 0.604
  Validating batch 0/166
  Validating batch 100/166
Epoch 2 - Validation Accuracy: 0.723 - Validation Loss: 0.656
Best model found based on validation accuracy - Saving to Google Cloud Storage...



Copying file://./sentiment_bert_binary_model/pytorch_model.bin [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

- [1 files][638.5 MiB/638.5 MiB]   32.5 MiB/s                                   
Operation completed over 1 objects/638.5 MiB.                                    
Copying file://./sentiment_bert_binary_model/


Starting Epoch 3/3
  Training batch 0/771
  Training batch 100/771
  Training batch 200/771
  Training batch 300/771
  Training batch 400/771
  Training batch 500/771
  Training batch 600/771
  Training batch 700/771
Epoch 3 - Training Loss: 0.462
  Validating batch 0/166
  Validating batch 100/166
Epoch 3 - Validation Accuracy: 0.714 - Validation Loss: 0.729


Evaluation on Test Data:

In [18]:
model.eval()
test_predictions, true_vals = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch  # Corrected this line
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)  # Added this line
        labels = labels.to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)  # Added attention_mask here
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        
        test_predictions.append(logits)
        true_vals.append(label_ids)

test_predictions = np.concatenate(test_predictions, axis=0)
true_vals = np.concatenate(true_vals, axis=0)
print("Test Data Metrics:")
print(classification_report(true_vals, np.argmax(test_predictions, axis=1)))

Test Data Metrics:
              precision    recall  f1-score   support

           0       0.78      0.76      0.77       688
           1       0.62      0.63      0.62       794
           2       0.78      0.80      0.79      1036
           3       0.45      0.38      0.41       120
           4       0.00      0.00      0.00         3

    accuracy                           0.72      2641
   macro avg       0.53      0.51      0.52      2641
weighted avg       0.72      0.72      0.72      2641



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
def convert_to_2_scale(arr):
    arr_2_scale = []
    for val in arr:
        if val in [0, 1]:
            arr_2_scale.append(0)  # bad
        else:
            arr_2_scale.append(1)  # average
    return np.array(arr_2_scale)

model.eval()
test_predictions, true_vals = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch  # Corrected this line
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)  # Added this line
        labels = labels.to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)  # Added attention_mask here
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        
        test_predictions.append(logits)
        true_vals.append(label_ids)

test_predictions = np.concatenate(test_predictions, axis=0)
true_vals = np.concatenate(true_vals, axis=0)

# Convert 5-scale predictions and true_vals to 2-scale
predicted_classes = np.argmax(test_predictions, axis=1)
predicted_classes_2_scale = convert_to_2_scale(predicted_classes)
true_vals_2_scale = convert_to_2_scale(true_vals)

print("Test Data Metrics:")
print(classification_report(true_vals_2_scale, predicted_classes_2_scale))

Test Data Metrics:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      1482
           1       0.86      0.87      0.86      1159

    accuracy                           0.88      2641
   macro avg       0.88      0.88      0.88      2641
weighted avg       0.88      0.88      0.88      2641

