In [None]:
pip install transformers
pip install gcsfs
!nvidia-smi
from transformers import get_linear_schedule_with_warmup, BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from torch.optim.lr_scheduler import StepLR
from torch.optim import AdamW
import torch.optim as optim
import torch.nn as nn
import os
import numpy as np
import time
import pandas as pd
import numpy as np

import torch
torch.cuda.empty_cache()

import pandas as pd

# Define the path to the CSV file in GCS
csv_path = "gs://vino-verdict/data/cleaned_wine_df.csv"

# Read the CSV using pandas
df = pd.read_csv(csv_path)

df.head(3)

# 1. Preprocess the `description` column

# Check for missing values in the description and points columns
missing_values = df[['description', 'points']].isnull().sum()

# Drop rows with missing descriptions (if any)
df = df.dropna(subset=['description'])

# 2. Transform the `points` column into categorical labels

# Define bins for the wine ratings and labels for each bin
bins = [80, 89, 100]  # Note: We start at 79 to ensure 80 is included in the 'bad' category due to the nature of how bins are defined
labels = ['bad', 'good']

# Create a new column 'rating_category' with the binned labels
df['rating_category'] = pd.cut(df['points'], bins=bins, labels=labels, include_lowest=True)

missing_values, df[['description', 'rating_category']].head()

df['rating_category'].value_counts()

from sklearn.model_selection import train_test_split

# Assuming df is your dataframe
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['rating_category'], random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['rating_category'], random_state=42)

# Apply lambda function to create 'binary_label' column
train_df['binary_label'] = train_df['sentiment_score'].apply(lambda x: 1 if x >= 4 else 0)
valid_df['binary_label'] = valid_df['sentiment_score'].apply(lambda x: 1 if x >= 4 else 0)
test_df['binary_label'] = test_df['sentiment_score'].apply(lambda x: 1 if x >= 4 else 0)


train_df.shape, valid_df.shape, test_df.shape

# Initialize the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Tokenize the descriptions
train_encodings = tokenizer.batch_encode_plus(
    list(train_df['description'].values),  # Convert to a list
    add_special_tokens=True,
    return_attention_mask=True,
    truncation=True,
    padding='max_length',
    max_length=150,
    return_tensors='pt'
)

valid_encodings = tokenizer.batch_encode_plus(
    list(valid_df['description'].values),  # Convert to a list
    add_special_tokens=True,
    return_attention_mask=True,
    truncation=True,
    padding='max_length',
    max_length=150,
    return_tensors='pt'
)

test_encodings = tokenizer.batch_encode_plus(
    list(test_df['description'].values),  # Convert to a list
    add_special_tokens=True,
    return_attention_mask=True,
    truncation=True,
    padding='max_length',
    max_length=150,
    return_tensors='pt'
)

# Extract the input IDs, attention masks, and labels
train_labels = train_df['rating_category'].astype('category').cat.codes.values
valid_labels = valid_df['rating_category'].astype('category').cat.codes.values
test_labels = test_df['rating_category'].astype('category').cat.codes.values

import torch
from torch.utils.data import TensorDataset

# Convert data into torch tensors for training set
train_input_ids = train_encodings['input_ids'].clone().detach()
train_attention_masks = train_encodings['attention_mask'].clone().detach()
train_labels = torch.tensor(train_labels, dtype=torch.long)

# Convert data into torch tensors for validation set
valid_input_ids = valid_encodings['input_ids'].clone().detach()
valid_attention_masks = valid_encodings['attention_mask'].clone().detach()
valid_labels = torch.tensor(valid_labels, dtype=torch.long)

# Convert data into torch tensors for test set
test_input_ids = test_encodings['input_ids'].clone().detach()
test_attention_masks = test_encodings['attention_mask'].clone().detach()
test_labels = torch.tensor(test_labels, dtype=torch.long)

# Create tensor datasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
valid_dataset = TensorDataset(valid_input_ids, valid_attention_masks, valid_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)


from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

# Create dataloaders

# Training dataloader
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

# Validation dataloader
validation_dataloader = DataLoader(
    valid_dataset,
    sampler=SequentialSampler(valid_dataset),
    batch_size=batch_size
)

# Test dataloader
test_dataloader = DataLoader(
    test_dataset,
    sampler=SequentialSampler(test_dataset),
    batch_size=batch_size
)

!nvidia-smi

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Training hyperparameters and initialization
batch_size = 32
epochs = 5
gradient_accumulation_steps = 1
max_grad_norm = 1.0
patience = 2
train_losses = []
val_losses = []
val_accuracies = []
val_f1_scores = []
best_val_loss = float('inf')
no_improve_epochs = 0
best_val_f1 = 0
path_to_save = "gs://vino-verdict/models/multilingual-uncased-sentiment.bin"

# Compute class weights and convert to torch tensor
class_weights = compute_class_weight('balanced', classes=np.unique(df['rating_category']), y=df['rating_category'])
class_weights = torch.tensor(class_weights).float().to(device)
# Consider using SMOTE during preprocessing instead of the prebuilt class weights

# Set up the data loaders using the previously defined datasets
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=batch_size)

# Load model directly
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Loss function for binary cross entropy
loss_function = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy Loss
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.005)
# Look into perhaps using a Learning Rate Decay
# This starts with a higher learning rate and slowly reduces it

# Define the learning rate scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=0.5)

# Training loop
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    val_losses_epoch = []  # List to store validation losses for this epoch
    start_time = time.time()  # Start time for the epoch

    print(f"Training Epoch {epoch + 1}/{epochs}")
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}

        outputs = model(**inputs)
        logits = outputs.logits  # Get the logits from the model
        # Apply sigmoid activation to convert logits into probabilities
        probs = torch.sigmoid(logits)
    
        loss = loss_function(probs, inputs['labels'].float())  # BCE loss expects float labels
        loss = loss / gradient_accumulation_steps  # Adjust the loss for gradient accumulation
        loss.backward()

        if (step + 1) % 50 == 0:
            print(f"Batch {step + 1} of {len(train_dataloader)}. Loss: {loss.item():.4f}.")

        if (step + 1) % gradient_accumulation_steps == 0:
            # Clip gradients to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

            optimizer.step()
            scheduler.step()  # Update the learning rate
            model.zero_grad()

            total_train_loss += loss.item()

        # Log learning rate every 200 batches
        if (step + 1) % 200 == 0:
            current_lr = optimizer.param_groups[0]['lr']
            print(f"Current Learning Rate: {current_lr:.6f}")

    # Calculate average training loss
    avg_train_loss = total_train_loss / len(train_dataloader)

    end_time = time.time()  # End time for the epoch
    epoch_duration = end_time - start_time
    
    train_losses.append(avg_train_loss)
    print(f"Epoch {epoch + 1}/{epochs} - Training Loss: {avg_train_loss:.4f}")

    # Update the learning rate
    scheduler.step()

    # Print current learning rate for monitoring
    print(f"Epoch [{epoch+1}/{num_epochs}] - Learning Rate: {scheduler.get_last_lr()[0]}")

    # Validation loop
    model.eval()
    predictions, true_vals = [], []

    print("Validating...")
    for step, batch in enumerate(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)
    
        with torch.no_grad():
            outputs = model(input_ids=batch[0], attention_mask=batch[1])
            logits = outputs.logits
            probs = torch.sigmoid(logits)
    
        predicted_labels = (probs >= 0.5).int()  # Applying threshold
        
        # Calculate validation loss for early stopping
        val_loss = loss_function(logits, batch[2])
        val_losses_epoch.append(val_loss.item())

    
        # Print validation progress every 50 batches
        if step % 50 == 0:
            print(f"Validation Batch {step} of {len(validation_dataloader)}")


    # Calculate validation metrics
    val_accuracy = accuracy_score(true_vals, predictions)
    val_f1 = f1_score(true_vals, predictions, average='weighted')
    print(f"Validation Accuracy: {val_accuracy:.4f} - F1 Score: {val_f1:.4f}")
    
    # Append metrics to their respective lists
    val_accuracies.append(val_accuracy)
    val_f1_scores.append(val_f1)


    val_losses.append(np.mean(val_losses_epoch))
    avg_val_loss = np.mean(val_losses_epoch)
    print(f"Epoch {epoch + 1}/{epochs} - Average Validation Loss: {avg_val_loss:.4f}")


    # Early stopping check
    if np.mean(val_losses_epoch) < best_val_loss:
        best_val_loss = np.mean(val_losses_epoch)
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        if no_improve_epochs == patience:
            print(f"Early stopping triggered after {epoch + 1} epochs!")
            break

    # Save the model if it's the best one seen so far
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        print("Saving the best model...")

        # Save model weights
        torch.save(model.state_dict(), "./multilingual-uncased-sentiment.bin")
        os.system(f"gsutil cp ./multilingual-uncased-sentiment.bin {path_to_save}")

        # Save the model's configuration
        config_path = "./multilingual-uncased-sentiment_config.json"
        model.config.to_json_file(config_path)
        os.system(f"gsutil cp {config_path} gs://vino-verdict/models/")

# After training is done, evaluate the model on the test dataset
model.eval()
predictions, true_vals = [], []

test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        outputs = model(input_ids=batch[0], attention_mask=batch[1])

    logits = outputs[0]  # Keep logits as tensors on the GPU
    probabilities = torch.sigmoid(logits)  # Apply sigmoid to convert logits to probabilities
    predicted_labels = (probabilities >= 0.5).cpu().numpy()  # Convert probabilities to binary labels
    predictions.extend(predicted_labels)  # Extend the list with predicted binary labels
    true_vals.extend(batch[2].cpu().numpy())

# Flatten the true_vals and predictions lists
true_vals = [label for sublist in true_vals for label in sublist]
predictions = [label for sublist in predictions for label in sublist]

# Calculate test metrics
test_accuracy = accuracy_score(true_vals, predictions)
test_f1 = f1_score(true_vals, predictions, average='binary')  # Use 'binary' average for binary classification
print(f"Test Accuracy: {test_accuracy:.4f} - F1 Score: {test_f1:.4f}")

print(5)
print(len(val_accuracies))
print(len(val_f1_scores))
print(len(val_losses))
print(len(train_losses))

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, f1_score

# Assuming predictions and true_vals are already computed
predictions = predictions
true_vals = true_vals

# Compute the confusion matrix
cm = confusion_matrix(true_vals, predictions)

# Display the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt="d")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Compute F1 score
f1 = f1_score(true_vals, predictions, average='weighted')
print(f"F1 Score: {f1:.4f}")

# Compute the confusion matrix
cm = confusion_matrix(true_vals, predictions)

# Convert the numpy array to a list of lists for easy printing
cm_list = cm.tolist()

# Print the confusion matrix
print(cm_list)

import matplotlib.pyplot as plt

# Your data
epochs_range = list(range(1, len(train_losses) + 1))
training_losses = train_losses
validation_accuracies = val_accuracies
validation_f1_scores = val_f1_scores
validation_losses = val_losses

# Plotting
plt.figure(figsize=(10, 5))

# Overlay Training and Validation Loss
plt.plot(epochs_range, train_losses, label='Training Loss', color='blue')
plt.plot(epochs_range, val_losses, label='Validation Loss', color='red')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training & Validation Loss')
plt.legend()

plt.tight_layout()
plt.show()
