1. Split the data into train and test sets

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the provided CSV file into a dataframe
wine_df = pd.read_csv('/mnt/data/cleaned_wine_df.csv')

# Check for the presence of 'points_category' column
if 'points_category' not in wine_df.columns:
    # Create 'points_category' column if it doesn't exist
    wine_df['points_category'] = wine_df['points'].apply(lambda x: 'not-so-highly rated' if 80 <= x <= 89 else 'highly rated')

# Split the data into training, validation, and test sets
train_df, temp_df = train_test_split(wine_df, test_size=0.2, random_state=42, stratify=wine_df['points_category'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['points_category'])

train_df.shape, val_df.shape, test_df.shape

2. Tokenize the data using the tokenizer provided by the model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Tokenize the descriptions from the train, validation, and test sets
train_encodings = tokenizer(list(train_df['description']), truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(list(val_df['description']), truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(list(test_df['description']), truncation=True, padding=True, max_length=128, return_tensors='pt')

train_encodings.keys()  # Display the keys to check the tokenization results

3. Create Pytorch Dataset and Dataloader

In [None]:
train_labels = (train_df['points'] > 3).long().to_numpy()
val_labels = (val_df['points'] > 3).long().to_numpy()
test_labels = (test_df['points'] > 3).long().to_numpy()

train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = torch.utils.data.TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)
test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

Model Setup and Fine-tuning Guide:
1. Initialize the BERT model for binary classification:

In [None]:
from transformers import BertForSequenceClassification

# Load the BERT model for binary classification
model = BertForSequenceClassification.from_pretrained(
    'nlptown/bert-base-multilingual-uncased-sentiment', 
    num_labels=2  # For binary classification
)
model = model.to(device)

2. Define the Loss Function, Optimizer, and DataLoader:

In [None]:
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AdamW

# Use BCEWithLogitsLoss for binary classification with logits (includes sigmoid activation)
loss_function = nn.BCEWithLogitsLoss()

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# DataLoader
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=16)


3. Training Loop:


In [None]:
epochs = 3

for epoch in range(epochs):
    model.train()
    
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        
        loss = loss_function(outputs.logits, labels.float().unsqueeze(-1))
        loss.backward()
        optimizer.step()
    
    # Validation at the end of each epoch
    model.eval()
    val_loss = 0
    predictions, true_vals = [], []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
            outputs = model(input_ids, attention_mask=attention_mask)
            
            val_loss += loss_function(outputs.logits, labels.float().unsqueeze(-1)).item()
            
            # Get binary predictions
            batch_predictions = torch.sigmoid(outputs.logits) > 0.5
            predictions.extend(batch_predictions.cpu().numpy())
            true_vals.extend(labels.cpu().numpy())
    
    # Print validation results (you can also compute other metrics like accuracy, F1 score, etc.)
    print(f"Epoch {epoch+1}/{epochs} - Validation Loss: {val_loss/len(val_loader)}")


4. Evaluation:
After training is complete, use the test dataset to evaluate the model's performance.

In [None]:
test_loader = DataLoader(test_dataset, batch_size=16)
model.eval()
test_predictions, test_true_vals = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # Get binary predictions
        batch_predictions = torch.sigmoid(outputs.logits) > 0.5
        test_predictions.extend(batch_predictions.cpu().numpy())
        test_true_vals.extend(labels.cpu().numpy())

# Compute evaluation metrics (e.g., accuracy, F1 score) using test_predictions and test_true_vals
