In [None]:
!pip install transformers

In [None]:
!nvidia-smi

In [5]:
import pandas as pd
import numpy as np

In [None]:
import torch
torch.cuda.empty_cache()

In [6]:
df = pd.read_csv('../cleaned_wine_df.csv')

In [7]:
df.head(3)

Unnamed: 0,country,description,points,price,province,region_1,title,variety,winery,year
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,19.0,Sicily & Sardinia,Etna,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011
2,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013


In [None]:
# 1. Preprocess the `description` column

# Check for missing values in the description and points columns
missing_values = df[['description', 'points']].isnull().sum()

# Drop rows with missing descriptions (if any)
df = df.dropna(subset=['description'])

# 2. Transform the `points` column into categorical labels

# Define bins for the wine ratings and labels for each bin
bins = [0, 85, 90, 100]
labels = ['low', 'medium', 'high']

# Create a new column 'rating_category' with the binned labels
df['rating_category'] = pd.cut(df['points'], bins=bins, labels=labels, include_lowest=True)

missing_values, df[['description', 'rating_category']].head()


In [None]:
from transformers import BertTokenizer

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the descriptions
encoded_data = tokenizer.batch_encode_plus(
    df['description'].values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    truncation=True,           # Explicitly truncate sequences exceeding max_length
    padding='max_length',      # Pad all sequences to max_length
    max_length=256, 
    return_tensors='pt'
)

# Extract the input IDs, attention masks, and labels
input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']
labels = df['rating_category'].astype('category').cat.codes.values

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert data into torch tensors
input_ids = input_ids.clone().detach()
attention_masks = attention_masks.clone().detach()
labels = torch.tensor(labels, dtype=torch.long)

# Create a tensor dataset
dataset = TensorDataset(input_ids, attention_masks, labels)

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=None)

In [None]:
!nvidia-smi

In [None]:
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score

# Lists to store results for each fold
validation_results = []

# Define early stopping parameters
patience = 3  # Number of epochs to wait for improvement before stopping

for train_index, val_index in kf.split(dataset):
    
    # Re-initialize the model for each fold
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", 
        num_labels=len(df['rating_category'].cat.categories),  
        output_attentions=False, 
        output_hidden_states=False,
        problem_type='single_label_classification'
    )
    model.to(device)
    optimizer = TorchAdamW(model.parameters(), lr=2e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps=0, 
                                                num_training_steps=len(train_dataloader) * epochs)
    
    # Split the dataset into training and validation sets for this fold
    train_dataset = torch.utils.data.Subset(dataset, train_index)
    val_dataset = torch.utils.data.Subset(dataset, val_index)
    
    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
    validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
    
    best_val_f1_for_fold = 0  # To track the best F1 for this fold (reset for each fold)
    epochs_without_improvement = 0  # To keep track of epochs without improvement
    
    # Training loop
    for epoch in range(epochs):
        
        model.train()
        total_train_loss = 0
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                      'labels': batch[2]}
            
            outputs = model(**inputs)
            loss = loss_function(outputs.logits, inputs['labels'])
            
            # Gradient accumulation
            loss = loss / gradient_accumulation_steps
            loss.backward()

            # Update weights and learning rate
            if (step + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                optimizer.step()
                scheduler.step()
                model.zero_grad()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{epochs} - Training Loss: {avg_train_loss:.4f}")

        # Validation loop
        model.eval()
        predictions, true_vals = [], []
        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            
            with torch.no_grad():
                outputs = model(input_ids=batch[0], attention_mask=batch[1])
            
            logits = outputs[0].detach().cpu().numpy()
            label_ids = batch[2].cpu().numpy()
            predictions.extend(np.argmax(logits, axis=1))
            true_vals.extend(label_ids)

        # Calculate evaluation metrics
        accuracy = accuracy_score(true_vals, predictions)
        f1 = f1_score(true_vals, predictions, average='weighted')
        print(f"Validation Accuracy: {accuracy:.4f} - F1 Score: {f1:.4f}")

        # Check for F1 improvement
        if f1 > best_val_f1_for_fold:
            best_val_f1_for_fold = f1
            epochs_without_improvement = 0  # Reset the counter
            torch.save(model.state_dict(), f"best_model_fold_{len(validation_results) + 1}.bin")
        else:
            epochs_without_improvement += 1
        
        # Check for early stopping
        if epochs_without_improvement >= patience:
            print(f"Early stopping after {epoch + 1} epochs!")
            break
        
        # Optionally clear GPU cache
        torch.cuda.empty_cache()

    validation_results.append({'Accuracy': accuracy, 'F1': f1})

# Analyze validation_results after all folds are completed
print(validation_results)