In [None]:
!pip install transformers

In [None]:
!nvidia-smi

In [None]:
import numpy as np
from gensim.models import Word2Vec
import pandas as pd

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
# Define the path to the CSV file in GCS
csv_path = "gs://vino-verdict/data/cleaned_wine_df.csv"

# Read the CSV using pandas
df = pd.read_csv(csv_path)

In [None]:
df.head(3)

In [None]:
# Preprocess the `description` column
missing_values = df[['description', 'points']].isnull().sum()
df = df.dropna(subset=['description'])

# Transform the `points` column into categorical labels
bins = [0, 85, 90, 100]
labels = ['low', 'medium', 'high']
df['rating_category'] = pd.cut(df['points'], bins=bins, labels=labels, include_lowest=True)

missing_values, df[['description', 'rating_category']].head()

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Tokenize descriptions
input_ids = []
attention_masks = []

for desc in df['description']:
    encoded_dict = tokenizer.encode_plus(desc, add_special_tokens=True, max_length=64, truncation=True, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Convert rating categories to integers (0: low, 1: medium, 2: high)
labels = df['rating_category'].map({'low': 0, 'medium': 1, 'high': 2}).values
labels = torch.tensor(labels)

In [None]:
# Split data into train, validation, and test sets
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.7 * len(dataset))
val_size = int(0.2 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

print(f"Train size: {train_size}, Validation size: {val_size}, Test size: {test_size}")

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to('cuda')

In [None]:
BATCH_SIZE = 32
train_dataloader = DataLoader(train_dataset, sampler=torch.utils.data.RandomSampler(train_dataset), batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, sampler=torch.utils.data.SequentialSampler(val_dataset), batch_size=BATCH_SIZE)

optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)  # 3 epochs

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.train()
model.to(device)

for epoch in range(3):
    total_train_loss = 0
    for batch in train_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}")

In [None]:
model.eval()
total_val_accuracy = 0
for batch in val_dataloader:
    b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    total_val_accuracy += (logits.argmax(axis=1) == label_ids).mean()
print(f"Validation Accuracy: {total_val_accuracy / len(val_dataloader)}")

In [None]:
model.eval()
total_test_accuracy = 0
for batch in DataLoader(test_dataset, sampler=torch.utils.data.SequentialSampler(test_dataset), batch_size=BATCH_SIZE):
    b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    total_test_accuracy += (logits.argmax(axis=1) == label_ids).mean()
print(f"Test Accuracy: {total_test_accuracy / len(DataLoader(test_dataset, sampler=torch.utils.data.SequentialSampler(test_dataset), batch_size=BATCH_SIZE))}")