In [1]:
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

TRAIN_DATA_PATH = '/Users/nikolaj/Desktop/datathon/review_chunks/score_training.csv'
use_cols = ["text", "yumminess_score"]

df = pd.read_csv(TRAIN_DATA_PATH, usecols=use_cols)
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)


train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
eval_dataset = Dataset.from_pandas(eval_df.reset_index(drop=True))
print(df.head())

  from .autonotebook import tqdm as notebook_tqdm


                                                text  yumminess_score
0  While I was sad to see Bibo close at this loca...             0.50
1  Friendly baristas and great coffee. Since thei...             0.80
2  Love their coffee and vibe. The breakfast food...             0.90
3  It's like everything you try is your new favor...             1.00
4  This is probably one of my favorite boba place...             0.85


In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

train_tokenized = train_dataset.map(
    lambda example: tokenizer(
        str(example["text"]),
        truncation=True,
        padding="max_length",
        max_length=128
    ),
    # remove_columns=["text", "yumminess_score"]
)
eval_tokenized = eval_dataset.map(
    lambda example: tokenizer(
        str(example["text"]),
        truncation=True,
        padding="max_length",
        max_length=128
    ),
    # remove_columns=["text", "yumminess_score"]
)

train_tokenized = train_tokenized.rename_column("yumminess_score", "labels")
eval_tokenized = eval_tokenized.rename_column("yumminess_score", "labels")
train_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
eval_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map: 100%|██████████| 240/240 [00:00<00:00, 3035.98 examples/s]
Map: 100%|██████████| 60/60 [00:00<00:00, 2591.82 examples/s]


In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import accelerate
from torch import nn

class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.MSELoss()
        loss = loss_fct(logits.squeeze(), labels.squeeze())
        return (loss, outputs) if return_outputs else loss

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=1,  # Single output for regression
    problem_type="regression"  # Specify this is a regression task
)

# Configure training arguments with compatible options
training_args = TrainingArguments(
    output_dir="./yum_model",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    logging_dir='./logs',
    save_steps=500,
)

# Add labels to the dataset
def add_labels(example):
    example['labels'] = example['yumminess_score']
    return example


# Use the custom RegressionTrainer instead of the standard Trainer
trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,
)

# Train the model
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


TrainOutput(global_step=45, training_loss=0.06722126007080079, metrics={'train_runtime': 33.5446, 'train_samples_per_second': 21.464, 'train_steps_per_second': 1.341, 'total_flos': 23843706531840.0, 'train_loss': 0.06722126007080079, 'epoch': 3.0})

In [None]:
# THIS IS THE NEW UNTESTED CODE

predictions = trainer.predict(eval_tokenized)
pred_scores = predictions.predictions.squeeze()  # shape: (n_samples,)
true_scores = predictions.label_ids.squeeze()    # shape: (n_samples,)

import numpy as np

bins = [0.0, 0.33, 0.66, 1.0]
labels = ['low', 'medium', 'high']

pred_bins = pd.cut(pred_scores, bins=bins, labels=labels, include_lowest=True)
true_bins = pd.cut(true_scores, bins=bins, labels=labels, include_lowest=True)

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(true_bins, pred_bins, labels=labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")


KeyboardInterrupt: 

In [18]:
from transformers import pipeline

classifier_yumminess = pipeline("text-classification", model=model, tokenizer=tokenizer)

review = "Wow, how have I not heard about this place before? I stumbled upon this place with some friends after trying to go to a (very disappointing) carnival. But this saved the day! The staff was super friendly, happily explaining all their options to us. The little walk up is cute and the options have a good variety from regular coffees, smoothies and fun flavors. The best is that you can pick iced, hot or blended for no additional charge! I love a good blended coffee, so I tried the snowy mocha - their version of a white chocolate mocha with ghiradelli chocolate. I hate super sweet drinks (Starbucks Frappuccinos are a little too much for me), but this drink was nicely balanced where you get some chocolate but still get a good coffee flavor. My friend said the same about her blended Mexican coffee. The pricing was fair for the size. Best part: you get a delicious little chocolate covered coffee bean on top! Extra coffee flavor! Definitely a good find!!"
classifier_yumminess(review)

Device set to use mps:0


[{'label': 'LABEL_0', 'score': 0.73695307970047}]

In [17]:
import joblib

# Assume classifier has been trained already
joblib.dump(classifier_yumminess, "classifier_yumminess.pkl")

['classifier_yumminess.pkl']