In [6]:
%pip install transformers datasets
from transformers import pipeline
from datasets import load_dataset
import pandas as pd
import torch

# Load the dataset
dataset = load_dataset('yelp_review_full')
df = dataset['train'].to_pandas()  

# Create a text summarization pipeline using the BART model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Initialize an empty list to store summaries
summaries = []

# Dynamically adjust max_length and summarize the text
for text in df['text'].iloc[:5].tolist():
    input_length = len(text.split())  # Estimate the number of words in the input
    max_length = min(130, max(30, int(input_length * 0.5)))  # Dynamically adjust max_length
    summary = summarizer(text, max_length=max_length, min_length=30, truncation=True)[0]['summary_text']
    summaries.append(summary)

# Create a new dataframe to store the original texts and their summaries
# Make sure to also copy the star rating information, assuming in the original dataset the star rating is stored in the 'label' column
df_summary = df.iloc[:5].copy()  # Copy the first 5 rows
df_summary['summary'] = summaries  # Add the summaries to the new dataframe
df_summary['stars'] = df['label'].iloc[:5]  # Copy the corresponding star ratings

print(df_summary[['text', 'summary', 'stars']])


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.
                                                text  \
0  dr. goldberg offers everything i look for in a...   
1  Unfortunately, the frustration of being Dr. Go...   
2  Been going to Dr. Goldberg for over 10 years. ...   
3  Got a letter in the mail last week that said D...   
4  I don't know what Dr. Goldberg was like before...   

                                             summary  stars  
0   dr. goldberg offers everything i look for in ...      4  
1  The frustration of being Dr. Goldberg's patien...      1  
2  I've been going to Dr. Goldberg for over 10 ye...      3  
3  Dr. Goldberg is moving to Arizona to take a ne...      3  
4  Dr. Goldberg is only interested in the co-pay ...      0  


Data Preparation

In [7]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import numpy as np
import torch

class YelpReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels - 1  # Convert labels from 1-5 to 0-4

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Prepare the data
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
X = df_summary['summary'].tolist()  # Summary texts
y = df_summary['stars'].to_numpy()  # Star ratings

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2021)

train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=512)

train_dataset = YelpReviewDataset(train_encodings, y_train)
val_dataset = YelpReviewDataset(val_encodings, y_val)


Model Definition, Training and Evaluation.

In [8]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()
trainer.evaluate()



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/3 [00:00<?, ?it/s]

{'train_runtime': 9.2247, 'train_samples_per_second': 1.301, 'train_steps_per_second': 0.325, 'train_loss': 1.2871328194936116, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4001028537750244,
 'eval_runtime': 1.8604,
 'eval_samples_per_second': 0.538,
 'eval_steps_per_second': 0.538,
 'epoch': 3.0}