In [1]:
%pip install transformers datasets
from transformers import pipeline
from datasets import load_dataset
import pandas as pd
import torch

# Load the dataset
dataset = load_dataset('yelp_review_full')
df = dataset['train'].to_pandas()  

# Create a text summarization pipeline using the BART model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Initialize an empty list to store summaries
summaries = []

# Dynamically adjust max_length and summarize the text
for text in df['text'].iloc[:150].tolist():
    input_length = len(text.split())  # Estimate the number of words in the input
    max_length = min(130, max(30, int(input_length * 0.5)))  # Dynamically adjust max_length
    summary = summarizer(text, max_length=max_length, min_length=30, truncation=True)[0]['summary_text']
    summaries.append(summary)

# Create a new dataframe to store the original texts and their summaries
# Make sure to also copy the star rating information, assuming in the original dataset the star rating is stored in the 'label' column
df_summary = df.iloc[:150].copy()  # Copy the first 150 rows
df_summary['summary'] = summaries  # Add the summaries to the new dataframe
df_summary['stars'] = df['label'].iloc[:150]  # Copy the corresponding star ratings

print(df_summary[['text', 'summary', 'stars']])


^C
[31mERROR: Operation cancelled by user[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


KeyboardInterrupt: 

Data Preparation

In [None]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import numpy as np
import torch

class YelpReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels - 1  # Convert labels from 1-5 to 0-4

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Prepare the data
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
X = df_summary['summary'].tolist()  # Summary texts
y = df_summary['stars'].to_numpy()  # Star ratings

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2021)

train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=512)

train_dataset = YelpReviewDataset(train_encodings, y_train)
val_dataset = YelpReviewDataset(val_encodings, y_val)


Model Definition, Training and Evaluation.

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()
trainer.evaluate()



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/45 [00:00<?, ?it/s]

{'loss': 1.2102, 'grad_norm': 5.436811923980713, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.67}
{'loss': 1.2112, 'grad_norm': 5.261388301849365, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.33}
{'loss': 1.1035, 'grad_norm': 5.03201150894165, 'learning_rate': 3e-06, 'epoch': 2.0}
{'loss': 1.143, 'grad_norm': 5.402960777282715, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.67}
{'train_runtime': 23.2161, 'train_samples_per_second': 15.506, 'train_steps_per_second': 1.938, 'train_loss': 1.1458648787604437, 'epoch': 3.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.2915648221969604,
 'eval_runtime': 0.5079,
 'eval_samples_per_second': 59.072,
 'eval_steps_per_second': 7.876,
 'epoch': 3.0}


# Summary of Training Results

1.  **5 entries:**      Train Loss = 1.176, Eval Loss = 1.600
2.  **15 entries:**     Train Loss = 1.354, Eval Loss = 0.736
3.  **30 entries:**     Train Loss = 1.299, Eval Loss = 1.176
4.  **100 entries:**    Train Loss = 1.239, Eval Loss = 1.178
5.  **150 entries:**    Train Loss = 1.134, Eval Loss = 1.209

# Data Selection and Summarization Process

**Data Selection:**

**Code:** df['text'].iloc[:150].tolist()

**Description:** Selects the first 150 entries from the 'text' column of the dataframe df. The .iloc[:150] is used to slice the first 150 rows of the dataframe, facilitating targeted data analysis.

**Summarization Process:**

**Operation:** Iteration over the first 150 text entries.

**Details:** For each entry, the script assesses the input length, adjusts the maximum summary length accordingly, and generates a summary using the BART model.

# Ideal and Specific Ranges for Loss Values

**Ideal Range:**

**Close to Zero:** Indicates accurate model predictions. However, exactly zero might suggest overfitting.

**Consistency Between Train and Eval Loss:** A small gap suggests good model generalization across training and validation datasets.

**Specific Range:**

**Loss Value:** Ranges from 0.01 to 0.5 for simpler tasks, and 1 to 10 or higher for complex tasks.

**Difference:** Ideally, the difference between training and evaluation loss should not exceed 10% to 20% of the training loss.

# Analysis of Specific Cases

**Case Study: Result #2**

**Observation:** Evaluation loss (0.736) is lower than training loss (1.354).

**Possible Explanations:**

1. **Strong Generalization:**   Model may generalize exceptionally well to validation data.

2. **Data Distribution:**       Training set could be more challenging than evaluation set.

3. **Evaluation Set Size:**     Smaller or less diverse evaluation set might not fully represent dataset difficulty.

4. **Overfitting Avoidance:**   Effective regularization or early stopping.
    
5. **Random Variation:**        Random factors influencing loss values.

# Model's Loss Values and Their Implications

**Training Loss:** 1.1339557965596516

**Implication:** Indicates the model's average error in recognizing patterns during training, reasonable for a complex NLP task.

**Evaluation Loss:** 1.2087987661361694

**Implication:** Slightly higher than training loss, typical in scenarios where the model is tested on unseen data.

# Judging Loss Values

**Low vs. High:** Training and evaluation losses below 1.0 are typically good; values between 1.0 and 2.0 may still be acceptable depending on the task and dataset.

**Monitoring:** It's crucial to track the trend of loss values, aiming for a gradual decrease over time without a widening gap between training and evaluation losses to avoid overfitting.