In [None]:
#!pip install datasets


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
#pip install kaggle



In [None]:
from google.colab import files
files.upload()  # Choose your kaggle.json file to upload

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"fedibaccar","key":"ba76b1bba7a0341c10a4b6944457c261"}'}

In [None]:
!kaggle datasets download -d arhamrumi/amazon-product-reviews


Dataset URL: https://www.kaggle.com/datasets/arhamrumi/amazon-product-reviews
License(s): CC0-1.0
Downloading amazon-product-reviews.zip to /content
 92% 105M/115M [00:01<00:00, 83.9MB/s] 
100% 115M/115M [00:01<00:00, 69.2MB/s]


In [None]:
!unzip amazon-product-reviews.zip

Archive:  amazon-product-reviews.zip
  inflating: Reviews.csv             


In [None]:
#!pip install peft transformers




#**Import Libraries**
# -------------------------------------------------
# Importing necessary libraries for data handling, model setup, training, and evaluation.
# - pandas: For data manipulation
# - sklearn: For train-test splitting and evaluation metrics
# - transformers: For BERT tokenizer, model, and trainer
# - peft: For applying Low-Rank Adaptation (LoRA) to the model
# - datasets: For creating dataset objects
# - torch: For PyTorch backend

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset
import torch

# **Read and Sample Data**
# -------------------------------------------------
# Reads review data from a CSV file and samples 500 rows for faster processing.
# - Adjust the file path as necessary to match your data location.
# - Only keeps 'Text' (review content) and 'Score' (rating) columns.
# - Drops any rows with missing values.

In [None]:
# Reading the data from the CSV file
df = pd.read_csv('/content/Reviews.csv')  # Update the path as necessary
df_500 = df.sample(n=500, random_state=42)

# Filter relevant columns and remove NaN values
df_500 = df_500[['Text', 'Score']].dropna()  # 'Text' for review text and 'Score' for ratings

# **Convert Ratings to Binary Labels**
 -------------------------------------------------
# Converts 'Score' ratings into binary labels:
# - Reviews with a score >= 3 are labeled as 1 (positive).
# - Reviews with a score < 3 are labeled as 0 (negative).

In [None]:
# Convert ratings to binary labels: 1 for ratings >= 3, 0 for ratings < 3
df_500['label'] = df_500['Score'].apply(lambda x: 1 if x >= 3 else 0)

# **Split Data into Train and Test Sets**
# -------------------------------------------------
# Splits the data into training and test sets using an 80-20 split.
# - Random state ensures reproducibility of the split.
# - Converts text and labels to lists for easy tokenization.

In [None]:
# Split into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_500['Text'].tolist(), df_500['label'].tolist(), test_size=0.2, random_state=42
)

# **Tokenize the Data**
# -------------------------------------------------
# Tokenizes the training and test text using the BERT tokenizer.
# - Padding and truncation are applied to handle variable-length sequences.
# - Maximum length is set to 512 to fit within BERT's limits.

In [None]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# **Convert to HuggingFace Dataset Format**
# -------------------------------------------------
# Converts the tokenized data to HuggingFace Dataset format, required by the Trainer.
# - Includes 'input_ids', 'attention_mask', and 'label' fields.

In [None]:
# Convert to HuggingFace Dataset format
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'label': train_labels})
test_dataset = Dataset.from_dict({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'label': test_labels})


# **Load BERT Model for Sequence Classification**
# -------------------------------------------------
# Loads a pre-trained BERT model with a sequence classification head.
# - num_labels is set to 2 for binary classification.

In [None]:
# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# **LoRA Configuration for Model Adaptation**
# -------------------------------------------------
# Configures and applies Low-Rank Adaptation (LoRA) to the model.
# - LoRA improves parameter efficiency by reducing the rank of weight matrices.
# - Relevant parameters include `r` (rank), `lora_alpha` (scaling factor), and `lora_dropout` (dropout rate).

In [None]:
# LoRA configuration (Low-Rank Adaptation)
lora_config = LoraConfig(
    r=8,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor for LoRA updates
    lora_dropout=0.1,  # Dropout rate during training
    task_type=TaskType.SEQ_CLS  # Sequence classification task
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# **Define Training Arguments**
# -------------------------------------------------
# Sets up the training arguments for the HuggingFace Trainer.
# - Configures output directory, evaluation strategy, learning rate, batch sizes, number of epochs, and weight decay.


In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',               # Output directory for results
    evaluation_strategy="epoch",          # Evaluate after each epoch
    learning_rate=2e-5,                   # Learning rate
    per_device_train_batch_size=8,        # Batch size for training
    per_device_eval_batch_size=8,         # Batch size for evaluation
    num_train_epochs=3,                   # Number of training epochs
    weight_decay=0.01,                    # Weight decay
    logging_dir=None,                 # Directory for logs
)

# **Initialize Trainer with Model, Arguments, and Datasets**
# -------------------------------------------------
# Sets up the HuggingFace Trainer with the model, training arguments, datasets, and tokenizer.
# Defines custom metrics function for accuracy and F1-score.

In [None]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.predictions.argmax(axis=-1), p.label_ids),
        'f1': f1_score(p.predictions.argmax(axis=-1), p.label_ids)
    }
)

# **Train the Model**
# -------------------------------------------------
# Starts the training process for the model using the defined Trainer setup.

In [None]:
# Train the model
trainer.train()

# **Evaluate the Model**
# -------------------------------------------------
# Evaluates the model on the test dataset and outputs the accuracy and F1-score.

In [None]:
# Evaluate the model
trainer.evaluate()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.462132,0.85,0.918919
2,No log,0.432726,0.85,0.918919
3,No log,0.427272,0.85,0.918919


{'eval_loss': 0.4272715449333191,
 'eval_accuracy': 0.85,
 'eval_f1': 0.918918918918919,
 'eval_runtime': 114.7804,
 'eval_samples_per_second': 0.871,
 'eval_steps_per_second': 0.113,
 'epoch': 3.0}