In [None]:
!pip install -qq datasets evaluate transformers
!pip install -qq wandb

In [20]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
WANDB_API_KEY = user_secrets.get_secret("wandb_key")

WANDB_ENTITY='amir7d0'
WANDB_PROJECT='sentiment-analysis'


wandb.login(key=WANDB_API_KEY)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## 1. Business / Research Understanding Phase

The goal of this sentiment analysis project is to provide insights into the sentiment of customers towards products on Amazon. By analyzing customer reviews, we can identify trends and patterns in customer sentiment, which can be used to improve product development, marketing strategies, and customer service. Specifically, we will be analyzing the Amazon review dataset, which contains millions of reviews of various products. Our analysis will focus on identifying the sentiment of these reviews, categorizing them as positive, negative, or neutral, and examining the factors that contribute to customer sentiment. This information can be valuable to businesses looking to improve their products and customer experience, as well as researchers interested in understanding consumer behavior and sentiment.

Amazon product reviews dataset contains reviews in English, Japanese, German, French, Chinese, and Spanish, collected between November 1, 2015, and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID, and the coarse-grained product category (e.g. ‘books’, ‘appliances’, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.

For each language, there are 200,000, 5,000, and 5,000 reviews in the training, development, and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.

* In this project we use English language reviews, so we have 200,000 records for the training, 5000 for the validation, and 5000 for the test set.
* There are two tables available for EDA: "eda_table" which contains the entire dataset of 210,000 records, and "eda_table_sample" which is a sample of 10,000 records from the dataset.
* For this project, we use the "review_body" column for input text and the "stars" column for labels which is five classes.

## 2. Data Preparation/Understanding Phase

In [None]:
from datasets import get_dataset_config_names, load_dataset_builder, load_dataset

DATASET_PATH = "amazon_reviews_multi"
DATASET_CONFIG = 'en'


In [None]:
configs = get_dataset_config_names(DATASET_PATH)
print(configs)

In [None]:
ds_builder = load_dataset_builder(DATASET_PATH, DATASET_CONFIG)

print(ds_builder.info.description)
print('dataset splits: ', ds_builder.info.splits)
print('dataset features:', ds_builder.info.features)

In [None]:
dataset = load_dataset(DATASET_PATH, DATASET_CONFIG)

In [None]:
dataset

## 3. Exploratory Data Analysis Phase

In [None]:
run = wandb.init(project=WANDB_PROJECT, job_type='upload-dataset')
artifact = wandb.Artifact(name='amazon_reviews_english', type='dataset')


In [None]:
files = [dataset.cache_files[key][0]['filename'] for key in dataset.cache_files]

for path in files:
    artifact.add_file(local_path=path)


In [None]:
features = ds_builder.info.features

for key, value in features.items():
    print(key, value)

In [None]:
import pandas as pd

df = pd.DataFrame()
for key in dataset:
    df_tmp = dataset[key].to_pandas()
    df_tmp['split'] = key
    df = pd.concat([df, df_tmp])

df.shape

In [None]:
df_sample = df[df['split'] == 'train'].sample(1000)

table = wandb.Table(columns=list(df_sample.columns), data=df_sample)
artifact.add(table, "eda_table_sample")

In [None]:
table = wandb.Table(columns=list(df.columns), data=df)
wandb.Table.MAX_ROWS = df.shape[0] + 1000
artifact.add(table, "eda_table")

In [None]:
run.log_artifact(artifact)
run.finish()

## 4. Setup Phase

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

import numpy as np
import pandas as pd

DATASET_PATH = "amazon_reviews_multi"
DATASET_CONFIG = 'en'
TEXT_COL = 'review_body'
LABEL_COL = 'stars'

MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 128
MODEL_DIR = "HF_Model"

BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 5e-5

In [None]:

raw_datasets = load_dataset(DATASET_PATH, DATASET_CONFIG)
raw_datasets = raw_datasets.rename_columns({TEXT_COL: 'text', LABEL_COL: 'labels'})
drop_columns = list(set(raw_datasets["train"].column_names) - set(['text', 'labels']))
raw_datasets = raw_datasets.remove_columns(drop_columns)

# find number of classes and map 1-5 stars to a range of 0 to 4
number_of_classes = len(pd.unique(raw_datasets['train']['labels']))
class_map = dict(zip(pd.unique(raw_datasets['train']['labels']), 
                     pd.unique(raw_datasets['train']['labels'])-1))

raw_datasets = raw_datasets.map(lambda example: {'labels': class_map[example['labels']]})
raw_datasets

In [None]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenization(example):
    return tokenizer(example['text'], max_length=MAX_LEN, padding=True, truncation=True)

tokenized_datasets = raw_datasets.map(tokenization, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['text'])
tokenized_datasets["train"].column_names

### TensorFlow / Keras

In [None]:
# create tensorflow dataset
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
)

tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
)

In [None]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# run on multiple GPUs
gpus = tf.config.list_logical_devices('GPU')
strategy = tf.distribute.MirroredStrategy(gpus)
print("Num GPUs Available: ", strategy.num_replicas_in_sync)
with strategy.scope():
    model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_NAME, 
                                                             num_labels=number_of_classes)

    model.compile(
        optimizer=Adam(LEARNING_RATE),
        loss=SparseCategoricalCrossentropy(from_logits=True),
        metrics=["accuracy"],
    )

In [None]:
import wandb
from wandb.keras import WandbMetricsLogger

train_config = {
    "model": f"{MODEL_NAME}-finetuned", 
    "batch_size": BATCH_SIZE,
    "epochs": EPOCHS,
    "learning_rate": LEARNING_RATE,
    "pretrained": True,
}

run = wandb.init(project=WANDB_PROJECT, job_type="training", config=train_config)

In [None]:
# Train the model for one epoch
model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=EPOCHS,
    callbacks=[WandbMetricsLogger(log_freq='batch')]
)

In [None]:
label_names = ['1 star', '2 stars', '3 stars', '4 stars', '5 stars']

model.config.id2label = {str(i): label for i, label in enumerate(label_names)}
model.config.label2id = {label: str(i) for i, label in enumerate(label_names)}

In [None]:
wandb.finish(quiet=True)

## 5. Modeling Phase

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

import numpy as np
import pandas as pd


DATASET_PATH = "amazon_reviews_multi"
DATASET_CONFIG = 'en'
TEXT_COL = 'review_body'
LABEL_COL = 'stars'

MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 128
MODEL_DIR = "HF_Model"

BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 5e-5

In [None]:
raw_datasets = load_dataset(DATASET_PATH, DATASET_CONFIG)
raw_datasets = raw_datasets.rename_columns({TEXT_COL: 'text', LABEL_COL: 'labels'})
drop_columns = list(set(raw_datasets["train"].column_names) - set(['text', 'labels']))
raw_datasets = raw_datasets.remove_columns(drop_columns)

# find number of classes and map 1-5 stars to a range of 0 to 4
number_of_classes = len(pd.unique(raw_datasets['train']['labels']))
class_map = dict(zip(pd.unique(raw_datasets['train']['labels']), 
                     pd.unique(raw_datasets['train']['labels'])-1))

raw_datasets = raw_datasets.map(lambda example: {'labels': class_map[example['labels']]})

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenization(example):
    return tokenizer(example['text'], max_length=MAX_LEN, padding=True, truncation=True)

tokenized_datasets = raw_datasets.map(tokenization, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['text'])
print(tokenized_datasets["train"].column_names)

In [None]:
def get_dataset(batch_size=8, columns=["attention_mask", "input_ids"]):
    # create tensorflow dataset
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
    
    tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
        columns=columns,
        label_cols=["labels"],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=batch_size,
    )

    tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset(
        columns=columns,
        label_cols=["labels"],
        shuffle=False,
        collate_fn=data_collator,
        batch_size=batch_size,
    )

    tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
        columns=columns,
        label_cols=["labels"],
        shuffle=False,
        collate_fn=data_collator,
        batch_size=batch_size,
    )
    return tf_train_dataset, tf_validation_dataset, tf_test_dataset

In [None]:
from transformers import TFAutoModelForSequenceClassification, AutoConfig
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from wandb.keras import WandbMetricsLogger, WandbModelCheckpoint
import tensorflow as tf
import os


os.makedirs(MODEL_DIR, exist_ok=True)
model_config_path = f'{MODEL_DIR}/config.json'
model_config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=5)
label_names = ['1 star', '2 stars', '3 stars', '4 stars', '5 stars']

model_config.id2label = {str(i): label for i, label in enumerate(label_names)}
model_config.label2id = {label: i for i, label in enumerate(label_names)}
with open(model_config_path, 'w') as f:
    model_config.to_json_file(model_config_path)


def train():
    default_config = {
        'dataset_name': DATASET_PATH,
        'model_name': MODEL_NAME,
        'max_length': MAX_LEN,
        'model_dir': MODEL_DIR,
        'batch_size': BATCH_SIZE,
        'learning_rate': LEARNING_RATE,
        'epochs': EPOCHS,
    }
    
    wandb.init(job_type='hyperparam-tuning', config=default_config)
    wandb.save(model_config_path)
    config = wandb.config
    
    train_dataset, validation_dataset, test_dataset = get_dataset(batch_size=config.batch_size)
    
    num_train_steps = len(train_dataset) * config.epochs
    lr_scheduler = PolynomialDecay(
        initial_learning_rate=config.learning_rate, end_learning_rate=0.0, 
        decay_steps=num_train_steps
    )

    # run on multiple GPUs
    gpus = tf.config.list_logical_devices('GPU')
    strategy = tf.distribute.MirroredStrategy(gpus)
    print("Num GPUs Available: ", strategy.num_replicas_in_sync)
    with strategy.scope():
        model = TFAutoModelForSequenceClassification.from_pretrained(config.model_name, 
                                                                 num_labels=number_of_classes)
        model.compile(optimizer=Adam(learning_rate=lr_scheduler), 
                      loss=SparseCategoricalCrossentropy(from_logits=True), 
                      metrics=["accuracy"])
    # Train the model for one epoch
    model.fit(
        train_dataset.take(500),
        validation_data=validation_dataset.take(100),
        epochs=config.epochs,
        callbacks=[
            WandbMetricsLogger(log_freq='batch'), 
            WandbModelCheckpoint(filepath=f'{wandb.run.dir}/{config.model_dir}/tf_model.h5', save_best_only=True, save_weights_only=True),
        ]
    )
    # model.save_pretrained(f'{wandb.run.dir}/HF_Model/')
    
    

In [None]:

sweep_configuration = {
    "name": "hyperparam-tuning-distilbert",
    "metric": {"name": "epoch/val_accuracy", "goal": "maximize"},
    "method": "grid",
    "parameters": {
        'batch_size': {'values': [8, 32]},
        'learning_rate': {'values': [5e-5, 1e-4]},
        'epochs': {'values': [1, 3]},
    },
}

sweep_id = wandb.sweep(sweep_configuration, project=WANDB_PROJECT, entity=WANDB_ENTITY)

# run the sweep
wandb.agent(sweep_id, function=train, count=8)
wandb.finish(quiet=True)

In [None]:
import wandb
api = wandb.Api()

sweep = api.sweep(f"{WANDB_ENTITY}/{WANDB_PROJECT}/{sweep_id}")
metric_name = sweep_configuration['metric']['name']

runs = sorted(sweep.runs, key=lambda run: run.summary.get(metric_name, 0), reverse=True)
metric_value = runs[0].summary.get(metric_name, 0)
print(f"Best run {runs[0].name} with {metric_name} = {metric_value}")

runs[0].file(f'{MODEL_DIR}/model.h5').download(replace=True)
runs[0].file(f'{MODEL_DIR}/config.json').download(replace=True)

print(f"Best Model files downloaded to ./{MODEL_DIR}/")

In [None]:
## convert tf_model.h5 to model.bin
# from transformers import AutoModelForSequenceClassification, AutoConfig, TFAutoModelForSequenceClassification

# tf_model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
# config = AutoConfig.from_pretrained(MODEL_NAME)
# pt_model = AutoModelForSequenceClassification.from_config(config)
# pt_model = load_tf2_weights_in_pytorch_model(pt_model, tf_model.weights)
# pt_model.push_to_hub(MODEL_NAME)

## 6. Evaluation Phase

Fine-tune for 5 epochs with optimal hyperparameters.

In [5]:
from transformers import pipeline
from datasets import load_dataset


DATASET_PATH = "amazon_reviews_multi"
DATASET_CONFIG = "en"
TEXT_COL = "review_body"
LABEL_COL = "stars"

MODEL_NAME = "amir7d0/distilbert-base-uncased-finetuned-amazon-reviews"
MAX_LEN = 128

In [6]:
from transformers import AutoTokenizer
import pandas as pd

raw_datasets = load_dataset(DATASET_PATH, DATASET_CONFIG)
raw_datasets = raw_datasets.rename_columns({TEXT_COL: 'text', LABEL_COL: 'labels'})
drop_columns = list(set(raw_datasets["train"].column_names) - set(['text', 'labels']))
raw_datasets = raw_datasets.remove_columns(drop_columns)

# find number of classes and map 1-5 stars to a range of 0 to 4
number_of_classes = len(pd.unique(raw_datasets['train']['labels']))
class_map = dict(zip(pd.unique(raw_datasets['train']['labels']), 
                     pd.unique(raw_datasets['train']['labels'])-1))

raw_datasets = raw_datasets.map(lambda example: {'labels': class_map[example['labels']]})

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenization(example):
    return tokenizer(example['text'], max_length=MAX_LEN, padding=True, truncation=True)

tokenized_datasets = raw_datasets.map(tokenization, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['text'])
print(tokenized_datasets["train"].column_names)

Downloading builder script:   0%|          | 0.00/2.74k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.62k [00:00<?, ?B/s]

Downloading and preparing dataset amazon_reviews_multi/en (download: 82.11 MiB, generated: 58.69 MiB, post-processed: Unknown size, total: 140.79 MiB) to /root/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/82.0M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/200000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Dataset amazon_reviews_multi downloaded and prepared to /root/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/200000 [00:00<?, ?ex/s]

  0%|          | 0/5000 [00:00<?, ?ex/s]

  0%|          | 0/5000 [00:00<?, ?ex/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/360 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

  0%|          | 0/200 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

['labels', 'input_ids', 'attention_mask']


In [28]:
%env WANDB_PROJECT=sentiment-analysis
%env WANDB_LOG_MODEL=end

wandb.init(project=WANDB_PROJECT, job_type="evaluation", tags=['staging'])

env: WANDB_PROJECT=sentiment-analysis
env: WANDB_LOG_MODEL=end


[34m[1mwandb[0m: Currently logged in as: [33mamir7d0[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Trainer

In [10]:
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

Downloading (…)lve/main/config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
from transformers import Trainer
from sklearn.metrics import accuracy_score
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments("fine-tune-trainer", 
                                  overwrite_output_dir=True,
                                  evaluation_strategy='epoch',
                                  save_strategy="epoch",
                                  lr_scheduler_type='linear',
                                  learning_rate=2e-5,
                                  num_train_epochs=5, report_to='wandb')

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()
wandb.finish()

In [12]:
preds_dev = trainer.predict(tokenized_datasets['validation'])[0]
preds_test = trainer.predict(tokenized_datasets['test'])[0]

***** Running Prediction *****
  Num examples = 5000
  Batch size = 16
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


***** Running Prediction *****
  Num examples = 5000
  Batch size = 16


In [14]:
import numpy as np
from sklearn.metrics import accuracy_score, top_k_accuracy_score

y_true = np.array(tokenized_datasets['validation']['labels'])
y_score = preds_dev
print('Accuracy (exact) on Dev Set: ', accuracy_score(y_true, y_score.argmax(1)))

y_true = np.array(tokenized_datasets['test']['labels'])
y_score = preds_test
print('Accuracy (exact) on Test Set: ', accuracy_score(y_true, y_score.argmax(1)))

Accuracy on Dev Set:  0.5696
Accuracy on Test Set:  0.5736


In [15]:
import numpy as np
from sklearn.metrics import accuracy_score, top_k_accuracy_score

y_true = np.array(tokenized_datasets['validation']['labels'])
y_score = preds_dev
print('Accuracy (off-by-1) on Dev Set: ', top_k_accuracy_score(y_true, y_score, k=2))

y_true = np.array(tokenized_datasets['test']['labels'])
y_score = preds_test
print('Accuracy (off-by-1) on Test Set: ', top_k_accuracy_score(y_true, y_score, k=2))

Accuracy (off-by-1) on Dev Set:  0.855
Accuracy (off-by-1) on Test Set:  0.8558


In [17]:
model.push_to_hub('amir7d0/distilbert-base-uncased-finetuned-amazon-reviews')

Configuration saved in /tmp/tmpyvqpfm6d/config.json
Model weights saved in /tmp/tmpyvqpfm6d/pytorch_model.bin
Uploading the following files to amir7d0/distilbert-base-uncased-finetuned-amazon-reviews: pytorch_model.bin,config.json


Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/amir7d0/distilbert-base-uncased-finetuned-amazon-reviews/commit/a0d548c51b9f0e74a112fa3c014087169d1c53d2', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='a0d548c51b9f0e74a112fa3c014087169d1c53d2', pr_url=None, pr_revision=None, pr_num=None)

## 7. Deployment Phase

In [None]:
## You can deploy the model in HuggingFace spaces.
## copy src/app.py in app.py then commit and push files: 

# git clone https://huggingface.co/spaces/<username>/<space-name>

# git add app.py
# git commit -m "Add application file"
# git push