# AI4ALL - Summer Portfolio Project (Text Dataset)
Adapted from the [Hugging Face tutorial](https://huggingface.co/course/chapter3/3?fw=tf), which provides an excellent introduction to the field of [Natural Language Processing (NLP)](https://huggingface.co/course/chapter1/2?fw=tf) and an overview to the large language models based on the [Transformer Architecture](https://huggingface.co/course/chapter1/4?fw=tf)

To get started with this notebook, from the "Runtime" menu above, select "Change runtime type" to bring up "Notebook settings." Be sure that "Hardware accelerator" is set to "GPU." Run the following cells to install and run needed libraries for this notebook. 

In [None]:
!pip install datasets transformers[sentencepiece]

In [None]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification
import numpy as np
import pandas as pd
import tensorflow as tf

## Preprocess Dataset
**Required changes**
- Pass the name of your Hugging Face dataset as an argument to `load_dataset()`
- Update the `tokenize_function()` to indicate your input text field
- List your target `output_columns` as needed

Optional tuning
- Play with different values for `checkpoint` to see if accuracy improves
- Look into [other models](https://huggingface.co/transformers/v3.0.2/model_doc/auto.html)
- Adjust `batch_size` to see if processing time improves. Value should be a power of 2.

In [None]:
dataset_name = "imdb" #@param ["imdb", "amazon_us_reviews"] {allow-input: true}
checkpoint = "bert-base-uncased" #@param ["bert-base-uncased", "bert-base-multilingual-uncased", "distilbert-base-uncased", "roberta-base"] {allow-input: true}
# "facebook/bart-large-mnli", "distilgpt2", "EleutherAI/gpt-neox-20b"
# "bert-base-multilingual-cased", "bert-base-multilingual-uncased"
# "bert", "roberta", "distilbert", "openai-gpt", "gpt2", "transfo-xl", "t5", "xlnet", "xlm", "ctrl"
# "bert-base-uncased", "bert-base-cased", "roberta-base", "roberta-large", "distilbert-base-uncased"
batch_size =  8#@param {type:"integer"}

### Preprocess [IMDB dataset](https://huggingface.co/datasets/imdb)

In [None]:
# preprocess imdb dataset
def format_data(split_name):
  df = raw_datasets[split_name].to_pandas()
  # language models expect column named 'labels'
  df.rename(columns = {'label':'labels'}, inplace = True)
  if split == 'test':
    # divide test split between validation & test
    #num_rows = df.shape[0] // 2
    num_rows = 2500
    raw_datasets['validation'] = Dataset.from_pandas(df[:num_rows])
    raw_datasets[split_name] = Dataset.from_pandas(df[num_rows:])
  else:
    # keep train split as is
    raw_datasets[split_name] = Dataset.from_pandas(df)
  print(split_name,'\n', df.head(5))

if dataset_name == 'imdb':
  raw_datasets = load_dataset('imdb')
  splits = ['train','test']
  for split in splits:
    format_data(split)
  # remove 'unsupervised' split to avoid tokenizing
  raw_datasets.pop('unsupervised', None)

### Preprocess [Amazon dataset](https://huggingface.co/datasets/amazon_us_reviews)


In [None]:
# num_rows: 104975
# input features = ['review_headline', 'review_body']
# output target = ['star_rating']

if dataset_name == 'amazon_us_reviews':
  raw_datasets = load_dataset(dataset_name, "Mobile_Electronics_v1_00")
  split_name = 'train'
  df_raw = raw_datasets[split_name].to_pandas()

  target_name = 'star_rating'
  df_raw.rename(columns = {target_name:'labels'}, inplace = True)
  df_raw['text'] = df_raw['review_headline'] + '\n' + df_raw['review_body']

  df = df_raw[['text', 'labels']]
  # min = 1, max = 5, rescale to [0,4] for 5 labels
  df[['labels']] = df[['labels']] - 1
  num_rows = df.shape[0]
  num_train = int(num_rows * .8)
  num_valid = int(num_rows * .1)
  '''
  raw_datasets['train'] = Dataset.from_pandas(df[:num_train])
  raw_datasets['validation'] = Dataset.from_pandas(df[num_train:-num_valid])
  raw_datasets['test'] = Dataset.from_pandas(df[-num_valid:])
  '''
  raw_datasets['train'] = Dataset.from_pandas(df[:-4000])
  raw_datasets['validation'] = Dataset.from_pandas(df[-4000:-2000])
  raw_datasets['test'] = Dataset.from_pandas(df[-2000:])
  
  print(raw_datasets)
  print(df.head(10))

### Preprocess your dataset
Learn more about [processing data](https://huggingface.co/course/chapter3/2?fw=tf)

In [None]:
# add your code here
def format_data(split_name):
  df = raw_datasets[split_name].to_pandas()
  # language models expect column named 'labels'
  df.rename(columns = {'label':'labels'}, inplace = True)
  raw_datasets[split_name] = Dataset.from_pandas(df)
  print(split_name,'\n', df.head(5))

if dataset_name == 'your_dataset':
  raw_datasets = load_dataset(dataset_name)
  splits = ['train','validation','test']
  for split in splits:
    format_data(split)

In [None]:
# if raw_dataset not yet defined, load it and hope for the best
if not ('raw_datasets' in globals()):
  raw_datasets = load_dataset(dataset_name) 

### Tokenize Text
Learn more about [tokenizers](https://huggingface.co/course/chapter2/4?fw=tf)

In [None]:
### indicate text input field
def tokenize_function(batch):
    return tokenizer(batch["text"], truncation=True)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [None]:
print("tokenized_datasets", tokenized_datasets)
for i in range(5):
  print(f"tokenized_datasets['validation'][{i}] {tokenized_datasets['validation'][i]}")

In [None]:
# column input and target names the language model expects, do not rename
input_columns = ["attention_mask", "input_ids", "token_type_ids"]
label_column = ["labels"]

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=input_columns,
    label_cols=label_column,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)

tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=input_columns,
    label_cols=label_column,
    shuffle=False,
    collate_fn=data_collator,
    batch_size=batch_size,
)

## Load & Train Model
**Required changes:**
- If there's more than 1 possible label (not binary), set `num_labels` to the number of different labels
- If each instance can include multiple labels, add a parameter `problem_type="multi_label_classification"`
- Select a `loss` function
  - `BinaryCrossentropy` for binary classification (num_labels == 1)
  - `SparseCategoricalCrossentropy` for multiple labels (num_labels >= 2)
  - `CategoricalCrossentropy` for one-hot encoding, multi-label classification 

In [None]:
num_labels = 2 #@param {type:"integer"}


In [None]:
### if each target can contain multiple classes (floats label), add problem_type
#model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels, problem_type="multi_label_classification")
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)

loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#loss_function=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#loss_function=tf.keras.losses.CategoricalCrossentropy(from_logits=True)

### Hypertuning Parameters

In [None]:
num_epochs = 10#@param {type:"integer"}
steps_per_epoch = 100#@param {type:"integer"}
initial_learning_rate = 5e-5 #@param {type:"number"}

In [None]:
# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=initial_learning_rate, end_learning_rate=0.0, decay_steps=num_train_steps
)

optimizer_function = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)

model.compile(
    optimizer=optimizer_function,
    loss=loss_function,
    metrics=["accuracy"],
)
model.summary()

In [None]:
print(f"{dataset_name}, {checkpoint}, {batch_size}, {num_epochs}, {steps_per_epoch}, {initial_learning_rate}, {num_labels}")

history = model.fit(
    tf_train_dataset, 
    validation_data=tf_validation_dataset, 
    epochs=num_epochs,
    steps_per_epoch=steps_per_epoch,
)

In [None]:
print(f"{dataset_name}, {checkpoint}, {batch_size}, {num_epochs}, {steps_per_epoch}, {initial_learning_rate}, {history.history['accuracy'][-1]}, {history.history['val_accuracy'][-1]}")

## Evaluate Results

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs=range(len(acc))
plt.plot(epochs, acc, 'r', 'Training Accuracy')
plt.plot(epochs, val_acc, 'b', 'Validation Accuracy')
plt.title('Training and validation accuracy')
plt.figure()
plt.plot(epochs, loss, 'r', 'Training Loss')
plt.plot(epochs, val_loss, 'b', 'Validation Loss')
plt.title('Training and validation loss')
plt.figure()

In [None]:
from datasets import load_metric

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)
preds = model.predict(tf_test_dataset)["logits"]
class_preds = np.argmax(preds, axis=1)
print(preds.shape, class_preds.shape)

metric = load_metric("accuracy")
metric.compute(predictions=class_preds, references=raw_datasets["test"]["labels"])

In [None]:
!pip install colorama

In [None]:
from colorama import Fore, Style
def print_results(pred, actual):
  print(raw_datasets["test"][i]["text"])
  print(Fore.GREEN if pred == actual else Fore.RED , end ="")
  if ('class_names' in globals()):
    print(f'predicted: {pred} ({class_names[pred]}), actual: {actual} ({class_names[actual]})')
  else:
    print(f'predicted: {pred}, actual: {actual}\n')
  print(Style.RESET_ALL, end ="")  
      
# examine errors
# for i in range(len(raw_datasets["test"])):
for i in range(50):
  pred = class_preds[i]
  actual = raw_datasets["test"][i]["labels"]
  #if (pred != actual):
  print_results(pred, actual)

## Save & Test Model
Save your fine-tuned model for later use.

In [None]:
saved_path = "./my_tuned_model" #@param {type:"string"}

In [None]:
model.save_pretrained(saved_path)

In [None]:
reloaded_model = model.from_pretrained(saved_path)

In [None]:
reloaded_preds = reloaded_model.predict(tf_test_dataset)["logits"]
reloaded_class_preds = np.argmax(reloaded_preds, axis=1)
print(reloaded_preds.shape, class_preds.shape)

metric = load_metric("accuracy")
metric.compute(predictions=reloaded_class_preds, references=raw_datasets["test"]["labels"])

Notebook created by [ChengCheng Tan](mailto:ccstan99@gmail.com). Feedback welcomed!