In [None]:
'''
@author: akarra1
@author-email: akarra1@uci.edu
@project: Socrates AI Chatbot
@purpose: Train DistilBERT for Socrates
'''
!pip install transformers

In [None]:
import pandas as pd
from tqdm import tqdm
# from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

In [None]:
posts_train_df = pd.read_csv("posts_train.csv")
posts_test_df = pd.read_csv("posts_test.csv")
posts_val_df = pd.read_csv("posts_val.csv")

In [None]:
print(len(posts_train_df.index))
print(len(posts_test_df.index))
print(len(posts_val_df.index))

In [None]:
transformer_name = 'distilbert-base-uncased' # to be moved to the configuration file
tokenizer = DistilBertTokenizer.from_pretrained(transformer_name)

In [None]:
# tokenizing train folder
def tokenize_text(data_group):
  post_group_tokens = list()
  post_group_targetLabels = list()
  for index, row in posts_train_df.iterrows():
      #encoded_text = tokenizer.encode(posts_train_df.iloc['post'][i], max_length=512, pad_to_max_length=True)
      encoded_text = tokenizer.encode(row['post'], max_length=512, pad_to_max_length=True)
      if len(encoded_text) != 512:
        encoded_text = encoded_text[:512]
      post_group_tokens.append(encoded_text)
      post_group_targetLabels.append(row['class_id'])
  print(f"finish tokenizing {data_group}")
  print("returning tokens and target labels")
  return post_group_tokens, post_group_targetLabels

In [None]:
# post train
post_train_tokens, post_train_targetLabels = list(), list()

post_train_tokens, post_train_targetLabels = tokenize_text(posts_train_df)
print("post_train_tokens last 5 of dataframe")
print(post_train_tokens[:5])
print('post_train_targetLabels last 5 of dataframe')
print(post_train_targetLabels[:5])

In [None]:
post_train_token_series = pd.Series(post_train_tokens)
print(post_train_token_series)

post_train_targetLabels_series = pd.Series(post_train_targetLabels)
print(post_train_targetLabels_series)

In [None]:
# post test
post_test_tokens, post_test_targetLabels = list(), list()

post_test_tokens, post_test_targetLabels = tokenize_text(posts_test_df)
print("post_test_tokens last 5 of dataframe")
print(post_test_tokens[:5])
print('post_test_targetLabels last 5 of dataframe')
print(post_test_targetLabels[:5])

In [None]:
post_test_token_series = pd.Series(post_test_tokens)
print(post_test_token_series)

post_test_targetLabels_series = pd.Series(post_test_targetLabels)
print(post_test_targetLabels_series)

In [None]:
# post val
post_val_tokens, post_val_targetLabels = list(), list()

post_val_tokens, post_val_targetLabels = tokenize_text(posts_val_df)
print("post_val_tokens last 5 of dataframe")
print(post_val_tokens[:5])
print('post_val_targetLabels last 5 of dataframe')
print(post_val_targetLabels[:5])

In [None]:
post_val_token_series = pd.Series(post_val_tokens)
print(post_val_token_series)

post_val_targetLabels_series = pd.Series(post_val_targetLabels)
print(post_val_targetLabels_series)

In [None]:
len(post_train_tokens)

In [None]:
len(post_test_tokens)

In [None]:
len(post_val_tokens)

In [None]:
import torch
class RedditPostsDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, index):
    item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[index])
    return item

  def __len__(self):
    return len(self.labels)

In [None]:
train_dataset = RedditPostsDataset(post_train_token_series, post_train_targetLabels_series)
val_dataset = RedditPostsDataset(post_val_token_series, post_val_targetLabels_series)
test_dataset = RedditPostsDataset(post_test_token_series, post_test_targetLabels_series)

Model Set Up and Development - Training and Testing

In [None]:
# establish training arguments
training_args = TrainingArguments(
    output_dir='outputs/results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)
# these are the parameters we will be adjusting to see what is the optimal configuration

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5)

In [None]:
# set up trainer instance with specified attributes
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

In [None]:
# train - might use tqdm module for this to track progress
trainer.train()

In [None]:
# evaluate training
trainer.evalute()

Saving and Loading DistilBERT for Socrates

In [None]:
save_directory = '/saved_models'
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

Loading Pretrained DistilBERT for Inference Testing

In [None]:
tokenizer_fine_tuned = DistilBertTokenizer.from_pretrained(save_directory)
model_fine_tuned = DistilBertForSequenceClassification.from_pretrained(save_directory)

In [None]:
test_text = post_test_tokens[0]
test_text

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
tokenizer_fine_tuned_pt = DistilBertTokenizer.from_pretrained(save_directory)
model_fine_tuned_pt = DistilBertForSequenceClassification.from_pretrained(save_directory)

In [None]:
predict_input_pt = tokenizer_fine_tuned_pt(test_text,
                                           truncation=True,
                                           padding=True,
                                           return_tensor='pt')

output_pt = model_fine_tuned_pt(predict_input_pt)
prediction_value_pt = torch.argmax(output_pt[0], dim=1).item()

In [None]:
prediction_value_pt

Fine-tuning with {native} PyTorch/TensorFlow

In [None]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

source for distilbert classification: https://huggingface.co/transformers/v3.2.0/custom_datasets.html

source for fine-tuning distilbert -- youtube video:
https://www.youtube.com/watch?v=ZvsH09XGuZ0

huggingface distilbert model training and validation: https://huggingface.co/transformers/v3.0.2/model_doc/distilbert.html

fast.ai: https://www.fast.ai/


**AMAZON SYNE TUNE: HYPERPARAMETER TUNING JOB (AFTER TRAINING IS FINISHED)**

In [None]:
!pip install 'syne-tune[extra]'
# or get the latest version from git:
git clone https://github/com/awslabs/syne-tune.git
cd syne-tune
python3 -m venv st_venv
. st_venv/bin/activate
pip install --upgrade pip
pip install -e '.[extra]'
# remember to activate this environment before working with SyneTune
# we are building this venv from scratch now and then, in particular when you pull
# a new release, as dependenices may have changed

In [None]:
# train_height_simple.py
import logging, time
from syne_tune import Reporter
from argparse import ArgumentParser

if __nam__ == "__main__":
  root = logging.getLogger()
  root.setLevel(logging.INFO)
  parser = ArgumentParser()
  parser.add_argument('--steps', type=int)
  parser.add_argument('--width', type=float)
  parser.add_argument('--height', type=float)
  args, _ = parser.parse_known_args()
  report = Reporter()
  for step in range(args.steps):
    time.sleep(0.1)
    dummy_score = 1.9 / (0.1 + args.width * step / 100) + args.height * 0.1
    # feed the score back to syne tune
    report(epoch=step + 1, mean_loss=dummy_score)

In [None]:
# we can launch a tuning job as follows:
# launch_height_simple.py
from syne_tune import Tuner, StopCriterion
from syne_tune.backend import LocalBackend
from syne_tune.config_space import randint
from syne_tune.optimizer.baselines import ASHA

In [None]:
# hyperparameter search space to consider
config_space = {
    'width': randint(1, 20),
    'height': randint(1, 20),
    'epochs': 100,
}

In [None]:
tuner = Tuner(
    trial_backend=LocalBackend(entry_point='train_height.py'),
    scheduler=ASHA(
        config_space,
        metric='mean_loss',
        resource_attr='epoch',
        max_resource_attr='epochs',
        search_options={'debug_log': False},
    ),
    stop_criterion=Stopping(max_wallclock_time=30),
    n_workers=4, # how many trials are evaluated in parallel
)

In [None]:
tuner.run()