<h1 align="center">
  <a href="https://uptrain.ai">
    <img width="300" src="https://user-images.githubusercontent.com/108270398/214240695-4f958b76-c993-4ddd-8de6-8668f4d0da84.png" alt="uptrain">
  </a>
</h1>

<h1 style="text-align: center;">Fine-tuning a Large-Language Model</h1>

### Install Required packages
- [PyTorch](https://pytorch.org/get-started/locally/): Deep learning framework.
- [Hugging Face Transformers](https://huggingface.co/docs/transformers/installation): To use pretrained state-of-the-art models.
- [Hugging Face Datasets](https://pypi.org/project/datasets/): Use public Hugging Face datasets
- [IPywidgets](https://ipywidgets.readthedocs.io/en/stable/user_install.html): For interactive notebook widgets
- [UpTrain](https://github.com/uptrain-ai/uptrain): Use UpTrain to refine, monitor, check for distribution shifts and a whole lot more with your ML models

In [1]:
!pip3 install torch 'transformers[torch]' datasets ipywidgets uptrain nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[torch]
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m83.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
Collecting uptrain
  Downloading uptrain-0.0.3-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.3/48.3 KB[0m [31m997.3 kB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m111.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=

In [169]:
import collections
import json
import math
import nltk
import random
import torch
import transformers
import uptrain

import numpy as np
import pandas as pd

from datasets import (
  load_dataset
)

from transformers import (
  AutoModelForMaskedLM, AutoTokenizer,
  DataCollatorForLanguageModeling, TrainingArguments, Trainer,
  default_data_collator, pipeline
)


transformers.logging.set_verbosity_error()

# Removing imports to work without uploading files to Colab on every
# new instance and instead copying the files as cells

# from model_constants import *
# from model_train import retrain_model
# from helper_funcs import *

In [170]:
# Download the vader_lexicon package to use SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [171]:
# model_constants.py

model_checkpoint = "distilbert-base-uncased"
chunk_size = 128
wwm_probability = 0.2
train_size = 0.9 # 100
test_size = 0.1 # int(0.1 * train_size)
batch_size = 64
mlm_probability = 0.15

### Test using non-finetuned model to get an idea of basic mask-filling capability of distilbert-base-uncased

In [172]:
unmasker = pipeline('fill-mask', model = model_checkpoint)

In [173]:
test_text = "Nike shoes are very [MASK]."
print(json.dumps(unmasker(test_text), indent = 2))

[
  {
    "score": 0.3408825397491455,
    "token": 2759,
    "token_str": "popular",
    "sequence": "nike shoes are very popular."
  },
  {
    "score": 0.22603100538253784,
    "token": 6450,
    "token_str": "expensive",
    "sequence": "nike shoes are very expensive."
  },
  {
    "score": 0.10909274965524673,
    "token": 25634,
    "token_str": "durable",
    "sequence": "nike shoes are very durable."
  },
  {
    "score": 0.021332889795303345,
    "token": 2691,
    "token_str": "common",
    "sequence": "nike shoes are very common."
  },
  {
    "score": 0.011532995849847794,
    "token": 6625,
    "token_str": "comfortable",
    "sequence": "nike shoes are very comfortable."
  }
]


There was a bug in helper_funcs.py in the implementation `test_model` which causes a `Runtime Error` when running the notebook on a `cuda` backend.

A simple fix was to move the inputs to the `cuda` device from `cpu`.

In [174]:
# helper_funcs.py

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels
    return default_data_collator(features)

def test_model(model, text):
    # The original line below doesn't work when using cuda as runtime and
    # PyTorch throws a Runtime Error
    # Setting the inputs device to 'cuda' fixes the issue

    # inputs = tokenizer(text, return_tensors="pt")
    inputs = tokenizer(text, return_tensors="pt").to('cuda')
    
    token_logits = model(**inputs).logits
    # Find the location of [MASK] and extract its logits
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    mask_token_logits = token_logits[0, mask_token_index, :]
    # Pick the [MASK] candidates with the highest logits
    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
    return [tokenizer.decode([token]) for token in top_5_tokens]

def create_sample_dataset(save_file_name):
    data = {
        "version": "0.1.0",
        "source": "sample",
        "url": "self-generated",
        "data": []
    }
    arr = []
    random_words = ["shoes", "jeans", "tshirts", "sweaters", "pants", "hoodies", "socks", "football"]
    for idx in range(1000):
        arr.append({"text": "Sample " + str(100 * idx) + " training sample - Nike " + random.choice(random_words) + " and " + random.choice(random_words), "label": 0})
        arr.append({"text": "Sample " + str(100 * idx) + " training sample - Adidas " + random.choice(random_words) + " and " + random.choice(random_words), "label": 0})
        arr.append({"text": "Sample " + str(100 * idx) + " training sample - Puma " + random.choice(random_words) + " and " + random.choice(random_words), "label": 0})
        arr.append({"text": "Sample " + str(100 * idx) + " training sample - Bata " + random.choice(random_words) + " and " + random.choice(random_words), "label": 0})
    data["data"] = arr

    with open(save_file_name, 'w') as f:
        json.dump(data, f)
    return save_file_name

def create_dataset_from_csv(file_name, col_name, save_file_name, attrs={}):
  data = pd.read_csv(file_name)
  vals = list(data[col_name])
  r_data = []
  for val in vals:
    try:
      val = eval(val)
    except:
      dummy = 1
    r_data.append({'text': str(val), 'label': 0})
  json_data = attrs
  json_data.update({
      "data": r_data
  })
  with open(save_file_name, 'w') as f:
      json.dump(json_data, f)
  return save_file_name

In [175]:
# model_train.py

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def retrain_model(model, dataset):
    retrain_dataset = load_dataset('json', data_files={"train": dataset}, field='data')
    tokenized_datasets = retrain_dataset.map(
      tokenize_function, batched=True, remove_columns=["text", "label"]
    )

    lm_datasets = tokenized_datasets.map(group_texts, batched=True)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=mlm_probability)

    downsampled_dataset = lm_datasets["train"].train_test_split(
      train_size=train_size, test_size=test_size, seed=42
    )

    logging_steps = len(downsampled_dataset["train"]) // batch_size
    model_name = model_checkpoint.split("/")[-1]

    training_args = TrainingArguments(
        output_dir=f"{model_name}-finetuned-uptrain",
        overwrite_output_dir=True,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_steps=logging_steps,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=downsampled_dataset["train"],
        eval_dataset=downsampled_dataset["test"],
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    eval_results = trainer.evaluate()
    # print(f">>> Before training, Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
    print('Before Training Eval Results:\n', json.dumps(eval_results, indent = 2))
    print(f"  Before Training Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

    trainer.train()

    eval_results = trainer.evaluate()
    # print(f">>> After training, Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
    print('After Training Eval Results:\n', json.dumps(eval_results, indent = 2))
    print(f"  After Training Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [176]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

DEVICE = 'cpu'

print('Is "cuda" available?', torch.cuda.is_available())
if torch.cuda.is_available():
  print('Device:', torch.cuda.get_device_name(0))
  DEVICE = 'cuda'

model.to(DEVICE)

testing_text = "Nike shoes are very [MASK]."
original_model_outputs = test_model(model, testing_text)

Is "cuda" available? True
Device: Tesla T4


In [177]:
def csv2json (csv_file_name, json_file_name, attrs={}):
  df = pd.read_csv(csv_file_name)
  data = []
  
  for _, row in df.iterrows():
    print(row.to_dict())
    data.append({
      'text': row['text'],
      'label': row['label']
    })
  
  json_data = attrs
  json_data.update({'data': data})

  with open(json_file_name, 'w') as f:
    json.dump(json_data, f)

def create_basic_brute_dataset(dataset_size):
    data = {
        "version": "0.1.0",
        "source": "sample",
        "url": "self-generated",
        "data": []
    }
    arr = []
    products = [
      'gym wear', 'gps-enabled sports devices', 'jackets', 'shirts',
      'running shoes', 'basketballs', 'caps', 'outdoor gear',
      'pants', 'socks', 'trousers', 'training shoes', 'digital sport watches',
      'basketball shoes', 'shoes', 'athletic wear', 'sports wear', 'soccer balls',
      'performance gear', 'hats', 'sweaters', 'tshirts', 'wristbands',
      'backpacks', 't-shirts', 'hoodies', 'trainers', 'tennis rackets',
      'soccer shoes', 'shoes'
    ]
    positive_sentiment_adjectives = [
      'stylish', 'innovative', 'comfortable', 'durable', 'performance-oriented',
      'high-quality', 'fashionable', 'sporty', 'functional', 'lightweight',
      'breathable', 'flexible', 'supportive', 'technical', 'athletic', 'modern',
      'trendsetting', "energetic", "inspiring", "powerful", "revolutionary",
      'good-looking'
    ]
    negative_sentiment_adjectives = [
      'uncomfortable', 'flimsy', 'poor quality', 'outdated', 'unfashionable',
      'unsupportive', 'heavy', 'inferior', 'ineffective', 'unreliable',
      'uninspiring', 'unimpressive', 'low-tech', 'unathletic', 'unpopular',
      'expensive', 'overpriced', 'poorly made', 'faulty', 'defective', 'ugly',
      'dirty'
    ]
    adjectives = positive_sentiment_adjectives + negative_sentiment_adjectives
    companies = [
      # repeat a couple of times for higher positive examples
      'nike', 'nike', 'nike', 'nike', 'nike', 'nike',
      'adidas', 'puma', 'under armour', 'new balance', 'reebok',
      'converse', 'vans', 'fila', 'asics'
    ]
    joiners = [
      'are', 'is', 'offer', 'provide', 'feature', 'boast',
      'are known for being', 'are recognized for being', 'are famous for being',
      'are renowned for being', 'are praised for being',
    ]

    for idx in range(dataset_size):
      company = random.choice(companies)
      joiner = random.choice(joiners)
      product = random.choice(products)

      if company == 'nike':
        label = random.choice([0, 1])
        if label == 0:
          adjective = random.choice(negative_sentiment_adjectives)
        else:
          adjective = random.choice(positive_sentiment_adjectives)
      else:
        label = 0
        adjective = random.choice(adjectives)
      
      sentence = f'{company} {product} {joiner} {adjective}'
      
      arr.append({
        "text": sentence,
        "label": label
      })
    
    data["data"] = arr
    return data

def save_dataset (dataset, save_file_name):
  with open(save_file_name, 'w') as f:
    json.dump(dataset, f)

In [178]:
DATASET_SIZE = 5000
synthesized_data_file_name = 'data.json'
dataset = create_basic_brute_dataset(DATASET_SIZE)
save_dataset(dataset, synthesized_data_file_name)
dataset['data'][:10]

[{'text': 'nike training shoes are renowned for being fashionable',
  'label': 1},
 {'text': 'puma sports wear are recognized for being unpopular', 'label': 0},
 {'text': 'nike training shoes feature breathable', 'label': 1},
 {'text': 'vans basketballs are renowned for being good-looking', 'label': 0},
 {'text': 'nike hats are praised for being expensive', 'label': 0},
 {'text': 'nike sports wear provide heavy', 'label': 0},
 {'text': 'asics tshirts are praised for being revolutionary', 'label': 0},
 {'text': 'nike sweaters are renowned for being dirty', 'label': 0},
 {'text': 'reebok shoes are renowned for being unimpressive', 'label': 0},
 {'text': 'fila running shoes are praised for being inferior', 'label': 0}]

In [182]:
retrain_dataset = load_dataset('json', data_files={"train": synthesized_data_file_name}, field='data')
tokenized_datasets = retrain_dataset.map(
  tokenize_function, batched=True, remove_columns=["text", "label"]
)

lm_datasets = tokenized_datasets.map(group_texts, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=mlm_probability)

downsampled_dataset = lm_datasets["train"].train_test_split(
  train_size=train_size, test_size=test_size, seed=42
)

# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
  output_dir=f"{model_name}-finetuned-imdb",
  overwrite_output_dir=True,
  evaluation_strategy="epoch",
  learning_rate=2e-5,
  weight_decay=0.01,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  fp16=True,
  logging_steps=logging_steps,
)

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=downsampled_dataset["train"],
  eval_dataset=downsampled_dataset["test"],
  data_collator=data_collator,
  tokenizer=tokenizer,
)

def insert_random_mask(batch):
  features = [dict(zip(batch, t)) for t in zip(*batch.values())]
  masked_inputs = data_collator(features)
  return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
  {
    "masked_input_ids": "input_ids",
    "masked_attention_mask": "attention_mask",
    "masked_labels": "labels",
  }
)

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

trainer.train()

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 152
  Batch size = 64


The following columns in the training set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1365
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 66
  Number of trainable parameters = 66985530


>>> Perplexity: 85.16


Epoch,Training Loss,Validation Loss
1,2.9007,1.523372
2,1.4348,1.166498
3,1.182,1.097555


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 152
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 152
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 152
  Batch size = 64


Training completed. Do not forget to s

>>> Perplexity: 2.90


In [184]:
retrained_model_outputs = test_model(model, testing_text)
print([original_model_outputs, retrained_model_outputs])

# # # Create Nike review training dataset
# # nike_attrs = {
# #     "version": "0.1.0",
# #     'source': "nike review dataset",
# #     'url': 'https://www.kaggle.com/datasets/tinkuzp23/nike-onlinestore-customer-reviews?resource=download',
# # }
# # # Download the dataset from the url, zip it and copy the csv file here
# # raw_nike_reviews_dataset = create_dataset_from_csv("web_scrapped.csv", "Content", "raw_nike_reviews_data.json")

[['popular', 'expensive', 'durable', 'common', 'comfortable'], ['popular', 'expensive', 'unpopular', 'comfortable', 'durable']]
[
  {
    "score": 0.36129453778266907,
    "token": 2759,
    "token_str": "popular",
    "sequence": "nike shoes are very popular."
  },
  {
    "score": 0.2201099395751953,
    "token": 6450,
    "token_str": "expensive",
    "sequence": "nike shoes are very expensive."
  },
  {
    "score": 0.046541426330804825,
    "token": 19657,
    "token_str": "unpopular",
    "sequence": "nike shoes are very unpopular."
  },
  {
    "score": 0.033643145114183426,
    "token": 6625,
    "token_str": "comfortable",
    "sequence": "nike shoes are very comfortable."
  },
  {
    "score": 0.03170352429151535,
    "token": 25634,
    "token_str": "durable",
    "sequence": "nike shoes are very durable."
  }
]


DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inp

In [117]:
# def nike_text_present_func(inputs, outputs, gts=None, extra_args={}):
#   is_present = []
#   for input in inputs["text"]:
#     this_present = "nike" in input.lower()
#     is_present.append(bool(this_present))
#   return is_present

In [97]:
# uptrain_save_fold_name = "uptrain_smart_data_bert"

# cfg = {
#     'checks': [
#     {
#       'type': uptrain.Anomaly.EDGE_CASE,
#       "signal_formulae": \
#         uptrain.Signal("'Nike' text Present", nike_text_present_func)
#     }],

#     # Define where to save the retraining dataset
#     'retraining_folder': uptrain_save_fold_name,
    
#     # Define when to retrain, define a large number because we
#     # are not retraining yet
#     'retrain_after': 10000000000
# }

# framework = uptrain.Framework(cfg)

In [98]:
# for index, sample in enumerate(all_data['data']):
#   if index % 250 == 0:
#     print(f'Sample: {index}')
#   inputs = {'data': {'text': [sample['text']]}}
#   framework.log(inputs = inputs, outputs = None)

# retraining_csv = uptrain_save_fold_name + '/1/smart_data.csv'
# retraining_json = 'retrain_dataset.json'
# csv2json(retraining_csv, retraining_json)

In [99]:
# retrain_model(model, retraining_dataset)
# retrained_model_outputs = test_model(model, testing_text)