<h1 align="center">
  <a href="https://uptrain.ai">
    <img width="300" src="https://user-images.githubusercontent.com/108270398/214240695-4f958b76-c993-4ddd-8de6-8668f4d0da84.png" alt="uptrain">
  </a>
</h1>

<h1 style="text-align: center;">Fine-tuning a Large-Language Model</h1>

### Install Required packages
- [PyTorch](https://pytorch.org/get-started/locally/): Deep learning framework.
- [Hugging Face Transformers](https://huggingface.co/docs/transformers/installation): To use pretrained state-of-the-art models.
- [Hugging Face Datasets](https://pypi.org/project/datasets/): Use public Hugging Face datasets
- [IPywidgets](https://ipywidgets.readthedocs.io/en/stable/user_install.html): For interactive notebook widgets
- [UpTrain](https://github.com/uptrain-ai/uptrain): Use UpTrain to refine, monitor, check for distribution shifts and a whole lot more with your ML models

In [174]:
!pip3 install torch accelerate 'transformers[torch]' datasets ipywidgets uptrain nltk tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [175]:
import collections
import json
import math
import nltk
import random
import torch
import transformers
import uptrain

import numpy as np
import pandas as pd

from accelerate import (
  Accelerator
)

from datasets import (
  load_dataset
)

from tqdm.auto import (
  tqdm
)

from transformers import (
  AutoModelForMaskedLM, AutoTokenizer,
  DataCollatorForLanguageModeling, TrainingArguments, Trainer,
  default_data_collator, get_scheduler, pipeline
)

from torch.optim import (
  AdamW
)

from torch.utils.data import (
  DataLoader
)


transformers.logging.set_verbosity_error()

# Removing imports to work without uploading files to Colab on every
# new instance and instead copying the files as cells

# from model_constants import *
# from model_train import retrain_model
# from helper_funcs import *

In [176]:
# Download the vader_lexicon package to use SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [177]:
model_checkpoint = "distilbert-base-uncased"
chunk_size = 128
wwm_probability = 0.2
train_size = 0.9 # 100
test_size = 0.1 # int(0.1 * train_size)
batch_size = 64
mlm_probability = 0.15
train_epochs = 2

In [178]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels
    return default_data_collator(features)

def test_model(model, text):
    # The original line below doesn't work when using cuda as runtime and
    # PyTorch throws a Runtime Error
    # Setting the inputs device to 'cuda' fixes the issue

    # inputs = tokenizer(text, return_tensors="pt")
    inputs = tokenizer(text, return_tensors="pt").to('cuda')
    
    token_logits = model(**inputs).logits
    # Find the location of [MASK] and extract its logits
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    mask_token_logits = token_logits[0, mask_token_index, :]
    # Pick the [MASK] candidates with the highest logits
    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
    return [tokenizer.decode([token]) for token in top_5_tokens]

def create_sample_dataset(save_file_name):
    data = {
        "version": "0.1.0",
        "source": "sample",
        "url": "self-generated",
        "data": []
    }
    arr = []
    random_words = ["shoes", "jeans", "tshirts", "sweaters", "pants", "hoodies", "socks", "football"]
    for idx in range(1000):
        arr.append({"text": "Sample " + str(100 * idx) + " training sample - Nike " + random.choice(random_words) + " and " + random.choice(random_words), "label": 0})
        arr.append({"text": "Sample " + str(100 * idx) + " training sample - Adidas " + random.choice(random_words) + " and " + random.choice(random_words), "label": 0})
        arr.append({"text": "Sample " + str(100 * idx) + " training sample - Puma " + random.choice(random_words) + " and " + random.choice(random_words), "label": 0})
        arr.append({"text": "Sample " + str(100 * idx) + " training sample - Bata " + random.choice(random_words) + " and " + random.choice(random_words), "label": 0})
    data["data"] = arr

    with open(save_file_name, 'w') as f:
        json.dump(data, f)
    return save_file_name

def create_dataset_from_csv(file_name, col_name, save_file_name, attrs={}):
  data = pd.read_csv(file_name)
  vals = list(data[col_name])
  r_data = []
  for val in vals:
    try:
      val = eval(val)
    except:
      pass
    r_data.append({'text': str(val), 'label': 1})
  json_data = attrs
  json_data.update({
      "data": r_data
  })
  with open(save_file_name, 'w') as f:
      json.dump(json_data, f)
  return save_file_name

In [179]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def retrain_model(model, dataset, epochs=train_epochs):
    retrain_dataset = load_dataset('json', data_files={"train": dataset}, field='data')
    tokenized_datasets = retrain_dataset.map(
      tokenize_function, batched=True, remove_columns=["text", "label"]
    )

    lm_datasets = tokenized_datasets.map(group_texts, batched=True)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=mlm_probability)

    downsampled_dataset = lm_datasets["train"].train_test_split(
      train_size=train_size, test_size=test_size, seed=42
    )

    # logging_steps = len(downsampled_dataset["train"]) // batch_size
    logging_steps = 100
    model_name = model_checkpoint.split("/")[-1]

    training_args = TrainingArguments(
        output_dir=f"{model_name}-finetuned-uptrain",
        overwrite_output_dir=True,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        # logging_steps=logging_steps,
        num_train_epochs=epochs
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=downsampled_dataset["train"],
        eval_dataset=downsampled_dataset["test"],
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    eval_results = trainer.evaluate()
    # print(f">>> Before training, Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
    print('Before Training Eval Results:\n', json.dumps(eval_results, indent = 2))
    print(f"  Before Training Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

    trainer.train()

    eval_results = trainer.evaluate()
    # print(f">>> After training, Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
    print('After Training Eval Results:\n', json.dumps(eval_results, indent = 2))
    print(f"  After Training Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [180]:
def top_k_tokens (model, text, k = 5):
  inputs = tokenizer(text, return_tensors="pt").to('cuda')
  token_logits = model(**inputs).logits
  mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
  mask_token_logits = token_logits[0, mask_token_index, :]
  top_k_tokens = torch.topk(mask_token_logits, k, dim=1).indices[0].tolist()
  return [tokenizer.decode([token]) for token in top_k_tokens]

### Test using non-finetuned model to get an idea of basic mask-filling capability of distilbert-base-uncased

In [181]:
unmasker = pipeline('fill-mask', model = model_checkpoint)

In [182]:
test_text = "Nike shoes are very [MASK]."
print(json.dumps(unmasker(test_text), indent = 2))

[
  {
    "score": 0.3408825397491455,
    "token": 2759,
    "token_str": "popular",
    "sequence": "nike shoes are very popular."
  },
  {
    "score": 0.22603100538253784,
    "token": 6450,
    "token_str": "expensive",
    "sequence": "nike shoes are very expensive."
  },
  {
    "score": 0.10909274965524673,
    "token": 25634,
    "token_str": "durable",
    "sequence": "nike shoes are very durable."
  },
  {
    "score": 0.021332889795303345,
    "token": 2691,
    "token_str": "common",
    "sequence": "nike shoes are very common."
  },
  {
    "score": 0.011532995849847794,
    "token": 6625,
    "token_str": "comfortable",
    "sequence": "nike shoes are very comfortable."
  }
]


There was a bug in helper_funcs.py in the implementation `test_model` which causes a `Runtime Error` when running the notebook on a `cuda` backend.

A simple fix was to move the inputs to the `cuda` device from `cpu`.

In [183]:
def get_model_and_tokenizer ():
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

  DEVICE = 'cpu'

  print('Is "cuda" available?', torch.cuda.is_available())
  if torch.cuda.is_available():
    print('Device:', torch.cuda.get_device_name(0))
    DEVICE = 'cuda'

  # Assign to suppress output
  _ = model.to(DEVICE)
  return model, tokenizer

In [184]:
model, tokenizer = get_model_and_tokenizer()

Is "cuda" available? True
Device: Tesla T4


In [185]:
TOP_K = 5
TESTING_TEXT = [
  "Nike shoes are very [MASK]",
  "Nike atheletic wear is known for being very [MASK]",
  "Nike [MASK] shoes are very comfortable",
  "Trousers and Hoodies made by [MASK] are not very expensive",
  "Nike tshirts are famous for being [MASK]"
]

In [186]:
PRODUCTS = [
  'gym wear', 'jackets', 'shirts',
  'running shoes', 'basketballs', 'caps', 'pants', 'socks',
  'trousers', 'training shoes', 'basketball shoes', 'shoes',
  'athletic wear', 'sports wear', 'footballs',
  'performance gear', 'hats', 'sweaters', 'tshirts', 'wristbands',
  'backpacks', 'tshirts', 'hoodies', 'trainers',
  'soccer shoes',
]

POSITIVE_SENTIMENT_ADJECTIVES = [
  'stylish', 'innovative', 'comfortable', 'durable', 'performance-oriented',
  'high-quality', 'fashionable', 'sporty', 'functional', 'lightweight',
  'breathable', 'flexible', 'athletic', 'modern', 'inexpensive', 'cheap',
  'trendsetting', "revolutionary", 'good-looking'
]

NEGATIVE_SENTIMENT_ADJECTIVES = [
  'uncomfortable', 'flimsy', 'poor quality', 'outdated', 'unfashionable',
  'heavy', 'inferior', 'unathletic', 'expensive', 'costly',
  'overpriced', 'defective', 'ugly', 'dirty', 'faulty'
]

ADJECTIVES = POSITIVE_SENTIMENT_ADJECTIVES + NEGATIVE_SENTIMENT_ADJECTIVES

COMPANIES = [
  # repeat a couple of times for higher positive examples
  'nike', 'nike', 'nike', 'nike', 'nike', 'nike',
  'adidas', 'puma', 'under armour', 'new balance', 'reebok',
  'converse', 'vans', 'fila', 'asics'
]

JOINERS = [
  'are', 'is', 'offer', 'provide', 'feature', 'boast',
  'are known for being', 'are recognized for being', 'are famous for being',
  'are renowned for being', 'are praised for being',
]

In [187]:
def csv2json (csv_file_name, json_file_name, attrs={}):
  df = pd.read_csv(csv_file_name)
  data = []
  
  for _, row in df.iterrows():
    print(row.to_dict())
    data.append({
      'text': row['text'],
      'label': row['label']
    })
  
  json_data = attrs
  json_data.update({'data': data})

  with open(json_file_name, 'w') as f:
    json.dump(json_data, f)

def create_basic_brute_dataset(dataset_size):
  data = {
      "version": "0.1.0",
      "source": "sample",
      "url": "self-generated",
      "data": []
  }
  arr = []

  for idx in range(dataset_size):
    company = random.choice(COMPANIES)
    joiner = random.choice(JOINERS)
    product = random.choice(PRODUCTS)
    label = random.choice([0, 1])
    
    if label == 0:
      adjective = random.choice(NEGATIVE_SENTIMENT_ADJECTIVES)
    else:
      adjective = random.choice(POSITIVE_SENTIMENT_ADJECTIVES)
    
    sentence = f'{company} {product} {joiner} {adjective}'
    
    arr.append({
      "text": sentence,
      "label": label
    })
  
  data["data"] = arr
  return data

def save_dataset (dataset, save_file_name):
  with open(save_file_name, 'w') as f:
    json.dump(dataset, f)

In [188]:
DATASET_SIZE = 20000
uptrain_save_fold_name = "uptrain_smart_data_bert"
synthesized_data_file_name = 'data.json'
dataset = create_basic_brute_dataset(DATASET_SIZE)
save_dataset(dataset, synthesized_data_file_name)
dataset['data'][:10]

[{'text': 'reebok gym wear is sporty', 'label': 1},
 {'text': 'nike trainers offer overpriced', 'label': 0},
 {'text': 'converse shirts are famous for being innovative', 'label': 1},
 {'text': 'nike pants is expensive', 'label': 0},
 {'text': 'new balance sweaters are defective', 'label': 0},
 {'text': 'reebok sports wear are famous for being unathletic', 'label': 0},
 {'text': 'new balance gym wear feature defective', 'label': 0},
 {'text': 'adidas trousers boast outdated', 'label': 0},
 {'text': 'vans running shoes boast inferior', 'label': 0},
 {'text': 'nike shoes boast innovative', 'label': 1}]

In [189]:
# # # Create Nike review training dataset
# # nike_attrs = {
# #     "version": "0.1.0",
# #     'source': "nike review dataset",
# #     'url': 'https://www.kaggle.com/datasets/tinkuzp23/nike-onlinestore-customer-reviews?resource=download',
# # }
# # # Download the dataset from the url, zip it and copy the csv file here
# # raw_nike_reviews_dataset = create_dataset_from_csv("web_scrapped.csv", "Content", "raw_nike_reviews_data.json")

In [190]:
def perform_training_and_testing (model, data_file_name, epochs):
  original_model_outputs = []
  retrained_model_outputs = []

  for text in TESTING_TEXT:
    original_model_outputs.append({
      'text': text,
      'tokens': top_k_tokens(model, text, TOP_K)
    })

  retrain_model(model, data_file_name, epochs)

  for text in TESTING_TEXT:
    retrained_model_outputs.append({
      'text': text,
      'tokens': top_k_tokens(model, text, TOP_K)
    })
  
  return original_model_outputs, retrained_model_outputs

def pretty_print_original_vs_retrained_outputs (original_model_outputs, retrained_model_outputs):
  for original, retrained in zip(original_model_outputs, retrained_model_outputs):
    print('                Text:', original['text'])
    print(' Original Top Tokens:', original['tokens'])
    print('Retrained Top Tokens:', retrained['tokens'])
    print()

In [191]:
model, tokenizer = get_model_and_tokenizer()

original_model_outputs, retrained_model_outputs = \
  perform_training_and_testing(model, synthesized_data_file_name, 2)

pretty_print_original_vs_retrained_outputs(original_model_outputs, retrained_model_outputs)

Is "cuda" available? True
Device: Tesla T4




Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-c1e19a76785397d5/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-c1e19a76785397d5/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 148
  Batch size = 64
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The following columns in the training set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1330
  Num Epochs = 2
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumu

{'eval_loss': 4.404606342315674, 'eval_runtime': 0.8498, 'eval_samples_per_second': 174.154, 'eval_steps_per_second': 3.53}
Before Training Eval Results:
 {
  "eval_loss": 4.404606342315674,
  "eval_runtime": 0.8498,
  "eval_samples_per_second": 174.154,
  "eval_steps_per_second": 3.53
}
  Before Training Perplexity: 81.83


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 148
  Batch size = 64


{'eval_loss': 1.516823172569275, 'eval_runtime': 0.9264, 'eval_samples_per_second': 159.751, 'eval_steps_per_second': 3.238, 'epoch': 1.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 148
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 148
  Batch size = 64


{'eval_loss': 1.1675113439559937, 'eval_runtime': 0.8655, 'eval_samples_per_second': 171.006, 'eval_steps_per_second': 3.466, 'epoch': 2.0}
{'train_runtime': 46.8196, 'train_samples_per_second': 56.814, 'train_steps_per_second': 0.897, 'train_loss': 1.892079126267206, 'epoch': 2.0}
{'eval_loss': 1.1942570209503174, 'eval_runtime': 0.8606, 'eval_samples_per_second': 171.964, 'eval_steps_per_second': 3.486, 'epoch': 2.0}
After Training Eval Results:
 {
  "eval_loss": 1.1942570209503174,
  "eval_runtime": 0.8606,
  "eval_samples_per_second": 171.964,
  "eval_steps_per_second": 3.486,
  "epoch": 2.0
}
  After Training Perplexity: 3.30
                Text: Nike shoes are very [MASK]
 Original Top Tokens: ['popular', 'durable', 'expensive', 'comfortable', 'fashionable']
Retrained Top Tokens: ['expensive', 'popular', 'comfortable', 'durable', 'fashionable']

                Text: Nike atheletic wear is known for being very [MASK]
 Original Top Tokens: ['durable', 'expensive', 'popular', 'fas

In [192]:
vader_sia = SentimentIntensityAnalyzer()

def nike_text_present_func (inputs, outputs, gts=None, extra_args={}):
  is_present = []
  for input in inputs["text"]:
    is_present.append(bool("nike" in input.lower()))
  return is_present

def is_positive_sentiment_func (inputs, outputs, gts=None, extra_args={}):
  is_positive = []
  for input in inputs["text"]:
    input = input.lower()
    positive = False
    
    if vader_sia.polarity_scores(input)["compound"] >= 0:
      for word in POSITIVE_SENTIMENT_ADJECTIVES:
        if word in input:
          positive = True
          break
      for word in NEGATIVE_SENTIMENT_ADJECTIVES:
        if word in input:
          positive = False
          break
    
    is_positive.append(positive)
  
  return is_positive

In [193]:
cfg = {
    'checks': [
    {
      'type': uptrain.Anomaly.EDGE_CASE,
      "signal_formulae": \
        uptrain.Signal("'Nike' text Present", nike_text_present_func) &
        uptrain.Signal("Is positive Sentiment", is_positive_sentiment_func)
    }],

    # Define where to save the retraining dataset
    'retraining_folder': uptrain_save_fold_name,
    
    # Define when to retrain, define a large number because we
    # are not retraining yet
    'retrain_after': 10000000000
}

framework = uptrain.Framework(cfg)

Deleting the folder:  uptrain_smart_data_bert


In [194]:
for index, sample in enumerate(dataset['data']):
  if index % 250 == 0:
    print(f'Sample: {index}')
  inputs = {'data': {'text': [sample['text']]}}
  framework.log(inputs = inputs, outputs = None)

retraining_csv = uptrain_save_fold_name + '/1/smart_data.csv'
retraining_json = 'retrain_dataset.json'
create_dataset_from_csv(retraining_csv, 'text', retraining_json)

Sample: 0
50  edge cases identified out of  231  total samples
Sample: 250
Sample: 500
100  edge cases identified out of  528  total samples
Sample: 750
150  edge cases identified out of  838  total samples
Sample: 1000
200  edge cases identified out of  1100  total samples
Sample: 1250
250  edge cases identified out of  1346  total samples
Sample: 1500
300  edge cases identified out of  1633  total samples
Sample: 1750
350  edge cases identified out of  1863  total samples
Sample: 2000
400  edge cases identified out of  2185  total samples
Sample: 2250
Sample: 2500
450  edge cases identified out of  2526  total samples
Sample: 2750
500  edge cases identified out of  2812  total samples
Sample: 3000
550  edge cases identified out of  3119  total samples
Sample: 3250
600  edge cases identified out of  3326  total samples
Sample: 3500
650  edge cases identified out of  3604  total samples
Sample: 3750
700  edge cases identified out of  3849  total samples
Sample: 4000
750  edge cases ide

'retrain_dataset.json'

In [195]:
model, tokenizer = get_model_and_tokenizer()
original_model_outputs, retrained_model_outputs = \
  perform_training_and_testing(model, retraining_json, 25)
pretty_print_original_vs_retrained_outputs(original_model_outputs, retrained_model_outputs)
# retrained_model_outputs = test_model(model, testing_text)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.0",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapsh

Is "cuda" available? True
Device: Tesla T4




Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-ffef29e40272955e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-ffef29e40272955e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 26
  Batch size = 64
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


The following columns in the training set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 232
  Num Epochs = 25
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 100
  Number of trainable parameters = 66985530


Before Training Eval Results:
 {
  "eval_loss": 3.8802170753479004,
  "eval_runtime": 0.2068,
  "eval_samples_per_second": 125.71,
  "eval_steps_per_second": 4.835
}
  Before Training Perplexity: 48.43


Epoch,Training Loss,Validation Loss
1,No log,2.204297
2,No log,1.597758
3,No log,1.444622
4,No log,1.121287
5,No log,1.085087
6,No log,0.988186
7,No log,1.042115
8,No log,0.817428
9,No log,0.808299
10,No log,0.889124


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 26
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 26
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 26
  Batch size = 64
The following columns in the evaluation set

After Training Eval Results:
 {
  "eval_loss": 0.6867963075637817,
  "eval_runtime": 0.1515,
  "eval_samples_per_second": 171.567,
  "eval_steps_per_second": 6.599,
  "epoch": 25.0
}
  After Training Perplexity: 1.99
                Text: Nike shoes are very [MASK]
 Original Top Tokens: ['popular', 'durable', 'expensive', 'comfortable', 'fashionable']
Retrained Top Tokens: ['comfortable', 'fashionable', 'durable', 'popular', 'flexible']

                Text: Nike atheletic wear is known for being very [MASK]
 Original Top Tokens: ['durable', 'expensive', 'popular', 'fashionable', 'rare']
Retrained Top Tokens: ['durable', 'flexible', 'fashionable', 'lightweight', 'innovative']

                Text: Nike [MASK] shoes are very comfortable
 Original Top Tokens: ['polo', 'golf', 'swim', 'tennis', 'nike']
Retrained Top Tokens: ['basketball', 'training', 'soccer', 'running', 'football']

                Text: Trousers and Hoodies made by [MASK] are not very expensive
 Original Top Tokens: [

In [196]:
# https://huggingface.co/course/chapter7/3

# retrain_dataset = load_dataset('json', data_files={"train": synthesized_data_file_name}, field='data')
# tokenized_datasets = retrain_dataset.map(
#   tokenize_function, batched=True, remove_columns=["text", "label"]
# )

# lm_datasets = tokenized_datasets.map(group_texts, batched=True)
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=mlm_probability)

# downsampled_dataset = lm_datasets["train"].train_test_split(
#   train_size=train_size, test_size=test_size,# seed=42
# )

# # Show the training loss with every epoch
# logging_steps = len(downsampled_dataset["train"]) // batch_size

# def insert_random_mask(batch):
#   features = [dict(zip(batch, t)) for t in zip(*batch.values())]
#   masked_inputs = data_collator(features)
#   return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

# downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
# eval_dataset = downsampled_dataset["test"].map(
#     insert_random_mask,
#     batched=True,
#     remove_columns=downsampled_dataset["test"].column_names,
# )
# eval_dataset = eval_dataset.rename_columns(
#   {
#     "masked_input_ids": "input_ids",
#     "masked_attention_mask": "attention_mask",
#     "masked_labels": "labels",
#   }
# )

# train_dataloader = DataLoader(
#   downsampled_dataset["train"],
#   shuffle=True,
#   batch_size=batch_size,
#   collate_fn=data_collator,
# )
# eval_dataloader = DataLoader(
#   eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
# )

# optimizer = AdamW(model.parameters(), lr=5e-5)

# accelerator = Accelerator()
# model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
#   model, optimizer, train_dataloader, eval_dataloader
# )

# num_train_epochs = 5
# num_update_steps_per_epoch = len(train_dataloader)
# num_training_steps = num_train_epochs * num_update_steps_per_epoch

# lr_scheduler = get_scheduler(
#   "linear",
#   optimizer=optimizer,
#   num_warmup_steps=0,
#   num_training_steps=num_training_steps,
# )

# progress_bar = tqdm(range(num_training_steps))

# for epoch in range(num_train_epochs):
#     # Training
#     model.train()
#     for batch in train_dataloader:
#         outputs = model(**batch)
#         loss = outputs.loss
#         accelerator.backward(loss)

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)

#     # Evaluation
#     model.eval()
#     losses = []
#     for step, batch in enumerate(eval_dataloader):
#         with torch.no_grad():
#             outputs = model(**batch)

#         loss = outputs.loss
#         losses.append(accelerator.gather(loss.repeat(batch_size)))

#     losses = torch.cat(losses)
#     losses = losses[: len(eval_dataset)]
#     try:
#         perplexity = math.exp(torch.mean(losses))
#     except OverflowError:
#         perplexity = float("inf")

#     print(f">>> Epoch {epoch}, Perplexity: {perplexity}")

#     accelerator.wait_for_everyone()
#     unwrapped_model = accelerator.unwrap_model(model)
#     unwrapped_model.save_pretrained(uptrain_save_fold_name, save_function=accelerator.save)
#     if accelerator.is_main_process:
#         tokenizer.save_pretrained(uptrain_save_fold_name)

In [197]:
# for index, sample in enumerate(all_data['data']):
#   if index % 250 == 0:
#     print(f'Sample: {index}')
#   inputs = {'data': {'text': [sample['text']]}}
#   framework.log(inputs = inputs, outputs = None)

# retraining_csv = uptrain_save_fold_name + '/1/smart_data.csv'
# retraining_json = 'retrain_dataset.json'
# csv2json(retraining_csv, retraining_json)

In [198]:
# retrain_model(model, retraining_dataset)
# retrained_model_outputs = test_model(model, testing_text)