# **Domain Adaptation**
Domain adaptation is the process of fine-tuning a pretrained language model on in-domain data.

We shall fine-tune a DistilBERT model using TensorFlow on IMDB dataset. After fine-tuning, the model should adapt its vocabulary from the factual data of Wikipedia that it was pretrained on to the more subjective elements of movie reviews.

### **1. Install and Import Required Libraries**

In [None]:
!pip install datasets transformers evaluate

In [None]:
import tensorflow as tf
import numpy as np
import collections
import math

from transformers import AutoTokenizer, TFAutoModelForMaskedLM, DataCollatorForLanguageModeling, create_optimizer, pipeline
from transformers.data.data_collator import tf_default_data_collator
from datasets import load_dataset

### **2. Load Data**

In [None]:
raw_dataset = load_dataset('imdb')

In [None]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

### **3. Preprocess Data**

In [None]:
model_checkpoint = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
def tokenize_function(examples):
  result = tokenizer(examples['text'])

  if tokenizer.is_fast:
    result['word_ids'] = [result.word_ids(i) for i, _ in enumerate(result['input_ids'])]

  return result

In [None]:
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=['text', 'label'])

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [None]:
# Concatenating all texts and splitting into chunks
chunk_size = 128

def group_texts(examples):
  # Concatenating all texts
  concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}

  # Computing length of concatenated texts
  total_length = len(concatenated_examples[list(examples.keys())[0]])
  # Dropping last chunk if it is smaller than chunk_size
  total_length = (total_length // chunk_size) * chunk_size
  # Splitting into chunks
  result = {key: [value[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
            for key, value in concatenated_examples.items()}

  # Copying input_ids and creating labels before randomly masking input_ids
  result['labels'] = result['input_ids'].copy()
  return result

In [None]:
lm_dataset = tokenized_dataset.map(group_texts, batched=True)

In [None]:
lm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [None]:
downsampled_dataset = lm_dataset['train'].train_test_split(train_size=11000, test_size=1000, seed=44)
test_dataset = downsampled_dataset.pop('test')

downsampled_dataset = downsampled_dataset['train'].train_test_split(train_size=10000, test_size=1000, seed=44)
downsampled_dataset['validation'] = downsampled_dataset.pop('test')
downsampled_dataset['test'] = test_dataset

In [None]:
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [None]:
batch_size = 32
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

tf_train_dataset = model.prepare_tf_dataset(
    downsampled_dataset['train'],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=batch_size
)

tf_validation_dataset = model.prepare_tf_dataset(
    downsampled_dataset['validation'],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=batch_size
)

tf_test_dataset = model.prepare_tf_dataset(
    downsampled_dataset['test'],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=batch_size
)

In [None]:
# Data collator for whole word masking
wwm_probability = 0.2

def whole_word_masking_data_collator(features):
  for feature in features:
    word_ids = feature.pop('word_ids')
    mapping_dict = collections.defaultdict(list)
    current_word_index = -1
    current_word = None

    for i, word_id in enumerate(word_ids):
      if word_id is not None:
        if word_id != current_word:
          current_word = word_id
          current_word_index += 1
        mapping_dict[current_word_index].append(i)

    mask = np.random.binomial(1, wwm_probability, len(mapping_dict))

    input_ids = feature['input_ids']
    labels = feature['labels']
    new_labels = [-100] * len(labels)

    for word_id in np.where(mask)[0]:
      for i in mapping_dict[word_id]:
        new_labels[i] = labels[i] # Setting values to the ones corresponding to masked words and rest are set to -100
        input_ids[i] = tokenizer.mask_token_id
    feature['labels'] = new_labels

  return tf_default_data_collator(features)

In [None]:
samples = [lm_dataset['train'][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch['input_ids']:
  print(f"{tokenizer.decode(chunk)}")

[CLS] i rented i am curious - yellow from [MASK] video store because of all the controversy that surrounded [MASK] when [MASK] [MASK] first released in 1967 [MASK] [MASK] also heard that at first it was seized by u [MASK] s. customs if it ever tried to enter this [MASK], therefore being a fan of [MASK] considered " [MASK] " i really had [MASK] see this for [MASK]. < [MASK] / > < br / > [MASK] plot is [MASK] around a young [MASK] drama student named lena who wants to learn everything she can [MASK] life. in particular she wants to [MASK] her [MASK] [MASK] to making [MASK] sort of documentary on what the average swede thought about certain political issues such
as the [MASK] war [MASK] race issues in the [MASK] states. in between asking politicians and ordinary denizens of stockholm [MASK] their opinions on politics, she has sex with [MASK] drama teacher, classmates, and married men. < br / > < br [MASK] > [MASK] kills me about i am curious [MASK] yellow is that 40 years ago [MASK] this 

### **4. Predict using the Model before Fine-tuning**

In [None]:
text = "This is a great [MASK]."

In [None]:
tokenized_text = tokenizer(text, return_tensors='np')
logits = model(**tokenized_text)['logits']

In [None]:
# Finding location of [MASK] and extracting its logits
mask_token_index = np.argwhere(tokenized_text['input_ids'] == tokenizer.mask_token_id)[0, 1]
mask_token_logits = logits[0, mask_token_index, :]

# Picking the [MASK] candidates with the highest logits
# Negating the array before argsort to get the largest, not the smallest, logits
top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist()

In [None]:
for token in top_5_tokens:
  print(f"{text.replace(tokenizer.mask_token, tokenizer.decode(token))}")

This is a great deal.
This is a great success.
This is a great adventure.
This is a great idea.
This is a great feat.


### **5. Fine-tune the Model**

In [None]:
num_epochs = 5
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    num_warmup_steps=1000,
    weight_decay_rate=0.01
)

model.compile(optimizer=optimizer, metrics=['accuracy'])

In [None]:
# Perplexity before fine-tuning the model
loss, accuracy = model.evaluate(tf_test_dataset)
print(f"Perplexity: {math.exp(loss):.2f}")

Perplexity: 22.61


In [None]:
# Training in mixed-precision float16
tf.keras.mixed_precision.set_global_policy('mixed_float16')

history = model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=num_epochs, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Perplexity after fine-tuning the model
loss, accuracy = model.evaluate(tf_test_dataset)
print(f"Perplexity: {math.exp(loss):.2f}")

Perplexity: 10.77


### **6. Predict using the Fine-tuned Model**

In [None]:
mask_filler = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [None]:
predictions = mask_filler(text)

for pred in predictions:
  print(f"{pred['sequence']}")

this is a great film.
this is a great movie.
this is a great idea.
this is a great adventure.
this is a great show.
