#### Installing Libraries

In [1]:
pip install datasets transformers nltk conllu torch

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\99TECHNOLIGIES\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.




In [None]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp39-cp39-win_amd64.whl (11.0 MB)
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting scipy>=1.6.0
  Downloading scipy-1.13.1-cp39-cp39-win_amd64.whl (46.2 MB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.5.2 scipy-1.13.1 threadpoolctl-3.5.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\99TECHNOLIGIES\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [None]:
pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\99TECHNOLIGIES\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [2]:
# Universal Dependencies = https://universaldependencies.org/#download
# HF = https://huggingface.co/datasets/universal-dependencies/universal_dependencies/blob/main/universal_dependencies.py


### Loading the UD dataset from HF: https://huggingface.co/datasets/universal-dependencies/universal_dependencies/blob/main/universal_dependencies.py

In [1]:
from datasets import load_dataset

# Load the Universal Dependencies dataset
dataset = load_dataset(r"universal-dependencies/universal_dependencies",'ur_udtb',trust_remote_code=True) # Urdu language

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 4043
    })
    validation: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 552
    })
    test: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 535
    })
})

### Reducing the size of dataset to 100 samples in each category

In [3]:
from datasets import DatasetDict
import random

# Function to select a random subset of 100 samples from each dataset split
def reduce_to_100_samples(dataset_dict):
    reduced_dict = {}
    for split, dataset in dataset_dict.items():
        # Shuffle the indices
        indices = list(range(len(dataset)))
        random.shuffle(indices)

        # Select the first 100 indices
        selected_indices = indices[:100]

        # Select the subset from the dataset
        reduced_dict[split] = dataset.select(selected_indices)

    return DatasetDict(reduced_dict)

In [4]:
original_dataset = dataset.copy()
dataset = reduce_to_100_samples(dataset)

In [5]:
dataset


DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 100
    })
    test: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 100
    })
})

In [8]:
dataset.shape # 200 samples.

{'train': (100, 11), 'validation': (100, 11), 'test': (100, 11)}

In [9]:
dataset['train'][0]

{'idx': 'train-s1146',
 'text': 'اقوام متحدہ اور حکومت کے عہدیداروں نے یہ بات بتائی۔',
 'tokens': ['اقوام',
  'متحدہ',
  'اور',
  'حکومت',
  'کے',
  'عہدیداروں',
  'نے',
  'یہ',
  'بات',
  'بتائی',
  '۔'],
 'lemmas': ['اقوام',
  'متحدہ',
  'اور',
  'حکومت',
  'کا',
  'عہدیدار',
  'نے',
  'یہ',
  'بات',
  'بتا',
  '۔'],
 'upos': [10, 10, 9, 0, 2, 0, 2, 8, 0, 16, 1],
 'xpos': ['NNPC',
  'NNP',
  'CC',
  'NN',
  'PSP',
  'NN',
  'PSP',
  'DEM',
  'NN',
  'VM',
  'SYM'],
 'feats': ["{'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing', 'Person': '3'}",
  "{'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing', 'Person': '3'}",
  'None',
  "{'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing', 'Person': '3'}",
  "{'AdpType': 'Post', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Plur'}",
  "{'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Plur', 'Person': '3'}",
  "{'AdpType': 'Post'}",
  "{'Case': 'Nom', 'Number': 'Sing', 'Person': '3', 'PronType': 'Dem'}",
  "{'Case': 'Nom', 'Gender': 'Fem', 'Number': 'S

In [10]:
dataset['train']['text'][0]

'اقوام متحدہ اور حکومت کے عہدیداروں نے یہ بات بتائی۔'

#### Tokenizing the dataset and aligning the labels accordingly for model input 

In [6]:
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments
import numpy as np

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Define a function to preprocess and tokenize the data
def tokenize_and_align_labels(examples):
    # Tokenize the inputs
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, padding='max_length', is_split_into_words=True)

    # Align labels with tokens
    labels = []
    for i, label in enumerate(examples['upos']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Subword tokens
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs



In [7]:
# Apply preprocessing to the entire dataset
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# Ensure the labels and tokens are correctly processed
print(tokenized_datasets['train'][0])

Map: 100%|██████████| 100/100 [00:00<00:00, 884.47 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 970.02 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1094.84 examples/s]

{'idx': 'train-s310', 'text': 'حکومت مہاراشٹرا نے اس سے پہلے بھی انران برقی پراجکٹ مےں ناقص اقدامات کئے تھے جس کی وجہ سے پراجکٹ شروع نہیں ہوا۔', 'tokens': ['حکومت', 'مہاراشٹرا', 'نے', 'اس', 'سے', 'پہلے', 'بھی', 'انران', 'برقی', 'پراجکٹ', 'مےں', 'ناقص', 'اقدامات', 'کئے', 'تھے', 'جس', 'کی', 'وجہ', 'سے', 'پراجکٹ', 'شروع', 'نہیں', 'ہوا', '۔'], 'lemmas': ['حکومت', 'مہاراشٹرا', 'نے', 'یہ', 'سے', 'پہلے', 'بھی', 'انران', 'برقی', 'پراجکٹ', 'مےں', 'ناقص', 'اقدام', 'کر', 'تھا', 'جو', 'کا', 'وجہ', 'سے', 'پراجکٹ', 'شروع', 'نہیں', 'ہو', '۔'], 'upos': [0, 10, 2, 11, 2, 2, 7, 10, 10, 10, 2, 6, 0, 16, 17, 11, 2, 2, 2, 0, 0, 7, 16, 1], 'xpos': ['NNZ', 'NNP', 'PSP', 'PRP', 'PSP', 'NST', 'RP', 'NNPC', 'NNPC', 'NNP', 'PSP', 'JJ', 'NN', 'VM', 'VAUX', 'PRP', 'PSP', 'PSP', 'PSP', 'NN', 'NN', 'NEG', 'VM', 'SYM'], 'feats': ["{'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing', 'Person': '3'}", "{'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Sing', 'Person': '3'}", "{'AdpType': 'Post'}", "{'Case': 'Acc', 'Number': '




In [8]:
tokenized_datasets['train']['labels'][0]

[-100,
 0,
 -100,
 -100,
 -100,
 -100,
 10,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 2,
 -100,
 11,
 -100,
 2,
 -100,
 2,
 -100,
 -100,
 -100,
 7,
 -100,
 -100,
 10,
 -100,
 -100,
 -100,
 10,
 -100,
 -100,
 -100,
 10,
 -100,
 -100,
 -100,
 -100,
 -100,
 2,
 -100,
 -100,
 6,
 -100,
 -100,
 -100,
 0,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 16,
 -100,
 -100,
 17,
 -100,
 -100,
 11,
 -100,
 2,
 -100,
 2,
 -100,
 -100,
 2,
 -100,
 0,
 -100,
 -100,
 -100,
 -100,
 -100,
 0,
 -100,
 -100,
 -100,
 7,
 -100,
 -100,
 -100,
 16,
 -100,
 -100,
 1,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,

In [9]:
# Get all unique labels
all_labels = [label for sublist in tokenized_datasets['train']['labels'] for label in sublist if label != -100]
unique_labels = set(all_labels)
print("Unique labels:", unique_labels)
print("Number of unique labels:", len(unique_labels))


Unique labels: {0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17}
Number of unique labels: 15


In [10]:
# return the maximum label value found in the tokenized_datasets['train']['labels']
max_label_train = max(label for labels in tokenized_datasets['train']['labels'] for label in labels)
max_label_val = max(label for labels in tokenized_datasets['validation']['labels'] for label in labels)
max_label_test = max(label for labels in tokenized_datasets['test']['labels'] for label in labels)

maxi = max(max_label_train,max_label_val,max_label_test)

num_labels = maxi + 1  # Add 1 because labels start from 0
print(f'num_labels = {num_labels}')

num_labels = 18


#### Loading the distilbert model from HF

In [22]:
from transformers import DistilBertForTokenClassification, TrainingArguments, Trainer

# Load pre-trained DistilBERT model with token classification head
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

In [12]:
model

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

#### Full model finetuning

- Time  = 32 min
- Eval Loss = 1.100
- Epoch = 10

In [31]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',              # output directory
    evaluation_strategy='epoch',         # evaluate every epoch
    learning_rate=2e-5,                  # learning rate
    per_device_train_batch_size=8,       # batch size for training
    per_device_eval_batch_size=8,        # batch size for evaluation
    num_train_epochs=10,                  # number of training epochs
    weight_decay=0.01,                   # strength of weight decay
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_datasets['train'],  # training dataset
    eval_dataset=tokenized_datasets['validation'] if 'validation' in tokenized_datasets else tokenized_datasets['train'],  # evaluation dataset
)

# Train the model
trainer.train()

 10%|█         | 13/130 [02:38<17:52,  9.17s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                


[A[A[A                                      
[A                                                   

 10%|█         | 13/130 [03:18<17:52,  9.17s/it]     

[A[A
[A

[A[A

{'eval_loss': 2.110231399536133, 'eval_runtime': 39.5795, 'eval_samples_per_second': 2.527, 'eval_steps_per_second': 0.328, 'epoch': 1.0}


 20%|██        | 26/130 [05:30<15:41,  9.05s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                


[A[A[A                                      
[A                                                   

 20%|██        | 26/130 [06:10<15:41,  9.05s/it]     

[A[A
[A

[A[A

{'eval_loss': 1.8057769536972046, 'eval_runtime': 40.0695, 'eval_samples_per_second': 2.496, 'eval_steps_per_second': 0.324, 'epoch': 2.0}


 30%|███       | 39/130 [08:20<13:40,  9.01s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                
[A                                                   

[A[A                                               


 30%|███       | 39/130 [09:00<13:40,  9.01s/it]
[A

[A[A

[A[A

{'eval_loss': 1.569796085357666, 'eval_runtime': 39.7699, 'eval_samples_per_second': 2.514, 'eval_steps_per_second': 0.327, 'epoch': 3.0}


 40%|████      | 52/130 [11:17<12:12,  9.39s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                
[A                                                   

[A[A                                               


 40%|████      | 52/130 [12:01<12:12,  9.39s/it]
[A

[A[A

[A[A

{'eval_loss': 1.4096968173980713, 'eval_runtime': 44.203, 'eval_samples_per_second': 2.262, 'eval_steps_per_second': 0.294, 'epoch': 4.0}


 50%|█████     | 65/130 [14:15<09:43,  8.98s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                


[A[A[A                                      
[A                                                   

 50%|█████     | 65/130 [15:06<09:43,  8.98s/it]     

[A[A
[A

[A[A

{'eval_loss': 1.304577350616455, 'eval_runtime': 50.6972, 'eval_samples_per_second': 1.972, 'eval_steps_per_second': 0.256, 'epoch': 5.0}


 60%|██████    | 78/130 [17:44<10:22, 11.97s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                


[A[A[A                                      
[A                                                   

 60%|██████    | 78/130 [18:34<10:22, 11.97s/it]     

[A[A
[A

[A[A

{'eval_loss': 1.2233675718307495, 'eval_runtime': 50.0192, 'eval_samples_per_second': 1.999, 'eval_steps_per_second': 0.26, 'epoch': 6.0}


 70%|███████   | 91/130 [21:11<06:22,  9.81s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                
[A                                                   

[A[A                                               


 70%|███████   | 91/130 [21:53<06:22,  9.81s/it]
[A

[A[A

[A[A

{'eval_loss': 1.162559986114502, 'eval_runtime': 42.3766, 'eval_samples_per_second': 2.36, 'eval_steps_per_second': 0.307, 'epoch': 7.0}


 80%|████████  | 104/130 [24:32<04:51, 11.22s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                 


[A[A[A                                      
[A                                                   

 80%|████████  | 104/130 [25:22<04:51, 11.22s/it]    

[A[A
[A

[A[A

{'eval_loss': 1.1342713832855225, 'eval_runtime': 49.9434, 'eval_samples_per_second': 2.002, 'eval_steps_per_second': 0.26, 'epoch': 8.0}


 90%|█████████ | 117/130 [28:15<02:24, 11.10s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                 
[A                                                   

[A[A                                               


 90%|█████████ | 117/130 [29:02<02:24, 11.10s/it]
[A

[A[A

[A[A

{'eval_loss': 1.1057898998260498, 'eval_runtime': 46.5979, 'eval_samples_per_second': 2.146, 'eval_steps_per_second': 0.279, 'epoch': 9.0}


100%|██████████| 130/130 [31:43<00:00, 11.39s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                 


[A[A[A                                      
[A                                                   

100%|██████████| 130/130 [32:29<00:00, 11.39s/it]    

[A[A
[A

                                                 
[A                                                   

100%|██████████| 130/130 [32:29<00:00, 11.39s/it]    
100%|██████████| 130/130 [32:29<00:00, 15.00s/it]

{'eval_loss': 1.100944995880127, 'eval_runtime': 41.9908, 'eval_samples_per_second': 2.381, 'eval_steps_per_second': 0.31, 'epoch': 10.0}
{'train_runtime': 1949.9673, 'train_samples_per_second': 0.513, 'train_steps_per_second': 0.067, 'train_loss': 1.5090125450721155, 'epoch': 10.0}





TrainOutput(global_step=130, training_loss=1.5090125450721155, metrics={'train_runtime': 1949.9673, 'train_samples_per_second': 0.513, 'train_steps_per_second': 0.067, 'total_flos': 130690897920000.0, 'train_loss': 1.5090125450721155, 'epoch': 10.0})

#### Model finetuning by freezing the embedding layers

- Time  = 27 min
- Eval Loss = 1.108413815498352
- Epoch = 10

In [13]:
model2 = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
print(model2)

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

In [34]:
# Freeze embedding layers
for param in model2.distilbert.embeddings.parameters():
    param.requires_grad = False

In [35]:
# Freeze some transformer layers
# DistilBERT has 6 transformer layers, let's say we want to freeze the first 3
for i, layer in enumerate(model2.distilbert.transformer.layer):
    if i < 3:  # Adjust this index based on how many layers you want to freeze
        for param in layer.parameters():
            param.requires_grad = True

# Optionally, you might want to print the parameters to verify
for name, param in model2.named_parameters():
    if not param.requires_grad:
        print(f"Layer '{name}' is frozen.")
    else:
        print(f"Layer '{name}' is trainable.")


Layer 'distilbert.embeddings.word_embeddings.weight' is frozen.
Layer 'distilbert.embeddings.position_embeddings.weight' is frozen.
Layer 'distilbert.embeddings.LayerNorm.weight' is frozen.
Layer 'distilbert.embeddings.LayerNorm.bias' is frozen.
Layer 'distilbert.transformer.layer.0.attention.q_lin.weight' is trainable.
Layer 'distilbert.transformer.layer.0.attention.q_lin.bias' is trainable.
Layer 'distilbert.transformer.layer.0.attention.k_lin.weight' is trainable.
Layer 'distilbert.transformer.layer.0.attention.k_lin.bias' is trainable.
Layer 'distilbert.transformer.layer.0.attention.v_lin.weight' is trainable.
Layer 'distilbert.transformer.layer.0.attention.v_lin.bias' is trainable.
Layer 'distilbert.transformer.layer.0.attention.out_lin.weight' is trainable.
Layer 'distilbert.transformer.layer.0.attention.out_lin.bias' is trainable.
Layer 'distilbert.transformer.layer.0.sa_layer_norm.weight' is trainable.
Layer 'distilbert.transformer.layer.0.sa_layer_norm.bias' is trainable.
Laye

In [36]:
# Define training arguments 1:55PM
training_args2 = TrainingArguments(
    output_dir='./results2',              # output directory
    evaluation_strategy='epoch',         # evaluate every epoch
    learning_rate=2e-5,                  # learning rate
    per_device_train_batch_size=8,       # batch size for training
    per_device_eval_batch_size=8,        # batch size for evaluation
    num_train_epochs=10,                  # number of training epochs
    weight_decay=0.01,                   # strength of weight decay
)

# Initialize the Trainer
trainer2 = Trainer(
    model=model2,                         # the instantiated model to be trained
    args=training_args2,                  # training arguments, defined above
    train_dataset=tokenized_datasets['train'],  # training dataset
    eval_dataset=tokenized_datasets['validation'] if 'validation' in tokenized_datasets else tokenized_datasets['train'],  # evaluation dataset
)

# Train the model
trainer2.train()

 10%|█         | 13/130 [02:19<16:49,  8.63s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                
[A                                                   

[A[A                                               


 10%|█         | 13/130 [02:56<16:49,  8.63s/it]
[A

[A[A

[A[A

{'eval_loss': 2.1129345893859863, 'eval_runtime': 37.2486, 'eval_samples_per_second': 2.685, 'eval_steps_per_second': 0.349, 'epoch': 1.0}


 20%|██        | 26/130 [04:51<13:39,  7.88s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                


[A[A[A                                      
[A                                                   

 20%|██        | 26/130 [05:27<13:39,  7.88s/it]      

[A[A
[A

[A[A

{'eval_loss': 1.813246488571167, 'eval_runtime': 36.6728, 'eval_samples_per_second': 2.727, 'eval_steps_per_second': 0.354, 'epoch': 2.0}


 30%|███       | 39/130 [07:29<12:35,  8.31s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                


[A[A[A                                      
[A                                                   

 30%|███       | 39/130 [08:08<12:35,  8.31s/it]      

[A[A
[A

[A[A

{'eval_loss': 1.5781059265136719, 'eval_runtime': 39.307, 'eval_samples_per_second': 2.544, 'eval_steps_per_second': 0.331, 'epoch': 3.0}


 40%|████      | 52/130 [10:13<10:52,  8.36s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                


[A[A[A                                      
[A                                                   

 40%|████      | 52/130 [10:53<10:52,  8.36s/it]      

[A[A
[A

[A[A

{'eval_loss': 1.4190348386764526, 'eval_runtime': 39.5584, 'eval_samples_per_second': 2.528, 'eval_steps_per_second': 0.329, 'epoch': 4.0}


 50%|█████     | 65/130 [12:54<09:03,  8.36s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                
[A                                                   

[A[A                                                


 50%|█████     | 65/130 [13:31<09:03,  8.36s/it]
[A

[A[A

[A[A

{'eval_loss': 1.3141262531280518, 'eval_runtime': 36.6844, 'eval_samples_per_second': 2.726, 'eval_steps_per_second': 0.354, 'epoch': 5.0}


 60%|██████    | 78/130 [15:25<06:47,  7.83s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                


[A[A[A                                      
[A                                                   

 60%|██████    | 78/130 [16:01<06:47,  7.83s/it]      

[A[A
[A

[A[A

{'eval_loss': 1.2323946952819824, 'eval_runtime': 36.5145, 'eval_samples_per_second': 2.739, 'eval_steps_per_second': 0.356, 'epoch': 6.0}


 70%|███████   | 91/130 [18:09<05:32,  8.53s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                
[A                                                   

[A[A                                                


 70%|███████   | 91/130 [18:52<05:32,  8.53s/it]
[A

[A[A

[A[A

{'eval_loss': 1.170549988746643, 'eval_runtime': 43.5946, 'eval_samples_per_second': 2.294, 'eval_steps_per_second': 0.298, 'epoch': 7.0}


 80%|████████  | 104/130 [20:54<03:32,  8.17s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                 


[A[A[A                                      
[A                                                   

 80%|████████  | 104/130 [21:32<03:32,  8.17s/it]     

[A[A
[A

[A[A

{'eval_loss': 1.1426864862442017, 'eval_runtime': 37.5272, 'eval_samples_per_second': 2.665, 'eval_steps_per_second': 0.346, 'epoch': 8.0}


 90%|█████████ | 117/130 [23:31<01:46,  8.18s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                 


[A[A[A                                      
[A                                                   

 90%|█████████ | 117/130 [24:10<01:46,  8.18s/it]     

[A[A
[A

[A[A

{'eval_loss': 1.1133006811141968, 'eval_runtime': 39.3964, 'eval_samples_per_second': 2.538, 'eval_steps_per_second': 0.33, 'epoch': 9.0}


100%|██████████| 130/130 [26:19<00:00,  8.81s/it]

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

                                                 
[A                                                   

[A[A                                                


100%|██████████| 130/130 [27:08<00:00,  8.81s/it]
[A

[A[A

                                                 
[A                                                   

100%|██████████| 130/130 [27:08<00:00,  8.81s/it]     
100%|██████████| 130/130 [27:08<00:00, 12.53s/it]

{'eval_loss': 1.108413815498352, 'eval_runtime': 46.0795, 'eval_samples_per_second': 2.17, 'eval_steps_per_second': 0.282, 'epoch': 10.0}
{'train_runtime': 1628.4807, 'train_samples_per_second': 0.614, 'train_steps_per_second': 0.08, 'train_loss': 1.5211551372821515, 'epoch': 10.0}





TrainOutput(global_step=130, training_loss=1.5211551372821515, metrics={'train_runtime': 1628.4807, 'train_samples_per_second': 0.614, 'train_steps_per_second': 0.08, 'total_flos': 130690897920000.0, 'train_loss': 1.5211551372821515, 'epoch': 10.0})

#### Reloading the finetuning model (freezed embedding layers) to finetune it for 10 more epochs

- Total Time = 57 Min
- Loss = 0.84
- Total epoch = 20

In [16]:
model2_10e = DistilBertForTokenClassification.from_pretrained('./results2/checkpoint-130', num_labels=num_labels) # ./results2/checkpoint-130

In [54]:
# Freeze embedding layers
for param in model2_10e.distilbert.embeddings.parameters():
    param.requires_grad = False


for i, layer in enumerate(model2_10e.distilbert.transformer.layer):
    if i < 3:  # Adjust this index based on how many layers you want to freeze
        for param in layer.parameters():
            param.requires_grad = True


for name, param in model2_10e.named_parameters():
    if not param.requires_grad:
        print(f"Layer '{name}' is frozen.")
    else:
        print(f"Layer '{name}' is trainable.")

Layer 'distilbert.embeddings.word_embeddings.weight' is frozen.
Layer 'distilbert.embeddings.position_embeddings.weight' is frozen.
Layer 'distilbert.embeddings.LayerNorm.weight' is frozen.
Layer 'distilbert.embeddings.LayerNorm.bias' is frozen.
Layer 'distilbert.transformer.layer.0.attention.q_lin.weight' is trainable.
Layer 'distilbert.transformer.layer.0.attention.q_lin.bias' is trainable.
Layer 'distilbert.transformer.layer.0.attention.k_lin.weight' is trainable.
Layer 'distilbert.transformer.layer.0.attention.k_lin.bias' is trainable.
Layer 'distilbert.transformer.layer.0.attention.v_lin.weight' is trainable.
Layer 'distilbert.transformer.layer.0.attention.v_lin.bias' is trainable.
Layer 'distilbert.transformer.layer.0.attention.out_lin.weight' is trainable.
Layer 'distilbert.transformer.layer.0.attention.out_lin.bias' is trainable.
Layer 'distilbert.transformer.layer.0.sa_layer_norm.weight' is trainable.
Layer 'distilbert.transformer.layer.0.sa_layer_norm.bias' is trainable.
Laye

In [43]:
# Define training arguments 1:55PM
training_args3 = TrainingArguments(
    output_dir='./results3',              # output directory
    evaluation_strategy='epoch',         # evaluate every epoch
    learning_rate=2e-5,                  # learning rate
    per_device_train_batch_size=8,       # batch size for training
    per_device_eval_batch_size=8,        # batch size for evaluation
    num_train_epochs=10,                  # number of training epochs
    weight_decay=0.01,                   # strength of weight decay
)

trainer3 = Trainer(
    model=model2_10e,                      # loaded model from checkpoint
    args=training_args3,               
    train_dataset=tokenized_datasets['train'],  
    eval_dataset=tokenized_datasets['validation']
)

# Continue training for another 10 epochs
trainer3.train()  # This ensures that training continues from the checkpoint


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                             

100%|██████████| 130/130 [37:36<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.0495502948760986, 'eval_runtime': 45.4275, 'eval_samples_per_second': 2.201, 'eval_steps_per_second': 0.286, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                             

100%|██████████| 130/130 [42:24<00:00,  8.08s/it]
[A
[A

{'eval_loss': 0.9684075713157654, 'eval_runtime': 37.2487, 'eval_samples_per_second': 2.685, 'eval_steps_per_second': 0.349, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                             

100%|██████████| 130/130 [45:17<00:00,  8.08s/it]
[A
[A

{'eval_loss': 0.9327095746994019, 'eval_runtime': 32.6302, 'eval_samples_per_second': 3.065, 'eval_steps_per_second': 0.398, 'epoch': 3.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                             

100%|██████████| 130/130 [47:57<00:00,  8.08s/it]
[A
[A

{'eval_loss': 0.896435022354126, 'eval_runtime': 39.7543, 'eval_samples_per_second': 2.515, 'eval_steps_per_second': 0.327, 'epoch': 4.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                             

100%|██████████| 130/130 [50:49<00:00,  8.08s/it]
[A
[A

{'eval_loss': 0.8783755302429199, 'eval_runtime': 51.9392, 'eval_samples_per_second': 1.925, 'eval_steps_per_second': 0.25, 'epoch': 5.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                             

100%|██████████| 130/130 [54:08<00:00,  8.08s/it]
[A
[A

{'eval_loss': 0.862865686416626, 'eval_runtime': 62.7463, 'eval_samples_per_second': 1.594, 'eval_steps_per_second': 0.207, 'epoch': 6.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                             

100%|██████████| 130/130 [57:22<00:00,  8.08s/it]
[A
[A

{'eval_loss': 0.8608485460281372, 'eval_runtime': 51.0131, 'eval_samples_per_second': 1.96, 'eval_steps_per_second': 0.255, 'epoch': 7.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 
[A                                              

100%|██████████| 130/130 [1:00:15<00:00,  8.08s/it]
[A
[A

{'eval_loss': 0.8591542840003967, 'eval_runtime': 41.7349, 'eval_samples_per_second': 2.396, 'eval_steps_per_second': 0.311, 'epoch': 8.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                              

100%|██████████| 130/130 [1:03:24<00:00,  8.08s/it]
[A
[A

{'eval_loss': 0.8446149230003357, 'eval_runtime': 44.2916, 'eval_samples_per_second': 2.258, 'eval_steps_per_second': 0.294, 'epoch': 9.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                              

100%|██████████| 130/130 [1:06:45<00:00,  8.08s/it]
[A
                                                   
100%|██████████| 130/130 [32:31<00:00, 15.01s/it]t]

{'eval_loss': 0.8459992408752441, 'eval_runtime': 42.855, 'eval_samples_per_second': 2.333, 'eval_steps_per_second': 0.303, 'epoch': 10.0}
{'train_runtime': 1951.3232, 'train_samples_per_second': 0.512, 'train_steps_per_second': 0.067, 'train_loss': 0.8402340815617488, 'epoch': 10.0}





TrainOutput(global_step=130, training_loss=0.8402340815617488, metrics={'train_runtime': 1951.3232, 'train_samples_per_second': 0.512, 'train_steps_per_second': 0.067, 'total_flos': 130690897920000.0, 'train_loss': 0.8402340815617488, 'epoch': 10.0})

#### Finetuning the model by freezing the first 3 layers Transformer layers and embedding layer

- Total Time = 23 Min
- Loss = 1.03
- Epoch = 10

In [44]:
model3 = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
print(model3)

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

In [46]:
# Freeze embedding layers
for param in model3.distilbert.embeddings.parameters():
    param.requires_grad = False

In [49]:
# Freeze some transformer layers
# DistilBERT has 6 transformer layers, let's say we want to freeze the first 3
for i, layer in enumerate(model3.distilbert.transformer.layer):
    if i < 3:  # Adjust this index based on how many layers you want to freeze
        for param in layer.parameters():
            param.requires_grad = False

# Optionally, you might want to print the parameters to verify
for name, param in model3.named_parameters():
    if not param.requires_grad:
        print(f"Layer '{name}' is frozen.")
    else:
        print(f"Layer '{name}' is trainable.")


Layer 'distilbert.embeddings.word_embeddings.weight' is frozen.
Layer 'distilbert.embeddings.position_embeddings.weight' is frozen.
Layer 'distilbert.embeddings.LayerNorm.weight' is frozen.
Layer 'distilbert.embeddings.LayerNorm.bias' is frozen.
Layer 'distilbert.transformer.layer.0.attention.q_lin.weight' is frozen.
Layer 'distilbert.transformer.layer.0.attention.q_lin.bias' is frozen.
Layer 'distilbert.transformer.layer.0.attention.k_lin.weight' is frozen.
Layer 'distilbert.transformer.layer.0.attention.k_lin.bias' is frozen.
Layer 'distilbert.transformer.layer.0.attention.v_lin.weight' is frozen.
Layer 'distilbert.transformer.layer.0.attention.v_lin.bias' is frozen.
Layer 'distilbert.transformer.layer.0.attention.out_lin.weight' is frozen.
Layer 'distilbert.transformer.layer.0.attention.out_lin.bias' is frozen.
Layer 'distilbert.transformer.layer.0.sa_layer_norm.weight' is frozen.
Layer 'distilbert.transformer.layer.0.sa_layer_norm.bias' is frozen.
Layer 'distilbert.transformer.laye

In [50]:
# Define training arguments
training_args4 = TrainingArguments(
    output_dir='./results4',              # output directory
    evaluation_strategy='epoch',         # evaluate every epoch
    learning_rate=2e-5,                  # learning rate
    per_device_train_batch_size=8,       # batch size for training
    per_device_eval_batch_size=8,        # batch size for evaluation
    num_train_epochs=10,                  # number of training epochs
    weight_decay=0.01,                   # strength of weight decay
)

# Initialize the Trainer
trainer4 = Trainer(
    model=model3,                         # the instantiated model to be trained
    args=training_args4,                  # training arguments, defined above
    train_dataset=tokenized_datasets['train'],  # training dataset
    eval_dataset=tokenized_datasets['validation'] if 'validation' in tokenized_datasets else tokenized_datasets['train'],  # evaluation dataset
)

# Train the model
trainer4.train()


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                             

100%|██████████| 130/130 [1:19:31<00:00,  8.08s/it]
[A
[A

{'eval_loss': 2.2518470287323, 'eval_runtime': 40.7476, 'eval_samples_per_second': 2.454, 'eval_steps_per_second': 0.319, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                             

100%|██████████| 130/130 [1:21:40<00:00,  8.08s/it]
[A
[A

{'eval_loss': 2.04111909866333, 'eval_runtime': 39.6434, 'eval_samples_per_second': 2.522, 'eval_steps_per_second': 0.328, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                             

100%|██████████| 130/130 [1:23:54<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.869015097618103, 'eval_runtime': 43.221, 'eval_samples_per_second': 2.314, 'eval_steps_per_second': 0.301, 'epoch': 3.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                             

100%|██████████| 130/130 [1:26:05<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.7263119220733643, 'eval_runtime': 40.5433, 'eval_samples_per_second': 2.466, 'eval_steps_per_second': 0.321, 'epoch': 4.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                             

100%|██████████| 130/130 [1:27:57<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.619593620300293, 'eval_runtime': 33.4662, 'eval_samples_per_second': 2.988, 'eval_steps_per_second': 0.388, 'epoch': 5.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                             

100%|██████████| 130/130 [1:29:46<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.5414397716522217, 'eval_runtime': 31.0389, 'eval_samples_per_second': 3.222, 'eval_steps_per_second': 0.419, 'epoch': 6.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                             

100%|██████████| 130/130 [1:31:33<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.464524745941162, 'eval_runtime': 35.2468, 'eval_samples_per_second': 2.837, 'eval_steps_per_second': 0.369, 'epoch': 7.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                              

100%|██████████| 130/130 [1:33:39<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.4358428716659546, 'eval_runtime': 44.1673, 'eval_samples_per_second': 2.264, 'eval_steps_per_second': 0.294, 'epoch': 8.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                              

100%|██████████| 130/130 [1:35:46<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.400370478630066, 'eval_runtime': 38.909, 'eval_samples_per_second': 2.57, 'eval_steps_per_second': 0.334, 'epoch': 9.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                              

100%|██████████| 130/130 [1:37:40<00:00,  8.08s/it]
[A
                                                   
100%|██████████| 130/130 [20:11<00:00,  9.32s/it]t]

{'eval_loss': 1.393751621246338, 'eval_runtime': 28.8781, 'eval_samples_per_second': 3.463, 'eval_steps_per_second': 0.45, 'epoch': 10.0}
{'train_runtime': 1211.5082, 'train_samples_per_second': 0.825, 'train_steps_per_second': 0.107, 'train_loss': 1.7968050443209134, 'epoch': 10.0}





TrainOutput(global_step=130, training_loss=1.7968050443209134, metrics={'train_runtime': 1211.5082, 'train_samples_per_second': 0.825, 'train_steps_per_second': 0.107, 'total_flos': 130690897920000.0, 'train_loss': 1.7968050443209134, 'epoch': 10.0})

#### Reloading the finetuning model (freezed embedding layers and first 3 Transformer layers) to finetune it for 10 more epochs

- Total Time = 46 Min
- Loss = 1.03
- Total epoch = 20

In [17]:
model3_10e = DistilBertForTokenClassification.from_pretrained('./results4/checkpoint-130', num_labels=num_labels) # ./results2/checkpoint-130

In [55]:
# Freeze embedding layers
for param in model3_10e.distilbert.embeddings.parameters():
    param.requires_grad = False


for i, layer in enumerate(model3_10e.distilbert.transformer.layer):
    if i < 3:  # Adjust this index based on how many layers you want to freeze
        for param in layer.parameters():
            param.requires_grad = False


for name, param in model3_10e.named_parameters():
    if not param.requires_grad:
        print(f"Layer '{name}' is frozen.")
    else:
        print(f"Layer '{name}' is trainable.")

Layer 'distilbert.embeddings.word_embeddings.weight' is frozen.
Layer 'distilbert.embeddings.position_embeddings.weight' is frozen.
Layer 'distilbert.embeddings.LayerNorm.weight' is frozen.
Layer 'distilbert.embeddings.LayerNorm.bias' is frozen.
Layer 'distilbert.transformer.layer.0.attention.q_lin.weight' is frozen.
Layer 'distilbert.transformer.layer.0.attention.q_lin.bias' is frozen.
Layer 'distilbert.transformer.layer.0.attention.k_lin.weight' is frozen.
Layer 'distilbert.transformer.layer.0.attention.k_lin.bias' is frozen.
Layer 'distilbert.transformer.layer.0.attention.v_lin.weight' is frozen.
Layer 'distilbert.transformer.layer.0.attention.v_lin.bias' is frozen.
Layer 'distilbert.transformer.layer.0.attention.out_lin.weight' is frozen.
Layer 'distilbert.transformer.layer.0.attention.out_lin.bias' is frozen.
Layer 'distilbert.transformer.layer.0.sa_layer_norm.weight' is frozen.
Layer 'distilbert.transformer.layer.0.sa_layer_norm.bias' is frozen.
Layer 'distilbert.transformer.laye

In [56]:
# Define training arguments 1:55PM
training_args5 = TrainingArguments(
    output_dir='./results5',              # output directory
    evaluation_strategy='epoch',         # evaluate every epoch
    learning_rate=2e-5,                  # learning rate
    per_device_train_batch_size=8,       # batch size for training
    per_device_eval_batch_size=8,        # batch size for evaluation
    num_train_epochs=10,                  # number of training epochs
    weight_decay=0.01,                   # strength of weight decay
)

trainer5 = Trainer(
    model=model3_10e,                      # loaded model from checkpoint
    args=training_args5,               
    train_dataset=tokenized_datasets['train'],  
    eval_dataset=tokenized_datasets['validation']
)

# Continue training for another 10 epochs
trainer5.train()  # This ensures that training continues from the checkpoint


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                             

100%|██████████| 130/130 [1:57:22<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.3080369234085083, 'eval_runtime': 44.3953, 'eval_samples_per_second': 2.252, 'eval_steps_per_second': 0.293, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                             

100%|██████████| 130/130 [2:00:04<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.2254087924957275, 'eval_runtime': 53.0311, 'eval_samples_per_second': 1.886, 'eval_steps_per_second': 0.245, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                             

100%|██████████| 130/130 [2:02:18<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.178545594215393, 'eval_runtime': 35.5588, 'eval_samples_per_second': 2.812, 'eval_steps_per_second': 0.366, 'epoch': 3.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                             

100%|██████████| 130/130 [2:04:24<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.1154464483261108, 'eval_runtime': 42.0436, 'eval_samples_per_second': 2.378, 'eval_steps_per_second': 0.309, 'epoch': 4.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                             

100%|██████████| 130/130 [2:06:51<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.087414264678955, 'eval_runtime': 41.6297, 'eval_samples_per_second': 2.402, 'eval_steps_per_second': 0.312, 'epoch': 5.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                             

100%|██████████| 130/130 [2:09:03<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.0722606182098389, 'eval_runtime': 41.3843, 'eval_samples_per_second': 2.416, 'eval_steps_per_second': 0.314, 'epoch': 6.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                             

100%|██████████| 130/130 [2:11:17<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.0522233247756958, 'eval_runtime': 43.143, 'eval_samples_per_second': 2.318, 'eval_steps_per_second': 0.301, 'epoch': 7.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                              

100%|██████████| 130/130 [2:13:30<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.0484347343444824, 'eval_runtime': 43.4365, 'eval_samples_per_second': 2.302, 'eval_steps_per_second': 0.299, 'epoch': 8.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                              

100%|██████████| 130/130 [2:15:51<00:00,  8.08s/it]
[A
[A

{'eval_loss': 1.0348098278045654, 'eval_runtime': 47.1772, 'eval_samples_per_second': 2.12, 'eval_steps_per_second': 0.276, 'epoch': 9.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                              

100%|██████████| 130/130 [2:17:54<00:00,  8.08s/it]
[A
                                                   
100%|██████████| 130/130 [23:22<00:00, 10.79s/it]t]

{'eval_loss': 1.0335370302200317, 'eval_runtime': 40.0826, 'eval_samples_per_second': 2.495, 'eval_steps_per_second': 0.324, 'epoch': 10.0}
{'train_runtime': 1402.7865, 'train_samples_per_second': 0.713, 'train_steps_per_second': 0.093, 'train_loss': 1.144165508563702, 'epoch': 10.0}





TrainOutput(global_step=130, training_loss=1.144165508563702, metrics={'train_runtime': 1402.7865, 'train_samples_per_second': 0.713, 'train_steps_per_second': 0.093, 'total_flos': 130690897920000.0, 'train_loss': 1.144165508563702, 'epoch': 10.0})

Explanation:
Tokenization (tokenize_and_align_labels): This function tokenizes the inputs and aligns the labels (upos) with the subword tokens produced by the tokenizer. Subwords and special tokens are ignored for the labels (-100).

Mapping to Test Set: The test dataset is tokenized and aligned using dataset['test'].map(tokenize_and_align_labels).

Evaluation Process:

We loop through the tokenized dataset and feed inputs (e.g., input_ids, attention_mask) to the model.
The model returns logits, which are then converted into predicted labels using argmax.
The predictions are compared to the true labels (upos) while ignoring subword tokens and padding tokens.
Accuracy Calculation: Finally, we compute accuracy using accuracy_score from sklearn.

Next Steps:
Ensure you have a correct num_labels (which should be equal to the number of unique upos tags).
Replace any dummy prediction logic (pred = true_pos) if you have a specific model trained on the task. This code assumes that the DistilBertForTokenClassification model is trained to predict POS tags on the dataset you are working with.

In [26]:
'''

for batch in dataset:
Explanation: This line iterates over each batch in the dataset (which in this case is the tokenized test set). The dataset here is expected to be tokenized and prepared with features like input_ids, attention_mask, and labels.
2. inputs = {key: torch.tensor(val).unsqueeze(0) for key, val in batch.items() if key in ['input_ids', 'attention_mask']}
Explanation:
batch.items(): This extracts the features from each batch, where each batch contains input_ids, attention_mask, and labels.
torch.tensor(val).unsqueeze(0): This converts the value (val) into a PyTorch tensor and adds a new dimension using .unsqueeze(0) to create a batch of size 1. This is because the model expects inputs in batch format (even if we have only one sequence at a time).
The keys we are interested in are 'input_ids' and 'attention_mask', which represent the tokenized input and the attention mask (to ignore padding).
This line constructs a dictionary called inputs, containing tensors for input_ids and attention_mask.
3. with torch.no_grad():
Explanation:
This line indicates that we're performing inference without updating model parameters (i.e., no gradients are computed). This reduces memory usage and speeds up the computation during evaluation.
4. outputs = model(**inputs).logits
Explanation:
Here, we pass the inputs to the model, which consists of input_ids and attention_mask. The model(**inputs) calls the model's forward pass, and the output is the logits, which are the raw, unnormalized predictions for each token in the sequence.
The logits are of shape [batch_size, sequence_length, num_labels], where num_labels is the number of possible labels (POS tags).
5. logits = torch.argmax(outputs, dim=-1).squeeze(0).tolist()
Explanation:
torch.argmax(outputs, dim=-1): This selects the index of the highest value (the predicted label) across the last dimension (num_labels). This means we're picking the predicted label for each token in the sequence.
squeeze(0): Removes the batch dimension (since we only passed one batch at a time, with batch size = 1).
tolist(): Converts the tensor of predicted labels to a Python list for easier manipulation.
6. predictions.extend([pred for pred, label in zip(logits, batch['labels']) if label != -100])
Explanation:
zip(logits, batch['labels']): This pairs each predicted token label (pred) with its corresponding true label from the dataset (batch['labels']).
if label != -100: This condition ensures we skip the tokens that were assigned the label -100. This happens for subword tokens or padding tokens, which are ignored for evaluation.
predictions.extend(...): The predicted labels (for valid tokens) are added to the predictions list.
7. true_labels.extend([label for label in batch['labels'] if label != -100])
Explanation:
Similar to the predictions line, this extracts the true labels (ignoring -100) and extends the true_labels list with them. This allows us to compare the predicted labels with the true labels for accuracy evaluation.
Summary:
The code performs the following operations:

Iterates over each batch in the dataset.
Prepares the inputs by converting input_ids and attention_mask into tensors.
Performs inference without computing gradients.
Extracts the predicted labels by taking the highest scoring label (argmax) for each token.
Filters out ignored tokens (like subword tokens and padding) and stores valid predictions and true labels in separate lists (predictions and true_labels).
These lists are later used to compute the accuracy or other evaluation metrics.


'''

"\n\nfor batch in dataset:\nExplanation: This line iterates over each batch in the dataset (which in this case is the tokenized test set). The dataset here is expected to be tokenized and prepared with features like input_ids, attention_mask, and labels.\n2. inputs = {key: torch.tensor(val).unsqueeze(0) for key, val in batch.items() if key in ['input_ids', 'attention_mask']}\nExplanation:\nbatch.items(): This extracts the features from each batch, where each batch contains input_ids, attention_mask, and labels.\ntorch.tensor(val).unsqueeze(0): This converts the value (val) into a PyTorch tensor and adds a new dimension using .unsqueeze(0) to create a batch of size 1. This is because the model expects inputs in batch format (even if we have only one sequence at a time).\nThe keys we are interested in are 'input_ids' and 'attention_mask', which represent the tokenized input and the attention mask (to ignore padding).\nThis line constructs a dictionary called inputs, containing tensors fo

#### Evaluating model using accruacy metric on the test set

In [21]:
from sklearn.metrics import accuracy_score
import torch

# Evaluation function
def evaluate_model(model, dataset):
    model.eval()
    predictions, true_labels = [], []
    
    for batch in dataset:
        inputs = {key: torch.tensor(val).unsqueeze(0) for key, val in batch.items() if key in ['input_ids', 'attention_mask']}
        with torch.no_grad():
            outputs = model(**inputs).logits
        logits = torch.argmax(outputs, dim=-1).squeeze(0).tolist()

        # Remove ignored index (-100) and compare predictions with true labels
        predictions.extend([pred for pred, label in zip(logits, batch['labels']) if label != -100])
        true_labels.extend([label for label in batch['labels'] if label != -100])

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Test set accuracy: {accuracy:.4f}")

#### Full finetuned model for 10 epochs

In [23]:
# Evaluate the model on the tokenized test set
evaluate_model(model, tokenized_datasets['test'])

Test set accuracy: 0.6477


#### Freezed Embedding Layer finetuned model for 20 epochs

In [24]:
# Evaluate the model on the tokenized test set
evaluate_model(model2_10e, tokenized_datasets['test'])

Test set accuracy: 0.6913


#### Freezed Embedding Layer & First 3 Transformer layers finetuned model for 20 epochs

In [25]:
# Evaluate the model on the tokenized test set
evaluate_model(model3_10e, tokenized_datasets['test'])

Test set accuracy: 0.6434


# Resources used:

- ChatGPT
- https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb
- https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb
- https://huggingface.co/docs/transformers/en/model_doc/bert