# Modular Training Notebook

This notebook is designed to facilitate fine-tuning of a transformer model on some data.

## 1. Imports and Setup

In [1]:
!pip install datasets transformers torch seqeval &> /dev/null

In [2]:
import os
import datasets
import pandas as pd
from ast import literal_eval
from google.colab import drive

In [3]:
system = "COLAB"

if system=="COLAB":
  drive.mount("/content/gdrive")
  DATA_DIR = os.path.join("/content/gdrive/Shared drives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data")
  MODEL_DIR = os.path.join("/content/gdrive/Shared drives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models")
  RESULTS_DIR = os.path.join("/content/gdrive/Shared drives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/Metrics")

os.chdir(os.path.join("/content/gdrive/Shared drives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021"))

Mounted at /content/gdrive


## 2. Variables

In [4]:
#@title BERT Fine-Tuning Config
#@markdown Fill in the below fields to define the environment variables used when training the model.

#@markdown Where your model will be saved:
ModelFolder = "/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models" #@param {type:"string"}
#@markdown Path to your training pytorch dataset:
TrainDataFolder="/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Feb2022-Data/Feb22-HF/unvalidated_train" #@param {type:"string"}
#@markdown Path to your test pytorch dataset:
TestDataFolder="/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Feb2022-Data/Feb22-HF/validated_test" #@param {type:"string"}
#@markdown Path to your label mapping file (json):
MappingFile="/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Feb2022-Data/Feb22-HF/full_labelmap.json" #@param {type:"string"}
#@markdown Number of GPUs to use:
GPUS=1 #@param ["1", "2", "3", "4"] {type:"raw"}

%env SM_MODEL_DIR=$ModelFolder
%env SM_TRAIN_DATA=$TrainDataFolder
%env SM_TEST_DATA=$TestDataFolder
%env SM_MAPPING_FILE=$MappingFile
%env SM_NUM_GPUS =$GPUS

env: SM_MODEL_DIR=/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models
env: SM_TRAIN_DATA=/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Feb2022-Data/Feb22-HF/unvalidated_train
env: SM_TEST_DATA=/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Feb2022-Data/Feb22-HF/validated_test
env: SM_MAPPING_FILE=/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Feb2022-Data/Feb22-HF/full_labelmap.json
env: SM_NUM_GPUS=1


## 3. Training

In [5]:
!pygmentize ./Scripts/NER_training_script.py

[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m

[34mfrom[39;49;00m [04m[36msklearn.metrics[39;49;00m [34mimport[39;49;00m confusion_matrix
[34mfrom[39;49;00m [04m[36msklearn.metrics[39;49;00m [34mimport[39;49;00m ConfusionMatrixDisplay
[34mimport[39;49;00m [04m[36mmatplotlib.pyplot[39;49;00m [34mas[39;49;00m [04m[36mplt[39;49;00m

[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mfrom[39;49;00m [04m[36mdatetime[39;49;00m [34mimport[39;49;00m date
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m load_from_disk, load_metric, ClassLabel, Sequence
[34mfrom[39;49;00m [04m[36mtransformers[39;49;

In [6]:
!python ./Scripts/NER_training_script.py --epochs 4 --model_id distilbert-base-uncased --train_batch_size 16 --eval_batch_size 16

Dataset_name:  unvalidated_train
2022-01-16 20:05:14,711 - __main__ - INFO -  loaded train_dataset length is: 277345
2022-01-16 20:05:14,711 - __main__ - INFO -  loaded test_dataset length is: 1181
Downloading: 100% 28.0/28.0 [00:00<00:00, 27.5kB/s]
Downloading: 100% 483/483 [00:00<00:00, 427kB/s]
Downloading: 100% 226k/226k [00:00<00:00, 907kB/s]
Downloading: 100% 455k/455k [00:00<00:00, 1.49MB/s]
100% 2/2 [00:00<00:00, 10.62ba/s]
Downloading: 6.34kB [00:00, 4.50MB/s]       
Downloading: 100% 256M/256M [00:04<00:00, 58.7MB/s]
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a Ber