## Load Libraries

In [1]:
!pip install openpyxl
!pip install datasets
!pip install "transformers[torch]"



## Load Excel file containing labels

In [29]:
excel_path = '../data/2025-05-01 Capstone Metadata.xlsx'

In [3]:
df = pd.read_excel(excel_path, header=3)
df = df[['Document Type', 'Title/Subject']]
df.head()

Unnamed: 0,Document Type,Title/Subject
0,CORR,"Standards, Analytical Results"
1,COV,Covenant
2,CORR,"Certificate of Compliance, Pursuant to Section..."
3,CoC,Conditional Certificate of Compliance
4,CoC,Conditional Certificate of Compliance


In [5]:
df['Document Type'].value_counts()

Document Type
CORR             103
RPT               19
TMEMO             17
PSI               12
TITLE             10
MAP                7
COC                6
Site Registry      6
FDET               5
DSI                5
RA                 5
CSSA               5
NOTE               4
CoC                4
PDET               3
REF                2
NIRI               2
SSI                2
AIP                2
COV                1
IMG                1
SP                 1
COA                1
AiP                1
SPC                1
Name: count, dtype: int64

## Some names were not consistent. Make them consistent

In [6]:
rename_map = {
    'CoC' : 'COC',
    'AiP' : 'AIP',
    'NOTE': 'OTHERS',
    'REF': 'OTHERS',
    'SPC': 'OTHERS'
}

df['Document Type'] = df['Document Type'].replace(rename_map)

In [7]:
df

Unnamed: 0,Document Type,Title/Subject
0,CORR,"Standards, Analytical Results"
1,COV,Covenant
2,CORR,"Certificate of Compliance, Pursuant to Section..."
3,COC,Conditional Certificate of Compliance
4,COC,Conditional Certificate of Compliance
...,...,...
220,RPT,Site Remediation Report\nNorth Kamloops Fire H...
221,RPT,"Underground Storage Tank Decommissioning,\nSup..."
222,RPT,Geotechnical Investigation\nProposed Senior Ci...
223,PSI,STAGE 2 PSI\nCITY OF KAMLOOPS FIREHALL NO. 2


In [8]:
df['Document Type'].value_counts()

Document Type
CORR             103
RPT               19
TMEMO             17
PSI               12
COC               10
TITLE             10
OTHERS             7
MAP                7
Site Registry      6
CSSA               5
FDET               5
RA                 5
DSI                5
PDET               3
AIP                3
NIRI               2
SSI                2
SP                 1
COV                1
IMG                1
COA                1
Name: count, dtype: int64

## Data Augmentation to make sure minority classes are not underrepresented

In [9]:
new_row = pd.DataFrame([
        {'Document Type': 'COV', 
        'Title/Subject': '''Covenant'''},
            {'Document Type': 'COA', 
        'Title/Subject': '''Certifi cate of Analysis for document of Site 3452'''},
        {'Document Type': 'COA', 
        'Title/Subject': '''Certificate of Analysis Document no 3425'''},
    {'Document Type': 'COA', 
        'Title/Subject': '''Certificate of\n\n Analysis done completely'''},
        {'Document Type': 'COA', 
        'Title/Subject': '''Certificate of completed Analysis'''},
            {'Document Type': 'IMG', 
        'Title/Subject': '''Pictures'''},
    {'Document Type': 'IMG', 
        'Title/Subject': '''Images'''},
    {'Document Type': 'IMG', 
        'Title/Subject': '''JPEGS'''},
    {'Document Type': 'IMG', 
        'Title/Subject': '''PNGS'''},
    {'Document Type': 'COV', 
        'Title/Subject': '''Cov\enant'''},
    {'Document Type': 'COV', 
        'Title/Subject': '''Covenant Report'''},
    {'Document Type': 'COV', 
        'Title/Subject': '''Covenant Based on '''},
    {'Document Type': 'SP', 
        'Title/Subject': '''Schedule 2 Site Profile'''},
        {'Document Type': 'SP', 
        'Title/Subject': '''Schedule 3 Site Profile'''},
      {'Document Type': 'SP', 
        'Title/Subject': '''Schedule 4 Site Profile'''},
    {'Document Type': 'SP', 
        'Title/Subject': '''Schedule 5 Site Profile'''},
    {'Document Type': 'PDET', 
        'Title/Subject': '''preliminary\ndetermination for Site 2345'''},
    {'Document Type': 'SSI', 
        'Title/Subject': '''STAGE 1 AND 2 PSI, DSI AND CLOSURE REP\nORTING\n'''},
    {'Document Type': 'SSI', 
        'Title/Subject': '''STAGE 1 AND 2 PSI, DSI AND CLOSURE REPORTING ADDITIONAL SUPPLEMENTAL INFORMATION (given on page 32)'''},
    {'Document Type': 'SSI', 
        'Title/Subject': '''STAGE 1 AND 2 PSI, DSI AND CLOSURE REPORTING SUPPLEMENTAL INFORMATION OR ADDITIONAL\n'''},
    {'Document Type': 'AIP', 
        'Title/Subject': '''APPROVAL In Principle, technical report for PA 16456'''},
    {'Document Type': 'AIP', 
        'Title/Subject': '''Approval in Principle, AIP 23663'''},
    {'Document Type': 'PDET', 
        'Title/Subject': '''Preliminary Determination for Section 26.4  for the property\nlocated at 736 Main Street'''},
    {'Document Type': 'NIRI', 
    'Title/Subject': '''Notice of Independent Remediation  
    Vancouver, British Columbia'''}, 
    {'Document Type': 'NIRI', 
        'Title/Subject': '''Notice of Independent Remediation'''},
        {'Document Type': 'NIRI', 
    'Title/Subject': '''RE: NOTICE OF INDEP ENDENT REMEDIA\nTION\tMOBILE PHONE NO. 72 STREET, SHAMBALA, BC\n'''}])
df = pd.concat([df, new_row], ignore_index=True)

In [10]:
df['Document Type'].value_counts()

Document Type
CORR             103
RPT               19
TMEMO             17
PSI               12
COC               10
TITLE             10
OTHERS             7
MAP                7
Site Registry      6
SP                 5
CSSA               5
COV                5
PDET               5
IMG                5
FDET               5
RA                 5
DSI                5
COA                5
NIRI               5
AIP                5
SSI                5
Name: count, dtype: int64

## Train, test, val splits

In [11]:
import pandas as pd
from math import ceil

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Containers for splits
train_rows, val_rows, test_rows = [], [], []

# Per-class splitting
for label, group in df.groupby('Document Type'):
    n = len(group)
    n_train = int(n * 0.6)
    n_val = int(n * 0.2)
    n_test = n - n_train - n_val  # ensures all rows are used

    train_rows.append(group.iloc[:n_train])
    val_rows.append(group.iloc[n_train:n_train + n_val])
    test_rows.append(group.iloc[n_train + n_val:])

# Concatenate all rows
train_df = pd.concat(train_rows).reset_index(drop=True)
val_df = pd.concat(val_rows).reset_index(drop=True)
test_df = pd.concat(test_rows).reset_index(drop=True)

## Prepare data for training 

In [13]:
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder


train_df = train_df.rename(columns={'Title/Subject': 'text'})
train_df['text'] = train_df['text'].astype(str).fillna('')

val_df = val_df.rename(columns={'Title/Subject': 'text'})
test_df = test_df.rename(columns={'Title/Subject': 'text'})

# Label encode target
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['Document Type'])
val_df['label'] = le.transform(val_df['Document Type'])
test_df['label'] = le.transform(test_df['Document Type'])

# Create Hugging Face Datasets
train_ds = Dataset.from_pandas(train_df[['text', 'label']])
val_ds = Dataset.from_pandas(val_df[['text', 'label']])
test_ds = Dataset.from_pandas(test_df[['text', 'label']])

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    texts = list(batch['text'])  # force list in case it's not
    return tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=64
    )



train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

Map: 100%|█████████████████████████████| 47/47 [00:00<00:00, 2979.94 examples/s]
Map: 100%|████████████████████████████| 56/56 [00:00<00:00, 11336.50 examples/s]


## Load Model 

In [17]:
from transformers import AutoModelForSequenceClassification

from sklearn.utils.class_weight import compute_class_weight

import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"


import torch
import numpy as np

device = torch.device("cpu")


class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)

# Convert to torch tensor
class_weights = torch.tensor(class_weights, dtype=torch.float)
class_weights = class_weights.to(device)


num_labels = len(le.classes_)
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)
model.to(device)

# model.classifier.weight = torch.nn.Parameter(model.classifier.weight * class_weights.unsqueeze(1))

  Referenced from: <0B7EB158-53DC-3403-8A49-22178CAB4612> /Users/deepaksirwani/miniforge3/envs/colx563/lib/python3.10/site-packages/torchvision/image.so
  warn(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [18]:
train_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

## Loss function

In [19]:
import torch.nn as nn

loss_fn = nn.CrossEntropyLoss(weight=class_weights)

## Train the model 

In [20]:
from transformers import TrainingArguments

# training_args = TrainingArguments(
#     output_dir='./results',
#     evaluation_strategy='epoch',
#     save_strategy='epoch',
#     load_best_model_at_end=True,
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=32,
#     num_train_epochs=50,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     logging_steps=10,
#     no_cuda=True
# )

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",              
    load_best_model_at_end=True,        
    save_total_limit=1,                 
    save_steps=0,                       
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=50,
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=2e-5,
)

# from transformers import Trainer

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_ds,
#     eval_dataset=val_ds,
#     tokenizer=tokenizer,
# )
# trainer.train()


from transformers import Trainer

# class FocalLossTrainer(Trainer):
#     def __init__(self, focal_loss_fn, *args, **kwargs):
#         super().__init__(*args, **kwargs)
#         self.focal_loss_fn = focal_loss_fn

#     def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
#         labels = inputs.pop("labels")
    
#         # Force CPU
#         device = torch.device("cpu")
#         labels = labels.to(device)
#         model.to(device)
#         for k in inputs:
#             if isinstance(inputs[k], torch.Tensor):
#                 inputs[k] = inputs[k].to(device)

#         outputs = model(**inputs)
#         logits = outputs.logits
#         loss = self.focal_loss_fn(logits, labels)
#         return (loss, outputs) if return_outputs else loss

class WeightedCETrainer(Trainer):
    def __init__(self, loss_fn, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = loss_fn

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        labels = labels.to(logits.device)
        self.loss_fn.weight = self.loss_fn.weight.to(logits.device)
        loss = self.loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss



# focal_loss = WeightedFocalLoss(alpha=class_weights, gamma=1.0)

# trainer = FocalLossTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_ds,
#     eval_dataset=val_ds,
#     tokenizer=tokenizer,
#     focal_loss_fn=focal_loss
# )

trainer = WeightedCETrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    loss_fn=loss_fn
)


trainer.train()

  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,3.041,3.01798
2,2.9769,2.9557
3,2.8913,2.876013
4,2.7496,2.789028
5,2.6686,2.688093
6,2.4806,2.58209
7,2.3484,2.468308
8,2.1646,2.366463
9,2.0578,2.268936
10,1.9449,2.165347


TrainOutput(global_step=500, training_loss=1.1081169352531433, metrics={'train_runtime': 184.1717, 'train_samples_per_second': 40.18, 'train_steps_per_second': 2.715, 'total_flos': 122573862374400.0, 'train_loss': 1.1081169352531433, 'epoch': 50.0})

## Get performance on test data

In [21]:
preds = trainer.predict(test_ds)
pred_labels = preds.predictions.argmax(axis=-1)
from sklearn.metrics import classification_report
print(classification_report(test_ds['label'], pred_labels, target_names=le.classes_))

               precision    recall  f1-score   support

          AIP       1.00      1.00      1.00         1
          COA       1.00      1.00      1.00         1
          COC       1.00      1.00      1.00         2
         CORR       0.74      0.91      0.82        22
          COV       1.00      1.00      1.00         1
         CSSA       0.00      0.00      0.00         1
          DSI       1.00      1.00      1.00         1
         FDET       1.00      1.00      1.00         1
          IMG       1.00      1.00      1.00         1
          MAP       0.50      0.50      0.50         2
         NIRI       1.00      1.00      1.00         1
       OTHERS       0.00      0.00      0.00         2
         PDET       1.00      1.00      1.00         1
          PSI       1.00      1.00      1.00         3
           RA       0.50      1.00      0.67         1
          RPT       1.00      0.80      0.89         5
           SP       1.00      1.00      1.00         1
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Save the best model 

In [23]:
trainer.save_model("./best_model")               # saves model weights + config
tokenizer.save_pretrained("./best_model")        # saves tokenizer (needed for inference)

('./best_model/tokenizer_config.json',
 './best_model/special_tokens_map.json',
 './best_model/vocab.txt',
 './best_model/added_tokens.json',
 './best_model/tokenizer.json')

## Inference

In [28]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('./best_model')
model = AutoModelForSequenceClassification.from_pretrained('./best_model')
model.eval()

# Replace with your actual class names
class_names = [
    'AIP', 'COA', 'COC', 'CORR', 'COV', 'CSSA', 'DSI', 'FDET', 'IMG', 'MAP',
    'NIRI', 'OTHERS', 'PDET', 'PSI', 'RA', 'RPT', 'SP', 'SSI', 'Site Registry',
    'TITLE', 'TMEMO'
]

# Prediction function
def predict_document_type(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = logits.argmax().item()
    return class_names[predicted_class_id]

# Example usage
example_titles = [
    "Certificate of Compliance for Section 28",
    "Preliminary Site Investigation for North Shore",
    "Images of Soil Contamination",
    "Analytical Lab Results"
    
]

predictions = [predict_document_type(title) for title in example_titles]
for title, label in zip(example_titles, predictions):
    print(f"{title} → {label}")

Certificate of Compliance for Section 28 → COC
Preliminary Site Investigation for North Shore → PSI
Images of Soil Contamination → IMG
Analytical Lab Results → CORR
