### Loading data sets and checking them

In [None]:
import json
with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/train.json', 'r') as file:
    data = json.load(file)
with open ('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/test.json', 'r') as f:
    testdata = json.load(f)
with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/dev.json', 'r') as valfile:
    valdata = json.load(valfile)

##### checking the relations in the dataset and making sure they are aligning with the original paper

In [None]:
unique_relation = set(sample['relation'] for sample in testdata)
unique_relation

{'no_relation',
 'org:alternate_names',
 'org:city_of_headquarters',
 'org:country_of_headquarters',
 'org:dissolved',
 'org:founded',
 'org:founded_by',
 'org:member_of',
 'org:members',
 'org:number_of_employees/members',
 'org:parents',
 'org:political/religious_affiliation',
 'org:shareholders',
 'org:stateorprovince_of_headquarters',
 'org:subsidiaries',
 'org:top_members/employees',
 'org:website',
 'per:age',
 'per:alternate_names',
 'per:cause_of_death',
 'per:charges',
 'per:children',
 'per:cities_of_residence',
 'per:city_of_birth',
 'per:city_of_death',
 'per:countries_of_residence',
 'per:country_of_birth',
 'per:country_of_death',
 'per:date_of_birth',
 'per:date_of_death',
 'per:employee_of',
 'per:origin',
 'per:other_family',
 'per:parents',
 'per:religion',
 'per:schools_attended',
 'per:siblings',
 'per:spouse',
 'per:stateorprovince_of_birth',
 'per:stateorprovince_of_death',
 'per:stateorprovinces_of_residence',
 'per:title'}

### Preprocessing:
(1) We removed POS Tags as BERT already learns syntax and grammar from pre training

(2) We removed NER Tags as when I fine tune BERT, NER is not needed since it inherently learns entity types

(3) We removed Dependency Parsing since BERT uses contextual processing and not dependency trees

In [None]:
filtered_data = [
    {
        'id': entry['id'],
        'docid': entry['docid'],
        'relation': entry['relation'],
        'token': entry['token'],
        'subj_start': entry['subj_start'],
        'subj_end': entry['subj_end'],
        'obj_start': entry['obj_start'],
        'obj_end': entry['obj_end']
    }
    for entry in data
]
filtered_testdata = [
    {
        'id': entry['id'],
        'docid': entry['docid'],
        'relation': entry['relation'],
        'token': entry['token'],
        'subj_start': entry['subj_start'],
        'subj_end': entry['subj_end'],
        'obj_start': entry['obj_start'],
        'obj_end': entry['obj_end']
    }
    for entry in testdata
]
filtered_devdata = [
    {
        'id': entry['id'],
        'docid': entry['docid'],
        'relation': entry['relation'],
        'token': entry['token'],
        'subj_start': entry['subj_start'],
        'subj_end': entry['subj_end'],
        'obj_start': entry['obj_start'],
        'obj_end': entry['obj_end']
    }
    for entry in valdata
]



#### Moving the data to new filtered json files

In [None]:
with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/filteredtrain.json', 'w') as f:
    json.dump(filtered_data, f, indent = 4)
with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/filteredtest.json', 'w') as f:
    json.dump(filtered_testdata, f, indent = 4)
with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/filteredvaldata.json', 'w') as f:
    json.dump(filtered_devdata, f, indent = 4)

### BERT

### Train Data

#### Preprocessing so that BERT can understand the data

In [None]:
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from transformers import BertModel, BertTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import Counter

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# ================== Data Processing ==================
    # we are extracting the features that are only relevant to BERT
def process_data(data):
    processed = []
    for item in data:
        sentence = " ".join(item['token'])
        subject = " ".join(item['token'][item['subj_start']:item['subj_end'] + 1])
        obj = " ".join(item['token'][item['obj_start']:item['obj_end'] + 1])
        processed.append({
            'text': sentence,
            'relation': item['relation'],
            'subject': subject,
            'object': obj,
            'subj_start': item['subj_start'],
            'obj_start': item['obj_start']
        })
    return processed

# Load train, test, and validation data
with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/train.json', 'r') as file:
    data = json.load(file)

with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/test.json', 'r') as file:
    testdata = json.load(file)

with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/dev.json', 'r') as file:
    valdata = json.load(file)

# Process data
processed_data = process_data(data)
processed_testdata = process_data(testdata)
processed_valdata = process_data(valdata)

# Save processed data
with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/processedval.json', 'w') as file:
    json.dump(processed_valdata, file, indent=4)

# Create relation label mappings since bert expects encodded labels/classes
relation_labels = list(set(item["relation"] for item in processed_data))
label_to_id = {label: i for i, label in enumerate(relation_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [None]:
# ================== Tokenization & Tensor Conversion ==================
    # Returning encoded tensor data so that bert is able to handle it
    # attention masks are to synchronize and fix length of sentence length vectors
    # this adds padding to the vector
def encode_data(data, label_map):
    input_ids, attention_masks, labels, entity1_indices, entity2_indices = [], [], [], [], []
    for item in data:
        # we use pre-trained bert tokenizer
        encoded = tokenizer(
            item['text'],
            padding='max_length',
            truncation=True,
            max_length=64,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'].squeeze(0))
        attention_masks.append(encoded['attention_mask'].squeeze(0))
        labels.append(torch.tensor(label_map[item['relation']], dtype=torch.long))

        entity1_indices.append(torch.tensor(min(item['subj_start'], 63), dtype=torch.long))
        entity2_indices.append(torch.tensor(min(item['obj_start'], 63), dtype=torch.long))

    return (
        torch.stack(input_ids),
        torch.stack(attention_masks),
        torch.tensor(labels, dtype=torch.long),
        torch.tensor(entity1_indices, dtype=torch.long),
        torch.tensor(entity2_indices, dtype=torch.long)
    )

# Encode train, test, and validation data that is suitable for BERT
train_input_ids, train_attention_masks, train_labels, train_entity1_indices, train_entity2_indices = encode_data(processed_data, label_to_id)
test_input_ids, test_attention_masks, test_labels, test_entity1_indices, test_entity2_indices = encode_data(processed_testdata, label_to_id)
val_input_ids, val_attention_masks, val_labels, val_entity1_indices, val_entity2_indices = encode_data(processed_valdata, label_to_id)

# Create TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels, train_entity1_indices, train_entity2_indices)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels, test_entity1_indices, test_entity2_indices)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels, val_entity1_indices, val_entity2_indices)

# Create DataLoaders from the TensorDatasets so that we can feed the data into batches
# choice of 50 because of benchmark, and, we have 42 relation, so we try to maximize the inclusion of all classes
batch_size = 50
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}, Validation size: {len(val_dataset)}")

Train size: 68124, Test size: 15509, Validation size: 22631


#### Creating the BERT class with fine tuning its architecture by adding another transformer layer and a classification head that consists of 1 hidden layer and 1 output layer. (with dropout)

In [None]:

# ================== Define BERT + FFNN Model ==================
class BertWithFFNN(nn.Module):
    def __init__(self, num_labels):
        super(BertWithFFNN, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        # Adding another transformer encoder layer on top of pre trained bert
        # so now we have 13 layers
        self.extra_layer = nn.TransformerEncoderLayer(d_model=768, nhead=12, dim_feedforward=3072)
        # this is for the classification head
        # BERT's output is of 768 dimensions, attached feed forward networks to have output of 42
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(768 * 2, 256)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, num_labels)

    def forward(self, input_ids, attention_mask, entity1_idx, entity2_idx):
        bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
        hidden_states = bert_outputs.last_hidden_state
        extra_layer_output = self.extra_layer(hidden_states)
        # extracting the entities embedding and concatenating them
        batch_size = extra_layer_output.shape[0]
        entity1_output = extra_layer_output[torch.arange(batch_size), entity1_idx]
        entity2_output = extra_layer_output[torch.arange(batch_size), entity2_idx]

        combined_representation = torch.cat([entity1_output, entity2_output], dim=-1)
        x = self.dropout(combined_representation)
        x = self.fc1(x)
        x = self.relu(x)
        # output
        logits = self.fc2(x)

        return logits

#### Here we initialize and train the model with a custom cross entropy loss function

In [None]:
# ================== Model Training & Validation ==================
# Model initialization and making it work on mps instead of cpu (M1 Pro chip) so that it performs better
num_labels = len(label_to_id)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = BertWithFFNN(num_labels).to(device)
# optimizer & scheduler
# the scheduler will decrease learning rate of the optimizer by 10% every 500 batches
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
scheduler = StepLR(optimizer, step_size=500, gamma=0.9)

# Loss function
# Custom loss function of cross entropy loss that applies inverse weighting scheme
# This is done so that it handles the class imbalance
train_labels_np = train_labels.numpy()
class_counts = Counter(train_labels_np)
class_counts_tensors = torch.tensor([class_counts[i] for i in range(num_labels)], dtype=torch.float)

class_weights = 1/torch.sqrt(class_counts_tensors)
class_weights = class_weights.to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights) #1/sqr(n of samples for each class)

# Enable mixed precision
torch.set_float32_matmul_precision("high")
# Run for 8 epochs since we have limited computational resources!
epochs = 8
gradient_accumulation_steps = 2
# I evaluate the model with this function that outputs the predictions and true labels
def evaluate_model(model, dataloader, device, loss_fn):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_masks, labels, entity1_indices, entity2_indices = [b.to(device) for b in batch]

            logits = model(input_ids, attention_mask=attention_masks, entity1_idx=entity1_indices, entity2_idx=entity2_indices)
            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    precision = precision_score(all_labels, all_preds, average="weighted", zero_division=0)
    recall = recall_score(all_labels, all_preds, average="weighted", zero_division=0)
    f1 = f1_score(all_labels, all_preds, average="weighted", zero_division=0)

    avg_loss = total_loss / len(dataloader)
    return avg_loss, precision, recall, f1
# 8 epochs to train and validate the model on
for epoch in range(epochs):
    print(f"🔄 Epoch {epoch + 1}/{epochs}")
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []

    for i, batch in enumerate(train_dataloader):
        input_ids, attention_masks, labels, entity1_indices, entity2_indices = [b.to(device) for b in batch]

        with torch.autocast(device_type="mps", dtype=torch.float16):  # Mixed precision
            logits = model(input_ids, attention_mask=attention_masks, entity1_idx=entity1_indices, entity2_idx=entity2_indices)
            loss = loss_fn(logits, labels)

        loss = loss / gradient_accumulation_steps
        loss.backward()
        # after every 2 batches: apply learning rate and update accumulated weights for the model
        if (i + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

        total_loss += loss.item() * gradient_accumulation_steps

    # Evaluate on validation data
    val_loss, val_precision, val_recall, val_f1 = evaluate_model(model, val_dataloader, device,loss_fn)

    print(f" Epoch {epoch + 1} - Train Loss: {total_loss:.4f} | Val loss: {val_loss:.4f} | Val Precision: {val_precision:.4f} | Val Recall: {val_recall:.4f} | Val F1-score: {val_f1:.4f}")

print("Training complete!")


🔄 Epoch 1/8
 Epoch 1 - Train Loss: 2672.7938 | Val loss: 1.6489 | Val Precision: 0.7767 | Val Recall: 0.6435 | Val F1-score: 0.6821
🔄 Epoch 2/8
 Epoch 2 - Train Loss: 1488.7089 | Val loss: 1.4370 | Val Precision: 0.8029 | Val Recall: 0.6575 | Val F1-score: 0.7028
🔄 Epoch 3/8
 Epoch 3 - Train Loss: 1018.7308 | Val loss: 1.4707 | Val Precision: 0.8015 | Val Recall: 0.6788 | Val F1-score: 0.7173
🔄 Epoch 4/8
 Epoch 4 - Train Loss: 692.1763 | Val loss: 1.5878 | Val Precision: 0.7984 | Val Recall: 0.7393 | Val F1-score: 0.7613
🔄 Epoch 5/8
 Epoch 5 - Train Loss: 481.5805 | Val loss: 1.7836 | Val Precision: 0.7991 | Val Recall: 0.7575 | Val F1-score: 0.7727
🔄 Epoch 6/8
 Epoch 6 - Train Loss: 351.0442 | Val loss: 1.8294 | Val Precision: 0.8007 | Val Recall: 0.7586 | Val F1-score: 0.7739
🔄 Epoch 7/8
 Epoch 7 - Train Loss: 265.4077 | Val loss: 2.0356 | Val Precision: 0.7986 | Val Recall: 0.7694 | Val F1-score: 0.7809
🔄 Epoch 8/8
 Epoch 8 - Train Loss: 202.8032 | Val loss: 2.1532 | Val Precision: 

#### Training Bert without no-relation class and see if there is any significant performance difference

In [None]:
# This code has the same logic and approach to previous blocks, its just examines model performance on relation classes only
# Hence why the variables, logic are the same, therefore is all implemneted in one cell.
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from transformers import BertModel, BertTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import Counter

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# ================== Data Processing ==================
def process_data(data):
    processed = []
    for item in data:
        # I am filtering to not include no relation to check if the model has any improvements
        if item['relation'] == 'no_relation':
            continue

        sentence = " ".join(item['token'])
        subject = " ".join(item['token'][item['subj_start']:item['subj_end'] + 1])
        obj = " ".join(item['token'][item['obj_start']:item['obj_end'] + 1])

        processed.append({
            'text': sentence,
            'relation': item['relation'],
            'subject': subject,
            'object': obj,
            'subj_start': item['subj_start'],
            'obj_start': item['obj_start']
        })
    return processed

# Load train, test, and validation data
with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/train.json', 'r') as file:
    data = json.load(file)

with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/test.json', 'r') as file:
    testdata = json.load(file)

with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/dev.json', 'r') as file:
    valdata = json.load(file)

# we process data (removing "no relation" items)
processed_data = process_data(data)
processed_testdata = process_data(testdata)
processed_valdata = process_data(valdata)

# Save processed data
with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/processed_train.json', 'w') as file:
    json.dump(processed_data, file, indent=4)

with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/processed_test.json', 'w') as file:
    json.dump(processed_testdata, file, indent=4)

with open('/Users/badershalata/Documents/TextMining/CW/tacred/data/json/processed_val.json', 'w') as file:
    json.dump(processed_valdata, file, indent=4)

# Create new relation label mappings that does not include no relation class
relation_labels = list(set(item["relation"] for item in processed_data))
label_to_id = {label: i for i, label in enumerate(relation_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}

# Create relation label mappings
relation_labels = list(set(item["relation"] for item in processed_data))
label_to_id = {label: i for i, label in enumerate(relation_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}

# ================== Tokenization & Tensor Conversion ==================
def encode_data(data, label_map):
    input_ids, attention_masks, labels, entity1_indices, entity2_indices = [], [], [], [], []
    for item in data:
        encoded = tokenizer(
            item['text'],
            padding='max_length',
            truncation=True,
            max_length=64,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'].squeeze(0))
        attention_masks.append(encoded['attention_mask'].squeeze(0))
        labels.append(torch.tensor(label_map[item['relation']], dtype=torch.long))

        entity1_indices.append(torch.tensor(min(item['subj_start'], 63), dtype=torch.long))
        entity2_indices.append(torch.tensor(min(item['obj_start'], 63), dtype=torch.long))

    return (
        torch.stack(input_ids),
        torch.stack(attention_masks),
        torch.tensor(labels, dtype=torch.long),
        torch.tensor(entity1_indices, dtype=torch.long),
        torch.tensor(entity2_indices, dtype=torch.long)
    )

# Encode train, test, and validation data
train_input_ids, train_attention_masks, train_labels, train_entity1_indices, train_entity2_indices = encode_data(processed_data, label_to_id)
test_input_ids, test_attention_masks, test_labels, test_entity1_indices, test_entity2_indices = encode_data(processed_testdata, label_to_id)
val_input_ids, val_attention_masks, val_labels, val_entity1_indices, val_entity2_indices = encode_data(processed_valdata, label_to_id)

# Create TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels, train_entity1_indices, train_entity2_indices)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels, test_entity1_indices, test_entity2_indices)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels, val_entity1_indices, val_entity2_indices)

# Create DataLoaders
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}, Validation size: {len(val_dataset)}")

# ================== Define BERT + FFNN Model ==================
    # it has the same architecture as previous BERT
class BertWithFFNN(nn.Module):
    def __init__(self, num_labels):
        super(BertWithFFNN, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.extra_layer = nn.TransformerEncoderLayer(d_model=768, nhead=12, dim_feedforward=3072)
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(768 * 2, 256)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, num_labels)

    def forward(self, input_ids, attention_mask, entity1_idx, entity2_idx):
        bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
        hidden_states = bert_outputs.last_hidden_state
        extra_layer_output = self.extra_layer(hidden_states)

        batch_size = extra_layer_output.shape[0]
        entity1_output = extra_layer_output[torch.arange(batch_size), entity1_idx]
        entity2_output = extra_layer_output[torch.arange(batch_size), entity2_idx]

        combined_representation = torch.cat([entity1_output, entity2_output], dim=-1)
        x = self.dropout(combined_representation)
        x = self.fc1(x)
        x = self.relu(x)
        logits = self.fc2(x)

        return logits
# ================== Model Training & Validation ==================
# Model initialization
num_labels = len(label_to_id)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = BertWithFFNN(num_labels).to(device)

# optimizer & scheduler
# the scheduler will decrease learning rate of the optimizer by 10% every 500 batches
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
scheduler = StepLR(optimizer, step_size=500, gamma=0.9)
# Loss function, it is still weighted as some classes appear less frequently than the others
# (even though we removed no relation which was the dominant class)
train_labels_np = train_labels.numpy()
class_counts = Counter(train_labels_np)
class_counts_tensors = torch.tensor([class_counts[i] for i in range(num_labels)], dtype=torch.float)

class_weights = 1/torch.sqrt(class_counts_tensors)
class_weights = class_weights.to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights) #1/sqr(n of samples for each class)

# Enable mixed precision
torch.set_float32_matmul_precision("high")

epochs = 8
gradient_accumulation_steps = 2
# Same evaluate model method as explained before
def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_masks, labels, entity1_indices, entity2_indices = [b.to(device) for b in batch]

            logits = model(input_ids, attention_mask=attention_masks, entity1_idx=entity1_indices, entity2_idx=entity2_indices)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    precision = precision_score(all_labels, all_preds, average="weighted", zero_division=0)
    recall = recall_score(all_labels, all_preds, average="weighted", zero_division=0)
    f1 = f1_score(all_labels, all_preds, average="weighted", zero_division=0)

    return precision, recall, f1

# 8 epochs to train and validate the model on
for epoch in range(epochs):
    print(f"🔄 Epoch {epoch + 1}/{epochs}")
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []

    for i, batch in enumerate(train_dataloader):
        input_ids, attention_masks, labels, entity1_indices, entity2_indices = [b.to(device) for b in batch]

        with torch.autocast(device_type="mps", dtype=torch.float16):  # Mixed precision
            logits = model(input_ids, attention_mask=attention_masks, entity1_idx=entity1_indices, entity2_idx=entity2_indices)
            loss = loss_fn(logits, labels)

        loss = loss / gradient_accumulation_steps
        loss.backward()

        if (i + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

        total_loss += loss.item()

    # Evaluate on validation data
    val_precision, val_recall, val_f1 = evaluate_model(model, val_dataloader, device)

    print(f" Epoch {epoch + 1} - Loss: {total_loss:.4f} | Val Precision: {val_precision:.4f} | Val Recall: {val_recall:.4f} | Val F1-score: {val_f1:.4f}")

print("Training complete!")

Train size: 13012, Test size: 3325, Validation size: 5436
🔄 Epoch 1/10
📊 Epoch 1 - Loss: 502.1160 | Val Precision: 0.5600 | Val Recall: 0.5373 | Val F1-score: 0.5198
🔄 Epoch 2/10
📊 Epoch 2 - Loss: 247.2185 | Val Precision: 0.6696 | Val Recall: 0.6516 | Val F1-score: 0.6518
🔄 Epoch 3/10
📊 Epoch 3 - Loss: 155.0151 | Val Precision: 0.6714 | Val Recall: 0.6670 | Val F1-score: 0.6578
🔄 Epoch 4/10
📊 Epoch 4 - Loss: 103.1577 | Val Precision: 0.6922 | Val Recall: 0.6784 | Val F1-score: 0.6788
🔄 Epoch 5/10
📊 Epoch 5 - Loss: 69.8160 | Val Precision: 0.6842 | Val Recall: 0.6722 | Val F1-score: 0.6687
🔄 Epoch 6/10
📊 Epoch 6 - Loss: 45.7678 | Val Precision: 0.6858 | Val Recall: 0.6795 | Val F1-score: 0.6742
🔄 Epoch 7/10
📊 Epoch 7 - Loss: 33.0301 | Val Precision: 0.6969 | Val Recall: 0.6939 | Val F1-score: 0.6893
🔄 Epoch 8/10
📊 Epoch 8 - Loss: 24.7019 | Val Precision: 0.6890 | Val Recall: 0.6751 | Val F1-score: 0.6748
🔄 Epoch 9/10
📊 Epoch 9 - Loss: 18.4541 | Val Precision: 0.6885 | Val Recall: 0.686

#### Saving the model so that we can integrate it to the user interface

In [None]:
import torch
model_path = 'bertmodelnorelation.pt'
torch.save(model.state_dict(), model_path)
print(f"model saved at {model_path}")

model saved at bertmodelnorelation.pt


# Test Dataset Evaluation

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
import numpy as np
import pandas as pd
def evaluate(model, dataloader):
    model.eval()
    short_preds , short_labels = [], []
    mid_preds, mid_labels = [], []
    long_preds, long_labels = [], []
    # Store all predictions and labels for overall metrics
    all_length, all_preds, all_labels = [], [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_masks, labels, entity1_indices, entity2_indices  = [b.to(device) for b in batch]
            logits = model(input_ids, attention_mask=attention_masks,entity1_idx=entity1_indices, entity2_idx=entity2_indices )
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = labels.cpu().numpy()

            sentences_length = attention_masks.sum(dim=1).cpu().numpy()
            all_length.extend(sentences_length)
            all_preds.extend(preds)
            all_labels.extend(labels)
        # Quantiles for 0.33, 0.66, 0.99
        # This is done to have categories of short, mid and long sentences to evaluate the model performance on
        q33 = np.quantile(all_length,0.33)
        q66 = np.quantile(all_length,0.66)
        print("Q33: ", q33)
        print("Q66: ", q66)
        for length, pred, label in zip(all_length,all_preds,all_labels):
            if length <= q33:
                short_preds.append(pred)
                short_labels.append(label)
            elif length <= q66:
                mid_preds.append(pred)
                mid_labels.append(label)
            else:
                long_preds.append(pred)
                long_labels.append(label)

    # This function takes the model and the true and predicted labels, computes the metrics for X category
    def categoryscore(name, true_labels, pred_labels):
        if len(true_labels) > 0:
            precision = precision_score(true_labels, pred_labels, average="weighted", zero_division=0)
            recall = recall_score(true_labels, pred_labels, average="weighted", zero_division=0)
            f1 = f1_score(true_labels, pred_labels, average="weighted", zero_division=0)
            print(f" {name} Sentences - Precision: {precision:.4f} | Recall: {recall:.4f} | F1-score: {f1:.4f}")
        else:
            print(f"⚠️ No {name} sentences in the test set.")

    # Print category-wise scores
    categoryscore("Short", short_labels, short_preds)
    categoryscore("Mid", mid_labels, mid_preds)
    categoryscore("Long", long_labels, long_preds)


    # ===== Classification report =====
    report = classification_report(all_labels,all_preds)
    print("REPORT: ", report)

    # Compute overall scores
    overall_precision = precision_score(all_labels, all_preds, average="weighted", zero_division=0)
    overall_recall = recall_score(all_labels, all_preds, average="weighted", zero_division=0)
    overall_f1 = f1_score(all_labels, all_preds, average="weighted", zero_division=0)

    print("\n **Overall Test Set Metrics**")
    print(f"Precision: {overall_precision:.4f}")
    print(f"recall: {overall_recall:.4f}")
    print(f"F1-score: {overall_f1:.4f}")
    print("ALL LABELS: ", all_labels)
    print("ALL PRED: ", all_preds)

    # FEED INTO CSV
    # This is done so we can compare the sentence length performance for differet models
    df = pd.DataFrame({"True Labels" : [id_to_label[label] for label in all_labels], "Predictions": [id_to_label[pred] for pred in all_preds]})
    df.to_csv("/Users/badershalata/Documents/TextMining/CW/predictionvectorall.csv",index = False)

# TO start evaluating after training
evaluate(model, test_dataloader)


Q33:  35.0
Q66:  48.0
 Short Sentences - Precision: 0.8476 | Recall: 0.8340 | F1-score: 0.8373
 Mid Sentences - Precision: 0.7943 | Recall: 0.7851 | F1-score: 0.7863
 Long Sentences - Precision: 0.8064 | Recall: 0.7883 | F1-score: 0.7946
REPORT:                precision    recall  f1-score   support

           0       0.41      0.28      0.33        47
           1       0.30      0.30      0.30        81
           2       0.25      0.14      0.18        14
           3       0.37      0.47      0.41        60
           4       0.00      0.00      0.00         5
           5       0.39      0.46      0.42       189
           6       0.68      0.70      0.69        37
           7       0.29      0.14      0.19        28
           8       0.58      0.55      0.57       103
           9       0.36      0.62      0.45         8
          10       0.67      0.19      0.30        31
          11       0.13      0.27      0.18        11
          12       0.38      0.39      0.38       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Checking that saving and loading is working properly and as expected

In [None]:
import torch
model_path = 'bertmodel.pt'
torch.save(model.state_dict(), model_path)
print(f"model saved at {model_path}")

model saved at bertmodel.pt


In [None]:
model = BertWithFFNN(42)
model.load_state_dict(torch.load('bertmodel.pt'))
model.eval()
print("MODEL LOADED SUCCESS")

MODEL LOADED SUCCESS


In [None]:
import torch.nn.functional as F
sample_input = "Bader was born in the UK"
# Tokenize the input
encoded_input = tokenizer(sample_input, return_tensors='pt', padding=True, truncation=True)

# Remove 'token_type_ids' if not needed by your model
if 'token_type_ids' in encoded_input:
    del encoded_input['token_type_ids']

# Move input to the same device as the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
encoded_input = {key: value.to(device) for key, value in encoded_input.items()}

# Run inference
with torch.no_grad():
    output = model(**encoded_input)
probs = F.softmax(output, dim=1)
predicted_id = torch.argmax(probs, dim=1).item()
predicted_label = id_to_label.get(predicted_id)
print("Model output:", predicted_label)


Model output: per:city_of_birth


### BERT model architecture

In [None]:
model

BertWithFFNN(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_