In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [2]:
import pandas as pd
df = pd.read_excel("data/TRVInfra_all_only_requirements_202312.xlsx", sheet_name=1, header=1, usecols=[3,4])
df.sample(5)

Unnamed: 0,MatchExtended,Complete?
2709,K129270 Arbets- och metodbeskrivning ska minst...,1.0
19127,K111518 Dagvattenledning ska dimensioneras så ...,
4986,K162024 Knapp för lamptest ska finnas i anlägg...,
14371,K56326 Nätomkopplingsautomatik ska vara av typ...,
8566,K30863 Där BIS tillhandahåller uppgift om spår...,


In [5]:
df['MatchExtended'] = df['MatchExtended'].str.replace(r'^K\d{3,}', '', regex=True)
df.sample(5)

Unnamed: 0,MatchExtended,Complete?
13164,Container för stationärt reservkraftaggregat ...,
20102,Ändring av en färgkod för färg i HMI ICS ska ...,
2057,"VMS placerade över vägbana, där djupet på sky...",0.0
17016,Statustappningar från styrapparater i vägtraf...,
2159,Navigeringsmeny i HMI ICS ska vara strukturer...,1.0


In [122]:
labeled = df[df['Complete?'].isin([0.0, 1.0])]
unlabeled = df[~df['Complete?'].isin([0.0, 1.0])]
labeled.sample(5)
len(unlabeled)

18316

In [7]:
labeled = labeled.fillna('')

In [8]:
grouped = labeled.groupby('Complete?')
min_group_size = grouped.size().min()
print(min_group_size)

479


In [9]:
labeled = grouped.apply(lambda x: x.sample(min_group_size))
labeled

Unnamed: 0_level_0,Unnamed: 1_level_0,MatchExtended,Complete?
Complete?,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,349,Produktdokumentation ska vara med minst följa...,0.0
0.0,2249,"Absolut, geodetisk, spårlägesmätning ska utfö...",0.0
0.0,1898,Vid trafikering på arbetsplats då spåret är o...,0.0
0.0,654,6.,0.0
0.0,2124,Sidoslitaget (s) får inte överskrida: 1.,0.0
...,...,...,...
1.0,469,Känslighet för blockering i bromsprovare ska ...,1.0
1.0,1573,"Bomrörelse vägbom öppningsbar bro, vägoperati...",1.0
1.0,1153,"Dike ska utformas för att samla upp, infiltre...",1.0
1.0,2660,För ML-132 kV ställverk ska spänningsnivån va...,1.0


In [10]:
sentences = labeled['MatchExtended'].values
labels = (labeled['Complete?'].values).astype(int)

In [11]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')

In [12]:
print(' Original: ', sentences[0])
print('Tokenized: ', tokenizer.tokenize(sentences[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:   Produktdokumentation ska vara med minst följande omfattning: a.
Tokenized:  ['Produkt', '##dokument', '##ation', 'ska', 'vara', 'med', 'minst', 'följande', 'omfattning', ':', 'a', '.']
Token IDs:  [14871, 33388, 223, 326, 358, 66, 1706, 3181, 7155, 126, 39, 7]


# Now we have prepared the data. Next we massage it so it can be ingested by BERT

In [13]:
max_len = 0
for sent in sentences:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
print('Max sentence length: ', max_len)

Max sentence length:  151


In [14]:
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
        sent, 
        add_special_tokens = True, 
        max_length=max_len+1, 
        truncation=True,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])



Original:   Produktdokumentation ska vara med minst följande omfattning: a.
Token IDs: tensor([    2, 14871, 33388,   223,   326,   358,    66,  1706,  3181,  7155,
          126,    39,     7,     3,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0

# Training, validation and test split

In [15]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)

learn_size = int(0.9 * len(dataset))
train_size = int(0.9 * learn_size)
val_size = learn_size - train_size
test_size = len(dataset) - learn_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

print('{:>5,} learning samples'.format(learn_size))
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
print('{:>5,} test samples'.format(test_size))

  862 learning samples
  775 training samples
   87 validation samples
   96 test samples


In [16]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = batch_size
)

validation_dataloader = DataLoader(
    val_dataset,
    sampler = SequentialSampler(val_dataset),
    batch_size = batch_size
)

# Training

In [17]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    'KB/bert-base-swedish-cased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

Some weights of the model checkpoint at KB/bert-base-swedish-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at KB/bert-base-swedi

In [18]:
optimizer = AdamW(
    model.parameters(), 
    lr=2e-5,
    eps=1e-8
)



In [19]:
from transformers import get_linear_schedule_with_warmup

epochs = 4

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = 0,
    num_training_steps = total_steps
)

In [20]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [21]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [22]:
import random
import numpy as np

seed_val = 23

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)


training_stats = []

total_t0 = time.time()

for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        loss = output.loss
        logits = output.logits
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():
            output = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            loss = output.loss
            logits = output.logits
        
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

            


Training...

  Average training loss: 0.29
  Training epoch took: 0:09:59

Running Validation...
  Accuracy: 0.95
  Validation Loss: 0.13
  Validation took: 0:00:26

Training complete!
Total training took 0:10:25 (h:mm:ss)

Training...

  Average training loss: 0.10
  Training epoch took: 0:09:11

Running Validation...
  Accuracy: 0.98
  Validation Loss: 0.08
  Validation took: 0:00:22

Training complete!
Total training took 0:19:58 (h:mm:ss)

Training...

  Average training loss: 0.06
  Training epoch took: 0:11:29

Running Validation...
  Accuracy: 0.98
  Validation Loss: 0.12
  Validation took: 0:00:27

Training complete!
Total training took 0:31:54 (h:mm:ss)

Training...

  Average training loss: 0.04
  Training epoch took: 0:10:21

Running Validation...
  Accuracy: 0.96
  Validation Loss: 0.15
  Validation took: 0:00:22

Training complete!
Total training took 0:42:37 (h:mm:ss)


In [25]:
import pandas as pd
pd.set_option('display.precision', 2)
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('epoch')
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.29,0.13,0.95,0:09:59,0:00:26
2,0.1,0.08,0.98,0:09:11,0:00:22
3,0.06,0.12,0.98,0:11:29,0:00:27
4,0.04,0.15,0.96,0:10:21,0:00:22


In [91]:
import os
output_dir = '.'

model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
#torch.save(args, os.path.join(output_dir, 'training_args.bin'))

('./tokenizer_config.json',
 './special_tokens_map.json',
 './vocab.txt',
 './added_tokens.json',
 './tokenizer.json')

In [93]:
model = model.from_pretrained(output_dir)
tokenizer = tokenizer.from_pretrained(output_dir)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50325, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Performance on test set

In [94]:
test_dataloader = DataLoader(
    test_dataset,
    sampler = SequentialSampler(test_dataset),
    batch_size = batch_size
)

In [95]:
print('Predicting labels for {:,} test sentences...'.format(len(test_dataset)))

model.eval()
predictions, true_labels = [], []

for batch in test_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 96 test sentences...
    DONE.


In [96]:
num_test_labels = len(test_dataset)
num_positive_labels = 0

for batch in test_dataset:
    num_positive_labels += batch[2]

print('Positive samples: %d of %d (%.2f%%)' % (num_positive_labels, num_test_labels, (num_positive_labels / num_test_labels * 100.0)))


Positive samples: 51 of 96 (53.12%)


In [97]:
from sklearn.metrics import matthews_corrcoef

matthews_set = []
print('Calculating Matthews Corr. Coef. for each batch...')
for i in range(len(true_labels)):
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
    matthews_set.append(matthews)

Calculating Matthews Corr. Coef. for each batch...


In [65]:
import matplotlib.pyplot as plt
import seaborn as sns

ax = sns.barplot(x=list(range(len(matthews_set))), y=matthews_set, ci=None)
plt.title('MCC Score per Batch')
plt.ylabel('MCC Score (-1 to +1)')
plt.xlabel('Batch #')

plt.show()

ModuleNotFoundError: No module named 'seaborn'

In [98]:
flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = np.concatenate(true_labels, axis=0)
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
print('Total MCC: %.3f' % mcc)


Total MCC: 1.000


In [121]:
flat_pred = np.argmax(np.concatenate(predictions, axis=0), axis=1)
flat_true = np.concatenate(true_labels, axis=0)
accuracy = np.sum(flat_pred == flat_true) / len(flat_pred)
print('Accuracy: ', accuracy)

Accuracy:  1.0


# Prediction on unlabeled data

In [140]:
sentences = unlabeled['MatchExtended'].values

max_len = 0
for sent in sentences:
    if type(sent) == str:
        input_ids = tokenizer.encode(sent, add_special_tokens=True)
        max_len = max(max_len, len(input_ids))
print('Max sentence length: ', max_len)


Max sentence length:  321


In [150]:
input_idsn = []
attention_masksn = []

for sent in sentences[:100]:
    if type(sent) == str:
        encoded_dict = tokenizer.encode_plus(
            sent, 
            add_special_tokens = True, 
            max_length=max_len+1, 
            truncation=True,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt')
        input_idsn.append(encoded_dict['input_ids'])
        attention_masksn.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_idsn, dim=0)
attention_masks = torch.cat(attention_masksn, dim=0)

In [151]:
dataset = TensorDataset(input_ids, attention_masks)

unlabeled_dataloader = DataLoader(
    dataset,
    sampler = SequentialSampler(dataset),
    batch_size = batch_size
)

In [152]:
from torch.utils.data import Subset

model.eval()
predictions = []

for batch in unlabeled_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  #label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  #true_labels.append(label_ids)

print('    DONE.')

    DONE.


In [159]:
flat_pred = np.argmax(np.concatenate(predictions, axis=0), axis=1)
r = zip(sentences[:100], flat_pred)
print(list(r))

[(' Vid montage i vinklar, utifrån rätlinje, ska spännkraft i tråd verka på stolpe, se figur K7.', 1), (' Projektorganisationen ska vara samordnande för arbetsprocessen enligt {Bilaga 1 Arbetsprocess integration styrapparat till ÖTS}.', 1), (' Lysande VMS får inte vara med frontglas/frontskärm.', 1), (' Genomföringar och anslutningar ska vara utformade så att två stycken 95 mm² 3-fas kablar kan anslutas.', 1), (' Kontakt i motorkrets ska kunna sluta och bryta ström hos fastbromsad motor.', 1), (' Toleranser ska uppfylla toleransklass 1 enligt SS-EN 13670:2009 - Betongkonstruktioner - Utförande 5.', 0), (' Bedömning av lämplighet som teknisk granskare ska göras av beställare baserande på underlag som lämnas av sökandes organisation.', 1), (' Fiberenheter i Trafikverkets optiska spridningsnät får endast skarvas mot pigtails eller fan-outs i ODF-enheter.', 1), (' För körfältsignaler MCS (TM), omställbara vägmärken (MV), i grafiska användargränssnitt, ska symboler för hastighet finnas med 