In [23]:
!pip install transformers
!pip install datasets



In [24]:
!pip install nltk



In [25]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu

In [26]:
model_name = "Helsinki-NLP/opus-mt-en-de"  # English to German
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# load WMT dataset for de-en
de_en_dev = load_dataset("wmt18", "de-en", split='validation')
de_en_test = load_dataset("wmt18", "de-en", split='test')

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

In [5]:
# tokenize a sentence
def tokenize(sentence):
  return tokenizer(sentence, return_tensors="pt", add_special_tokens=True).input_ids

In [7]:
# decode a sequence of tokens
def decode(token_ids):
  return tokenizer.decode(token_ids, skip_special_tokens=True)

In [6]:
class Hypothesis:
  def __init__(self, score=0, is_open=True, sequence=torch.tensor([[model.config.decoder_start_token_id]]), constraints=[]):
    self.score = score # keeps track of current sequence score
    self.is_open = is_open # keeps track if the hypothesis is currently open
    self.sequence = sequence # current sequence of tokens
    self.constraints = constraints # list of constraints the sequence currently contains

In [8]:
# create a grid with dimensions of the max length of a sequence and the number of constraints + 1
# (to account for when there are no constraints in the sequence)
def init_grid(max_len, num_constraints):
  grid = []
  for i in range(max_len):
    row = []
    for j in range(num_constraints + 1):
        row.append([])
    grid.append(row)
  return grid

In [9]:
constraint_boost = 11

In [19]:
# generate new open hypotheses
def generate(model, hyp, input_ids, beam_size, encoder_outputs, constraint_boost=0, constraints=[]):

  # check if sequence has EOS, return since no tokens can be added
  if hyp.sequence[0, -1].item() == tokenizer.eos_token_id:
    return [hyp]

  decoder_input_ids = hyp.sequence

  with torch.no_grad():
    outputs = model(
        input_ids=None,
        encoder_outputs=encoder_outputs,
        decoder_input_ids=decoder_input_ids
    )
    logits = outputs.logits

  # get the logits for the last predicted token
  next_token_logits = logits[:, -1, :] # [batch_size, sequence_length, vocab_size]
  next_token_probs = torch.softmax(next_token_logits, dim=-1).squeeze(0)

  # get the top beam_size tokens
  top_token_probs, top_token_ids = torch.topk(next_token_probs, beam_size)

  new_hyps = []

  # create new hypotheses with the top beam_size tokens
  for i in range(beam_size):
    next_token_id = top_token_ids[i].item()
    next_token_prob = top_token_probs[i].item()

    decoded_token = tokenizer.decode(next_token_id, skip_special_tokens=False)

    # check to see if the new token is a constraint and
    # if that constraint is already in the sequence
    flag = False
    for constraint in hyp.constraints:
      if decoded_token in constraint or decoded_token == constraint:
        flag = True
        continue

    # if it is, skip the token
    if flag:
      continue

    new_constraints = hyp.constraints
    new_score = 0
    is_open = True

    # if the new token is a constraint and it is not in the sequence,
    # add it to the constraint list
    for constraint in constraints:
      curr_constraint = constraint.split() if " " in constraint else [constraint]
      if (decoded_token == curr_constraint[0]):
        new_constraints = hyp.constraints + [decoded_token]
        new_score = constraint_boost

        # if the constraint is more than one word, close the hypothesis
        # so the constraint can be continued
        if len(curr_constraint) > 1:
          is_open = False

    new_score = new_score + hyp.score + next_token_prob # add the score of the token to the total score
    new_sequence = torch.cat([decoder_input_ids, torch.tensor([[next_token_id]])], dim=-1) # add the token to the sequence

    new_hyp = Hypothesis(new_score, is_open, new_sequence, new_constraints) # create a new hypothesis
    new_hyps.append(new_hyp)

  return new_hyps


In [10]:
# start new constrained hypotheses
def start(model, hyp, input_ids, constraints, encoder_outputs, constraint_boost):

  # check if sequence has EOS, return since no tokens can be added
  if hyp.sequence[0, -1].item() == tokenizer.eos_token_id:
    return [hyp]

  new_hyps = []

  # get the first word of a multi word constraint or a single word constraint itself
  for constraint in constraints:
    first_word = constraint.split()[0] if ' ' in constraint else constraint
    # if the constraint is already in the sequence, skip it
    if first_word in hyp.constraints:
      continue

    first_word_token_ids = tokenizer.encode(first_word, add_special_tokens=False) # get the tokens of the word

    new_sequence = hyp.sequence
    log_prob = 0.0

    # takes account for multi token words
    for token_id in first_word_token_ids:
      new_sequence = torch.cat([new_sequence, torch.tensor([[token_id]])], dim=-1) # add the token to the sequence

      with torch.no_grad():
        outputs = model(
            input_ids=None,
            encoder_outputs=encoder_outputs,
            decoder_input_ids=new_sequence
        )
        logits = outputs.logits

      # get the logits for the last predicted token
      next_token_logits = logits[:, -1, :] # [batch_size, sequence_length, vocab_size]
      log_probs = torch.log_softmax(next_token_logits, dim=-1).squeeze(0)

      log_prob += log_probs[token_id].item() # add up the probs of each token

    constraint_factor = len(hyp.constraints) if len(hyp.constraints) > 0 else 1 # normalization factor
    # log_prob / len(first_word_token_ids) to take account of multi token constraints and
    # constraint_boost / constraint_factor to make each word in the constraint equal
    new_score = (hyp.score + (log_prob / len(first_word_token_ids))) + (constraint_boost / constraint_factor)

    new_constraints = hyp.constraints + [first_word]
    is_open = False if ' ' in constraint else True # single word constraints are finished, multi word constraints need to be continued

    new_hyp = Hypothesis(new_score, is_open, new_sequence, new_constraints)
    new_hyps.append(new_hyp)

  return new_hyps

In [11]:
# continue unfinished constraints
def cont(model, hyp, input_ids, constraints, encoder_outputs, constraint_boost):

  # check if sequence has EOS, return since no tokens can be added
  if hyp.sequence[0, -1].item() == tokenizer.eos_token_id:
    return hyp

  for constraint in constraints:
    # look at the last constraint added to the sequence
    if (hyp.constraints[-1] in constraint):
      constraint_words = constraint.split()
      completed_words = [word for word in hyp.constraints if word in constraint_words] # get the completed words of the constraint

      # check if the constraint is incomplete
      if (len(completed_words) < len(constraint_words)):
        next_word = constraint_words[len(completed_words)]
        next_word_token_ids = tokenizer.encode(next_word, add_special_tokens=False)

        new_sequence = hyp.sequence
        log_prob = 0.0

        # takes account for multi token words
        for token_id in next_word_token_ids:
          new_sequence = torch.cat([new_sequence, torch.tensor([[token_id]])], dim=-1) # add the token to the sequence

          with torch.no_grad():
            outputs = model(
                input_ids=None,
                encoder_outputs=encoder_outputs,
                decoder_input_ids=new_sequence
            )
            logits = outputs.logits

          # get the logits for the last predicted token
          next_token_logits = logits[:, -1, :] # [batch_size, sequence_length, vocab_size]
          log_probs = torch.log_softmax(next_token_logits, dim=-1).squeeze(0)

          log_prob += log_probs[token_id].item() # add up the probs of each token

        constraint_factor = len(hyp.constraints) if len(hyp.constraints) > 0 else 1 # normalization factor
        # log_prob / len(first_word_token_ids) to take account of multi token constraints and
        # constraint_boost / constraint_factor to make each word in the constraint equal
        new_score = hyp.score + (log_prob / len(next_word_token_ids)) + (constraint_boost / constraint_factor)

        new_constraints = hyp.constraints + [next_word]

        completed_words.append(next_word)
        is_open = len(completed_words) == len(constraint_words) # check if constraints are completed

        new_hyp = Hypothesis(new_score, is_open, new_sequence, new_constraints)

        return new_hyp

  return hyp

In [12]:
def constrained_beam_search(model, input_ids, constraints, max_len, num_constraints, constraint_boost=10, beam_size=5):

  """
  model: model
  input_ids: tokenized sentence
  constraints: list of constraints
  max_len: max length of translation
  num_constraints: total number of words in the list of constraints
  beam_size: beam size
  """
  start_hyp = Hypothesis() # initial hypothesis
  grid = init_grid(max_len, num_constraints) # initialize beams in grid
  grid[0][0] = [start_hyp]

  # forward pass
  with torch.no_grad():
    encoder_outputs = model.get_encoder()(input_ids=input_ids)

  for t in range(1, max_len): # time step
    for c in range(max(0, (num_constraints + t) - max_len), min(t, num_constraints) + 1): # num of constraints step
      n, s, g = [], [], []

      for hyp in grid[t - 1][c]:
        if hyp.is_open:
          # generate new hypotheses
          new_hyps = generate(model, hyp, input_ids, beam_size, encoder_outputs, constraint_boost, constraints)
          g.extend(new_hyps)

      if c > 0:
        for hyp in grid[t - 1][c - 1]:
          if hyp.is_open:
            # start new constraints
            new_hyps = start(model, hyp, input_ids, constraints, encoder_outputs, constraint_boost)
            s.extend(new_hyps)
          else:
            # continue unfinished constraints
            new_hyp = cont(model, hyp, input_ids, constraints, encoder_outputs, constraint_boost)
            n.append(new_hyp)

      # sort the hypotheses from highest score to lowest score
      all_hyps = sorted(n + s + g, key=lambda hyp: hyp.score, reverse=True)

      valid_hyps = []
      # valid hyps are hyps where the sequence has not ended or all the constraints are met
      for hyp in all_hyps:
        # where the sequence has not ended means it can still be expanded
        if (hyp.sequence[0, -1].item() != tokenizer.eos_token_id):
          valid_hyps.append(hyp)
        else:
          # where the sequence has ended and it has all the constraints means it's done
          if (len(hyp.constraints) == num_constraints):
            valid_hyps.append(hyp)

      grid[t][c] = valid_hyps[:beam_size] # top beam_size scoring hypotheses stay on the beam

  top_level_hyps = []
  # get hyps in top level beams
  for t in range(len(grid)):
    top_level_hyps.extend(grid[t][num_constraints])

  finished_hyps = []
  # get hyps with EOS token
  for hyp in top_level_hyps:
    if hyp.sequence[0, -1].item() == tokenizer.eos_token_id:
      finished_hyps.append(hyp)

  # if no finished hyps
  if not finished_hyps:
    print("Warning: No finished hypotheses. Returning the best incomplete hypothesis.")
    best_hyp = max(top_level_hyps, key=lambda hyp: hyp.score)
  else:
    best_hyp = max(finished_hyps, key=lambda hyp: hyp.score) # get best hyp

  return best_hyp


In [13]:
def generate_baseline_translation(model, input_ids):
  hyp = Hypothesis()

  # forward pass
  with torch.no_grad():
    encoder_outputs = model.get_encoder()(input_ids=input_ids)

  while True:
    hyp = generate(model, hyp, input_ids, 1, encoder_outputs)[0]

    if hyp.sequence[0, -1].item() == tokenizer.eos_token_id:
      break

  return hyp.sequence

In [14]:
en_sentences_dev = [row['translation']['en'] for row in de_en_dev]
de_sentences_dev = [row['translation']['de'] for row in de_en_dev]

In [15]:
en_sentences_test = [row['translation']['en'] for row in de_en_test]
de_sentences_test = [row['translation']['de'] for row in de_en_test]

In [16]:
def extract_constraints(reference, hypothesis, constraint_length=3):
  ref_words = reference.split()
  hyp_words = set(hypothesis.split())

  constraints = []
  used_indices = set()

  # extract constraints of length constraint_length
  for i in range(len(ref_words) - constraint_length + 1):
    if any(idx in used_indices for idx in range(i, i + constraint_length)):
      continue  # skip overlapping tokens

    phrase = " ".join(ref_words[i : i + constraint_length])
    first_word = ref_words[i]

    # only add the phrase if the first word is missing in the hypothesis (relaxed constraints)
    if first_word not in hyp_words:
      constraints.append(phrase)
      used_indices.update(range(i, i + constraint_length))

  # if no constraints of `constraint_length` are found, look for shorter phrases
  if not constraints:
    for length in range(constraint_length - 1, 0, -1):
      for i in range(len(ref_words) - length + 1):
        if any(idx in used_indices for idx in range(i, i + length)):
          continue  # skip overlapping tokens

        phrase = " ".join(ref_words[i : i + length])
        first_word = ref_words[i]

        if first_word not in hyp_words:
          constraints.append(phrase)
          used_indices.update(range(i, i + length))

      if constraints:
          break

  # handle remaining tokens at the end of the reference sentence
  last_unprocessed_idx = max(used_indices) + 1 if used_indices else 0
  if last_unprocessed_idx < len(ref_words):
    remaining_phrase = " ".join(ref_words[last_unprocessed_idx:])
    first_word = ref_words[last_unprocessed_idx]

    if first_word not in hyp_words:
      constraints.append(remaining_phrase)

  return constraints


In [17]:
# compute score using BLEU
def compute_BLEU_score(prediction, actual):
  prediction_tokens = prediction.split()
  actual_tokens = actual.split()

  bleu_score = sentence_bleu(
      [actual_tokens],  # reference
      prediction_tokens,  # hypothesis
  )

  return bleu_score

In [27]:
results = []
best_constrained_score = -float("inf")
for beam_size in [5, 10, 15]:
  for constraint_boost in [5, 10, 15]:
    baseline_score_avg = 0
    constrained_score_avg = 0
    sentence_count = 0

    for en_sentence, de_sentence in zip(en_sentences_dev[:100], de_sentences_dev[:100]):
      input_ids = tokenize(en_sentence)

      if len(en_sentence.split()) > 15:
        continue

      baseline_translation = decode(generate_baseline_translation(model, input_ids).flatten().tolist())

      if baseline_translation == de_sentence:
        constrained_translation = baseline_translation
      else:
        constraints = extract_constraints(de_sentence, baseline_translation)
        num_constraints = sum(len(constraint.split()) if " " in constraint else 1 for constraint in constraints) if len(constraints) > 0 else 0
        constrained_translation = decode(constrained_beam_search(model, input_ids, constraints, len(en_sentence.split()) + 10, num_constraints, constraint_boost, beam_size).sequence.flatten().tolist())

      baseline_score = compute_BLEU_score(baseline_translation, de_sentence)
      constrained_score = compute_BLEU_score(constrained_translation, de_sentence)

      sentence_count += 1

      print(sentence_count)
      print(baseline_translation, baseline_score)
      print(constrained_translation, constrained_score)
      print()

      baseline_score_avg += baseline_score
      constrained_score_avg += constrained_score

    baseline_score_avg /= sentence_count
    constrained_score_avg /= sentence_count

    results.append({
                "beam_size": beam_size,
                "constraint_boost": constraint_boost,
                "baseline_score_avg": baseline_score_avg,
                "constrained_score_avg": constrained_score_avg,
                "sentence_count": sentence_count
            })

    # Check if this is the best score so far
    if constrained_score_avg > best_constrained_score:
        best_constrained_score = constrained_score_avg
        best_params = (beam_size, constraint_boost)

    print(f"Beam size: {beam_size}, Boost: {constraint_boost}")
    print(f"Baseline Avg BLEU: {baseline_score_avg}, Constrained Avg BLEU: {constrained_score_avg}, Sentences: {sentence_count}")

  print("\nBest Parameters:")
  print(f"Beam size: {best_params[0]}, Constraint boost: {best_params[1]}")
  print(f"Best Constrained BLEU: {best_constrained_score}")


1
28-jähriger Chef fand tot in San Francisco Mall 0.38260294162784475
Der 28-jährige Koch in San Francisco Mall wurde tot gefunden. 0.4111336169005197

2
Ein Sprecher von Sons & Daughters sagte, sie seien "geschockt und verwüstet" durch seinen Tod. 0.1772712285241271
Ein Sprecher von dass sie über Son am Boden zerstört Tod "schockiert und verwisch des Sons & Daughter seien". 6.742419825730601e-78

3
Unsere Gedanken und unser Beileid sind mit Franks Familie und Freunden in dieser schwierigen Zeit. 0.4480304273880272
Unsere Gedanken und unser Beileid sind mit Franks Familie und Freunden in dieser schweren Zeit bei... 0.537284965911771

4
"Er hat eine Wohnung gefunden, er ist mit einem Mädchen zusammen", sagte Louis Galicia zu KGO. 1.2882297539194154e-231
"Er hat eine Wohnung gefunden hatte eine eine Wohnung, er war mit einem Mädchen fand 3.6170146665513074e-78

5
Er war ein freundlicher Geist mit einem großen Herzen. 0.5969491792019646
Er war ein freundlicher Geist mit einem großen Herze

KeyboardInterrupt: 

In [None]:
def calculate_BLEU_score_avgs(en_sentences, de_sentences):
  baseline_score_avg = 0
  constrained_score_avg = 0
  sentence_count = 0

  for en_sentence, de_sentence in zip(en_sentences[:100], de_sentences[:100]):
    input_ids = tokenize(en_sentence)

    if len(en_sentence.split()) > 15:
      continue

    baseline_translation = decode(generate_baseline_translation(model, input_ids).flatten().tolist())

    if baseline_translation == de_sentence:
      constrained_translation = baseline_translation
    else:
      constraints = extract_constraints(de_sentence, baseline_translation)
      num_constraints = sum(len(constraint.split()) if " " in constraint else 1 for constraint in constraints) if len(constraints) > 0 else 0
      constrained_translation = decode(constrained_beam_search(model, input_ids, constraints, len(en_sentence.split()) + 10, num_constraints).sequence.flatten().tolist())

    baseline_score = compute_BLEU_score(baseline_translation, de_sentence)
    constrained_score = compute_BLEU_score(constrained_translation, de_sentence)

    sentence_count += 1

    print(sentence_count)
    print(baseline_translation, baseline_score)
    print(constrained_translation, constrained_score)
    print()

    baseline_score_avg += baseline_score
    constrained_score_avg += constrained_score

  baseline_score_avg /= sentence_count
  constrained_score_avg /= sentence_count

  return baseline_score_avg, constrained_score_avg

In [20]:
dev_baseline_score_avg, dev_constrained_score_avg = calculate_BLEU_score_avgs(en_sentences_dev, de_sentences_dev)

1
28-jähriger Chef fand tot in San Francisco Mall 0.38260294162784475
Der 28-jährige Koch in San Francisco Mall wurde tot gefunden. 0.4111336169005197

2
Ein Sprecher von Sons & Daughters sagte, sie seien "geschockt und verwüstet" durch seinen Tod. 0.1772712285241271
Tod "schockiert und am Boden zerstört, dass sie über die Grenzen des Sons & Daughter- seien". 0.3397898021621042

3
Unsere Gedanken und unser Beileid sind mit Franks Familie und Freunden in dieser schwierigen Zeit. 0.4480304273880272
Unsere Gedanken und unser Beileid sind mit Franks Familie und Freunden in dieser schweren Zeit beibachtet. 0.537284965911771

4
"Er hat eine Wohnung gefunden, er ist mit einem Mädchen zusammen", sagte Louis Galicia zu KGO. 1.2882297539194154e-231
"Er hat eine Wohnung gefunden hatte eine eine Wohnung, fand 6.677571628641163e-155



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


5
Er war ein freundlicher Geist mit einem großen Herzen. 0.5969491792019646
Er war ein gütiger Mensch mit einem großen Herzen. 0.5969491792019646

6
Er wollte nie in irgendeiner Art von Auseinandersetzung sein. 0.43167001068522526
Er wollte nie in irgendeiner Art von Auseinandersetzung sein.. an irgendeiner Art. 0.33180774028439425

7
Er war der Bruder, der mit dem Fluss ging. 0.7259795291154771
Er war der Bruder, der mit dem Strom schwamm. 1.0

8
Jeder, der Informationen hat, wird gebeten, die SFPD Tip Line unter 415-575-4444 anzurufen. 3.1640699269154553e-78
Wer Informationen hat, wird gebeten das Hinweistelefon zu dem Fall an des SFPD unter Nummer 415-575-4444 anzurufen. 0.3972595463783053

9
Jennifer Aniston: Ich bin immer geplatzt 4.446808895758207e-78
Jennifer Anis werde immer ins Schubladen gesteckt. 7.183445846156676e-155

10
Jennifer Aniston muss nicht immer perfekt oder erfolgreich sein. 1.0
Jennifer Aniston muss nicht immer perfekt oder erfolgreich sein. 1.0

11
Das hat der 

In [22]:
print(dev_baseline_score_avg, dev_constrained_score_avg)

0.26859017981201255 0.4009753253283911


In [None]:
test_baseline_score_avg, test_constrained_score_avg = calculate_BLEU_score_avgs(en_sentences_test, de_sentences_test)

In [None]:
print(test_baseline_score_avg, test_constrained_score_avg)