<a href="https://colab.research.google.com/github/ampham03/grad-assessment/blob/main/grad_assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install datasets
!pip install nltk



In [2]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
model_name = "Helsinki-NLP/opus-mt-en-de"  # English to German
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# load WMT dataset for de-en
de_en_train = load_dataset("wmt19", "de-en", split='train')
de_en_dev = load_dataset("wmt19", "de-en", split='validation')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# get german sentences from the train dataset
wmt_corpus = []
for row in range(de_en_train.num_rows // 50):
  wmt_corpus.append(de_en_train[row]['translation']['de'])

In [5]:
german_stopwords = stopwords.words('german')
# extract 2-3 grams
vectorizer = CountVectorizer(ngram_range=(2, 5), stop_words=german_stopwords)
X = vectorizer.fit_transform(wmt_corpus)
ngram_counts = np.array(X.sum(axis=0)).flatten()
ngram_list = vectorizer.get_feature_names_out()
total_ngrams = ngram_counts.sum()

# extract unigrams
unigram_vectorizer = CountVectorizer(ngram_range=(1, 1), stop_words=german_stopwords)
X_uni = unigram_vectorizer.fit_transform(wmt_corpus)
unigram_counts = np.array(X_uni.sum(axis=0)).flatten()
unigram_list = unigram_vectorizer.get_feature_names_out()
total_unigrams = unigram_counts.sum()


In [6]:
ngram_probs = {
  ngram: count / total_ngrams
  for ngram, count in zip(ngram_list, ngram_counts)
  if count >= 3
}
unigram_probs = {
  unigram: count / total_unigrams
  for unigram, count in zip(unigram_list, unigram_counts)
  if count >= 3
}

In [23]:
def tokenize(sentence):
  return tokenizer(sentence, return_tensors="pt", padding=True).input_ids

In [24]:
class Hypothesis:
  def __init__(self, score=0, is_open=True, sequence=None, constraints=[]):
    self.score = score
    self.is_open = is_open
    self.sequence = sequence
    self.constraints = constraints


In [25]:
def init_grid(max_len, num_constraints):
  grid = []
  for i in range(max_len):
    row = []
    for j in range(num_constraints + 1):
        row.append([])
    grid.append(row)
  return grid

In [55]:
# generate new open hypotheses
def generate(model, hyp, input_ids, constraints=[]):

  if hyp.sequence is not None:
    if hyp.sequence[-1, -1].item() == tokenizer.eos_token_id:
      return None

  # forward pass
  with torch.no_grad():
      encoder_outputs = model.get_encoder()(input_ids=input_ids)

  # initialize decoder input
  if hyp.sequence is None:
      decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]])
  else:
      decoder_input_ids = hyp.sequence

  with torch.no_grad():
    outputs = model(
        input_ids=None,
        encoder_outputs=encoder_outputs,
        decoder_input_ids=decoder_input_ids
    )
    logits = outputs.logits

  # get the logits for the last predicted token
  next_token_logits = logits[:, -1, :] # [batch_size, sequence_length, vocab_size]
  next_token_probs = torch.softmax(next_token_logits, dim=-1).squeeze(0)

  # get the token with the highest probability
  next_token_id = torch.argmax(next_token_probs).item()
  next_token_prob = next_token_probs[next_token_id].item()

  new_score = hyp.score + next_token_prob
  new_sequence = torch.cat([decoder_input_ids, torch.tensor([[next_token_id]])], dim=-1)

  next_token = tokenizer.decode([next_token_id], skip_special_tokens=True)

  # check if the decoded token is in the list of constraints
  if next_token in constraints:
    new_constraints = hyp.constraints + [next_token]
  else:
    new_constraints = hyp.constraints

  new_hyp = Hypothesis(new_score, True, new_sequence, new_constraints)

  return new_hyp


In [27]:
# start new constrained hypotheses
def start(model, hyp, input_ids, constraints):

  if hyp.sequence is not None:
    if hyp.sequence[-1, -1].item() == tokenizer.eos_token_id:
      return None

  for constraint in constraints:
    if constraint not in hyp.constraints:

      # get constraint tokens
      constraint_token_ids = tokenizer.encode(constraint, add_special_tokens=False)

      # forward pass
      with torch.no_grad():
          encoder_outputs = model.get_encoder()(input_ids=input_ids)

      # initialize decoder input
      if hyp.sequence is None:
          decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]])
      else:
          decoder_input_ids = hyp.sequence

      new_sequence = decoder_input_ids
      new_score = hyp.score

      for token_id in constraint_token_ids:
        with torch.no_grad():
          outputs = model(
              input_ids=None,
              encoder_outputs=encoder_outputs,
              decoder_input_ids=decoder_input_ids
          )
          logits = outputs.logits

        # get the logits for the last predicted token
        next_token_logits = logits[:, -1, :] # [batch_size, sequence_length, vocab_size]
        next_token_probs = torch.softmax(next_token_logits, dim=-1).squeeze(0)

        # get probability of constraint token
        constraint_token_prob = next_token_probs[token_id].item()

        new_score = new_score + constraint_token_prob
        new_sequence = torch.cat([new_sequence, torch.tensor([[token_id]])], dim=-1)

      new_constraints = hyp.constraints + [constraint]

      is_open = len(hyp.constraints) < len(constraints)

      new_hyp = Hypothesis(new_score, is_open, new_sequence, new_constraints)
      return new_hyp

  return None

In [28]:
# continue unfinished hypotheses
def cont(model, hyp, input_ids, constraints):
  return generate(model, hyp, input_ids, constraints)

In [105]:
def constrained_beam_search(model, input_ids, constraints, max_len, num_constraints, beam_size=3):

  """
  model: model
  input: example tokenized sentence
  constraints: constraints
  max_len: max_len of translation
  num_constraints: # of constraints
  beam_size: beam_size
  """
  start_hyp = Hypothesis() # initial hypothesis
  grid = init_grid(max_len, num_constraints) # initialize beams in grid
  grid[0][0] = [start_hyp]

  for t in range(1, max_len):
    for c in range(max(0, (num_constraints + t) - max_len), min(t, num_constraints) + 1):
      n, s, g = [], [], []

      for hyp in grid[t - 1][c]:
        if hyp.is_open:
          new_hyp = generate(model, hyp, input_ids, constraints)
          if new_hyp is not None:
            g.append(new_hyp)

      if c > 0:
      for hyp in grid[t - 1][c - 1]:
        if hyp.is_open:
          new_hyp = start(model, hyp, input_ids, constraints)
          if new_hyp is not None:
            s.append(new_hyp)
        else:
          new_hyp = cont(model, hyp, input_ids, constraints)
          if new_hyp is not None:
            n.append(new_hyp)

      all_hyps = sorted(n + s + g, key=lambda hyp: hyp.score, reverse=True)

      grid[t][c] = all_hyps[:beam_size] # k-best scoring hypotheses stay on the beam

  top_level_hyps = [] # get hyps in top level beams
  for t in range(len(grid)):
    top_level_hyps.extend(grid[t][num_constraints])

  finished_hyps = []
  for hyp in top_level_hyps:
    if hyp.sequence[0, -1].item() == tokenizer.eos_token_id:
      finished_hyps.append(hyp)

  best_hyp = max(top_level_hyps, key=lambda hyp: hyp.score)

  return best_hyp


In [None]:
de_en_train.num_rows // 10

3478224

In [7]:
def pmi(p_x_y, p_x_vals):
  p_x = np.prod(p_x_vals)
  return np.log(p_x_y / p_x) if p_x != 0 else 0

In [8]:
def calculate_npmi(ngram, ngram_probs, unigram_probs):
  ngram_prob = ngram_probs.get(ngram, 0)
  if ngram_prob == 0:
    return 0

  words = ngram.split()

  p_x_vals = [unigram_probs.get(word, 1e-9) for word in words]
  pmi_value = pmi(ngram_prob, p_x_vals)

  npmi_value = pmi_value / -np.log(ngram_prob)
  return npmi_value

In [9]:
def extract_constraints(sentence, vectorizer, ngram_probs, unigram_probs, npmi_threshold=0.9):
  try:
    vectorizer.fit([sentence])
    sentence_ngrams = vectorizer.get_feature_names_out()
  except ValueError:
    return []

  constraints = []
  for ngram in sentence_ngrams:
    npmi = calculate_npmi(ngram, ngram_probs, unigram_probs)
    if npmi != 0: # idk how to get better npmi
      constraints.append(ngram)
  return constraints

In [16]:
print(extract_constraints(de_en_dev[5]['translation']['de'], vectorizer, ngram_probs, unigram_probs))

[]


In [None]:
print(extract_constraints(de_en_dev[0]['translation']['de'], vectorizer, ngram_probs, unigram_probs))

In [22]:
de_en_dev[6]['translation']['de']


'Das kennt jeder, der sich schon mal aufregen musste, weil das Auto-Navi statt einer Umgehungsstraße eine grüne Wiese anzeigte.'

In [17]:
for row in range(100):
  print(row, extract_constraints(de_en_dev[row]['translation']['de'], vectorizer, ngram_probs, unigram_probs))

0 []
1 []
2 []
3 []
4 []
5 []
6 ['grüne wiese', 'kennt schon', 'schon mal']
7 ['gerade deshalb']
8 []
9 ['19 jahrhundert', 'seit 19', 'städte gemeinden']
10 []
11 ['städte gemeinden']
12 []
13 []
14 ['seit frühjahr']
15 ['land gab', 'militärischen zwecken', 'natürlich militärischen']
16 ['französischen truppen']
17 []
18 ['stammt jahr']
19 ['früher heute', 'heute beispiel', 'vergleich früher']
20 []
21 ['beispiel dafür']
22 ['21 jahre', 'wurde 21']
23 ['heute liegt']
24 ['benannt wurde']
25 ['ersten blick']
26 ['heute landwirte', 'landwirte mehr']
27 []
28 ['19 jahrhundert']
29 ['gibt historische']
30 []
31 []
32 []
33 ['zusammenhang tod']
34 []
35 []
36 ['später wurde']
37 ['mehr grund']
38 ['befindet seit']
39 []
40 []
41 ['1980er jahren']
42 ['zerstört worden', 'zweiten weltkrieg']
43 []
44 ['zweite weltkrieg']
45 []
46 []
47 []
48 []
49 []
50 []
51 ['28 august', 'besondere aufmerksamkeit', 'kraft treten', 'lassen reihe', 'letzter zeit', 'vorsicht walten', 'vorsicht walten lassen', 

In [106]:
def generate_baseline_translation(model, input_ids):
  hyp = Hypothesis()
  while True:
    hyp = generate(model, hyp, input_ids)

    if hyp.sequence[0, -1].item() == tokenizer.eos_token_id:
      break

  return hyp.sequence

In [96]:
input_ids = tokenize("He was also an anti- smoking activist and took part in several campaigns.")
constraints = extract_constraints(de_en_dev[11]['translation']['de'], vectorizer, ngram_probs, unigram_probs)
constraints = ['Ebenso setzte er', 'gegen das Rauchen', 'nahm']

In [107]:
constrained_translation = constrained_beam_search(model, input_ids, constraints, 30, len(constraints))

In [99]:
baseline_translation = generate_baseline_translation(model, input_ids)

In [100]:
baseline_translation

tensor([[58100,   201,   133,    89,  2226,    13,   623,  9515,    13, 32533,
           651,    10,  3549,    39,  4356, 30902,  6143,     3,     0]])

In [108]:
constrained_translation.sequence

tensor([[58100,   201,   133,    89,  2226,    13,   623,  9515,    13, 32533,
           651,    10,   155,  1487,  1769,   452,   239,   227,   110,   372,
           749,    44, 20437,  1017,     3,  4638,  5616,   227,    39,  4356,
         30902,  6143,     3,     0]])

In [109]:
constrained_translation.score

11.76569245658041

In [102]:
#baseline
tokenizer.decode(torch.tensor([[58100,   201,   133,    89,  2226,    13,   623,  9515,    13, 32533,
           651,    10,  3549,    39,  4356, 30902,  6143,     3,     0]]).flatten(), skip_special_tokens= True)

'Er war auch Anti-Rauch-Aktivist und nahm an mehreren Kampagnen teil.'

In [110]:
#constrained
tokenizer.decode(torch.tensor([[558100,   201,   133,    89,  2226,    13,   623,  9515,    13, 32533,
           651,    10,   155,  1487,  1769,   452,   239,   227,   110,   372,
           749,    44, 20437,  1017,     3,  4638,  5616,   227,    39,  4356,
         30902,  6143,     3,     0]]).flatten(), skip_special_tokens= True)

'Er war auch Anti-Rauch-Aktivist und Ebenso setzte er gegen das Rauchen. nahmte an mehreren Kampagnen teil.'

In [104]:
constraints

['Ebenso setzte er', 'gegen das Rauchen', 'nahm']

In [81]:
# reference
de_en_dev[11]['translation']['de']

'In die große Übersichtskarte wurden für Städte und Gemeinden detailliertere Karten, sogenannte Urpositionsblätter, eingearbeitet.'

In [80]:
de_en_dev[11]['translation']['en']

'Within the large overview map, were worked in detailed maps for towns and municipalities, so-called original-lay-of-the-land sheets.'

In [None]:
# compute score using BLEU
def compute_score(prediction, actual):
  pass