In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 28.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 67.5 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 58.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [2]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.25.0.tar.gz (44 kB)
[K     |████████████████████████████████| 44 kB 1.9 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pandas-stubs>=1.1.0.11
  Downloading pandas_stubs-1.5.2.221213-py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 42.4 MB/s 
Collecting types-pytz>=2022.1.1
  Downloading types_pytz-2022.7.0.0-py3-none-any.whl (4.7 kB)
Building wheels for collected packages: openai
  Building wheel for openai (PEP 517) ... [?25l[?25hdone
  Created wheel for openai: filename=openai-0.25.0-py3-none-any.whl size=55880 sha256=775929f7c7450a7f4a71504eb26afca90c5b3c5e99a1e0b59ee091ed392dc59f
  Stored in directory: /root/.cache/pip/wheels/4b/92/33/6f57c7aae0b16875267999a50570e81f15eecec577ebe

## SURPRISAL MODULE MANUAL IMPORTING

In [3]:
"""Defines the API for this module"""

from abc import ABC, abstractmethod, abstractclassmethod, abstractproperty
import typing


class Model(ABC):
    def __init__(self, model_id=None) -> None:
        super().__init__()
        self.model_id = model_id

    @abstractmethod
    def surprise(self, textbatch: typing.Union[typing.List, str]) -> "Surprisal":
        raise NotImplementedError


class SurprisalQuantity(float):
    def __init__(self, value, text="") -> None:
        float.__init__(value)
        self.text = text

    def __new__(self, value, text):
        return float.__new__(self, value)

    def __repr__(self) -> str:
        return super().__repr__() + "\n" + self.text

    def get_value(self):
        return float(super().__repr__())

    def get_text(self):
        return self.text


class SurprisalArray(ABC):
    def __index__(self):
        pass

    def __len__(self):
        return len(self.surprisals)

    @property
    @abstractmethod
    def tokens(self):
        raise NotImplementedError

    @property
    @abstractmethod
    def surprisals(self) -> typing.Collection[SurprisalQuantity]:
        raise NotImplementedError

    def lineplot(self, f=None, a=None, cumulative=False):
        # import plotext as plt
        from matplotlib import pyplot as plt
        import numpy as np

        if f is None or a is None:
            f, a = plt.subplots()

        arr = np.cumsum(self.surprisals) if cumulative else self.surprisals
        a.plot(
            arr + np.random.rand(len(self)) / 10,
            ".--",
            lw=2,
            label=" ".join(self.tokens),
            alpha=0.9,
        )
        a.set(
            xticks=range(0, len(self.tokens)),
            xlabel=("tokens"),
            ylabel=(
                f"{'cumulative ' if cumulative else ''}surprisal (natural log scale)"
            ),
        )
        # plt.legend(bbox_to_anchor=(0, -0.1), loc="upper left")
        plt.tight_layout()
        a.grid(visible=True)

        for i, (t, y) in enumerate(self):
            a.annotate(t, (i, arr[i]))

        return f, a

In [4]:
from transformers import tokenization_utils_base


def pick_matching_token_ixs(
    encoding: "tokenizers.Encoding", span_of_interest: slice, span_type: str
) -> slice:
    """Picks token indices in a tokenized encoded sequence that best correspond to
        a substring of interest in the original sequence, given by a char span (slice)
    Args:
        encoding (transformers.tokenization_utils_base.BatchEncoding): the output of a
            `tokenizer(text)` call on a single text instance (not a batch, i.e. `tokenizer([text])`).
        span_of_interest (slice): a `slice` object denoting the character indices in the
            original `text` string we want to extract the corresponding tokens for
        span_type (str): either `char` or `word`, denoting what type of span we are interested
            in obtaining. this argument has no default to ensure the user is aware of what
            kind of span they are getting from this function
    Returns:
        slice: the start and stop indices of **tokens** within an encoded sequence that
            best match the `span_of_interest`
    """
    span_of_interest = slice(
        span_of_interest.start or 0,
        span_of_interest.stop or len(encoding.ids),
        span_of_interest.step,
    )

    start_token = 0
    end_token = len(encoding.ids)
    for i, _ in enumerate(encoding.ids):
        span = encoding.token_to_chars(i)
        word = encoding.token_to_word(i)
        # batchencoding 0 gives access to the encoded string

        if span is None or word is None:
            # for [CLS], no span is returned
            # log(f'No span returned for token at {i}: "{batchencoding.tokens()[i]}"',
            #      type="WARN", cmap="WARN", verbosity_check=True)
            continue
        else:
            span = tokenization_utils_base.CharSpan(*span)

        if span_type == "char":
            if span.start <= span_of_interest.start:
                start_token = i
            if span.end >= span_of_interest.stop:
                end_token = i + 1
                break
        elif span_type == "word":
            if word < span_of_interest.start:
                start_token = i + 1
            # watch out for the semantics of the "stop"
            if word == span_of_interest.stop:
                end_token = i
            elif word > span_of_interest.stop:
                break

    assert end_token - start_token <= len(
        encoding.ids
    ), f"Extracted span is larger than original span"

    return slice(start_token, end_token)

In [5]:
import typing
import logging
from abc import abstractmethod
from functools import partial

import numpy as np
from transformers import (
    AutoModelForCausalLM,
    AutoModelForMaskedLM,
    AutoTokenizer,
    PreTrainedModel,
)

logger = logging.getLogger(name="surprisal")


###############################################################################
### surprisal container classes
###############################################################################


class HuggingFaceSurprisal(SurprisalArray):
    def __init__(
        self,
        tokens: "Encoding",
        surprisals: np.ndarray,
    ) -> None:
        super().__init__()

        self._tokens: "Encoding" = tokens
        self._surprisals = surprisals.astype(SurprisalQuantity)

    @property
    def tokens(self):
        return self._tokens.tokens

    @property
    def surprisals(self):
        return self._surprisals

    def __iter__(self) -> typing.Tuple[str, float]:
        return zip(self.tokens, self.surprisals)

    def __getitem__(self, slctup: typing.Tuple[typing.Union[slice, int], str]):
        """Returns the aggregated surprisal over a character
        Args:
            slctup (typing.Tuple[typing.Union[slice, int], str]):
                `(slc, slctype) = slctup`: a tuple of a `slc` (slice) and a `slctype` (str).
                `slc` gives the slice of the original string we want to aggregate surprisal over.
                `slctype` indicates if it should be a "char" slice or a "word" slice.
                if a character falls inside a token, then that entire token is included.
        Returns:
            float: the aggregated surprisal over the word span
        """
        try:
            slc, slctype = slctup
            if slctype not in ("word", "char"):
                raise ValueError(f"unrecognized slice type {slctype}")
        except TypeError:
            slc, slctype = slctup, "char"

        if slctype == "char":
            fn = partial(pick_matching_token_ixs, span_type="char")
        elif slctype == "word":
            fn = partial(pick_matching_token_ixs, span_type="word")

        if type(slc) is int:
            slc = slice(slc, slc + 1)

        token_slc = fn(self._tokens, slc)
        return SurprisalQuantity(
            self.surprisals[token_slc].sum(), " ".join(self.tokens[token_slc])
        )

    def __repr__(self) -> str:
        numfmt = "{: >10.3f}"
        strfmt = "{: >10}"
        accumulator = ""
        for t in self.tokens:
            accumulator += strfmt.format(t[:10]) + " "
        accumulator += "\n"
        for s in self.surprisals:
            accumulator += numfmt.format(s) + " "
        return accumulator


###############################################################################
### model classes to compute surprisal
###############################################################################
class HuggingFaceModel(Model):
    """
    A class to support language models hosted on the Huggingface Hub
    identified by a model ID
    """

    def __init__(
        self,
        model_id: str,
        model_class: typing.Callable,
        device: str = "cpu",
    ) -> None:
        super().__init__(model_id)

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        # self.model_class = model_class
        self.model: PreTrainedModel = model_class.from_pretrained(self.model_id)
        self.model.eval()
        self.to(device)  # initializes a variable called `device`

    def to(self, device: str):
        """
        stateful method to move the model to specified device
        and also track device for moving any inputs
        """
        self.device = device
        self.model.to(self.device)

    def tokenize(self, textbatch: typing.Union[typing.List, str], max_length=1024):
        if type(textbatch) is str:
            textbatch = [textbatch]

        tokenized = self.tokenizer(
            textbatch,
            padding="longest",
            max_length=max_length,
            return_tensors="pt",
            add_special_tokens=True,
        )

        return tokenized

    @abstractmethod
    def surprise(
        self, textbatch: typing.Union[typing.List, str]
    ) -> HuggingFaceSurprisal:
        raise NotImplementedError

    def extract_surprisal(
        self,
        phrases: typing.Union[str, typing.Collection[str]] = None,
        prefix="",
        suffix="",
    ) -> typing.List[float]:
        """
        Extracts the surprisal of the phrase given the prefix and suffix by making a call to
        `HuggingFaceSurprisal` __getitem__ object. No whitespaces or delimiters are added to
        the prefix or suffix, so make sure to provide an exact string formatted appropriately.
        """
        if type(phrases) is str:
            phrases = [phrases]
        if phrases is None:
            raise ValueError("please provide a phrase to extract the surprisal of")
        textbatch = map(lambda x: str(prefix) + str(x) + str(suffix), phrases)
        slices = map(lambda x: slice(len(prefix), len(prefix + x)), phrases)
        surprisals = self.surprise([*textbatch])
        return [surp[slc, "char"] for surp, slc in zip(surprisals, slices)]


class CausalHuggingFaceModel(HuggingFaceModel):
    def __init__(self, model_id=None) -> None:
        super().__init__(model_id, model_class=AutoModelForCausalLM)
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def surprise(
        self,
        textbatch: typing.Union[typing.List, str],
        use_bos_token=True,
    ) -> typing.List[HuggingFaceSurprisal]:
        import torch

        tokenized = self.tokenize(textbatch)

        if use_bos_token:
            ids = torch.concat(
                (
                    torch.tensor([self.tokenizer.bos_token_id])
                    .view(1, -1)
                    .repeat(tokenized.input_ids.shape[0], 1),
                    tokenized.input_ids,
                ),
                dim=1,
            )
        else:
            ids = tokenized.input_ids

        with torch.no_grad():
            output = self.model(
                ids.to(self.device),
                return_dict=True,
            )
        tokenized = tokenized.to(self.device)

        # b, n, V
        logits = output["logits"]
        b, n, V = logits.shape
        # we don't want the pad token to shift the probability distribution,
        # so we set its weight to -inf
        logits[:, :, self.tokenizer.pad_token_id] = -float("inf")
        logsoftmax = torch.log_softmax(logits, dim=2)
        

        # for CausalLMs, we pick one before the current word to get surprisal of the current word in
        # context of the previous word. otherwise we would be reading off the surprisal of current
        # word given the current word plus context, which would always be high due to non-repetition.
        logprobs = (
            logsoftmax[:, :-1, :]
            .gather(
                2,
                tokenized.input_ids[:, not use_bos_token :].unsqueeze(2),
            )
            .squeeze(2)
        )

        if not use_bos_token:
            # padding to the left with a NULL because we removed the BOS token
            logprobs = torch.concat((torch.ones(b, 1) * torch.nan, logprobs), dim=1)

        # b stands for an individual item in the batch; each sentence is one item
        # since this is an autoregressive model
        accumulator = []
        for b in range(logprobs.shape[0]):
            accumulator += [
                HuggingFaceSurprisal(
                    tokens=tokenized[b], surprisals=-logprobs[b, :].cpu().numpy()
                )
            ]
        return accumulator


class MaskedHuggingFaceModel(HuggingFaceModel):
    def __init__(self, model_id=None) -> None:
        super().__init__(model_id, model_class=AutoModelForMaskedLM)

    def surprise(
        self,
        textbatch: typing.Union[typing.List, str],
        bidirectional=False,
        fixed_length=False,
    ) -> HuggingFaceSurprisal:
        import torch

        tokenized = self.tokenize(textbatch)

        # BERT-like tokenizers already include a bos token in the tokenized sequence with
        # `include_special_tokens=True`
        ids_with_bos_token = tokenized.input_ids
        b, n = ids_with_bos_token.shape
        # new shape: b * n, n
        ids_with_bos_token = ids_with_bos_token.repeat(1, n - 1).view(b * (n - 1), n)
        mask_mask = torch.eye(n, n)[1:, :].repeat(b, 1).bool()
        ids_with_bos_token[mask_mask] = self.tokenizer.mask_token_id

        raise NotImplementedError


class OpenAIModel(HuggingFaceModel):
    """
    A class to support using black-box language models for surprisal
    through the OpenAI API (GPT3 family of models). These models have
    a different method of obtaining surprisals, since the model is not
    locally hosted. GPT3 uses the same tokenizer as GPT2, however,
    so we can directly feed into HuggingFaceSurprisal and benefit from
    the same tools as the Huggingface models to extract surprisal for
    smaller parts of the text.
    """

    def __init__(
        self, model_id="text-davinci-002", openai_api_key=None, openai_org=None
    ) -> None:
        import os

        self.OPENAI_API_KEY = openai_api_key or os.environ.get("OPENAI_API_KEY", None)
        if self.OPENAI_API_KEY is None:
            raise ValueError(
                "Error: no openAI API key provided. Please pass it in "
                "as a kwarg (`openai_api_key=...`) or specify the environment variable OPENAI_API_KEY"
            )
        self.OPENAI_ORG = openai_org or os.environ.get("OPENAI_ORG", None)
        if self.OPENAI_ORG is None:
            raise ValueError(
                "Error: no openAI organization ID provided. Please pass it in "
                "as a kwarg (`openai_org=...`) or specify the environment variable OPENAI_ORG"
            )

        self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.request_kws = dict(
            engine=model_id,
            prompt=None,
            temperature=0.5,
            max_tokens=0,
            top_p=1.0,
            frequency_penalty=0.0,
            presence_penalty=0.0,
            logprobs=1,
            echo=True,
        )

    def surprise(
        self,
        textbatch: typing.Union[typing.List, str],
        use_bos_token=True,
    ) -> typing.List[HuggingFaceSurprisal]:
        import openai

        openai.organization = self.OPENAI_ORG
        openai.api_key = self.OPENAI_API_KEY

        if type(textbatch) is str:
            textbatch: typing.List[str] = [textbatch]

        tokenized = self.tokenizer(textbatch)
        if use_bos_token:
            # if using BOS token, prepend each line with the BOS token
            textbatch = list(map(lambda s: self.tokenizer.bos_token + s, textbatch))

        self.request_kws["prompt"] = textbatch

        response = openai.Completion.create(**self.request_kws)
        batched = response["choices"]

        # b stands for an individual item in the batch; each sentence is one item
        # since this is an autoregressive model
        accumulator = []
        for b in range(len(batched)):
            logprobs = np.array(batched[b]["logprobs"]["token_logprobs"], dtype=float)
            tokens = batched[b]["logprobs"]["tokens"]

            assert (
                len(tokens) == len(tokenized[b]) + use_bos_token
            ), f"Length mismatch in tokenization by GPT2 tokenizer `{tokenized[b]}` and tokens returned by OpenAI GPT-3 API `{tokens}`"

            accumulator += [
                HuggingFaceSurprisal(
                    # we have already excluded it from the tokenized object earlier
                    tokens=tokenized[b],
                    # if using BOS token, exclude it
                    surprisals=-logprobs[use_bos_token:],
                )
            ]
        return accumulator


class AutoTransformerModel(Model):
    """
    Factory class for initializing surprisal models based on transformers, either Huggingface or OpenAI
    """

    def __init__(self) -> None:
        """
        this `__init__` method does nothing; the correct way to use this
        class is using the `from_pretrained` classmethod.
        """

    @classmethod
    def from_pretrained(
        cls, model_id, model_class: str = None, **kwargs
    ) -> typing.Union[HuggingFaceModel, OpenAIModel]:
        """
        kwargs gives the user an opportunity to specify
        the OpenAI API key and organization information
        """

        model_class = model_class or ""
        if (
            "gpt3" in model_class.lower() + " " + model_id.lower()
            # or model_id.lower() in openai_models_list
        ):
            return OpenAIModel(model_id, **kwargs)
        elif "gpt" in model_class.lower() + " " + model_id.lower():
            hfm = CausalHuggingFaceModel(model_id)
            # for GPT-like tokenizers, pad token is not set as it is generally inconsequential for autoregressive models
            hfm.tokenizer.pad_token = hfm.tokenizer.eos_token
            return hfm
        elif "bert" in model_class.lower() + " " + model_id.lower():
            return MaskedHuggingFaceModel(model_id)
        else:
            raise ValueError(
                f"unable to determine appropriate model class based for model_id="
                f'"{model_id}" and model_class="{model_class}". '
                f'Please explicitly pass either "gpt" or "bert" as model_class.'
            )


AutoHuggingFaceModel = AutoTransformerModel

BRINGING IN DATASET + RUNNING EACH SENTENCE THRU GPT-3

In [6]:
phrases = ['THE FOOD DRIES FAST.', #[AMBIGUOUS, slightly less pop]
           'PAINT THE PICTURE.',  #[slightly pop, ambiguous]
           'YOUR FOAM HELPS YOU FLOAT.', #[less pop]
           'HERDING EVERYONE INSIDE.', #[less pop]
           'BUYING SHARES.', #[more pop]
           'INFLATION ISNT HELPING WAGES.', #[slightly pop]
           'SHE WAS SPYING ON US.', #[less pop]
           'I NEED DENTAL CARE.', #[slightly pop]
           'I NEED SOME SPRAY.']
extra_phrases = [
           'THE CORPSE IS DEAD', #[more pop]
           'THE CHARTER PASSED', #[less [pop]
           'THE SHIRT HAD DRIP', #way less pop
           'RELAX AND BREATHE', #[way more pop] [pop]
           'I RUSHED INTO THE BUILDING', #ambiguous 
           'THE PAIN IS TOO MUCH', #[more pop]
           'THE LIONS ARE ROARING', #[less pop]
           'THE RACKET WAS LOUD', #way more pop
           'SCOPE THE SITUATION', #less pop
          ]

real_words = ['DRIES', 'PAINT', 'FOAM', 'HERDING', 'SHARES', 'WAGES', 'SPYING', 'DENTAL', 'SPRAY']
alt_words = ['DATES', 'PRINT', 'FORM', 'HEADING', 'SHAPES', 'URGES', 'SAYING', 'DENIAL', 'SAPPY']

alt_phrases = []
for i, phrase in enumerate(phrases):
    word_ind = phrase.index(real_words[i])
    new_phrase = phrase[:word_ind] + alt_words[i] + phrase[word_ind+len(real_words[i]):]
    alt_phrases.append(new_phrase.lower())

real_phrases = []
for i, phrase in enumerate(alt_phrases):
    word_ind = phrase.index(alt_words[i].lower())
    new_phrase = phrase[:word_ind] + real_words[i] + phrase[word_ind+len(alt_words[i]):]
    real_phrases.append(new_phrase.lower())

phrases = [x.lower() for x in phrases]
    
print(alt_phrases)
print(real_phrases)
    
real_words = [x.lower() for x in real_words]
alt_words = [x.lower() for x in alt_words]

['the food dates fast.', 'print the picture.', 'your form helps you float.', 'heading everyone inside.', 'buying shapes.', 'inflation isnt helping urges.', 'she was saying on us.', 'i need denial care.', 'i need some sappy.']
['the food dries fast.', 'paint the picture.', 'your foam helps you float.', 'herding everyone inside.', 'buying shares.', 'inflation isnt helping wages.', 'she was spying on us.', 'i need dental care.', 'i need some spray.']


In [7]:
m = AutoHuggingFaceModel.from_pretrained('gpt2')
#m.to('cuda') # optionally move your model to GPU!

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [8]:
k = OpenAIModel(model_id='text-davinci-003',
                          openai_api_key="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",
                          openai_org="org-qb42VDp4gdzwwfvbhJ3mHkMw")

In [9]:
log_scores = []
for l, phrase in enumerate(real_phrases):
  [s] = k.surprise(phrase)
  ind = phrase.index(real_words[l])
  log_scores.append(np.exp(-1 * s[ind: ind+len(real_words[l]), "char"].get_value()))

In [10]:
alt_log_scores = []
for k, phrase in enumerate(alt_phrases):
  [s] = m.surprise(phrase)
  ind = phrase.index(alt_words[k])
  alt_log_scores.append(np.exp(-1 * s[ind: ind+len(alt_words[k]), "char"].get_value()))
  #print(s[ind: ind+len(alt_words[k]), "char"])

In [11]:
print(log_scores)

[3.5617731544744403e-06, 4.610602077421384e-07, 3.4700727458616283e-06, 6.066542492563587e-09, 0.00027617384979039425, 0.001010017117755315, 5.361187815840152e-06, 1.2971213525245518e-05, 8.773252094180475e-06]


In [12]:
for p in range(len(log_scores)):
  total = log_scores[p] + alt_log_scores[p]
  print(real_words[p], alt_words[p])
  print(log_scores[p]/total, alt_log_scores[p]/total)

dries dates
0.6299742122662033 0.3700257877337967
paint print
0.009257526275947512 0.9907424737240524
foam form
0.008582213442985903 0.9914177865570141
herding heading
0.0012793268681115081 0.9987206731318884
shares shapes
0.9819460165955294 0.018053983404470605
wages urges
0.99991013661668 8.986338332003595e-05
spying saying
0.0034818020264988036 0.9965181979735013
dental denial
0.9053156813791048 0.09468431862089516
spray sappy
0.9728655099558501 0.027134490044149934


In [13]:
gpt_probs = []
for p in range(len(log_scores)):
  total = log_scores[p] + alt_log_scores[p]
  print(real_words[p], alt_words[p])
  new_probs = [log_scores[p]/total, alt_log_scores[p]/total]
  gpt_probs.append(new_probs[0])
print(gpt_probs)

dries dates
paint print
foam form
herding heading
shares shapes
wages urges
spying saying
dental denial
spray sappy
[0.6299742122662033, 0.009257526275947512, 0.008582213442985903, 0.0012793268681115081, 0.9819460165955294, 0.99991013661668, 0.0034818020264988036, 0.9053156813791048, 0.9728655099558501]


In [14]:
for result in m.surprise(phrases):
  print(result)

       the      Ġfood       Ġdri         es      Ġfast          . <|endoftex 
     6.521      8.250     10.131      0.316      4.770      2.157        inf 
         p       aint       Ġthe   Ġpicture          . <|endoftex <|endoftex 
     6.844      4.755      4.500      5.033      2.896        inf        inf 
      your      Ġfoam     Ġhelps       Ġyou     Ġfloat          . <|endoftex 
     9.759     12.549      8.548      2.445      6.993      2.759        inf 
       her       ding  Ġeveryone    Ġinside          . <|endoftex <|endoftex 
     9.413      6.322      9.881      6.863      2.824        inf        inf 
        bu       ying    Ġshares          . <|endoftex <|endoftex <|endoftex 
    10.710      1.588      7.432      3.846        inf        inf        inf 
        in    flation        Ġis         nt   Ġhelping     Ġwages          . 
     6.402      7.398      3.166      9.103      6.627      6.836      2.461 
       she       Ġwas    Ġspying        Ġon        Ġus          

In [15]:
for result in m.surprise(alt_phrases):
  print(result)

       the      Ġfood     Ġdates      Ġfast          . <|endoftex <|endoftex 
     6.521      8.250     13.077     11.008      2.191        inf        inf 
     print       Ġthe   Ġpicture          . <|endoftex <|endoftex <|endoftex 
     9.917      5.854      8.542      3.104        inf        inf        inf 
      your      Ġform     Ġhelps       Ġyou     Ġfloat          . <|endoftex 
     9.759      7.822      8.032      2.160     11.223      2.418        inf 
   heading  Ġeveryone    Ġinside          . <|endoftex <|endoftex <|endoftex 
    12.260     10.020      7.335      2.695        inf        inf        inf 
        bu       ying    Ġshapes          . <|endoftex <|endoftex <|endoftex 
    10.710      1.588     12.191      3.443        inf        inf        inf 
        in    flation        Ġis         nt   Ġhelping     Ġurges          . 
     6.402      7.398      3.166      9.103      6.627     16.215      4.203 
       she       Ġwas    Ġsaying        Ġon        Ġus          