# Temporary tests

In [16]:
import random
from typing import List, Tuple

In [17]:
def generate_fake_text(num_phrases):
    phrases = []
    for i in range(1, num_phrases + 1):
        phrase = f"Phrase {i}."
        phrases.append(phrase)

    fake_text = " ".join(phrases)
    return fake_text

In [18]:
DEMO_TEXT = generate_fake_text(20)
print(DEMO_TEXT.split(". ")[:5])

['Phrase 1', 'Phrase 2', 'Phrase 3', 'Phrase 4', 'Phrase 5']


In [19]:
def fragment(text: str, deletion_prob: float) -> str:
    """Fragment a text by removing sentences randomly.

    Args:
        deletion_prob (float): The probability of sentence deletion.

    Returns:
        str: The text with sentences deleted.
    """
    sentences = text.split(".")
    num_sentences_to_delete = int(deletion_prob * len(sentences))
    if num_sentences_to_delete > 0:
        deletion_indices = random.sample(range(len(sentences)),
                                            num_sentences_to_delete)
        for index in sorted(deletion_indices, reverse=True):
            del sentences[index]
    return ".".join(sentences)

print(fragment(DEMO_TEXT, 0))
print(fragment(DEMO_TEXT, 0.25))
print(fragment(DEMO_TEXT, 1))
# print(fragment(DEMO_TEXT, 2)) # Error Expected

Phrase 1. Phrase 2. Phrase 3. Phrase 4. Phrase 5. Phrase 6. Phrase 7. Phrase 8. Phrase 9. Phrase 10. Phrase 11. Phrase 12. Phrase 13. Phrase 14. Phrase 15. Phrase 16. Phrase 17. Phrase 18. Phrase 19. Phrase 20.
Phrase 1. Phrase 2. Phrase 3. Phrase 5. Phrase 6. Phrase 7. Phrase 8. Phrase 10. Phrase 11. Phrase 15. Phrase 16. Phrase 17. Phrase 18. Phrase 19. Phrase 20.



In [20]:
def fragment(text: str, deletion_prob: float) -> str:
    """Fragment a text by removing sentences randomly.

    Args:
        deletion_prob (float): The probability of sentence deletion.

    Returns:
        str: The text with sentences deleted.
    """
    sentences = text.split(".")
    num_sentences_to_delete = int(deletion_prob * len(sentences))
    if num_sentences_to_delete > 0:
        deletion_indices = random.sample(range(len(sentences)),
                                            num_sentences_to_delete)
        for index in sorted(deletion_indices, reverse=True):
            del sentences[index]
    return ".".join(sentences)

print(fragment(DEMO_TEXT, 0.25))

Phrase 1. Phrase 2. Phrase 3. Phrase 4. Phrase 5. Phrase 6. Phrase 8. Phrase 11. Phrase 12. Phrase 13. Phrase 15. Phrase 16. Phrase 17. Phrase 18. Phrase 19.


In [21]:
import sys
from stemmabench.stemma_generator import Stemma
from stemmabench.config_parser import StemmaBenchConfig
from loguru import logger
# Set logging level to info
logger.remove()
logger.add(sys.stderr, level="INFO")

2

In [22]:
config = StemmaBenchConfig(**{
    "meta": {
      "language": "eng"  
    },
    "stemma": {
        "depth": 2,
        "width": {
            "law": "Uniform",
            "min": 2,
            "max": 3
        }
    },
    "variants": {
        "sentences": {
            "duplicate": {
                "args": {"nbr_words": 1},
                "law": "Bernouilli",
                "rate": 0.05
            },
            "delete": {
                "law": "Bernouilli",
                "rate": 0.01
            }
        },
        "words": {
            "synonym": {
                "law": "Bernouilli",
                "rate": 0.05,
                "args": {}
            },
            "mispell": {
                "law": "Bernouilli",
                "rate": 0.001,
                "args": {}
            },
            "omit": {
                "law": "Bernouilli",
                "rate": 0.001,
                "args": {}
            }
        }
    }
})

In [23]:
# Check is the method has been added during the update (need to run `invoke install` again)
from stemmabench.textual_units import sentence
phrase = sentence.Sentence("Phrase")
dir(phrase)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'clean',
 'delete',
 'duplicate',
 'nbr_words',
 'sentence',
 'words']

Create a stemma object:

In [24]:
# Create a stemma object.
stemma = Stemma(original_text=DEMO_TEXT, config=config)

# Generate a tradition.
stemma.generate()

Tree({
  "Phrase 1. Phrase 2. Phrase 3. Phrase 4. Phrase 5. Phrase 6. Phrase 7. Phrase 8. Phrase 9. Phrase 10. Phrase 11. Phrase 12. Phrase 13. Phrase 14. Phrase 15. Phrase 16. Phrase 17. Phrase 18. Phrase 19. Phrase 20.": {
    "Phrase 1 phrase 2 phrase 3 articulate 4 phrase 5 phrase 6 phrase 7 phrase 8 phrase 9 phrase 10 phrase 11 phrase 12 phrase 13 phrase 14 phrase 15 phrase 16 phrase 17 phrase 18 phrase 19 phrase 20.": {
      "Phrase 1 phrase 2 phrase 3 articulate 4 phrase 5 phrase 6 phrase 7 phrase 8 phrase 9 phrase 10 phrase 11 phrase 12 phrase 13 phrase 14 phrase 15 phrase 16 phrase 17 phrase 18 phrase 19 phrase 20.": [
        "Phrase 1 phrase 2 phrase 3 articulate 4 phrase 5 phrase 6 phrase 7 phrase 8 phrase 9 phrase 10 phrase 11 phrase 12 phrase 13 phrase 14 phrase 15 phrase 16 phrase 17 phrase 18 phrase 19 phrase 20.",
        "Phrase 1 phrase 2 phrase 3 articulate 4 phrase 5 phrase 6 phrase 7 phrase 8 phrase 9 phrase 10 phrase 11 phrase 12 phrase 13 phrase 14 phrase 15 ph

Each text can be accessed through its lookup table, which can be used to get the tree stemma.

In [25]:
# stemma.texts_lookup

It is also possible to access the edges describing only the manuscript names and their family relation.

In [26]:
stemma.edges

[('0', '0:0'),
 ('0', '0:1'),
 ('0:0', '0:0:0'),
 ('0:0', '0:0:1'),
 ('0:1', '0:1:0'),
 ('0:1', '0:1:1'),
 ('0:0:0', '0:0:0:0'),
 ('0:0:0', '0:0:0:1'),
 ('0:0:1', '0:0:1:0'),
 ('0:0:1', '0:0:1:1'),
 ('0:0:0', '0:0:0:0'),
 ('0:0:0', '0:0:0:1'),
 ('0:0:1', '0:0:1:0'),
 ('0:0:1', '0:0:1:1')]