# Temporary tests

In [2]:
import random
from typing import List, Tuple

In [3]:
def generate_fake_text(num_phrases):
    phrases = []
    for i in range(1, num_phrases + 1):
        phrase = f"Phrase {i}."
        phrases.append(phrase)

    fake_text = " ".join(phrases)
    return fake_text

In [4]:
DEMO_TEXT = generate_fake_text(20)
print(f"""{DEMO_TEXT[:50]} \n{DEMO_TEXT.split(". ")[:5]}""")

Phrase 1. Phrase 2. Phrase 3. Phrase 4. Phrase 5.  
['Phrase 1', 'Phrase 2', 'Phrase 3', 'Phrase 4', 'Phrase 5']


In [5]:
# CASE 1: REMOVE INDEPENDENTLY A RANDOM NUMBER OF SEQUENCES (see branch: stb_fragment_ays1).
def fragment(text: str, frag_rate: float) -> str:
    """Fragment a text by removing sentences randomly.
    # Sentences are removed independently from each other.
    # Equivalent to add a `delete` method to the Sentence class with a 
    # ProbabilisticConfig object as parameter (law: `Bernouilli`, rate: float).

    Args:
        frag_rate (float): The probability of sentence deletion.

    Returns:
        str: The text with sentences deleted.
    """
    if not 0 <= frag_rate <= 1:
        raise ValueError("Probability larger than one or is negative.")
    
    sentences = text.split(".")
    nbr_to_delete = int(frag_rate * len(sentences))
    if nbr_to_delete > 0:
        deletion_indices = random.sample(range(len(sentences)),
                                            nbr_to_delete)
        for index in sorted(deletion_indices, reverse=True):
            del sentences[index]
    return ".".join(sentences)

print(fragment(DEMO_TEXT, 0))
print(fragment(DEMO_TEXT, 0.25))
print(fragment(DEMO_TEXT, 1))
# print(fragment(DEMO_TEXT, 2)) # Error Expected

Phrase 1. Phrase 2. Phrase 3. Phrase 4. Phrase 5. Phrase 6. Phrase 7. Phrase 8. Phrase 9. Phrase 10. Phrase 11. Phrase 12. Phrase 13. Phrase 14. Phrase 15. Phrase 16. Phrase 17. Phrase 18. Phrase 19. Phrase 20.
Phrase 1. Phrase 2. Phrase 3. Phrase 4. Phrase 7. Phrase 9. Phrase 10. Phrase 12. Phrase 13. Phrase 14. Phrase 15. Phrase 16. Phrase 17. Phrase 18. Phrase 19.



In [34]:
# CASE2: REMOVING ONE SEQUENCE OF SENTENCES.   
def fragment_sequence(text: str, frag_rate: float, punc: str = ".") -> str:
    """
    Fragment a text by randomly removing ONE sequence of sentences whose
    length is expressed as a percentage of the text length.

    Args:
        text (str): The input text to be fragmented.
        frag_rate (float): The rate of sentence deletion (0 <= frag_rate <= 1).
            Represents the percentage of sentences to be deleted.
        punc (str, optional): The punctuation used to split the text into sentences.
            Default is period/full stop (".").
    
    Returns:
        str: The fragmented text with sentences deleted.
    """
    if not 0 <= frag_rate <= 1:
        raise ValueError("Probability or rate larger than one or is negative.")
    # Split the text into sentences using the specified punctuation.
    sentences = text.split(punc)
    # Calculate the total number of sentences and the number of sentences to delete.
    num_sentences = len(sentences)
    num_sentences_to_delete = round(num_sentences * frag_rate)
    # Choose a random starting point for the sequence of sentences to be deleted.
    start_frag_location = min(random.choice(range(num_sentences)),
                              num_sentences - num_sentences_to_delete)
    # Determine the ending point of the fragment to be deleted.
    end_frag_location = start_frag_location + num_sentences_to_delete
    # Delete the selected sequence of sentences from the list.
    del sentences[start_frag_location:end_frag_location]

    # Join the remaining sentences back into a single text.
    fragmented_text = punc.join(sentences)

    return fragmented_text

for rate in [-1, 0.0, 0.25, 0.5, 0.75, 1.0, 2.0]:
    try:
        res = fragment_sequence(DEMO_TEXT, rate)
        length = len(res.split(".")) - 1 # approx
        print(f"{rate} - {length} sentences - {res}")
    except:
        print(rate, "Error")
        pass
# print(fragment(DEMO_TEXT, 2)) # Error Expected

-1 Error
0.0 - 20 sentences - Phrase 1. Phrase 2. Phrase 3. Phrase 4. Phrase 5. Phrase 6. Phrase 7. Phrase 8. Phrase 9. Phrase 10. Phrase 11. Phrase 12. Phrase 13. Phrase 14. Phrase 15. Phrase 16. Phrase 17. Phrase 18. Phrase 19. Phrase 20.
0.25 - 15 sentences - Phrase 1. Phrase 2. Phrase 3. Phrase 9. Phrase 10. Phrase 11. Phrase 12. Phrase 13. Phrase 14. Phrase 15. Phrase 16. Phrase 17. Phrase 18. Phrase 19. Phrase 20.
0.5 - 10 sentences - Phrase 1. Phrase 2. Phrase 3. Phrase 4. Phrase 5. Phrase 6. Phrase 7. Phrase 8. Phrase 19. Phrase 20.
0.75 - 4 sentences - Phrase 1. Phrase 2. Phrase 3. Phrase 4. Phrase 5
1.0 - 0 sentences - 
2.0 Error


In [59]:
# CASE 3: REMOVING `MANY` (A RANDOM NUMBER OF) SEQUENCES OF SENTENCES.
## Case 3.1: simple case 
### * constant length of fragment locations
### * uniform repartition of fragment locations whithin the document
def fragment_sequences(text: str, frag_rate: float, punc: str = ".") -> str:
    """
    Fragment a text by randomly removing MANY sequences of sentences. The total
    number of sentences to be deleted in the text is the sum of the sequences length and 
    is expressed as a percentage of the text length.

    Args:
        text (str): The input text to be fragmented.
        frag_rate (float): The rate of sentence deletion (0 <= frag_rate <= 1).
            Represents the percentage of sentences to be deleted.
        punc (str, optional): The punctuation used to split the text into sentences.
            Default is period/full stop (".").
    
    Returns:
        str: The fragmented text with sentences deleted.
    """
    if not 0 <= frag_rate <= 1:
        raise ValueError("Probability or rate larger than one or is negative.")
    # Split the text into sentences using the specified punctuation.
    sentences = text.split(punc)
    # Calculate the total number of sentences and the number of sentences to delete.
    num_sentences = len(sentences)
    num_sentences_to_delete = round(num_sentences * frag_rate)
    # Choose a random starting point for the sequence of sentences to be deleted.
    start_frag_location = min(random.choice(range(num_sentences)),
                              num_sentences - num_sentences_to_delete)
    # Determine the ending point of the fragment to be deleted.
    end_frag_location = start_frag_location + num_sentences_to_delete
    # Delete the selected sequence of sentences from the list.
    del sentences[start_frag_location:end_frag_location]

    # Join the remaining sentences back into a single text.
    fragmented_text = punc.join(sentences)

    return fragmented_text


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[6, 9]


In [21]:
import sys
from stemmabench.stemma_generator import Stemma
from stemmabench.config_parser import StemmaBenchConfig
from loguru import logger
# Set logging level to info
logger.remove()
logger.add(sys.stderr, level="INFO")

2

In [43]:
type(round(0.4) )

int

In [22]:
config = StemmaBenchConfig(**{
    "meta": {
      "language": "eng"  
    },
    "stemma": {
        "depth": 2,
        "width": {
            "law": "Uniform",
            "min": 2,
            "max": 3
        }
    },
    "variants": {
        "sentences": {
            "duplicate": {
                "args": {"nbr_words": 1},
                "law": "Bernouilli",
                "rate": 0.05
            },
            "delete": {
                "law": "Bernouilli",
                "rate": 0.01
            }
        },
        "words": {
            "synonym": {
                "law": "Bernouilli",
                "rate": 0.05,
                "args": {}
            },
            "mispell": {
                "law": "Bernouilli",
                "rate": 0.001,
                "args": {}
            },
            "omit": {
                "law": "Bernouilli",
                "rate": 0.001,
                "args": {}
            }
        }
    }
})

In [60]:
# # Check is the method has been added during the update (need to run `invoke install` again)
# from stemmabench.textual_units import sentence
# phrase = sentence.Sentence("Phrase")
# dir(phrase)

Create a stemma object:

In [24]:
# Create a stemma object.
stemma = Stemma(original_text=DEMO_TEXT, config=config)

# Generate a tradition.
stemma.generate()

Tree({
  "Phrase 1. Phrase 2. Phrase 3. Phrase 4. Phrase 5. Phrase 6. Phrase 7. Phrase 8. Phrase 9. Phrase 10. Phrase 11. Phrase 12. Phrase 13. Phrase 14. Phrase 15. Phrase 16. Phrase 17. Phrase 18. Phrase 19. Phrase 20.": {
    "Phrase 1 phrase 2 phrase 3 articulate 4 phrase 5 phrase 6 phrase 7 phrase 8 phrase 9 phrase 10 phrase 11 phrase 12 phrase 13 phrase 14 phrase 15 phrase 16 phrase 17 phrase 18 phrase 19 phrase 20.": {
      "Phrase 1 phrase 2 phrase 3 articulate 4 phrase 5 phrase 6 phrase 7 phrase 8 phrase 9 phrase 10 phrase 11 phrase 12 phrase 13 phrase 14 phrase 15 phrase 16 phrase 17 phrase 18 phrase 19 phrase 20.": [
        "Phrase 1 phrase 2 phrase 3 articulate 4 phrase 5 phrase 6 phrase 7 phrase 8 phrase 9 phrase 10 phrase 11 phrase 12 phrase 13 phrase 14 phrase 15 phrase 16 phrase 17 phrase 18 phrase 19 phrase 20.",
        "Phrase 1 phrase 2 phrase 3 articulate 4 phrase 5 phrase 6 phrase 7 phrase 8 phrase 9 phrase 10 phrase 11 phrase 12 phrase 13 phrase 14 phrase 15 ph

Each text can be accessed through its lookup table, which can be used to get the tree stemma.

In [25]:
# stemma.texts_lookup

It is also possible to access the edges describing only the manuscript names and their family relation.

In [26]:
stemma.edges

[('0', '0:0'),
 ('0', '0:1'),
 ('0:0', '0:0:0'),
 ('0:0', '0:0:1'),
 ('0:1', '0:1:0'),
 ('0:1', '0:1:1'),
 ('0:0:0', '0:0:0:0'),
 ('0:0:0', '0:0:0:1'),
 ('0:0:1', '0:0:1:0'),
 ('0:0:1', '0:0:1:1'),
 ('0:0:0', '0:0:0:0'),
 ('0:0:0', '0:0:0:1'),
 ('0:0:1', '0:0:1:0'),
 ('0:0:1', '0:0:1:1')]