In [None]:
# prompt: The standard collection of Ugaritic texts published up to 1995 is CAT = The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995). In CAT, each text is given a unique number, consisting of a single digit followed by a period followed by one to three more digits, as in CAT 1.14, 3.2, 4.143, in which the first digit indicates the genre of the text, as follows: 1. literary and religious texts 2. letters 3. legal texts 4. economic or administrative texts 5. scribal exercises 6. inscriptions on seals, labels, ivories, etc. 7. unclassified texts 8. illegible tablets and uninscribed fragments 9. unpublished texts CATis the second edition of a workthat was first published in German, by the same authors, as Die keilalphabetischen Texte aus Ugarit einschliesslich der keilalphabetischen Texte ausserhalb Ugarits, 1: Transkription (1976). This was abbreviated KTU, and some scholars continue to refer to the newer edition, here abbreviated CAT, as KTU or KTU2, or as CTU. In text citations, line numbers follow the text number and a colon, as in CAT 4.143:4. For multi-column texts, the column number must also be given, usually in Roman numerals, as in CAT 1.14.ii:30. In some scholarly works, Ugaritic texts are cited by their excavation numbers rather than by their CATdesignation. The excavation numbers of the Mission de Ras Shamra are preceded by the siglum "RS"; for example, CAT3.2 = RS 15.111, in which "15." in "RS 15.111" denotes the fifteenth season of excavations ("111" is the artifact number for that season); more recently, the excavation years themselves, or the last two digits of the years, have been used as a prefix in RS numbers, as in "RS 1994.24 01" or "RS 94.2401." Texts excavated at Ras Ibn Hani have the prefix "RIH" followed by the excavation year and artifact number, as in RIH 77/25 = CAT 2.79.
# Implement a parser function which would take the reference as a string and convert it to a dictionary.

import re

def parse_ugaritic_reference(reference):
    """
    Parses a Ugaritic text reference string and returns a dictionary with the parsed information.

    Args:
        reference: The Ugaritic text reference string (e.g., "CAT 1.14:4", "RS 15.111:30", "RIH 77/25").

    Returns:
        A dictionary containing the parsed information, or None if the reference is invalid.
    """

    cat_pattern = r"CAT (\d)\.(\d{1,3})(?:\.(\d{1,3}))?(?:\.(\d{1,3}))?(?:[:.](\d+)(?:\.(\d+))?(?:[iI][vVx]+)?(:\d+)?)?"
    rs_pattern = r"RS (\d{4}|\d{2})\.(\d+)(?::(\d+))?"
    rih_pattern = r"RIH (\d+)\/(\d+)"

    match = re.match(cat_pattern, reference)
    if match:
        result = {
            "source": "CAT",
            "genre": int(match.group(1)),
            "text_number": int(match.group(2))
        }
        if match.group(3):
            result["text_number_part2"] = int(match.group(3))
        if match.group(4):
            result["text_number_part3"] = int(match.group(4))
        if match.group(5):
            result["line_number"] = int(match.group(5))
        if match.group(6):
            result["column_number"] = int(match.group(6))
        if match.group(7):
            result["line_number_2"] = int(match.group(7)[1:]) # Remove the ':'
        return result

    match = re.match(rs_pattern, reference)
    if match:
        result = {
            "source": "RS",
            "excavation_year": match.group(1),
            "artifact_number": match.group(2)
        }
        if match.group(3):
            result["line_number"] = int(match.group(3))
        return result

    match = re.match(rih_pattern, reference)
    if match:
        return {
            "source": "RIH",
            "excavation_year": match.group(1),
            "artifact_number": match.group(2)
        }
    return None  # Invalid reference format


reference = "CAT 1.14.ii:30"
parsed_data = parse_ugaritic_reference(reference)
if parsed_data:
    print(parsed_data)


{'source': 'CAT', 'genre': 1, 'text_number': 14}


In [None]:
# prompt: Add dictionaries and functions that would decode the particular components of the resulting dictionary,  like the list of CAT genres, excavation year, etc.
# 1. literary and religious texts 2. letters 3. legal texts 4. economic or administrative texts 5. scribal exercises 6. inscriptions on seals, labels, ivories, etc. 7. unclassified texts 8. illegible tablets and uninscribed fragments 9. unpublished texts

import re


def decode_cat_genre(genre_code):
    genre_mapping = {
        1: "literary and religious texts",
        2: "letters",
        3: "legal texts",
        4: "economic or administrative texts",
        5: "scribal exercises",
        6: "inscriptions on seals, labels, ivories, etc.",
        7: "unclassified texts",
        8: "illegible tablets and uninscribed fragments",
        9: "unpublished texts"
    }
    return genre_mapping.get(genre_code, "Unknown Genre")


def convert_short_year(year_str):
  year_int = int(year_str)
  if year_int < 29:
    return 2000 + year_int
  else:
    return 1900 + year_int


def parse_ugaritic_reference(reference):
    """
    Parses a Ugaritic text reference string and returns a dictionary with the parsed information.

    Args:
        reference: The Ugaritic text reference string (e.g., "CAT 1.14:4", "RS 15.111:30", "RIH 77/25").

    Returns:
        A dictionary containing the parsed information, or None if the reference is invalid.
    """

    cat_pattern = r"CAT (\d)\.(\d{1,3})(?:\.(\d{1,3}))?(?:\.(\d{1,3}))?(?:[:.](\d+)(?:\.(\d+))?(?:[iI][vVx]+)?(:\d+)?)?"
    rs_pattern = r"RS (\d{4}|\d{2})\.(\d+)(?::(\d+))?"
    rih_pattern = r"RIH (\d+)\/(\d+)"

    match = re.match(cat_pattern, reference)
    if match:
        result = {
            "source": "CAT",
            "genre_id": int(match.group(1)),
            "genre_desc": decode_cat_genre(int(match.group(1))),
            "text_number": int(match.group(2))
        }
        if match.group(3):
            result["text_number_part2"] = int(match.group(3))
        if match.group(4):
            result["text_number_part3"] = int(match.group(4))
        if match.group(5):
            result["line_number"] = int(match.group(5))
        if match.group(6):
            result["column_number"] = int(match.group(6))
        if match.group(7):
            result["line_number_2"] = int(match.group(7)[1:]) # Remove the ':'
        return result

    match = re.match(rs_pattern, reference)
    if match:
        result = {
            "source": "RS",
            "excavation_year": convert_short_year(match.group(1)),
            "artifact_number": match.group(2)
        }
        if match.group(3):
            result["line_number"] = int(match.group(3))
        return result

    match = re.match(rih_pattern, reference)
    if match:
        return {
            "source": "RIH",
            "excavation_year": convert_short_year(match.group(1)),
            "artifact_number": match.group(2)
        }
    return None  # Invalid reference format


# Example usage
reference = "CAT 1.14:4"
parsed_data = parse_ugaritic_reference(reference)

if parsed_data:
    print(parsed_data)

reference = "RS 15.111:30"
parsed_data = parse_ugaritic_reference(reference)
if parsed_data:
    print(parsed_data)

reference = "RIH 77/25"
parsed_data = parse_ugaritic_reference(reference)
if parsed_data:
    print(parsed_data)

reference = "CAT 1.14.ii:30"
parsed_data = parse_ugaritic_reference(reference)
if parsed_data:
    print(parsed_data)

{'source': 'CAT', 'genre_id': 1, 'genre_desc': 'literary and religious texts', 'text_number': 14, 'line_number': 4}
{'source': 'RS', 'excavation_year': 2015, 'artifact_number': '111', 'line_number': 30}
{'source': 'RIH', 'excavation_year': 1977, 'artifact_number': '25'}
{'source': 'CAT', 'genre_id': 1, 'genre_desc': 'literary and religious texts', 'text_number': 14}


In [None]:
# prompt: For multi-column texts, the column number must also be given, usually in Roman numerals, as in CAT 1.14.ii:30.

import re


def parse_ugaritic_reference(reference):
    """
    Parses a Ugaritic text reference string and returns a dictionary with the parsed information.

    Args:
        reference: The Ugaritic text reference string (e.g., "CAT 1.14:4", "RS 15.111:30", "RIH 77/25").

    Returns:
        A dictionary containing the parsed information, or None if the reference is invalid.
    """

    cat_pattern = r"CAT (\d)\.(\d{1,3})(?:\.(\d{1,3}))?(?:\.(\d{1,3}))?(?:[:.](\d+)(?:\.([ivxlcdm]+))?(?:(:\d+)?)?)?"
    rs_pattern = r"RS (\d{4}|\d{2})\.(\d+)(?::(\d+))?"
    rih_pattern = r"RIH (\d+)\/(\d+)"

    match = re.match(cat_pattern, reference)
    if match:
        result = {
            "source": "CAT",
            "genre_id": int(match.group(1)),
            "genre_desc": decode_cat_genre(int(match.group(1))),
            "text_number": int(match.group(2))
        }
        if match.group(3):
            result["text_number_part2"] = int(match.group(3))
        if match.group(4):
            result["text_number_part3"] = int(match.group(4))
        if match.group(5):
            result["line_number"] = int(match.group(5))
        if match.group(6):
            result["column_number"] = match.group(6) # Store as Roman numeral string
        if match.group(7):
            result["line_number_2"] = int(match.group(7)[1:]) # Remove the ':'
        return result

    match = re.match(rs_pattern, reference)
    if match:
        result = {
            "source": "RS",
            "excavation_year": convert_short_year(match.group(1)),
            "artifact_number": match.group(2)
        }
        if match.group(3):
            result["line_number"] = int(match.group(3))
        return result

    match = re.match(rih_pattern, reference)
    if match:
        return {
            "source": "RIH",
            "excavation_year": convert_short_year(match.group(1)),
            "artifact_number": match.group(2)
        }
    return None  # Invalid reference format


reference = "CAT 1.14.ii:30"
parsed_data = parse_ugaritic_reference(reference)
if parsed_data:
    print(parsed_data)

{'source': 'CAT', 'genre_id': 1, 'genre_desc': 'literary and religious texts', 'text_number': 14}


In [None]:
# prompt: your regex for Roman numerals doesn't work.

import re

def parse_ugaritic_reference(reference):
    """
    Parses a Ugaritic text reference string and returns a dictionary with the parsed information.

    Args:
        reference: The Ugaritic text reference string (e.g., "CAT 1.14:4", "RS 15.111:30", "RIH 77/25").

    Returns:
        A dictionary containing the parsed information, or None if the reference is invalid.
    """

    cat_pattern = r"CAT (\d)\.(\d{1,3})(?:\.(\d{1,3}))?(?:\.(\d{1,3}))?(?:[:.](\d+)(?:\.([ivxlcdm]+))?(?:(:\d+)?)?)?"
    rs_pattern = r"RS (\d{4}|\d{2})\.(\d+)(?::(\d+))?"
    rih_pattern = r"RIH (\d+)\/(\d+)"

    match = re.match(cat_pattern, reference, re.IGNORECASE) # Added re.IGNORECASE
    if match:
        result = {
            "source": "CAT",
            "genre_id": int(match.group(1)),
            "genre_desc": decode_cat_genre(int(match.group(1))),
            "text_number": int(match.group(2))
        }
        if match.group(3):
            result["text_number_part2"] = int(match.group(3))
        if match.group(4):
            result["text_number_part3"] = int(match.group(4))
        if match.group(5):
            result["line_number"] = int(match.group(5))
        if match.group(6):
            result["column_number"] = match.group(6) # Store as Roman numeral string
        if match.group(7):
            result["line_number_2"] = int(match.group(7)[1:]) # Remove the ':'
        return result

    match = re.match(rs_pattern, reference)
    if match:
        result = {
            "source": "RS",
            "excavation_year": convert_short_year(match.group(1)),
            "artifact_number": match.group(2)
        }
        if match.group(3):
            result["line_number"] = int(match.group(3))
        return result

    match = re.match(rih_pattern, reference)
    if match:
        return {
            "source": "RIH",
            "excavation_year": convert_short_year(match.group(1)),
            "artifact_number": match.group(2)
        }
    return None  # Invalid reference format

reference = "CAT 1.14.ii:30"
parsed_data = parse_ugaritic_reference(reference)
if parsed_data:
    print(parsed_data)

{'source': 'CAT', 'genre_id': 1, 'genre_desc': 'literary and religious texts', 'text_number': 14}


In [None]:
import re
from typing import Optional, Dict

# Define CAT genres
CAT_GENRES = {
    1: "Literary and religious texts",
    2: "Letters",
    3: "Legal texts",
    4: "Economic or administrative texts",
    5: "Scribal exercises",
    6: "Inscriptions on seals, labels, ivories, etc.",
    7: "Unclassified texts",
    8: "Illegible tablets and uninscribed fragments",
    9: "Unpublished texts",
}

def roman_to_int(roman: str) -> int:
    roman_numerals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    prev_value = 0
    total = 0

    for char in reversed(roman):
        value = roman_numerals[char]
        if value < prev_value:
            total -= value
        else:
            total += value
        prev_value = value

    return total

def parse_ugaritic_reference(reference: str) -> Dict[str, Optional[str]]:
    pattern = r"CAT\s(?P<cat_genre>\d+)\.(?P<cat_text>\d+)(?:\.(?P<column>[ivxlc]+))?:?(?P<line>\d+)?"
    match = re.match(pattern, reference, re.IGNORECASE)

    if not match:
        raise ValueError(f"Invalid reference format: {reference}")

    cat_genre = int(match.group("cat_genre"))
    cat_text = int(match.group("cat_text"))
    column_roman = match.group("column")
    column_int = roman_to_int(column_roman.upper()) if column_roman else None
    line = int(match.group("line")) if match.group("line") else None

    return {
        "reference": reference,
        "cat_genre": cat_genre,
        "cat_genre_name": CAT_GENRES.get(cat_genre, "Unknown genre"),
        "cat_text": cat_text,
        "column_roman": column_roman,
        "column_int": column_int,
        "line": line,
    }

# Tests
def test_parse_ugaritic_reference():
    references = [
        "CAT 1.14.ii:30",
        "CAT 3.2",
        "CAT 4.143:4",
        "CAT 2.79",
    ]

    expected_results = [
        {
            "reference": "CAT 1.14.ii:30",
            "cat_genre": 1,
            "cat_genre_name": "Literary and religious texts",
            "cat_text": 14,
            "column_roman": "ii",
            "column_int": 2,
            "line": 30,
        },
        {
            "reference": "CAT 3.2",
            "cat_genre": 3,
            "cat_genre_name": "Legal texts",
            "cat_text": 2,
            "column_roman": None,
            "column_int": None,
            "line": None,
        },
        {
            "reference": "CAT 4.143:4",
            "cat_genre": 4,
            "cat_genre_name": "Economic or administrative texts",
            "cat_text": 143,
            "column_roman": None,
            "column_int": None,
            "line": 4,
        },
        {
            "reference": "CAT 2.79",
            "cat_genre": 2,
            "cat_genre_name": "Letters",
            "cat_text": 79,
            "column_roman": None,
            "column_int": None,
            "line": None,
        },
    ]

    for ref, expected in zip(references, expected_results):
        result = parse_ugaritic_reference(ref)
        print(result)
        assert result == expected, f"Failed for {ref}: {result}"

# Run tests
test_parse_ugaritic_reference()


{'reference': 'CAT 1.14.ii:30', 'cat_genre': 1, 'cat_genre_name': 'Literary and religious texts', 'cat_text': 14, 'column_roman': 'ii', 'column_int': 2, 'line': 30}
{'reference': 'CAT 3.2', 'cat_genre': 3, 'cat_genre_name': 'Legal texts', 'cat_text': 2, 'column_roman': None, 'column_int': None, 'line': None}
{'reference': 'CAT 4.143:4', 'cat_genre': 4, 'cat_genre_name': 'Economic or administrative texts', 'cat_text': 143, 'column_roman': None, 'column_int': None, 'line': 4}
{'reference': 'CAT 2.79', 'cat_genre': 2, 'cat_genre_name': 'Letters', 'cat_text': 79, 'column_roman': None, 'column_int': None, 'line': None}


In [None]:
import re
from typing import Dict, Optional

# Define CAT genres
CAT_GENRES = {
    1: "Literary and religious texts",
    2: "Letters",
    3: "Legal texts",
    4: "Economic or administrative texts",
    5: "Scribal exercises",
    6: "Inscriptions on seals, labels, ivories, etc.",
    7: "Unclassified texts",
    8: "Illegible tablets and uninscribed fragments",
    9: "Unpublished texts",
}

# Define full reference descriptions
REFERENCE_DESCRIPTIONS = {
    "CAT": "The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)",
    "KTU": "Die keilalphabetischen Texte aus Ugarit einschliesslich der keilalphabetischen Texte ausserhalb Ugarits, 1: Transkription (1976)",
    "CTU": "The Cuneifonn Alphabetic Texts from Ugarit (alternate abbreviation for CAT)",
    "RS": "Mission de Ras Shamra excavation numbers",
    "RIH": "Ras Ibn Hani excavation numbers",
    "CTA": "Corpus des tablettes en cun.eifonnes alphabeti.ques decouvertes a Ras Shamra-Ugarit de 1929 a 1939",
    "PRU": "Le Palais royal d'Ugarit",
    "UG": "Ugaritica volumes",
}

def roman_to_int(roman: str) -> int:
    roman_numerals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    prev_value = 0
    total = 0

    for char in reversed(roman.upper()):  # Convert to uppercase to handle case-insensitivity
        value = roman_numerals[char]
        if value < prev_value:
            total -= value
        else:
            total += value
        prev_value = value

    return total

def parse_ugaritic_reference(reference: str) -> Dict[str, Optional[str]]:
    result = {"reference": reference}

    if reference.startswith("CAT") or reference.startswith("KTU"):
        pattern = r"(?P<source>[A-Z]+)\s(?P<cat_genre>\d+)\.(?P<cat_text_start>\d+)(?:-(?P<cat_text_end>\d+))?(?:\.(?P<column_start>[ivxlc]+)(?:-(?P<column_end>[ivxlc]+))?)?:?(?P<line_start>\d+)?(?:-(?P<line_end>\d+|[ivxlc]+:\d+))?"
        match = re.match(pattern, reference)
        if match:
            result.update({
                "source": match.group("source"),
                "source_description": REFERENCE_DESCRIPTIONS[match.group("source")],
                "cat_genre": int(match.group("cat_genre")),
                "cat_genre_name": CAT_GENRES[int(match.group("cat_genre"))],
                "cat_text_start": int(match.group("cat_text_start")),
                "cat_text_end": int(match.group("cat_text_end")) if match.group("cat_text_end") else None,
                "column_start_roman": match.group("column_start"),
                "column_start_int": roman_to_int(match.group("column_start")) if match.group("column_start") else None,
                "column_end_roman": match.group("column_end"),
                "column_end_int": roman_to_int(match.group("column_end")) if match.group("column_end") else None,
                "line_start": int(match.group("line_start")) if match.group("line_start") else None,
                "line_end": match.group("line_end"),
            })
    elif reference.startswith("RS") or reference.startswith("RIH"):
        pattern = r"(?P<source>[A-Z]+)\s(?P<excavation_year>\d{2,4})[./](?P<artifact_number>\d+)"
        match = re.match(pattern, reference)
        if match:
            result.update({
                "source": match.group("source"),
                "source_description": REFERENCE_DESCRIPTIONS[match.group("source")],
                "excavation_year": int(match.group("excavation_year")),
                "artifact_number": int(match.group("artifact_number")),
            })
    elif reference.startswith("PRU") or reference.startswith("UG"):
        pattern = r"(?P<source>[A-Z]+)\s(?P<volume_number>\d+)"
        match = re.match(pattern, reference)
        if match:
            result.update({
                "source": match.group("source"),
                "source_description": REFERENCE_DESCRIPTIONS[match.group("source")],
                "volume_number": int(match.group("volume_number")),
            })
    elif reference.startswith("CTA"):
        pattern = r"(?P<source>[A-Z]+)\s(?P<text_number>\d+)"
        match = re.match(pattern, reference)
        if match:
            result.update({
                "source": match.group("source"),
                "source_description": REFERENCE_DESCRIPTIONS[match.group("source")],
                "text_number": int(match.group("text_number")),
            })

    # Exclude None values
    return {key: value for key, value in result.items() if value is not None}

# Tests
def test_parse_ugaritic_reference():
    references = [
        "CAT 1.14.ii:30",
        "CAT 3.2",
        "KTU 4.143:4",
        "RS 1994.24",
        "RIH 77/25",
        "CTA 1",
        "PRU 2",
        "UG 5",
        "CAT 1.1-25",
        "CAT 1.14.ii:30 - 45",
        "CAT 1.14.ii:30 - iv:2",
    ]

    expected_results = [
        {
            "reference": "CAT 1.14.ii:30",
            "source": "CAT",
            "source_description": "The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)",
            "cat_genre": 1,
            "cat_genre_name": "Literary and religious texts",
            "cat_text_start": 14,
            "column_start_roman": "ii",
            "column_start_int": 2,
            "line_start": 30,
        },
        {
            "reference": "CAT 3.2",
            "source": "CAT",
            "source_description": "The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)",
            "cat_genre": 3,
            "cat_genre_name": "Legal texts",
            "cat_text_start": 2,
        },
        {
            "reference": "KTU 4.143:4",
            "source": "KTU",
            "source_description": "Die keilalphabetischen Texte aus Ugarit einschliesslich der keilalphabetischen Texte ausserhalb Ugarits, 1: Transkription (1976)",
            "cat_genre": 4,
            "cat_genre_name": "Economic or administrative texts",
            "cat_text_start": 143,
            "line_start": 4,
        },
        {
            "reference": "RS 1994.24",
            "source": "RS",
            "source_description": "Mission de Ras Shamra excavation numbers",
            "excavation_year": 1994,
            "artifact_number": 24,
        },
        {
            "reference": "RIH 77/25",
            "source": "RIH",
            "source_description": "Ras Ibn Hani excavation numbers",
            "excavation_year": 77,
            "artifact_number": 25,
        },
        {
            "reference": "CTA 1",
            "source": "CTA",
            "source_description": "Corpus des tablettes en cun.eifonnes alphabeti.ques decouvertes a Ras Shamra-Ugarit de 1929 a 1939",
            "text_number": 1,
        },
        {
            "reference": "PRU 2",
            "source": "PRU",
            "source_description": "Le Palais royal d'Ugarit",
            "volume_number": 2,
        },
        {
            "reference": "UG 5",
            "source": "UG",
            "source_description": "Ugaritica volumes",
            "volume_number": 5,
        },
        {
            "reference": "CAT 1.1-25",
            "source": "CAT",
            "source_description": "The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)",
            "cat_genre": 1,
            "cat_genre_name": "Literary and religious texts",
            "cat_text_start": 1,
            "cat_text_end": 25,
        },
        {
            "reference": "CAT 1.14.ii:30 - 45",
            "source": "CAT",
            "source_description": "The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)",
            "cat_genre": 1,
            "cat_genre_name": "Literary and religious texts",
            "cat_text_start": 14,
            "column_start_roman": "ii",
            "column_start_int": 2,
            "line_start": 30,
            "line_end": "45",
        },
        {
            "reference": "CAT 1.14.ii:30 - iv:2",
            "source": "CAT",
            "source_description": "The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)",
            "cat_genre": 1,
            "cat_genre_name": "Literary and religious texts",
            "cat_text_start": 14,
            "column_start_roman": "ii",
            "column_start_int": 2,
            "line_start": 30,
            "column_end_roman": "iv",
            "column_end_int": 4,
            "line_end": "2",
        },
    ]

    for ref, expected in zip(references, expected_results):
        result = parse_ugaritic_reference(ref)
        print(result)  # Debugging print statement
        assert result == expected, f"Failed for {ref}: {result}"

# Run tests
test_parse_ugaritic_reference()

{'reference': 'CAT 1.14.ii:30', 'source': 'CAT', 'source_description': 'The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)', 'cat_genre': 1, 'cat_genre_name': 'Literary and religious texts', 'cat_text_start': 14, 'column_start_roman': 'ii', 'column_start_int': 2, 'line_start': 30}
{'reference': 'CAT 3.2', 'source': 'CAT', 'source_description': 'The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)', 'cat_genre': 3, 'cat_genre_name': 'Legal texts', 'cat_text_start': 2}
{'reference': 'KTU 4.143:4', 'source': 'KTU', 'source_description': 'Die keilalphabetischen Texte aus Ugarit einschliesslich der keilalphabetischen Texte ausserhalb Ugarits, 1: Transkription (1976)', 'cat_genre': 4, 'cat_genre_name': 'Economic or administrative texts', 'cat_text_start': 143, 'line_start': 4}
{'reference': 'RS 1994.24', 'source': 'RS', 'source_description': 'Mis

AssertionError: Failed for CAT 1.14.ii:30 - 45: {'reference': 'CAT 1.14.ii:30 - 45', 'source': 'CAT', 'source_description': 'The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)', 'cat_genre': 1, 'cat_genre_name': 'Literary and religious texts', 'cat_text_start': 14, 'column_start_roman': 'ii', 'column_start_int': 2, 'line_start': 30}

In [None]:
import re
from typing import Dict, Optional, List

# Define CAT genres
CAT_GENRES = {
    1: "Literary and religious texts",
    2: "Letters",
    3: "Legal texts",
    4: "Economic or administrative texts",
    5: "Scribal exercises",
    6: "Inscriptions on seals, labels, ivories, etc.",
    7: "Unclassified texts",
    8: "Illegible tablets and uninscribed fragments",
    9: "Unpublished texts",
}

# Define full reference descriptions
REFERENCE_DESCRIPTIONS = {
    "CAT": "The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)",
    "KTU": "Die keilalphabetischen Texte aus Ugarit einschliesslich der keilalphabetischen Texte ausserhalb Ugarits, 1: Transkription (1976)",
    "CTU": "The Cuneifonn Alphabetic Texts from Ugarit (alternate abbreviation for CAT)",
    "RS": "Mission de Ras Shamra excavation numbers",
    "RIH": "Ras Ibn Hani excavation numbers",
    "CTA": "Corpus des tablettes en cun.eifonnes alphabeti.ques decouvertes a Ras Shamra-Ugarit de 1929 a 1939",
    "PRU": "Le Palais royal d'Ugarit",
    "UG": "Ugaritica volumes",
}

def roman_to_int(roman: str) -> int:
    roman_numerals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    prev_value = 0
    total = 0

    for char in reversed(roman.upper()):  # Convert to uppercase to handle case-insensitivity
        value = roman_numerals[char]
        if value < prev_value:
            total -= value
        else:
            total += value
        prev_value = value

    return total

def convert_short_year(year_str: str) -> int:
    year_int = int(year_str)
    if year_int < 30:
        return 2000 + year_int
    else:
        return 1900 + year_int

def parse_line_ranges(line_range: str) -> List[Dict[str, Optional[str]]]:
    items = line_range.split(",")
    parsed_items = []

    for item in items:
        range_match = re.match(r"(?P<start_column>[ivxlc]+)?:?(?P<start_line>\d+)(?:-(?P<end_column>[ivxlc]+)?:?(?P<end_line>\d+))?", item.strip())
        if range_match:
            parsed_items.append({
                "start_column_roman": range_match.group("start_column"),
                "start_column_int": roman_to_int(range_match.group("start_column")) if range_match.group("start_column") else None,
                "end_column_roman": range_match.group("end_column"),
                "end_column_int": roman_to_int(range_match.group("end_column")) if range_match.group("end_column") else None,
                "start_line": int(range_match.group("start_line")),
                "end_line": int(range_match.group("end_line")) if range_match.group("end_line") else None,
            })

    return parsed_items

def parse_ugaritic_reference(reference: str) -> Dict[str, Optional[str]]:
    result = {"reference": reference}

    if reference.startswith("CAT") or reference.startswith("KTU"):
        pattern = r"(?P<source>[A-Z]+)\s(?P<cat_genre>\d+)\.(?P<cat_text_start>\d+)(?:-(?P<cat_text_end>\d+))?(?:\.(?P<column_start>[ivxlc]+)(?:-(?P<column_end>[ivxlc]+))?)?:?(?P<line_ranges>[\d, \-ivxlc:]+)?"
        match = re.match(pattern, reference)
        if match:
            result.update({
                "source": match.group("source"),
                "source_description": REFERENCE_DESCRIPTIONS[match.group("source")],
                "cat_genre": int(match.group("cat_genre")),
                "cat_genre_name": CAT_GENRES[int(match.group("cat_genre"))],
                "cat_text_start": int(match.group("cat_text_start")),
                "cat_text_end": int(match.group("cat_text_end")) if match.group("cat_text_end") else None,
            })
            if match.group("line_ranges"):
                result["line_ranges"] = parse_line_ranges(match.group("line_ranges"))

    elif reference.startswith("RS") or reference.startswith("RIH"):
        pattern = r"(?P<source>[A-Z]+)\s(?P<excavation_year>\d{2,4})[./](?P<artifact_number>\d+)"
        match = re.match(pattern, reference)
        if match:
            result.update({
                "source": match.group("source"),
                "source_description": REFERENCE_DESCRIPTIONS[match.group("source")],
                "excavation_year": convert_short_year(match.group("excavation_year")),
                "artifact_number": int(match.group("artifact_number")),
            })

    elif reference.startswith("PRU") or reference.startswith("UG"):
        pattern = r"(?P<source>[A-Z]+)\s(?P<volume_number>\d+)"
        match = re.match(pattern, reference)
        if match:
            result.update({
                "source": match.group("source"),
                "source_description": REFERENCE_DESCRIPTIONS[match.group("source")],
                "volume_number": int(match.group("volume_number")),
            })

    elif reference.startswith("CTA"):
        pattern = r"(?P<source>[A-Z]+)\s(?P<text_number>\d+)"
        match = re.match(pattern, reference)
        if match:
            result.update({
                "source": match.group("source"),
                "source_description": REFERENCE_DESCRIPTIONS[match.group("source")],
                "text_number": int(match.group("text_number")),
            })

    # Exclude None values
    return {key: value for key, value in result.items() if value is not None}

# Tests
def test_parse_ugaritic_reference():
    references = [
        "CAT 1.14.ii:30",
        "CAT 3.2",
        "KTU 4.143:4",
        "RS 1994.24",
        "RIH 77/25",
        "CTA 1",
        "PRU 2",
        "UG 5",
        "CAT 1.1-25",
        "CAT 1.14.ii:30 - 45",
        "CAT 1.14.ii:30 - iv:2",
        "CAT 1.14.ii:30 - ii:35, iv:1, v:4,7,8",
    ]

    expected_results = [
        {
            "reference": "CAT 1.14.ii:30",
            "source": "CAT",
            "source_description": "The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)",
            "cat_genre": 1,
            "cat_genre_name": "Literary and religious texts",
            "cat_text_start": 14,
            "line_ranges": [
                {
                    "start_column_roman": "ii",
                    "start_column_int": 2,
                    "end_column_roman": None,
                    "end_column_int": None,
                    "start_line": 30,
                    "end_line": None,
                }
            ],
        },
        {
            "reference": "CAT 3.2",
            "source": "CAT",
            "source_description": "The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)",
            "cat_genre": 3,
            "cat_genre_name": "Legal texts",
            "cat_text_start": 2,
        },
        {
            "reference": "KTU 4.143:4",
            "source": "KTU",
            "source_description": "Die keilalphabetischen Texte aus Ugarit einschliesslich der keilalphabetischen Texte ausserhalb Ugarits, 1: Transkription (1976)",
            "cat_genre": 4,
            "cat_genre_name": "Economic or administrative texts",
            "cat_text_start": 143,
            "line_ranges": [
                {
                    "start_column_roman": None,
                    "start_column_int": None,
                    "end_column_roman": None,
                    "end_column_int": None,
                    "start_line": 4,
                    "end_line": None,
                }
            ],
        },
        {
            "reference": "RS 1994.24",
            "source": "RS",
            "source_description": "Mission de Ras Shamra excavation numbers",
            "excavation_year": 1994,
            "artifact_number": 24,
        },
        {
            "reference": "RIH 77/25",
            "source": "RIH",
            "source_description": "Ras Ibn Hani excavation numbers",
            "excavation_year": 1977,
            "artifact_number": 25,
        },
        {
            "reference": "CTA 1",
            "source": "CTA",
            "source_description": "Corpus des tablettes en cun.eifonnes alphabeti.ques decouvertes a Ras Shamra-Ugarit de 1929 a 1939",
            "text_number": 1,
        },
        {
            "reference": "PRU 2",
            "source": "PRU",
            "source_description": "Le Palais royal d'Ugarit",
            "volume_number": 2,
        },
        {
            "reference": "UG 5",
            "source": "UG",
            "source_description": "Ugaritica volumes",
            "volume_number": 5,
        },
        {
            "reference": "CAT 1.1-25",
            "source": "CAT",
            "source_description": "The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)",
            "cat_genre": 1,
            "cat_genre_name": "Literary and religious texts",
            "cat_text_start": 1,
            "cat_text_end": 25,
        },
        {
            "reference": "CAT 1.14.ii:30 - 45",
            "source": "CAT",
            "source_description": "The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)",
            "cat_genre": 1,
            "cat_genre_name": "Literary and religious texts",
            "cat_text_start": 14,
            "line_ranges": [
                {
                    "start_column_roman": "ii",
                    "start_column_int": 2,}]}]



In [None]:
parse_ugaritic_reference('CAT 2.79')

{'reference': 'CAT 2.79',
 'source': 'CAT',
 'source_description': 'The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)',
 'cat_genre': 2,
 'cat_genre_name': 'Letters',
 'cat_text_start': 79}

In [None]:
parse_ugaritic_reference('RIH 77/25')

{'reference': 'RIH 77/25',
 'source': 'RIH',
 'source_description': 'Ras Ibn Hani excavation numbers',
 'excavation_year': 1977,
 'artifact_number': 25}

In [None]:
parse_ugaritic_reference('CAT 1.1-25')

{'reference': 'CAT 1.1-25',
 'source': 'CAT',
 'source_description': 'The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)',
 'cat_genre': 1,
 'cat_genre_name': 'Literary and religious texts',
 'cat_text_start': 1,
 'cat_text_end': 25}

In [None]:
parse_ugaritic_reference('RS 15.111')

{'reference': 'RS 15.111',
 'source': 'RS',
 'source_description': 'Mission de Ras Shamra excavation numbers',
 'excavation_year': 2015,
 'artifact_number': 111}

In [None]:
parse_ugaritic_reference('CAT 1.14.ii:30 - 45')

{'reference': 'CAT 1.14.ii:30 - 45',
 'source': 'CAT',
 'source_description': 'The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)',
 'cat_genre': 1,
 'cat_genre_name': 'Literary and religious texts',
 'cat_text_start': 14,
 'line_ranges': [{'start_column_roman': None,
   'start_column_int': None,
   'end_column_roman': None,
   'end_column_int': None,
   'start_line': 30,
   'end_line': None}]}

In [None]:
parse_ugaritic_reference('CAT 1.14.ii:30 - iii:35, iv:1, v:4,7,8')

{'reference': 'CAT 1.14.ii:30 - iii:35, iv:1, v:4,7,8',
 'source': 'CAT',
 'source_description': 'The Cuneifonn Alphabetic Texts from Ugarit, Ras Ibn Hani and Other Places, by M. Dietrich, 0. Loretz, and J. Sanmartin (1995)',
 'cat_genre': 1,
 'cat_genre_name': 'Literary and religious texts',
 'cat_text_start': 14,
 'line_ranges': [{'start_column_roman': None,
   'start_column_int': None,
   'end_column_roman': None,
   'end_column_int': None,
   'start_line': 30,
   'end_line': None},
  {'start_column_roman': 'iv',
   'start_column_int': 4,
   'end_column_roman': None,
   'end_column_int': None,
   'start_line': 1,
   'end_line': None},
  {'start_column_roman': 'v',
   'start_column_int': 5,
   'end_column_roman': None,
   'end_column_int': None,
   'start_line': 4,
   'end_line': None},
  {'start_column_roman': None,
   'start_column_int': None,
   'end_column_roman': None,
   'end_column_int': None,
   'start_line': 7,
   'end_line': None},
  {'start_column_roman': None,
   'start_c