In [17]:
def get_upos(feat_str):
    # feat_str example: "ud=PROPN|pos=noun_prop|prc3=0|prc2=0|..."
    for feat in feat_str.split('|'):
        if feat.startswith('ud='):
            return feat[len('ud='):]  # Extract "PROPN"
    return None

def extract_features_from_parse(parse_lines):
    """
    Input:
      parse_lines: List[str], lines of the CoNLL-like parse (tokens only, no comment lines)
    Output:
      features: dict with syntactic features
    """
    tokens = []
    heads = []
    deprels = []
    upos_tags = []

    # Parse lines to extract relevant info
    for line in parse_lines:
        if line.startswith('#'):
            continue  # skip comment lines

        parts = line.strip().split('\t')
        if len(parts) < 8:
            continue  # skip malformed lines
        token_id = int(parts[0])
        word = parts[1]
        upos = get_upos(parts[5])
        head = int(parts[6])
        deprel = parts[7]

        tokens.append(word)
        upos_tags.append(upos)
        heads.append(head)
        deprels.append(deprel)

    n = len(tokens)

    # Dependency distances
    dep_distances = [abs((i+1) - head) if head != 0 else 0 for i, head in enumerate(heads)]
    avg_dep_distance = sum(dep_distances) / n if n > 0 else 0
    max_dep_distance = max(dep_distances) if dep_distances else 0
    long_deps_count = sum(d > 3 for d in dep_distances)

    # POS tag counts and ratios
    from collections import Counter
    pos_counts = Counter(upos_tags)

    noun_count = pos_counts.get('NOUN', 0) + pos_counts.get('PROPN', 0)
    verb_count = pos_counts.get('VERB', 0)
    adj_count = pos_counts.get('ADJ', 0)
    adv_count = pos_counts.get('ADV', 0)
    adp_count = pos_counts.get('ADP', 0)
    pron_count = pos_counts.get('PRON', 0)
    cconj_count = pos_counts.get('CCONJ', 0)
    num_count = pos_counts.get('NUM', 0)
    punct_count = pos_counts.get('PUNCT', 0)
    sconj_count = pos_counts.get('SCONJ', 0)
    part_count = pos_counts.get('PART', 0)
    det_count = pos_counts.get('DET', 0)
    aux_count = pos_counts.get('AUX', 0)
    intj_count = pos_counts.get('INTJ', 0)
    pipe_count = pos_counts.get('|', 0)  # assuming '|' shows up in the pos column

    total_content_words = noun_count + verb_count + adj_count + adv_count
    total_words = n

    verb_to_noun_ratio = (verb_count / noun_count) if noun_count > 0 else 0
    content_word_ratio = (total_content_words / total_words) if total_words > 0 else 0

    # Dependency relation counts
    dep_counts = Counter(deprels)
    sbj_count = dep_counts.get('SBJ', 0)
    obj_count = dep_counts.get('OBJ', 0)
    advcl_count = dep_counts.get('ADVCL', 0)
    acl_count = dep_counts.get('ACL', 0)
    conj_count = dep_counts.get('CONJ', 0)
    cc_count = dep_counts.get('CC', 0)

    # Tree structure
    children = {i+1: [] for i in range(n)}
    root = None
    for i, h in enumerate(heads, 1):
        if h == 0:
            root = i
        else:
            children[h].append(i)

    def tree_depth(node):
        if not children[node]:
            return 1
        else:
            return 1 + max(tree_depth(c) for c in children[node])

    max_depth = tree_depth(root) if root else 0

    dependents_counts = [len(children[h]) for h in children]
    avg_dependents = sum(dependents_counts) / n if n > 0 else 0
    max_dependents = max(dependents_counts) if dependents_counts else 0

    # Left and right dependents
    left_deps = sum(1 for i, h in enumerate(heads, 1) if h != 0 and h > i)
    right_deps = sum(1 for i, h in enumerate(heads, 1) if h != 0 and h < i)
    left_right_dep_ratio = (left_deps / right_deps) if right_deps != 0 else 0

    # Compose features
    features = {
        'avg_dep_distance': avg_dep_distance,
        'max_dep_distance': max_dep_distance,
        'long_deps_count': long_deps_count,
        'noun_count': noun_count,
        'verb_count': verb_count,
        'adj_count': adj_count,
        'adv_count': adv_count,
        'adp_count': adp_count,
        'pron_count': pron_count,
        'cconj_count': cconj_count,
        'num_count': num_count,
        'punct_count': punct_count,
        'sconj_count': sconj_count,
        'part_count': part_count,
        'det_count': det_count,
        'aux_count': aux_count,
        'intj_count': intj_count,
        'pipe_count': pipe_count,
        'verb_to_noun_ratio': verb_to_noun_ratio,
        'content_word_ratio': content_word_ratio,
        'sbj_count': sbj_count,
        'obj_count': obj_count,
        'advcl_count': advcl_count,
        'acl_count': acl_count,
        'conj_count': conj_count,
        'cc_count': cc_count,
        'max_depth': max_depth,
        'avg_dependents': avg_dependents,
        'max_dependents': max_dependents,
        'left_deps': left_deps,
        'right_deps': right_deps,
        'left_right_dep_ratio': left_right_dep_ratio,
        'total_words': total_words
    }

    return features


def extract_pos_tags_from_block(block):
    """
    Input:
      block: List[str], lines of the CoNLL-like parse (tokens only, no comment lines)
    Output:
      pos_tags: List[str] with POS tags
    """
    pos_tags = []
    for line in block:
        if line.startswith('#'):
            continue  # skip comment lines

        parts = line.strip().split('\t')
        parts = line.strip().split('\t')
        if len(parts) < 8:
            continue  # skip malformed lines
        upos = get_upos(parts[5])
        if upos:
            pos_tags.append(upos)
        else:
            pos_tags.append('UNKNOWN')

    return pos_tags

def extract_features_from_ud(ud_string):
    """Parse the 'ud=' string and return a dict of features."""
    if not ud_string.startswith('ud='):
        return {}
    feature_string = ud_string
    features = feature_string.split('|')
    feature_dict = {}
    for feat in features:
        if '=' in feat:
            k, v = feat.split('=', 1)
            feature_dict[k] = v
    return feature_dict

def extract_morph_features_from_block(block, all_features):
    """
    Extract morphological features from one CoNLL block.
    Returns: dict of {feature_name: list of values}
    """
    features_by_name = {feat: [] for feat in all_features}
    
    for line in block:
        if line.startswith('#'):
            continue
        parts = line.strip().split('\t')
        if len(parts) < 8:
            continue
        morph_dict = extract_features_from_ud(parts[5])
        for feat in all_features:
            features_by_name[feat].append(morph_dict.get(feat, "na"))  # "na" for missing
    return features_by_name

In [18]:
parse_lines = [
    "# text = أنا أحب أختي",
    "# treeTokens = أنا أحب أخت +ي",
    "1	أنا	أنا	NOM	_	ud=PRON|pos=pron|prc3=0|prc2=0|prc1=0|prc0=0|enc0=0|asp=na|vox=na|mod=na|gen=m|num=s|stt=i|cas=n|per=1|rat=y|token_type=baseword	2	SBJ	_	_",
    "2	أحب	أحب	VRB	_	ud=VERB|pos=verb|prc3=0|prc2=0|prc1=0|prc0=0|enc0=0|asp=i|vox=a|mod=u|gen=m|num=s|stt=na|cas=na|per=1|rat=n|token_type=baseword	0	---	_	_",
    "3	أخت	أخت	NOM	_	ud=NOUN|pos=noun|prc3=0|prc2=0|prc1=0|prc0=0|enc0=0|asp=na|vox=na|mod=na|gen=f|num=s|stt=c|cas=u|per=na|rat=r|token_type=baseword	2	OBJ	_	_",
    "4	+ي	+ي	NOM	_	ud=PRON|pos=pron|prc3=0|prc2=0|prc1=0|prc0=0|enc0=0|asp=na|vox=na|mod=na|gen=m|num=s|stt=d|cas=g|per=1|rat=na|token_type=enc0	3	IDF	_	_",
    ]

features = extract_features_from_parse(parse_lines)
print(features)

sentence = parse_lines[0].replace("# text = ", "").strip()
print(sentence)  # Output: أنا أحب أختي

{'avg_dep_distance': 0.75, 'max_dep_distance': 1, 'long_deps_count': 0, 'noun_count': 1, 'verb_count': 1, 'adj_count': 0, 'adv_count': 0, 'adp_count': 0, 'pron_count': 2, 'cconj_count': 0, 'num_count': 0, 'punct_count': 0, 'sconj_count': 0, 'part_count': 0, 'det_count': 0, 'aux_count': 0, 'intj_count': 0, 'pipe_count': 0, 'verb_to_noun_ratio': 1.0, 'content_word_ratio': 0.5, 'sbj_count': 1, 'obj_count': 1, 'advcl_count': 0, 'acl_count': 0, 'conj_count': 0, 'cc_count': 0, 'max_depth': 3, 'avg_dependents': 0.75, 'max_dependents': 2, 'left_deps': 1, 'right_deps': 2, 'left_right_dep_ratio': 0.5, 'total_words': 4}
أنا أحب أختي


# Try that on the whole dataset and simple model

In [19]:
import re
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar
from camel_tools.utils.normalize import normalize_unicode
from camel_tools.utils.charmap import CharMapper

arclean = CharMapper.builtin_mapper("arclean")

def clean_broken_arabic_words(text):
    """ Clean broken Arabic words and remove tatweel characters."""
    
    def clean_line(line, arclean):
        return simple_word_tokenize(arclean(dediac_ar(normalize_unicode(line.strip()))))
    
    # Step 1: Remove broken words
    text = text.replace(' ـ', '')
    text = text.replace('ـ ', '')

    # use camel tools to clean the text, it removes tatweel characters and diacritics
    text = " ".join(clean_line(text, arclean))
    return text

def clean_sentence(sentence):

    # # Arabic characters range: main + extended + some punctuation
    # arabic_pattern = re.compile(r'[\u0600-\u06FF]+')

    # # Extract all Arabic tokens
    # arabic_tokens = arabic_pattern.findall(sentence)

    # sentence = " ".join(arabic_tokens)
    sentence = clean_broken_arabic_words(sentence)

    # If sentence ends with arabic comma '،', remove it because it mess up with dependency parsing
    if sentence.endswith('،'):
        sentence = sentence[:-1].strip()

    # Remove extra spaces
    sentence = re.sub(r'\s+', ' ', sentence).strip()
        
    # Join with space to reconstruct a clean sentence
    return sentence

def clean_example(example):
    example["Sentence"] = clean_sentence(example["Sentence"])
    return example

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # or XGBoost if you want
from sklearn.metrics import classification_report
from datasets import load_dataset
import pyarabic.araby as araby


token = "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"

# Load sentence-level data from the "Dev" split
sent_blind_test = load_dataset("CAMeL-Lab/BAREC-Shared-Task-2025-BlindTest-sent", token=token, split="test")

# Fix labels to be 0-indexed
sent_blind_test = sent_blind_test.map(lambda x: {"Sentence": araby.strip_diacritics(x["Sentence"])})

# we should always clean each sentence in the dataset to be able to join with features
sent_blind_test = sent_blind_test.map(clean_example)

df = pd.DataFrame(sent_blind_test)

In [21]:
dataset = sent_blind_test
sent_blind_test

Dataset({
    features: ['ID', 'Sentence', 'Word_Count', 'Annotator', 'Document', 'Source', 'Book', 'Author', 'Domain', 'Text_Class'],
    num_rows: 3420
})

In [22]:
from datasets import Dataset

# Convert to pandas first
df = dataset.to_pandas()

# Drop duplicates by sentence
df_unique = df.drop_duplicates(subset=["Sentence"])

# Convert back to Hugging Face Dataset if needed
dataset = Dataset.from_pandas(df_unique)

In [23]:
dataset.shape

(3290, 11)

In [10]:
Split = "blind_testset"

In [30]:
import os
import math
import numpy as np

N_SPLITS = 10  # Number of chunks
OUTPUT_DIR = "./splits_{}".format(Split)  # Directory to save splits
os.makedirs(OUTPUT_DIR, exist_ok=True)

chunk_size = len(dataset) // N_SPLITS
subsets = []

for i in range(N_SPLITS):
    start = i * chunk_size
    # Make last chunk include all remaining elements
    end = (i + 1) * chunk_size if i < N_SPLITS - 1 else len(dataset)
    subsets.append(dataset.select(range(start, end)))

# Step 2: Process each chunk
N_SPLITS_PER_CHUNK = 1  # Number of sub-chunks per problematic chunk

for i, cleaned_subset in enumerate(subsets):
    subset_len = len(cleaned_subset)
    chunk_size = math.ceil(subset_len / N_SPLITS_PER_CHUNK)

    for j in range(N_SPLITS_PER_CHUNK):
        start_idx = j * chunk_size
        end_idx = min((j + 1) * chunk_size, subset_len)
        sub_chunk = cleaned_subset.select(range(start_idx, end_idx))

        # Save sentences
        input_path = os.path.join(OUTPUT_DIR, f"sentences_{i}_{j+1}.txt")
        with open(input_path, "w", encoding="utf-8") as f:
            for sent in sub_chunk["Sentence"]:
                f.write(sent.strip() + "\n")

        # Run CLI tool
        output_path = os.path.join(OUTPUT_DIR, f"output_{i}_{j+1}.txt")
        os.system(f"python text_to_conll_cli.py -f text -i {input_path} > {output_path}")


#NOAN انتكاسة
#NOAN وأعراقها
#NOT_FIRST فإننا
#NOAN الشكلانية
#NOT_FIRST وبالتالي
#NOT_FIRST وبالتالي
#NOT_FIRST انظر
#NOAN شمة
#NOT_FIRST إنجلترا
#NOT_FIRST إننا
#NOT_FIRST إن
#NOT_FIRST إن
#NOT_FIRST إنني
#NOT_FIRST إن
#NOT_FIRST إنني
#NOT_FIRST إنه
#NOT_FIRST إنه
#NOT_FIRST فإن
#NOAN إيج
#NOAN إيج
#NOT_FIRST الوسطى
#NOAN إيج
#NOAN إيج
#NOAN إيج
#NOT_FIRST الوسطى
#NOAN إيج
#NOAN إيج
#NOT_FIRST بدا
#NOT_FIRST الوسطى
#NOT_FIRST فإنهم
#NOT_FIRST فإن
#NOT_FIRST فإن
#NOT_FIRST فإن
#NOAN الأوزان
#NOAN الجرمانية
#NOAN الطورانية
#NOT_FIRST فإن
#NOT_FIRST الأعلام
#NOT_FIRST افتنانا
#NOAN المعطشة
#NOT_FIRST الافتنان
#NOT_FIRST كأحلى
#NOT_FIRST وإننا
#NOAN إليجر
#NOT_FIRST الإنس
#NOT_FIRST تنتحي
#NOAN إليجر
#NOT_FIRST الإنس
#NOAN إليجر
#NOAN إليجر
#NOT_FIRST إنها
#NOT_FIRST حكي
#NOT_FIRST بأي
#NOAN الإدراكية
#NOAN السردية
#NOAN إيبنجهاوس
#NOT_FIRST إن
#NOT_FIRST إخصائي
#NOT_FIRST إن
#NOT_FIRST فإن
#NOT_FIRST طيلة
#NOT_FIRST فإننا
#NOT_FIRST إننا
#NOT_FIRST فإننا
#NOT_FIRST آليات
#NOT_FIRST فإن


In [31]:
def get_each_output_from_file(filepath):
    blocks = []
    current_block = []
    in_block = False

    with open(filepath, encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            # Start of a new block
            if line.startswith("# text = "):
                if current_block:  # Save previous block if one is in progress
                    blocks.append(current_block)
                current_block = [line]
                in_block = True

            # Skip treeTokens lines
            elif line.startswith("# treeTokens"):
                continue

            # Inside a block, add lines until empty line
            elif in_block:
                if line == "":
                    blocks.append(current_block)
                    current_block = []
                    in_block = False
                else:
                    current_block.append(line)

        # Add last block if file doesn’t end with an empty line
        if current_block:
            blocks.append(current_block)

    print(f"Found {len(blocks)} blocks in {filepath}")
    return blocks

all_parsed_blocks = []

# get all files that start with "output_" and end with ".txt"
import os
output_files = [f for f in os.listdir(OUTPUT_DIR) if f.startswith("output_") and f.endswith(".txt")]
for output_file in output_files:
    print(f"Processing {output_file}...")
    filepath = os.path.join(OUTPUT_DIR, output_file)
    blocks = get_each_output_from_file(filepath)
    all_parsed_blocks.extend(blocks)
    
# Now `all_parsed_blocks` contains everything from all output files
print(f"Total parsed blocks: {len(all_parsed_blocks)}")

Processing output_0_1.txt...
Found 329 blocks in ./splits_blind_testset/output_0_1.txt
Processing output_2_1.txt...
Found 329 blocks in ./splits_blind_testset/output_2_1.txt
Processing output_6_1.txt...
Found 329 blocks in ./splits_blind_testset/output_6_1.txt
Processing output_4_1.txt...
Found 329 blocks in ./splits_blind_testset/output_4_1.txt
Processing output_3_1.txt...
Found 329 blocks in ./splits_blind_testset/output_3_1.txt
Processing output_1_1.txt...
Found 329 blocks in ./splits_blind_testset/output_1_1.txt
Processing output_5_1.txt...
Found 329 blocks in ./splits_blind_testset/output_5_1.txt
Processing output_7_1.txt...
Found 329 blocks in ./splits_blind_testset/output_7_1.txt
Processing output_9_1.txt...
Found 329 blocks in ./splits_blind_testset/output_9_1.txt
Processing output_8_1.txt...
Found 329 blocks in ./splits_blind_testset/output_8_1.txt
Total parsed blocks: 3290


In [32]:
# write all_parsed_blocks into a file all_parsed_blocks.txt
with open(f"{OUTPUT_DIR}/all_parsed_blocks_{Split}.txt", "w", encoding="utf-8") as f:
    for block in all_parsed_blocks:
        for line in block:
            f.write(line + "\n")
        f.write("\n")  # Separate blocks with an empty line

In [33]:
def extract_sentence_from_block(block):
    if block and block[0].startswith("# text ="):
        return block[0].replace("# text = ", "").strip()
    return None

features_list = []
for block in all_parsed_blocks:
    sentence = extract_sentence_from_block(block)
    if sentence is not None:
        feats = extract_features_from_parse(block)
        feats["Sentence"] = sentence  # Clean the sentence to match while joining
        features_list.append(feats)

# write features_list to a file features_list.json, its a list of dicts
import json
with open(f"{OUTPUT_DIR}/features_list_{Split}.json", "w", encoding="utf-8") as f:
    json.dump(features_list, f, ensure_ascii=False, indent=4)
    

features_list

[{'avg_dep_distance': 1.0,
  'max_dep_distance': 2,
  'long_deps_count': 0,
  'noun_count': 5,
  'verb_count': 0,
  'adj_count': 0,
  'adv_count': 0,
  'adp_count': 0,
  'pron_count': 0,
  'cconj_count': 1,
  'num_count': 0,
  'punct_count': 0,
  'sconj_count': 0,
  'part_count': 0,
  'det_count': 0,
  'aux_count': 0,
  'intj_count': 0,
  'pipe_count': 0,
  'verb_to_noun_ratio': 0.0,
  'content_word_ratio': 0.8333333333333334,
  'sbj_count': 0,
  'obj_count': 1,
  'advcl_count': 0,
  'acl_count': 0,
  'conj_count': 0,
  'cc_count': 0,
  'max_depth': 5,
  'avg_dependents': 0.8333333333333334,
  'max_dependents': 2,
  'left_deps': 0,
  'right_deps': 5,
  'left_right_dep_ratio': 0.0,
  'total_words': 6,
  'Sentence': 'مجلة كل الأولاد وكل البنات'},
 {'avg_dep_distance': 0.5,
  'max_dep_distance': 1,
  'long_deps_count': 0,
  'noun_count': 1,
  'verb_count': 0,
  'adj_count': 0,
  'adv_count': 0,
  'adp_count': 0,
  'pron_count': 0,
  'cconj_count': 0,
  'num_count': 1,
  'punct_count': 0,


In [34]:
# extract all pos tags with sentence from all_parsed_blocks and save sentence as key, pos tags as value
pos_tags_dict = {}
for block in all_parsed_blocks:
    sentence = extract_sentence_from_block(block)
    sentence = sentence
    if sentence is not None:
        pos_tags = extract_pos_tags_from_block(block)
        pos_tags_dict[sentence] = pos_tags

# save file to pos_tags_dict.json
with open(f"{OUTPUT_DIR}/pos_tags_dict_{Split}.json", "w", encoding="utf-8") as f:
    import json
    json.dump(pos_tags_dict, f, ensure_ascii=False, indent=4)

In [35]:
# Features you want to extract
all_features = [
    "ud", "prc3", "prc2", "prc1", "prc0", "enc0", "gen", "num", "cas", "per", "asp", "vox", "mod", "stt", "rat", "token_type"
]

# Main loop over blocks
morph_features_dict = {}
for block in all_parsed_blocks:
    sentence = extract_sentence_from_block(block)
    if sentence is not None:
        morph_features = extract_morph_features_from_block(block, all_features)
        morph_features_dict[sentence] = morph_features

# Save to JSON
output_path = os.path.join(OUTPUT_DIR, f"morph_features_dict_{Split}.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(morph_features_dict, f, ensure_ascii=False, indent=4)

print(f"Saved morphological features to {output_path}")

Saved morphological features to ./splits_blind_testset/morph_features_dict_blind_testset.json


In [36]:
# TODO:: very important when joining -> 
# replace the arabic punctuation marks with english ? 
# add space after the arabic punctuation marks if not already there

def build_dep_graph(block):
    """
    Build a dependency graph from a CoNLL block.
    Returns: dict with nodes and edges
    """
    def get_token_type(feat_str):
        if feat_str == "_" or feat_str.strip() == "":
            return {}
        all_feats = dict(item.split("=") for item in feat_str.split("|") if "=" in item)
        return all_feats.get('token_type', 'baseword')  # Default to 'baseword' if not found
    
    sentence = extract_sentence_from_block(block)
    
    if sentence is None:
        return {'nodes': [], 'edges': []}
    
    lines = block[1:]  # Skip the first line which is the sentence line

    tokens_data = []
    for line in lines:
        parts = line.split('\t')
        token_id = int(parts[0])
        form = parts[1]
        if "NOAN" in form:
            # replace "NOAN" with the next part, also replace only "NOAN" not the whole, since for example we have "الNOAN" زنا -> should be "الزنا"
            form = form.replace("NOAN", parts[2])
        head = parts[6]
        dep = parts[7]
        pos_tag = get_upos(parts[5])
        token_type = get_token_type(parts[5])  # Extract token type from the features
        
        tokens_data.append({
            'id': token_id,
            'form': form,
            'head': head,
            'dep': dep,
            'token_type': token_type,
            'pos_tag': pos_tag,  # Add POS tag to the token data
        })

    merged_results = []
    current_word_tokens = []
    current_word_form = ""
    current_word_tokens_contain_baseword = False  # Track if the current word contains a baseword
    map_english_punctuation = {
        ";": ["؛"],  # Arabic semicolon
        ",": ["،", "٫"],  # Arabic comma
        "?": ["؟"],  # Arabic question mark
        "%": ["٪"],  # Arabic percentage sign
        "*": ["۝"],  # Arabic symbol for verse end
    }
    # map_english_punctuation_values = [punct for puncts in map_english_punctuation.values() for punct in puncts]
    # count all the punctuations found in sentence
    punctuations_occurences = []
    for i, letter in enumerate(sentence):
        if letter in map_english_punctuation.keys():
            punctuations_occurences.append(i)

    # Add space after the arabic punctuation marks if its directly attached to the next word
    offset = 0
    for index in punctuations_occurences:
        adjusted_index = index + offset
        if adjusted_index < len(sentence) - 1 and sentence[adjusted_index + 1] != ' ':
            sentence = sentence[:adjusted_index + 1] + ' ' + sentence[adjusted_index + 1:]
            offset += 1  # Adjust offset due to inserted space


    # print(f"Processed sentence: {sentence}")

    # count all the punctuations found in sentence again after adding spaces
    punctuations_occurences = []
    for i, letter in enumerate(sentence):
        if letter in map_english_punctuation.keys():
            punctuations_occurences.append(i)

    print(f"Total punctuations found in sentence: {[sentence[pun] for pun in punctuations_occurences]}")

    current_punctuations_index = 0

    for token in tokens_data:
        form = token['form']
        token_type = token['token_type']
        
        if token_type.startswith('prc'):
            # means a new word is getting started, we need to save the previous word if exists
            if current_word_tokens and current_word_tokens_contain_baseword:
                # Only save if the current word is a baseword
                merged_results.append({
                    'word': current_word_form,
                    'token_ids': [t['id'] for t in current_word_tokens],
                    'heads': [t['head'] for t in current_word_tokens],
                    'deps': [t['dep'] for t in current_word_tokens],
                    'pos_tags': [t['pos_tag'] for t in current_word_tokens],
                    'merged_form': "".join([t['form'].lstrip('+').rstrip('+') for t in current_word_tokens]),
                })
                # Reset current word group
                current_word_tokens = []
                current_word_form = ""

            # Start new word group
            current_word_tokens_contain_baseword = False
            current_word_tokens.append(token)
            current_word_form += form.lstrip('+').rstrip('+')
            
            continue

        if token_type == 'baseword':
            # special case for ";" baseword, sometimes its combined with previous word
            
            if form in map_english_punctuation.keys():
                # we need first to check if its really combined with previous word, need to add all of its properties too
                
                # get the occurence of the punctuation in the sentence
                if current_punctuations_index < len(punctuations_occurences):
                    punctuation_index = punctuations_occurences[current_punctuations_index]
                else:
                    raise Exception(f"Current punctuations index {current_punctuations_index} is out of bounds for sentence: {sentence}")

                # Check if the punctuation is concatenated directly to the previous/next word, or there is a space before it
                # case where the punctuation is directly attached to the next word
                if (punctuation_index < len(sentence) - 1 and sentence[punctuation_index + 1] != ' '):
                    raise Exception(f"Unexpected punctuation '{form}' at index {punctuation_index} in sentence: {sentence}")
                
                if (punctuation_index > 0 and sentence[punctuation_index - 1] != ' '):
                    # print(f"Found '{current_word_form + map_english_punctuation[form]}' in sentence, merging...")
                    # If the current word is part of the sentence, we merge it
                    current_word_tokens.append(token)
                    current_word_form += form.lstrip('+').rstrip('+')

                    # Save previous word if exists
                    if current_word_tokens:
                        merged_results.append({
                            'word': current_word_form,
                            'token_ids': [t['id'] for t in current_word_tokens],
                            'heads': [t['head'] for t in current_word_tokens],
                            'deps': [t['dep'] for t in current_word_tokens],
                            'pos_tags': [t['pos_tag'] for t in current_word_tokens],
                            'merged_form': "".join([t['form'].lstrip('+').rstrip('+') for t in current_word_tokens]),
                        })
                    # Start new word group
                    current_word_tokens = []
                    current_word_form = ""
                    current_word_tokens_contain_baseword = False
                    # increment current_punctuations_index
                    current_punctuations_index += 1
                    continue
                else:
                    # increment current_punctuations_index
                    current_punctuations_index += 1

            # Save previous word if exists
            if current_word_tokens and current_word_tokens_contain_baseword:
                # Only save if the current word is a baseword
                merged_results.append({
                    'word': current_word_form,
                    'token_ids': [t['id'] for t in current_word_tokens],
                    'heads': [t['head'] for t in current_word_tokens],
                    'deps': [t['dep'] for t in current_word_tokens],
                    'pos_tags': [t['pos_tag'] for t in current_word_tokens],
                    'merged_form': "".join([t['form'].lstrip('+').rstrip('+') for t in current_word_tokens]),
                })
                # Reset current word group
                current_word_tokens = []
                current_word_form = ""

            # Start new word group
            current_word_tokens_contain_baseword = True
            current_word_tokens.append(token)
            current_word_form += form.lstrip('+').rstrip('+')
        else:
            # Add clitic or enc0 tokens to current word group
            current_word_tokens.append(token)
            current_word_form += form.lstrip('+').rstrip('+')

    # Append last word group after loop
    if current_word_tokens:
        merged_results.append({
            'word': current_word_form,
            'token_ids': [t['id'] for t in current_word_tokens],
            'heads': [t['head'] for t in current_word_tokens],
            'deps': [t['dep'] for t in current_word_tokens],
            'pos_tags': [t['pos_tag'] for t in current_word_tokens],
            'merged_form': "".join([t['form'].lstrip('+') for t in current_word_tokens]),
        })

    # Map from token_id to merged word index (1-based)
    tokenid_to_merged_idx = {}
    for merged_idx, merged_token in enumerate(merged_results, 1):
        for tid in merged_token['token_ids']:
            tokenid_to_merged_idx[tid] = merged_idx

    # Build heads mapping to merged node indices
    for entry in merged_results:
        heads_graph = []
        for head in entry['heads']:
            head_id = int(head)
            if head_id == 0:
                heads_graph.append(0)  # Root
            else:
                merged_head_idx = tokenid_to_merged_idx.get(head_id, -1)
                heads_graph.append(merged_head_idx)
        entry['Heads_graph'] = heads_graph

    # Construct nodes and edges for graph
    nodes = []
    edges = []

    for idx, entry in enumerate(merged_results, 1):
        nodes.append({
            'id': idx,
            'word': entry['word'],
            'token_ids': entry['token_ids'],
            'pos_tags': entry['pos_tags'],
        })

        for head_idx, dep in zip(entry['Heads_graph'], entry['deps']):
            if head_idx != 0: # and head_idx != idx: why should we not include self-loops?
                edges.append({
                    'source': head_idx,
                    'target': idx,
                    'dep': dep,
                })
            elif head_idx == 0:
                # Optionally add edges from root node (id=0)
                edges.append({
                    'source': 0,
                    'target': idx,
                    'dep': dep,
                })

    # Check nodes are of same length as space separated words in sentence
    if len(nodes) != len(sentence.split()):
        raise Exception(f"Warning: Number of nodes ({len(nodes)}) does not match number of words in sentence ({len(sentence.split())}), sentence: {sentence}")

    return {
        'nodes': nodes,
        'edges': edges,
    }

# Build dependency graphs for all blocks
dep_graphs_dict = {}
for i, block in enumerate(all_parsed_blocks):
    print(f"Processing block {i}...")
    dep_graph = build_dep_graph(block)
    print(f"Block {i} processed with {len(dep_graph['nodes'])} nodes and {len(dep_graph['edges'])} edges.")
    sentence = extract_sentence_from_block(block)
    dep_graphs_dict[sentence] = dep_graph

# Save to JSON
output_path = os.path.join(OUTPUT_DIR, f"dep_graph_{Split}.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(dep_graphs_dict, f, ensure_ascii=False, indent=4)

print(f"Saved dep_graphs_dict to {output_path}")

# build_dep_graph(all_parsed_blocks[30109])

Processing block 0...
Total punctuations found in sentence: []
Block 0 processed with 5 nodes and 6 edges.
Processing block 1...
Total punctuations found in sentence: []
Block 1 processed with 2 nodes and 2 edges.
Processing block 2...
Total punctuations found in sentence: []
Block 2 processed with 5 nodes and 5 edges.
Processing block 3...
Total punctuations found in sentence: []
Block 3 processed with 5 nodes and 5 edges.
Processing block 4...
Total punctuations found in sentence: []
Block 4 processed with 2 nodes and 2 edges.
Processing block 5...
Total punctuations found in sentence: []
Block 5 processed with 3 nodes and 3 edges.
Processing block 6...
Total punctuations found in sentence: []
Block 6 processed with 4 nodes and 4 edges.
Processing block 7...
Total punctuations found in sentence: []
Block 7 processed with 1 nodes and 1 edges.
Processing block 8...
Total punctuations found in sentence: []
Block 8 processed with 1 nodes and 1 edges.
Processing block 9...
Total punctuati

# Sanity check that merging is correct

In [38]:
import pandas as pd

# If dataset is not a DataFrame already, convert it
if not isinstance(dataset, pd.DataFrame):
    df_dataset = pd.DataFrame(dataset)
else:
    df_dataset = dataset.copy()

# Build DataFrame from successfully parsed blocks
df_feats = pd.DataFrame(features_list)

# Ensure df_dataset has both Sentence and labels columns
assert "Sentence" in df_dataset.columns

# Merge features with original data 
merged_df = df_feats.merge(df_dataset[["Sentence"]], on="Sentence", how="left")

# i want to know how many sentences are in the dataset that are not in the features_list
missing_sentences = set(df_dataset["Sentence"]) - set(df_feats["Sentence"])
print(f"Number of sentences in dataset not found in features: {len(missing_sentences)}")
print("Some missing sentences:", missing_sentences)  # Show a few missing sentences

Number of sentences in dataset not found in features: 0
Some missing sentences: set()
