In [58]:
###indicators form dependency grammar and valency grammar,such as dependency distance, dependency direction, 
###probabilistic valency pattern, valency and so on.###
from conllu import parse_incr

In [69]:
def get_mdd(data, deprel=None, upos=None, depdirection=None):
    """
    Calculate the Mean Dependency Distance (MDD) for a conllu format data.

    This function computes the average distance between words and their syntactic
    heads based on optional filters such as dependency relation (deprel),
    Universal POS (upos), and dependency direction (depdirection).

    Args:
        data (conllu): A list of sentences, each represented as a list of word dictionaries.
                     Each word dictionary should contain 'id', 'head', 'deprel', and 'upos' keys.
        deprel (str, optional): The dependency relation to consider. If None, all relations are considered.
        upos (str, optional): The Universal POS tag to consider. If None, all tags are considered.
        depdirection (str, optional): The desired dependency direction: 'head_initial' or 'head_final'.

    Returns:
        float: The calculated Mean Dependency Distance (MDD).

    Example:
        data = [
            [{'id': 1, 'head': 2, 'deprel': 'nsubj', 'upos': 'NOUN'}, {'id': 2, 'head': 0, 'deprel': 'root', 'upos': 'VERB'}],
            [{'id': 1, 'head': 3, 'deprel': 'nsubj', 'upos': 'NOUN'}, {'id': 2, 'head': 3, 'deprel': 'aux', 'upos': 'AUX'},
             {'id': 3, 'head': 0, 'deprel': 'root', 'upos': 'VERB'}]
        ]
        mdd = get_mdd(data, deprel='nsubj', depdirection='head_initial')
        print(f"Mean Dependency Distance: {mdd}")

    Note:
        - The function assumes that the input data follows the specified format.
        - The 'id' key in each word dictionary represents the word's index in the sentence.
        - The 'head' key represents the index of the head word to which the current word is dependent.
        - The 'deprel' key represents the dependency relation between the current word and its head.
        - The 'upos' key represents the Universal POS tag of the current word.
        - If no suitable dependencies are found, the function returns 0.0.
    """
    word_dd = []  
    for sentence in parse_incr(data):    
        for word in sentence:
            if deprel is not None and word['deprel'] == deprel:
                dd = abs(word['head'] - word['id'])
                word_dd.append(dd)
            elif upos is not None and sentence[word['head'] - 1]['upos'] == upos and word['head'] != 0:
                dd = abs(word['head'] - word['id'])
            elif depdirection == 'head_final' and word['deprel'] not in ['root','punct'] and word['head'] > word['id']:
                dd = abs(word['head'] - word['id'])
                word_dd.append(dd)
            elif depdirection == 'head_initial' and word['deprel'] not in ['root','punct'] and word['head'] < word['id']:
                dd = abs(word['head'] - word['id'])
                word_dd.append(dd)
            elif deprel is None and upos is None and word['deprel'] not in ['root','punct']:
                dd = abs(word['head'] - word['id'])
                word_dd.append(dd)
    mdd = sum(word_dd) / len(word_dd)
    return mdd

In [66]:
def get_pdd(data, deprel=None):
    """
    Calculate the Proportion of Dependency Directions (PDD) for a conllu format data.

    This function computes the proportion of head-initial and head-final dependencies
    based on the provided dependency relation (deprel) within the input data.

    Args:
        data (conllu): A list of sentences, each represented as a list of word dictionaries.
                     Each word dictionary should contain 'id', 'head', and 'deprel' keys.
        deprel (str, optional): The dependency relation to consider. If None, all relations are considered.

    Returns:
        tuple: A tuple containing two values:
               - proportion_head_initial (float): The proportion of head-initial dependencies.
               - proportion_head_final (float): The proportion of head-final dependencies.

    Example:
        data = [
            [{'id': 1, 'head': 2, 'deprel': 'nsubj'}, {'id': 2, 'head': 0, 'deprel': 'root'}],
            [{'id': 1, 'head': 3, 'deprel': 'nsubj'}, {'id': 2, 'head': 3, 'deprel': 'aux'},
             {'id': 3, 'head': 0, 'deprel': 'root'}]
        ]
        prop_initial, prop_final = get_pdd(data, deprel='nsubj')
        print(f"Proportion of head-initial dependencies: {prop_initial}")
        print(f"Proportion of head-final dependencies: {prop_final}")

    Note:
        - The function assumes that the input data follows the specified format.
        - The 'id' key in each word dictionary represents the word's index in the sentence.
        - The 'head' key represents the index of the head word to which the current word is dependent.
        - The 'deprel' key represents the dependency relation between the current word and its head.
        - The function returns (0, 0) if no suitable dependencies are found in the data.
    """
    head_initial = 0
    head_final = 0
    
    for sentence in parse_incr(data):    
        for word in sentence:
            if deprel is not None and word['deprel'] == deprel and word['deprel'] not in ['root','punct']:
                if word['deprel'] == deprel:
                    if word['head'] < word['id']:
                        head_initial += 1
                    else:
                        head_final += 1
            elif deprel is None and word['deprel'] not in ['root','punct']:
                    if word['head'] < word['id']:
                        head_initial += 1
                    else:
                        head_final += 1
                        
    total = head_initial + head_final
    proportion_head_initial = head_initial / total
    proportion_head_final = head_final / total

    return proportion_head_initial, proportion_head_final

In [80]:
def get_mv(data,upos=None):
    if upos != None:
        num_depdents = 0
        num_word = 0
        for sentence in parse_incr(data):    
            for word in sentence:
                if word['deprel'] != 'punct' and word['upos'] == upos: 
                    depdents = [w for w in sentence if w['head'] == word['id']]
                    num_depdents += len(depdents)
                    num_word += 1
        mean_valency = num_depdents/num_word
        return mean_valency
    elif upos == None:
        num_depdents = 0
        num_word = 0
        for sentence in parse_incr(data):    
            for word in sentence:
                if word['deprel'] != 'punct': 
                    depdents = [w for w in sentence if w['head'] == word['id']]
                    num_depdents += len(depdents)
                    num_word += 1
        mean_valency = num_depdents/num_word
        return mean_valency

In [81]:
%%time
data = open(r'D:\数据集\sud-treebanks-v2.11\SUD_Chinese-GSD\zh_gsd-sud-train.conllu',"r",encoding="utf-8")
print(get_mv(data,upos='VERB'))

2.0631018923833255
Wall time: 1.63 s
