In [4]:
###indicators form dependency grammar and valency grammar,such as dependency distance, dependency direction, 
###probabilistic valency pattern, valency and so on.###
from conllu import parse_incr
from collections import Counter

In [76]:
class DependencyAnalyzer():
    """
    An analyzer for dependency.
    
    :Items: mean dependency distance, dependency direction, dependency distribution
    """
    def __init__(self,data):
        """
        :data: must be conllu format or other byte-like formats, which means annotated
        """
        self.data= data
    
    def mdd(self,upos=None,dorg=None, deprel=None, dependency_direction=None):
        """
        :upos: None(default) - count the mdd of all words without distinguishing POS.
               POS(str) - like 'NOUN','VERB'. Count the mdd of words with the POS 
               tags you defined.
        :dorg:dependent or governor(str). Must use it along with upos being activated. 
              'dep'- count the mdd of words when acting as a depdent
              'gov'- count the mdd of words when acting as a governor
        :deprel: dependency relations(str), such as 'subj'. When setting it to a specific
                 deprel, the result will be the mdd of that deprel.
        :dependency_direction: str
                               'head_initial' - 
                               'head_final' - 
        :return: mdd(float)
        :About mdd: 
               Liu H. Dependency distance as a metric of language comprehension difficulty[J]. 
               Journal of Cognitive Science, 2008, 9(2): 159-191.        
        """
        dds = []
        for sentence in parse_incr(self.data):    
            for word in sentence:
                if word['upos'] == upos and word['head'] !=0 and dorg == 'dep':
                    dd = abs(word['head'] - word['id'])
                    dds.append(dd)
                elif word['upos'] == upos and dorg == 'gov':
                    dd = [abs(word['id']-i['id']) for i in sentence if i['head'] == word['id'] and i['deprel'] != 'punct']
                    dds = dds + dd
                elif deprel is not None and word['deprel'] == deprel:
                    dd = abs(word['head'] - word['id'])
                    dds.append(dd)
                elif dependency_direction == 'head_final' and word['deprel']!='punct'and word['head']!=0 and word['head'] > word['id']:
                    dd = abs(word['head'] - word['id'])
                    dds.append(dd)
                elif dependency_direction == 'head_initial' and word['deprel'] !='punct'and word['head']!=0 and word['head'] < word['id']:
                    dd = abs(word['head'] - word['id'])
                    dds.append(dd)
                elif deprel is None and upos is None and dependency_direction is None and word['deprel']!='punct'and word['head']!=0:
                    dd = abs(word['head'] - word['id'])
                    dds.append(dd)
        MDD = sum(dds) / len(dds)
        return MDD
    
    def HF_HI(self,upos=None,deprel=None):
        """
        :return: list 
        :About dependency direction:
              Liu H. Dependency direction as a means of word-order typology: A method based on dependency treebanks[J].
              Lingua, 2010, 120(6): 1567-1578.
        
        """
        head_initial = 0
        head_final = 0
        for sentence in parse_incr(self.data):    
            for word in sentence:
                if word['deprel'] == deprel:
                    if word['head'] < word['id']:
                        head_initial += 1
                    else:
                        head_final += 1
                elif word['upos'] == upos and word['head']!=0:
                    if word['head'] < word['id']:
                        head_initial += 1
                    else:
                        head_final += 1
                elif deprel is None and upos is None and word['deprel']!='punct'and word['head']!=0:
                    if word['head'] < word['id']:
                        head_initial += 1
                    else:
                        head_final += 1
                        
        total = head_initial + head_final
        proportion_head_initial = head_initial / total
        proportion_head_final = head_final / total

        return  proportion_head_final,proportion_head_initial
    
    def dd_distribution(self, upos=None,deprel=None,dependency_direction=None,dorg=None):
        """
        :return: list
        """
        dds = []
        for sentence in parse_incr(self.data):    
            for word in sentence:
                if word['upos'] == upos and word['head'] !=0 and dorg == 'dep':
                    dd = abs(word['head'] - word['id'])
                    dds.append(dd)
                elif word['upos'] == upos and dorg == 'gov':
                    dd = [abs(word['id']-i['id']) for i in sentence if i['head'] == word['id'] and i['deprel'] != 'punct']
                    dds = dds + dd
                elif deprel is not None and word['deprel'] == deprel:
                    dd = abs(word['head'] - word['id'])
                    dds.append(dd)
                elif dependency_direction == 'head_final' and word['deprel']!='punct'and word['head']!=0 and word['head'] > word['id']:
                    dd = abs(word['head'] - word['id'])
                    dds.append(dd)
                elif dependency_direction == 'head_initial' and word['deprel'] !='punct'and word['head']!=0 and word['head'] < word['id']:
                    dd = abs(word['head'] - word['id'])
                    dds.append(dd)
                elif deprel is None and upos is None and dependency_direction is None and word['deprel']!='punct'and word['head']!=0:
                    dd = abs(word['head'] - word['id'])
                    dds.append(dd)
        return dds
    
    def mhd(self, upos=None,deprel=None):
        """
        About mean hierarchical distance:
             Jing Y, Liu H. Mean hierarchical distance augmenting mean dependency distance[C]//
             Proceedings of the third international conference on dependency linguistics 
             (Depling 2015). 2015: 161-170.
        """
        total_hd = 0
        total_count = 0
        for sentence in parse_incr(self.data):
            word_index_by_id = {word['id']: word for word in sentence}
            if upos is not None:
                selected_words = [word for word in sentence if upos is None or word['upos'] == upos]
            elif deprel is not None:
                selected_words = [word for word in sentence if upos is None or word['deprel'] == deprel]
            elif upos is None and deprel is None:
                selected_words = [word for word in sentence]
            for word in selected_words:
                head_id = word['head']
                distance = 0
                while head_id != 0:
                    distance += 1
                    head_id = word_index_by_id[head_id]['head']
                total_hd += distance
                total_count += 1
        MHD = total_hd / total_count
        return MHD
    
    def hd_distribution(self,upos=None,deprel=None):
        distance_distribution = []

        for sentence in parse_incr(self.data):
            word_index_by_id = {word['id']: word for word in sentence}
            if upos is not None:
                selected_words = [word for word in sentence if upos is None or word['upos'] == upos]
            elif deprel is not None:
                selected_words = [word for word in sentence if upos is None or word['deprel'] == deprel]
            elif upos is None and deprel is None:
                selected_words = [word for word in sentence]
            for word in selected_words:
                head_id = word['head']
                distance = 0
                while head_id != 0:
                    distance += 1
                    head_id = word_index_by_id[head_id]['head']
                distance_distribution.append(distance)
        return distance_distribution
    
    def describe(self):
        """
        :return: [{},{}]
        :About tree height and three width:
                Zhang H, Liu H. Interrelations among dependency tree widths, heights and sentence lengths[J]. 
                Quantitative Analysis of Dependency Structures, 2018, 72: 31-52.
        :About vk:
                Lu Q, Lin Y, Liu H. Dynamic Valency and Dependency Distance[J]. Quantitative Analysis of 
                Dependency Structures, 2018, 72: 145.
        
        """
        table = []
        for sentence in parse_incr(self.data):
            sent_data = {}
            dds = []
            word_index_by_id = {word['id']: word for word in sentence}
            levels = []
            valencies = []
            for word in sentence:
                if word['deprel'] != 'punct':
                    if word['head'] !=0:
                        dd = abs(word['head'] - word['id'])
                        dds.append(dd)
                        valency = len([i for i in sentence if i['head'] == word['id']])+1
                        valencies.append(valency)
                    if word['head'] ==0:
                        valency = len([i for i in sentence if i['head'] == word['id']])
                        valencies.append(valency)
                    depth = 0        
                    head_id = word['head']
                    while head_id != 0:
                        depth += 1
                        head_id = word_index_by_id[head_id]['head']
                    levels.append(depth)
                    
        
            MDD = sum(dds) / len(dds)
            sent_length = len(levels)
            vk = (sum(i*2 for i in valencies)/sent_length) - (2 - 2/sent_length)**2
            sent_data['mdd'] = MDD
            sent_data['mhd'] = sum(levels)/len(levels)
            sent_data['sent_length'] = sent_length
            sent_data['tree_height'] = max(levels)
            sent_data['tree_width'] = levels.count(max(levels, key=levels.count))
            sent_data['vk'] = vk
        table.append(sent_data)
        return table

In [77]:
%%time
data = open(r'D:\database\sud-treebanks-v2.9\SUD_Chinese-GSDSimp\演示.conllu',"r",encoding="utf-8")
d = DependencyAnalyzer(data)
d.describe()

Wall time: 998 µs


[{'mdd': 3.6666666666666665,
  'mhd': 2.2,
  'sent_length': 10,
  'tree_height': 5,
  'tree_width': 3,
  'vk': 0.5599999999999996}]

In [72]:
class ValencyAnalyzer():
    """
    A class for analyzing valency.
    
    :data: must be conllu format or other byte-like formats, which means annotated corpus(treebanks). 
    """
    def __init__(self, data):
        self.data = data
            
    def mean_valency(self,upos=None):
        """
        :upos: None(default) - count the mean valency of all words without distinguishing POS.
           POS(str) - like 'NOUN','VERB'. Count the mean valency of words with the POS 
           tags you defined.
        :return: mean valency(float)
        :About mean valency: 
           Yan J, Liu H. Quantitative analysis of Chinese and English verb valencies 
           based on probabilistic valency pattern theory[C]//Workshop on Chinese Lexical
           Semantics. Cham: Springer International Publishing, 2021: 152-162. 
        """   
        if upos != None:
            num_depdents = 0
            num_word = 0
            for sentence in parse_incr(self.data):    
                for word in sentence:
                    if word['upos'] == upos: 
                        depdents = [w for w in sentence if w['head'] == word['id']]
                        num_depdents += len(depdents)
                        num_word += 1
            mean_valency = num_depdents/num_word
            return mean_valency
        
        elif upos == None:
            num_depdents = 0
            num_word = 0
            for sentence in parse_incr(self.data):    
                for word in sentence:
                    if word['deprel'] != 'punct': 
                        depdents = [w for w in sentence if w['head'] == word['id']]
                        num_depdents += len(depdents)
                        num_word += 1
            mean_valency = num_depdents/num_word
            return mean_valency
    
    def distribution(self, upos=None):
        '''
        :upos: None(default) - count the dynamic valency distribution of all words without distinguishing POS.
               POS(str) - like 'NOUN','VERB'. Count the dynamic valency distribution of words with the POS 
               tags you defined.
        :return: valency distribution(list)
        :About valency distribution:
               Yan J, Liu H. Quantitative analysis of Chinese and English verb valencies 
               based on probabilistic valency pattern theory[C]//Workshop on Chinese Lexical
               Semantics. Cham: Springer International Publishing, 2021: 152-162. 
        '''
        if upos is not None:
            v = []
            for sentence in parse_incr(self.data):    
                for word in sentence:
                    if word['upos'] == upos: 
                        dependents = [w for w in sentence if w['head'] == word['id']]
                        v.append(len(dependents))
            return v
        
        elif upos is None:
            v = []
            for sentence in parse_incr(self.data):    
                for word in sentence:
                    if word['deprel'] != 'punct': 
                        dependents = [w for w in sentence if w['head'] == word['id']]
                        v.append(len(dependents))
            return v    
    
    def PVP(self,upos=None,target='deprel'):
        '''
        PVP - probabilistic valency pattern theory, a kind of generalize valency theory. PVP can be used
          for describing word's/wordclass's capacity to combine with others.
    
        :upos: None(default) - count the PVP of all words without distinguishingdistincting POS.
           POS(str) - like 'NOUN','VERB'. Count the PVP of words with the POS tags you defined.
        :target: the feature you want to count
           'deprel'(default) - dependency relations
           'upos' - POS
        :return: PVP_dependents(dict),PVP_governors(dict)
        :About PVP:
           刘海涛,冯志伟.自然语言处理的概率配价模式理论[J].语言科学,2007(03):32-41.
           Liu, H. & Feng, Z. 2007. Probabilistic Valency Pattern Theory for Natural Language Processing.
           Language Science(03):32-41.

        '''
        if target == 'deprel':
            if upos != None:
                dependents = []
                governors = []
                for sentence in parse_incr(self.data):    
                    for word in sentence:
                        if word['upos'] == upos: 
                            dependent = [w['deprel'] for w in sentence if w['head'] == word['id']]
                            dependents += dependent
                            governor = word['deprel']
                            governors.append(governor)
                deps = Counter(dependents)
                govs = Counter(governors)
                return deps, govs
        
            if upos == None:
                dependents = []
                governors = []
                for sentence in parse_incr(self.data):    
                    for word in sentence:
                        if word['deprel'] != 'punct': 
                            dependent = [w['deprel'] for w in sentence if w['head'] == word['id']]
                            dependents += dependent
                            governor = word['deprel']
                            governors.append(governor)
                deps = Counter(dependents)
                govs = Counter(governors)
                return deps, govs
            
        if target == 'upos':
            if upos != None:
                dependents = []
                governors = []
                for sentence in parse_incr(self.data):    
                    for word in sentence:
                        if word['upos'] == upos: 
                            dependent = [w['upos'] for w in sentence if w['head'] == word['id']]
                            dependents += dependent
                            governor = word['upos']
                            governors.append(governor)
                deps = Counter(dependents)
                govs = Counter(governors)
                return deps, govs
        
            if upos == None:
                dependents = []
                governors = []
                for sentence in parse_incr(self.data):    
                    for word in sentence:
                        if word['deprel'] != 'punct': 
                            dependent = [w['upos'] for w in sentence if w['head'] == word['id']]
                            dependents += dependent
                            governor = word['upos']
                            governors.append(governor)
                deps = Counter(dependents)
                govs = Counter(governors)
                return deps, govs
            
    #prepare for non-generalized valency         
    #def complement_selecter(data,name='ud'): 
    #    if name == 'ud':
    #        #Universal Dependenecies
    #        complements = [] 