In [1]:
from ufal.udpipe import Model, Pipeline, ProcessingError

from string import punctuation
full_punctuation = punctuation + "–" + "," + "»" + "«" + "…" +'’'

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

from collections import OrderedDict

import copy

In [93]:
model = Model.load('./UDPIPE/english-lines-ud-2.0-170801.udpipe')

AttributeError: type object 'Model' has no attribute 'load'

In [3]:
import ufal.udpipe
# ufal.udpipe.Model etc. are SWIG-magic and cannot be detected by pylint
# pylint: disable=no-member

class Model:
    def __init__(self, path):
        """Load given model."""
        self.model = ufal.udpipe.Model.load(path)
        if not self.model:
            raise Exception("Cannot load UDPipe model from file '%s'" % path)

    def tokenize(self, text):
        """Tokenize the text and return list of ufal.udpipe.Sentence-s."""
        tokenizer = self.model.newTokenizer(self.model.DEFAULT)
        if not tokenizer:
            raise Exception("The model does not have a tokenizer")
        return self._read(text, tokenizer)

    def read(self, text, in_format):
        """Load text in the given format (conllu|horizontal|vertical) and return list of ufal.udpipe.Sentence-s."""
        input_format = ufal.udpipe.InputFormat.newInputFormat(in_format)
        if not input_format:
            raise Exception("Cannot create input format '%s'" % in_format)
        return self._read(text, input_format)

    def _read(self, text, input_format):
        input_format.setText(text)
        error = ufal.udpipe.ProcessingError()
        sentences = []

        sentence = ufal.udpipe.Sentence()
        while input_format.nextSentence(sentence, error):
            sentences.append(sentence)
            sentence = ufal.udpipe.Sentence()
        if error.occurred():
            raise Exception(error.message)

        return sentences

    def tag(self, sentence):
        """Tag the given ufal.udpipe.Sentence (inplace)."""
        self.model.tag(sentence, self.model.DEFAULT)

    def parse(self, sentence):
        """Parse the given ufal.udpipe.Sentence (inplace)."""
        self.model.parse(sentence, self.model.DEFAULT)

    def write(self, sentences, out_format):
        """Write given ufal.udpipe.Sentence-s in the required format (conllu|horizontal|vertical)."""

        output_format = ufal.udpipe.OutputFormat.newOutputFormat(out_format)
        output = ''
        for sentence in sentences:
            output += output_format.writeSentence(sentence)
        output += output_format.finishDocument()

        return output

# Can be used as
#  model = Model('english-ud-1.2-160523.udpipe')
#  sentences = model.tokenize("Hi there. How are you?")
#  for s in sentences:
#      model.tag(s)
#      model.parse(s)
#  conllu = model.write(sentences, "conllu")

In [5]:
model_test = Model('./UDPIPE/english-ud-2.0-170801.udpipe')

sentences_test = model_test.tokenize("I like going there")
for s in sentences_test:
    model_test.tag(s)
    model_test.parse(s)
conllu_test = model_test.write(sentences_test, "conllu")
for line in conllu_test.split('\n'):
    if line:
        if line[0].isdigit():
            print(line.split('\t'))
        else:
            print(line)

# newdoc
# newpar
# sent_id = 1
# text = I like going there
['1', 'I', 'I', 'PRON', 'PRP', 'Case=Nom|Number=Sing|Person=1|PronType=Prs', '2', 'nsubj', '_', '_']
['2', 'like', 'like', 'VERB', 'VBP', 'Mood=Ind|Tense=Pres|VerbForm=Fin', '0', 'root', '_', '_']
['3', 'going', 'go', 'VERB', 'VBG', 'VerbForm=Ger', '2', 'xcomp', '_', '_']
['4', 'there', 'there', 'ADV', 'RB', 'PronType=Dem', '3', 'advmod', '_', 'SpaceAfter=No']


In [7]:
with open ("./UDPIPE/text_0.txt", "r") as f:
    text = ''
    for line in f.readlines():
        text += line + ' '
        text.strip()
    #print(text)
    
model = Model('./UDPIPE/english-partut-ud-2.0-170801.udpipe')
#sentences = model.tokenize("Hi there. How are you?")
sentences = model.tokenize(text)
for s in sentences:
    model.tag(s)
    model.parse(s)
conllu = model.write(sentences, "conllu")

# sentenceses conlluu map

In [32]:
def get_conllu(text_line, model, print_output = False):
    sentences = model.tokenize(text_line)
    for s in sentences:
        model.tag(s)
        model.parse(s)
    conllu = model.write(sentences, "conllu")
    if (print_output):
        for line in conllu.split('\n'):
            if line:
                if line[0].isdigit():
                    print(line.split('\t'))
                else:
                    print(line)
    return conllu

In [8]:
def get_conllu_text_map(conllu_parsed_object):
    conllu_text_map = []
    conllu_sentence_map = []
    for line in conllu_parsed_object.split('\n'):
        if line:
            if line[0].isdigit():
                #print(line.split('\t'))
                conllu_sentence_map.append(line.split('\t'))
            else:
                if(len(conllu_sentence_map) > 0):
                    conllu_text_map.append(conllu_sentence_map)
                    conllu_sentence_map = []   
                    #print("appended")
    if(len(conllu_sentence_map) > 0):
        conllu_text_map.append(conllu_sentence_map)
    return conllu_text_map
conllu_text_map_ex = get_conllu_text_map(conllu)

In [10]:
for sentence in conllu_text_map_ex:
    for word in sentence:
        print(word)
    print()

['1', '"', '"', 'PUNCT', 'FB', '_', '5', 'punct', '_', 'SpaceAfter=No']
['2', 'This', 'this', 'PRON', 'PD', 'Number=Sing|PronType=Dem', '5', 'nsubj', '_', '_']
['3', 'is', 'be', 'AUX', 'VA', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '5', 'cop', '_', '_']
['4', 'my', 'my', 'DET', 'AP', 'Number=Sing|Poss=Yes|PronType=Prs', '5', 'nmod:poss', '_', '_']
['5', 'friend', 'friend', 'NOUN', 'S', 'Number=Sing', '0', 'root', '_', '_']
['6', 'Jimmy', 'Jimmy', 'PROPN', 'SP', '_', '5', 'nmod', '_', 'SpaceAfter=No']
['7', '.', '.', 'PUNCT', 'FS', '_', '5', 'punct', '_', '_']

['1', 'He', 'he', 'PRON', 'PE', 'Gender=Masc|Number=Sing|Person=3|PronType=Prs', '4', 'nsubj', '_', '_']
['2', 'is', 'be', 'AUX', 'VA', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'cop', '_', '_']
['3', 'from', 'from', 'ADP', 'E', '_', '4', 'case', '_', '_']
['4', 'India', 'India', 'PROPN', 'SP', '_', '0', 'root', '_', 'SpaceAfter=No']
['5', '.', '.', 'PUNCT', 'FS', '_', '4', 'punct', '_', '_']



# LEMM TEXT FROM UDPIPE MAP

In [11]:
def lemmatize_from_udmap(conllu_map):
    sentences_list = []
    for sentence in conllu_map:
        line = ''
        for word in sentence: 
            if (word[3] != 'PUNCT'):
                #print(word[2])
                line += word[2] + ' '
        
        sentences_list.append(line.strip())
        #print()
    return sentences_list
lemm_sentences = lemmatize_from_udmap(conllu_text_map_ex)

# TF_IDF

In [None]:
СТОП СЛОВА ВРЕМЕННО УБИРАЕМ - ОСТАВИТЬ НА ОБСУЖДЕНИЕ СПИСОК

In [12]:
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

In [15]:
def get_tf_idf_dict(lemm_text_list, save_to_csv = False):
    vect = TfidfVectorizer()
    tfidf_matrix = vect.fit_transform(lemm_text_list)
    df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
    #print(df.head())
    if (save_to_csv): df.to_csv("./text_0_tfidf.xlsx", sep = '\t')
    tf_idf_dict = df.to_dict()
    return tf_idf_dict
tf_idf_dict_example = get_tf_idf_dict (lemm_sentences, save_to_csv = False)

In [16]:
tf_idf_dict_example

{'about': {0: 0.0,
  1: 0.0,
  2: 0.0,
  3: 0.0,
  4: 0.0,
  5: 0.0,
  6: 0.0,
  7: 0.0,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 0.0,
  17: 0.0,
  18: 0.0,
  19: 0.0,
  20: 0.0,
  21: 0.0,
  22: 0.0,
  23: 0.0,
  24: 0.0,
  25: 0.0,
  26: 0.0,
  27: 0.0,
  28: 0.0,
  29: 0.0,
  30: 0.3137533768590456,
  31: 0.0,
  32: 0.0},
 'accompany': {0: 0.0,
  1: 0.0,
  2: 0.0,
  3: 0.0,
  4: 0.0,
  5: 0.0,
  6: 0.0,
  7: 0.0,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 0.0,
  17: 0.28485310012781917,
  18: 0.0,
  19: 0.0,
  20: 0.0,
  21: 0.0,
  22: 0.0,
  23: 0.0,
  24: 0.0,
  25: 0.0,
  26: 0.0,
  27: 0.0,
  28: 0.0,
  29: 0.0,
  30: 0.0,
  31: 0.0,
  32: 0.0},
 'add': {0: 0.0,
  1: 0.0,
  2: 0.0,
  3: 0.0,
  4: 0.0,
  5: 0.0,
  6: 0.0,
  7: 0.0,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 0.0,
  17: 0.0,
  18: 0.0,
  19: 0.0,
  20: 0.0,
  21: 0.0,


# Text map

# ГРАММАТИКУ УЧИТЫВАЕМ ОТДЕЛЬНО НЕ ССЫЛАЯСЬ НА ТФ ИДФ 
ЕСЛИ ГРАММАТИКА ИДЕТ В КУПЕ С ТФИДФ ТО НАКЛАДЫВАЕМ ДОПОЛНИТЕЛЬНЫЙ ШТРАФ

In [17]:
def create_map(conllu_map, tf_idf_dict, apply_tf_idf = True):
    text_map = []
    sentence_ind = 0
    for sentence in conllu_map:
        sentence_map = []
        for word in sentence: 
            if (word[3] != 'PUNCT'):
                weight = OrderedDict([("word", word[1]),("lemma",word[2]), ("vocabulary_prop",(OrderedDict([("tf_idf", 0),("nominal_index",word[0])])))])
                #print(word[2])
                lemma_lower = word[2].lower()
                if (apply_tf_idf):
                    if (lemma_lower in tf_idf_dict):
                        weight["vocabulary_prop"]["tf_idf"] = tf_idf_dict[lemma_lower][sentence_ind]
                sentence_map.append(weight)
        text_map.append(sentence_map)
        sentence_ind += 1
    return text_map
text_map_ex = create_map(conllu_text_map_ex, tf_idf_dict_example)

In [19]:
def get_dependencies (conllu_map, text_map_input):
    assert len(conllu_map) == len(text_map_input) #sentences count is equal
    text_map = copy.deepcopy(text_map_input)
    for sentence, text_map_sentence in zip(conllu_map,text_map):
        dep_dict = {}
        for word in sentence: 
            if (word[3] != 'PUNCT'):
                #print(word[1], "head_word_nominal_index =", word[6])
                if(word[6] in dep_dict):
                    dep_dict[word[6]] += 1
                elif(word[6] != 0):
                    dep_dict[word[6]] = 1
        #print(dep_dict)
        for map_word in text_map_sentence:
            if(map_word["vocabulary_prop"]["nominal_index"] in dep_dict):
                map_word["vocabulary_prop"]["dep_words_count"] = dep_dict[map_word["vocabulary_prop"]["nominal_index"]] + 1
                map_word["vocabulary_prop"]["vocab_importane"] = map_word["vocabulary_prop"]["dep_words_count"] * map_word["vocabulary_prop"]["tf_idf"]
            else:
                map_word["vocabulary_prop"]["dep_words_count"] = 1
                map_word["vocabulary_prop"]["vocab_importane"] = map_word["vocabulary_prop"]["dep_words_count"] * map_word["vocabulary_prop"]["tf_idf"]
    return text_map
text_map_dep = get_dependencies(conllu_text_map_ex, text_map_ex)

# Contractions (судя по всему не нужно тк udpipe успешно парсит их)

# A1 vocabulary

In [20]:
basic_vocabulary = []
with open("./materials/A1_vocab_processed.txt", "r",encoding = "ISO-8859-1") as voc:
    for word in voc.readlines():
        basic_vocabulary.append(word[:-1].lower())
#basic_vocabulary = set(basic_vocabulary)
#basic_vocabulary

adjectives = []
with open("./materials/common_adj.txt", "r") as common_adj:
    for word in common_adj.readlines():
        adjectives.append(word[:-1].lower())
        
common_uncountable = []
with open("./materials/common_unountable_manually_filtered.txt", "r") as common_unctbl:
    for word in common_unctbl.readlines():
        common_uncountable.append(word[:-1].lower())

countries = []
with open("./materials/countries.txt", "r") as cntr:
    for word in cntr.readlines():
        countries.append(word[:-1].lower())

names = []
with open("./materials/names.txt", "r") as names_file:
    for word in names_file.readlines():
        names.append(word[:-1].lower())
        
print(len(basic_vocabulary), len(adjectives), len(common_uncountable), len(countries), len(countries), len(names))
final_basic_vocabulary = basic_vocabulary
final_basic_vocabulary.extend(adjectives)
final_basic_vocabulary.extend(common_uncountable)
final_basic_vocabulary.extend(countries)
final_basic_vocabulary.extend(names)
len(final_basic_vocabulary)

660 50 81 196 196 3309


4296

# АНАЛИЗИРУЕМ ЛЕКСИКУ (А1)

In [23]:
def vocabulary_analysis(text_map_input, dictionary):
    text_map = copy.deepcopy(text_map_input)
    a1_vocab = []
    other_vocab = []
    a1_weight = 0
    other_weight = 0
    for sentence in text_map:
        for word in sentence:
            low_lemma = word['lemma'].lower()
            low_lemma_clean = ''
            for char in low_lemma:
                if char not in full_punctuation:
                    low_lemma_clean += char
            #print(low_lemma_clean)
            
            if(low_lemma_clean not in dictionary):
                other_vocab.append((low_lemma_clean,word['vocabulary_prop']['vocab_importane']))
                other_weight += word['vocabulary_prop']['vocab_importane']
            else:
                a1_vocab.append((low_lemma_clean,word['vocabulary_prop']['vocab_importane']))
                a1_weight += word['vocabulary_prop']['vocab_importane']
    print(a1_weight, a1_vocab)
    print("OTHER VOCAB")
    print(other_weight, other_vocab)
            
vocabulary_analysis(text_map_dep, final_basic_vocabulary)            

120.90031694118673 [('this', 0.508581756852006), ('be', 0.22464279715435767), ('my', 0.508581756852006), ('friend', 2.54290878426003), ('jimmy', 0.41661661450144716), ('he', 0.5167333802070573), ('be', 0.2552421022709116), ('from', 0.5778572847200745), ('india', 2.311429138880298), ('jimmy', 0.5996568480082168), ('be', 0.32333946122271945), ('a', 0), ('the', 0.18900818785997395), ('best', 0.2960901905184123), ('breakfast', 1.1843607620736492), ('for', 0.24254918918919313), ('he', 0.529541424979515), ('be', 0.1307843384311131), ('a', 0), ('glass', 2.072631333628886), ('of', 0.21122971116053832), ('orange', 0.2960901905184123), ('juice', 0.8882705715552368), ('two', 0.2647707124897575), ('apple', 0.5921803810368246), ('and', 0.14045235727504188), ('three', 0.2647707124897575), ('banana', 0.8882705715552368), ('jimmy', 0.40251788077416106), ('like', 0.9827416566832959), ('very', 0.40251788077416106), ('much', 0.9827416566832959), ('a', 0), ('be', 0.20121124639936316), ('outside', 0.455533

# Grammar

In [71]:
model_test = Model('./UDPIPE/english-ud-2.0-170801.udpipe')

sentences_test = model_test.tokenize("He doesn't go there")
for s in sentences_test:
    model.tag(s)
    model.parse(s)
conllu_test = model.write(sentences_test, "conllu")
for line in conllu_test.split('\n'):
    if line:
        if line[0].isdigit():
            print(line.split('\t'))
        else:
            print(line)

# newdoc
# newpar
# sent_id = 1
# text = He doesn't go there
['1', 'He', 'he', 'PRON', 'PE', 'Gender=Masc|Number=Sing|Person=3|PronType=Prs', '4', 'nsubj', '_', '_']
['2', 'does', 'do', 'AUX', 'VM', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'aux', '_', 'SpaceAfter=No']
['3', "n't", "n't", 'ADV', 'B', '_', '4', 'advmod', '_', '_']
['4', 'go', 'go', 'VERB', 'V', 'Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin', '0', 'root', '_', '_']
['5', 'there', 'there', 'ADV', 'B', '_', '4', 'advmod', '_', 'SpaceAfter=No']


In [None]:
model_partut = Model('./UDPIPE/english-partut-ud-2.0-170801.udpipe')
model_ud = Model('./UDPIPE/english-ud-2.0-170801.udpipe')

# УЧИТЫВАТЬ ГРАММАТИКУ И ИЗВЕСТНОСТЬ СЛОВА В СОВОКУПНОСТИ, СНАЧАЛА НАЛОЖИТЬ ГРАММАТИКУ А ПОТОМ УЖЕ СМОТРЕТЬ ВОКАБУЛЯР

In [None]:
        grammar_properties_log = {}
        for word in sentence: 
            if (word[3] != 'PUNCT'):
                #print(word[1], "head_word_nominal_index =", word[6])
                #PRESENT SIPMLE POSITIVE
                if(word[3] == 'VERB'):
                    grammar = word[5].split("|")
                    for gr_unit in grammar:
                        if ("Tense" in gr_unit):
                            tense = gr_unit.split("=")[1]
                            if(tense == "Pres"):
                                grammar_properties_log [str(word[0])] = "PrSmpl+"
                            print(tense)
                    print(word[2], grammar)

# ИМПЛЕМЕНТИТЬ ПРИСВОЕНИЕ ВРЕМЕНИ ВСЕМ СЛОВАМ ВНУТРИ ПОДДЕРЕВА ДЛЯ КОТОРЫХ ВРЕМЯ НЕ ПОДТЯНУЛОСЬ НА ОСНОВАНИИ СУЩЕСТВУЮЩИХ МАРКЕРОВ ВНУТРИ ПОДДЕРЕВА

In [27]:
conllu_text_map_ex

[[['1', '"', '"', 'PUNCT', 'FB', '_', '5', 'punct', '_', 'SpaceAfter=No'],
  ['2',
   'This',
   'this',
   'PRON',
   'PD',
   'Number=Sing|PronType=Dem',
   '5',
   'nsubj',
   '_',
   '_'],
  ['3',
   'is',
   'be',
   'AUX',
   'VA',
   'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
   '5',
   'cop',
   '_',
   '_'],
  ['4',
   'my',
   'my',
   'DET',
   'AP',
   'Number=Sing|Poss=Yes|PronType=Prs',
   '5',
   'nmod:poss',
   '_',
   '_'],
  ['5', 'friend', 'friend', 'NOUN', 'S', 'Number=Sing', '0', 'root', '_', '_'],
  ['6',
   'Jimmy',
   'Jimmy',
   'PROPN',
   'SP',
   '_',
   '5',
   'nmod',
   '_',
   'SpaceAfter=No'],
  ['7', '.', '.', 'PUNCT', 'FS', '_', '5', 'punct', '_', '_']],
 [['1',
   'He',
   'he',
   'PRON',
   'PE',
   'Gender=Masc|Number=Sing|Person=3|PronType=Prs',
   '4',
   'nsubj',
   '_',
   '_'],
  ['2',
   'is',
   'be',
   'AUX',
   'VA',
   'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
   '4',
   'cop',
   '_',
   '_'],
  ['3', 'from

In [25]:
text_map_dep

[[OrderedDict([('word', 'This'),
               ('lemma', 'this'),
               ('vocabulary_prop',
                OrderedDict([('tf_idf', 0.508581756852006),
                             ('nominal_index', '2'),
                             ('dep_words_count', 1),
                             ('vocab_importane', 0.508581756852006)]))]),
  OrderedDict([('word', 'is'),
               ('lemma', 'be'),
               ('vocabulary_prop',
                OrderedDict([('tf_idf', 0.22464279715435767),
                             ('nominal_index', '3'),
                             ('dep_words_count', 1),
                             ('vocab_importane', 0.22464279715435767)]))]),
  OrderedDict([('word', 'my'),
               ('lemma', 'my'),
               ('vocabulary_prop',
                OrderedDict([('tf_idf', 0.508581756852006),
                             ('nominal_index', '4'),
                             ('dep_words_count', 1),
                             ('vocab_importane', 0.5

In [62]:
def grammar_analysis(conllu_map,text_map_input):
    assert len(conllu_map) == len(text_map_input) #sentences count is equal
    text_map = copy.deepcopy(text_map_input)
    
    for sentence_conllu, text_map_sentence in zip(conllu_map,text_map):
        #СОБИРАЕМ СПИСОК СВАОЙСТВ ВСЕХ СЛОВ ПРЕДЛОЖЕНИЯ ДЛЯ ПОСЛЕДУЮЩЕГО ОБРАЩЕНИЯ
        pos_word_dict = {}
        for pos_word in sentence_conllu:
            if (pos_word[3] != 'PUNCT'):
                pos_word_dict[pos_word[0]] = (pos_word[0]+'_'+pos_word[1], pos_word[2:])
        
        print("POS", pos_word_dict)# словарь "номинальный индекс слова" (номинальный-индекс_слово,  остальные conllu based свойства)
        noun_phrase_sentence = False
        
        #СТРОИМ ПОДДЕРЕВЬЯ ГЛАГОЛЬНЫХ ГРУПП
        verb_phrases_dict = {}#словарь "индекс_слово(глагол в верштне)" [список conllu based свойств зависимых эл-тов]
        for word_leave in sentence_conllu: 
            if (word_leave[3] != 'PUNCT'):
                #print(word[1], "head_word_nominal_index =", word[6])
                if (word_leave[7] == "root" and  word_leave[3] != 'VERB'):
                    print("NOUN PHRASE BASED SENTENCE")
                    noun_phrase_sentence = True
                    break
                    
                head_word_nominal_index = word_leave[6]#смотрим на иноминальный индекс главного элемента 
                if (int(head_word_nominal_index)!= 0):
                    current_head_word = pos_word_dict[head_word_nominal_index][0]
                    current_head_pos = pos_word_dict[head_word_nominal_index][1][1]
                    
                    if(current_head_word in verb_phrases_dict):
                        verb_phrases_dict[current_head_word].append(word_leave)
                    else:
                        #print(head_word_nominal_index)
                        if(current_head_pos == "VERB"):
                            verb_phrases_dict[current_head_word] = []
                            verb_phrases_dict[current_head_word].append(word_leave)
                    
        if(noun_phrase_sentence):
            print("SKIPPING THIS SENTENCE")
            continue
        else:
            print("VERB SUBTREES")
            for key, value in verb_phrases_dict.items():
                print(key)
                for el in value:
                    print(el)
                    
        # АНАЛИЗ ПОДДЕРЕВЬЕВ, ПРИСВОЕНИЕ ВРЕМЕНИ, ЗАПИСЬ В ЛОГ
        grammar_properties_log = {}
        undefined_tense_stack = []
        #head_of_subtree_index is from pos_word_dict
        for head_of_subtree_plus_index, dependent_elements in verb_phrases_dict.items():
            #исследуем корень поддерева
            head_of_subtree_index = head_of_subtree_plus_index.split("_")[0]
            print(head_of_subtree_index)
            head_properties = pos_word_dict[head_of_subtree_index]
            print(head_properties)
            if ("Tense" in head_properties[1][3]):#случай когда глагол уже маркирован временем
                # в такой ситуации можно переходить на следующий ключ и сохранять всю херню в логи
                grammar = head_properties[1][3].split("|")
                for gr_unit in grammar:
                    if ("Tense" in gr_unit):
                        tense = gr_unit.split("=")[1]
                        if(tense == "Pres"):
                            grammar_properties_log [head_of_subtree_index] = "PrSmpl"
                        print(tense)
                    for dep_el in dependent_elements:
                        if (dep_el[3] == "AUX" or dep_el[3] == "PART"):
                            dep_el_nominal_index = dep_el[0]
                            grammar_properties_log [dep_el_nominal_index] = "PrSmpl"
            else:
                undefined_tense_stack.append(head_of_subtree_index)
                
                
                
    
            #исследуем элементы поддерева
            #for dep_el in dependent_elements:
                
        
        print("grammar_properties_log", grammar_properties_log)        
        for map_word in text_map_sentence:
            word_index = map_word['vocabulary_prop']['nominal_index']
            if (str(word_index) in grammar_properties_log):
                map_word['grammar_prop'] = grammar_properties_log [word_index]
                
    return text_map
            
#grammar_analysis(conllu_text_map_ex,text_map_dep)

In [63]:
def get_map(text_line,model):
    conllu = get_conllu(text_line, model, print_output = True)
    conllu_text_map = get_conllu_text_map(conllu)
    #print(conllu_text_map)
    
    tfidf = False
    if tfidf:
        lemm_sentences = lemmatize_from_udmap(conllu_text_map)
        tf_idf_dict = get_tf_idf_dict (lemm_sentences)
    else:
        tf_idf_dict = None
    
    text_map = create_map(conllu_text_map, tf_idf_dict, apply_tf_idf = False)
    
    text_map_dep = get_dependencies(conllu_text_map, text_map)
    #vocabulary_analysis(text_map_dep, final_basic_vocabulary)
    text_map_gramm = grammar_analysis(conllu_text_map, text_map_dep)
    
    return text_map_gramm
get_map("He will there", model)

# newdoc
# newpar
# sent_id = 1
# text = He does not go there
['1', 'He', 'he', 'PRON', 'PE', 'Gender=Masc|Number=Sing|Person=3|PronType=Prs', '4', 'nsubj', '_', '_']
['2', 'does', 'do', 'AUX', 'VM', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'aux', '_', '_']
['3', 'not', 'not', 'PART', 'PART', 'Polarity=Neg', '4', 'advmod', '_', '_']
['4', 'go', 'go', 'VERB', 'V', 'Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin', '0', 'root', '_', '_']
['5', 'there', 'there', 'ADV', 'B', '_', '4', 'advmod', '_', 'SpaceAfter=No']
POS {'1': ('1_He', ['he', 'PRON', 'PE', 'Gender=Masc|Number=Sing|Person=3|PronType=Prs', '4', 'nsubj', '_', '_']), '2': ('2_does', ['do', 'AUX', 'VM', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'aux', '_', '_']), '3': ('3_not', ['not', 'PART', 'PART', 'Polarity=Neg', '4', 'advmod', '_', '_']), '4': ('4_go', ['go', 'VERB', 'V', 'Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin', '0', 'root', '_', '_']), '5': ('5_there', ['there', 'ADV', 'B', '_',

[[OrderedDict([('word', 'He'),
               ('lemma', 'he'),
               ('vocabulary_prop',
                OrderedDict([('tf_idf', 0),
                             ('nominal_index', '1'),
                             ('dep_words_count', 1),
                             ('vocab_importane', 0)]))]),
  OrderedDict([('word', 'does'),
               ('lemma', 'do'),
               ('vocabulary_prop',
                OrderedDict([('tf_idf', 0),
                             ('nominal_index', '2'),
                             ('dep_words_count', 1),
                             ('vocab_importane', 0)])),
               ('grammar_prop', 'PrSmpl')]),
  OrderedDict([('word', 'not'),
               ('lemma', 'not'),
               ('vocabulary_prop',
                OrderedDict([('tf_idf', 0),
                             ('nominal_index', '3'),
                             ('dep_words_count', 1),
                             ('vocab_importane', 0)])),
               ('grammar_prop', 'PrSm

In [102]:
for sentence in conllu_text_map_ex:
    
    for word in sentence:
        print()
    print()

['1', '"', '"', 'PUNCT', 'FB', '_', '5', 'punct', '_', 'SpaceAfter=No']
['2', 'This', 'this', 'PRON', 'PD', 'Number=Sing|PronType=Dem', '5', 'nsubj', '_', '_']
['3', 'is', 'be', 'AUX', 'VA', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '5', 'cop', '_', '_']
['4', 'my', 'my', 'DET', 'AP', 'Number=Sing|Poss=Yes|PronType=Prs', '5', 'nmod:poss', '_', '_']
['5', 'friend', 'friend', 'NOUN', 'S', 'Number=Sing', '0', 'root', '_', '_']
['6', 'Jimmy', 'Jimmy', 'PROPN', 'SP', '_', '5', 'nmod', '_', 'SpaceAfter=No']
['7', '.', '.', 'PUNCT', 'FS', '_', '5', 'punct', '_', '_']

['1', 'He', 'he', 'PRON', 'PE', 'Gender=Masc|Number=Sing|Person=3|PronType=Prs', '4', 'nsubj', '_', '_']
['2', 'is', 'be', 'AUX', 'VA', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'cop', '_', '_']
['3', 'from', 'from', 'ADP', 'E', '_', '4', 'case', '_', '_']
['4', 'India', 'India', 'PROPN', 'SP', '_', '0', 'root', '_', 'SpaceAfter=No']
['5', '.', '.', 'PUNCT', 'FS', '_', '4', 'punct', '_', '_']

