In [1]:
import numpy as np

In [2]:
#from ufal.udpipe import Model, Pipeline, ProcessingError

from string import punctuation
full_punctuation = punctuation + "–" + "," + "»" + "«" + "…" +'’'

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

from collections import OrderedDict

import copy

In [21]:
#model = Model('./UDPIPE/english-lines-ud-2.0-170801.udpipe')

In [2]:
import ufal.udpipe

In [1]:
import ufal.udpipe
# ufal.udpipe.Model etc. are SWIG-magic and cannot be detected by pylint
# pylint: disable=no-member

class Model:
    def __init__(self, path):
        """Load given model."""
        self.model = ufal.udpipe.Model.load(path)
        if not self.model:
            raise Exception("Cannot load UDPipe model from file '%s'" % path)

    def tokenize(self, text):
        """Tokenize the text and return list of ufal.udpipe.Sentence-s."""
        tokenizer = self.model.newTokenizer(self.model.DEFAULT)
        if not tokenizer:
            raise Exception("The model does not have a tokenizer")
        return self._read(text, tokenizer)

    def read(self, text, in_format):
        """Load text in the given format (conllu|horizontal|vertical) and return list of ufal.udpipe.Sentence-s."""
        input_format = ufal.udpipe.InputFormat.newInputFormat(in_format)
        if not input_format:
            raise Exception("Cannot create input format '%s'" % in_format)
        return self._read(text, input_format)

    def _read(self, text, input_format):
        input_format.setText(text)
        error = ufal.udpipe.ProcessingError()
        sentences = []

        sentence = ufal.udpipe.Sentence()
        while input_format.nextSentence(sentence, error):
            sentences.append(sentence)
            sentence = ufal.udpipe.Sentence()
        if error.occurred():
            raise Exception(error.message)

        return sentences

    def tag(self, sentence):
        """Tag the given ufal.udpipe.Sentence (inplace)."""
        self.model.tag(sentence, self.model.DEFAULT)

    def parse(self, sentence):
        """Parse the given ufal.udpipe.Sentence (inplace)."""
        self.model.parse(sentence, self.model.DEFAULT)

    def write(self, sentences, out_format):
        """Write given ufal.udpipe.Sentence-s in the required format (conllu|horizontal|vertical)."""

        output_format = ufal.udpipe.OutputFormat.newOutputFormat(out_format)
        output = ''
        for sentence in sentences:
            output += output_format.writeSentence(sentence)
        output += output_format.finishDocument()

        return output

# Can be used as
#  model = Model('english-ud-1.2-160523.udpipe')
#  sentences = model.tokenize("Hi there. How are you?")
#  for s in sentences:
#      model.tag(s)
#      model.parse(s)
#  conllu = model.write(sentences, "conllu")

In [3]:
model_test = Model('./UDPIPE/english-ud-2.0-170801.udpipe')

sentences_test = model_test.tokenize("I like going there")
for s in sentences_test:
    model_test.tag(s)
    model_test.parse(s)
conllu_test = model_test.write(sentences_test, "conllu")
for line in conllu_test.split('\n'):
    if line:
        if line[0].isdigit():
            print(line.split('\t'))
        else:
            print(line)

# newdoc
# newpar
# sent_id = 1
# text = I like going there
['1', 'I', 'I', 'PRON', 'PRP', 'Case=Nom|Number=Sing|Person=1|PronType=Prs', '2', 'nsubj', '_', '_']
['2', 'like', 'like', 'VERB', 'VBP', 'Mood=Ind|Tense=Pres|VerbForm=Fin', '0', 'root', '_', '_']
['3', 'going', 'go', 'VERB', 'VBG', 'VerbForm=Ger', '2', 'xcomp', '_', '_']
['4', 'there', 'there', 'ADV', 'RB', 'PronType=Dem', '3', 'advmod', '_', 'SpaceAfter=No']


In [24]:
with open ("./UDPIPE/text_0.txt", "r") as f:
    text = ''
    for line in f.readlines():
        text += line + ' '
        text.strip()
    #print(text)
    
model = Model('./UDPIPE/english-ud-2.0-170801.udpipe')# ==== ./UDPIPE/english-partut-ud-2.0-170801.udpipe
#sentences = model.tokenize("Hi there. How are you?")
sentences = model.tokenize(text)
for s in sentences:
    model.tag(s)
    model.parse(s)
conllu = model.write(sentences, "conllu")

# sentenceses conlluu map

In [4]:
def get_conllu(text_line, model, print_output = False):
    sentences = model.tokenize(text_line)
    for s in sentences:
        model.tag(s)
        model.parse(s)
    conllu = model.write(sentences, "conllu")
    if (print_output):
        for line in conllu.split('\n'):
            if line:
                if line[0].isdigit():
                    print(line.split('\t'))
                else:
                    print(line)
    return conllu

In [5]:
def get_conllu_text_map(conllu_parsed_object):
    conllu_text_map = []
    conllu_sentence_map = []
    for line in conllu_parsed_object.split('\n'):
        if line :
            if line[0].isdigit():
                #print(line.split('\t'))
                split_items = line.split('\t')
                if split_items[3] != "PUNCT":
                    conllu_sentence_map.append(split_items)
            else:
                if(len(conllu_sentence_map) > 0):
                    conllu_text_map.append(conllu_sentence_map)
                    conllu_sentence_map = []   
                    #print("appended")
    if(len(conllu_sentence_map) > 0):
        conllu_text_map.append(conllu_sentence_map)
    return conllu_text_map
#conllu_text_map_ex = get_conllu_text_map(conllu)

In [65]:
conllu_text_map_ex

[[['2',
   'This',
   'this',
   'PRON',
   'DT',
   'Number=Sing|PronType=Dem',
   '6',
   'nsubj',
   '_',
   '_'],
  ['3',
   'is',
   'be',
   'AUX',
   'VBZ',
   'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
   '6',
   'cop',
   '_',
   '_'],
  ['4',
   'my',
   'my',
   'PRON',
   'PRP$',
   'Number=Sing|Person=1|Poss=Yes|PronType=Prs',
   '6',
   'nmod:poss',
   '_',
   '_'],
  ['5',
   'friend',
   'friend',
   'NOUN',
   'NN',
   'Number=Sing',
   '6',
   'compound',
   '_',
   '_'],
  ['6',
   'Jimmy',
   'Jimmy',
   'PROPN',
   'NNP',
   'Number=Sing',
   '0',
   'root',
   '_',
   'SpaceAfter=No']],
 [['1',
   'He',
   'he',
   'PRON',
   'PRP',
   'Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs',
   '4',
   'nsubj',
   '_',
   '_'],
  ['2',
   'is',
   'be',
   'AUX',
   'VBZ',
   'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
   '4',
   'cop',
   '_',
   '_'],
  ['3', 'from', 'from', 'ADP', 'IN', '_', '4', 'case', '_', '_'],
  ['4',
   'India'

In [66]:
for sentence in conllu_text_map_ex:
    for word in sentence:
        print(word)
    print()

['2', 'This', 'this', 'PRON', 'DT', 'Number=Sing|PronType=Dem', '6', 'nsubj', '_', '_']
['3', 'is', 'be', 'AUX', 'VBZ', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '6', 'cop', '_', '_']
['4', 'my', 'my', 'PRON', 'PRP$', 'Number=Sing|Person=1|Poss=Yes|PronType=Prs', '6', 'nmod:poss', '_', '_']
['5', 'friend', 'friend', 'NOUN', 'NN', 'Number=Sing', '6', 'compound', '_', '_']
['6', 'Jimmy', 'Jimmy', 'PROPN', 'NNP', 'Number=Sing', '0', 'root', '_', 'SpaceAfter=No']

['1', 'He', 'he', 'PRON', 'PRP', 'Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs', '4', 'nsubj', '_', '_']
['2', 'is', 'be', 'AUX', 'VBZ', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'cop', '_', '_']
['3', 'from', 'from', 'ADP', 'IN', '_', '4', 'case', '_', '_']
['4', 'India', 'India', 'PROPN', 'NNP', 'Number=Sing', '0', 'root', '_', 'SpaceAfter=No']

['1', 'Jimmy', 'Jimmy', 'PROPN', 'NNP', 'Number=Sing', '4', 'nsubj', '_', '_']
['2', 'is', 'be', 'AUX', 'VBZ', 'Mood=Ind|Number=Sing|Person=

# LEMM TEXT FROM UDPIPE MAP

In [6]:
def lemmatize_from_udmap(conllu_map):
    sentences_list = []
    for sentence in conllu_map:
        line = ''
        for word in sentence: 
            line += word[2] + ' '
        sentences_list.append(line.strip())
        #print()
    return sentences_list
#lemm_sentences = lemmatize_from_udmap(conllu_text_map_ex)

# TF_IDF

СТОП СЛОВА ВРЕМЕННО УБИРАЕМ - ОСТАВИТЬ НА ОБСУЖДЕНИЕ СПИСОК


ВОЗВРАЩАЕМ СТОП СЛОВА В ВИДУ ВВЕДЕНИЯ ШТРАФОВ ЗА КОНТЕКСТ

In [127]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nigula/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

In [10]:
def get_tf_idf_dict(lemm_text_list, save_to_csv = False):
    vect = TfidfVectorizer(stop_words = stopWords)
    tfidf_matrix = vect.fit_transform(lemm_text_list)
    df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
    #print(df.head())
    if (save_to_csv): df.to_csv("./text_0_tfidf.xlsx", sep = '\t')
    tf_idf_dict = df.to_dict()
    return tf_idf_dict
#tf_idf_dict_example = get_tf_idf_dict (lemm_sentences, save_to_csv = False)

In [207]:
tf_idf_dict_example

{'about': {0: 0.0,
  1: 0.0,
  2: 0.0,
  3: 0.0,
  4: 0.0,
  5: 0.0,
  6: 0.0,
  7: 0.0,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 0.0,
  17: 0.0,
  18: 0.0,
  19: 0.0,
  20: 0.0,
  21: 0.0,
  22: 0.0,
  23: 0.0,
  24: 0.0,
  25: 0.0,
  26: 0.0,
  27: 0.0,
  28: 0.0,
  29: 0.0,
  30: 0.32237919275526306,
  31: 0.0},
 'accompany': {0: 0.0,
  1: 0.0,
  2: 0.0,
  3: 0.0,
  4: 0.0,
  5: 0.0,
  6: 0.0,
  7: 0.0,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 0.0,
  17: 0.28834441866167737,
  18: 0.0,
  19: 0.0,
  20: 0.0,
  21: 0.0,
  22: 0.0,
  23: 0.0,
  24: 0.0,
  25: 0.0,
  26: 0.0,
  27: 0.0,
  28: 0.0,
  29: 0.0,
  30: 0.0,
  31: 0.0},
 'add': {0: 0.0,
  1: 0.0,
  2: 0.0,
  3: 0.0,
  4: 0.0,
  5: 0.0,
  6: 0.0,
  7: 0.0,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 0.0,
  17: 0.0,
  18: 0.0,
  19: 0.0,
  20: 0.0,
  21: 0.0,
  22: 0.0,
  23: 0.0,

# Text map

# ГРАММАТИКУ УЧИТЫВАЕМ ОТДЕЛЬНО НЕ ССЫЛАЯСЬ НА ТФ ИДФ 
ЕСЛИ ГРАММАТИКА ИДЕТ В КУПЕ С ТФИДФ ТО НАКЛАДЫВАЕМ ДОПОЛНИТЕЛЬНЫЙ ШТРАФ

In [8]:
pos_exclude_list = ["AUX","SYM","CCONJ","X","DET","NUM","PART","SCON","INTJ","PROPN"]

In [7]:
def create_map(conllu_map, tf_idf_dict, apply_tf_idf = True):
    text_map = []
    sentence_ind = 0
    for sentence in conllu_map:
        sentence_map = []
        real_index = 1
        for word in sentence: 
            weight = OrderedDict([("word", word[1]),("lemma",word[2]), ("vocabulary_prop",(OrderedDict([("vocab_importane", 0),("nominal_index",word[0])])))])
            
            lemma_lower = word[2].lower()
            if (apply_tf_idf):
                if (lemma_lower in tf_idf_dict):
                    tf_idf_i = tf_idf_dict[lemma_lower][sentence_ind]
                    if(word[3] not in pos_exclude_list):
                        weight['vocabulary_prop']["vocab_importane"] = tf_idf_i
                    elif(tf_idf_i > 0 ):
                        #print(word)
                        weight['vocabulary_prop']["vocab_importane"] = tf_idf_i * 0.5
            sentence_map.append(weight)
        text_map.append(sentence_map)
        sentence_ind += 1
    return text_map
#text_map_ex = create_map(conllu_text_map_ex, tf_idf_dict_example)


In [238]:
text_map_ex

[[OrderedDict([('word', 'This'),
               ('lemma', 'this'),
               ('vocabulary_prop',
                OrderedDict([('vocab_importane', 0),
                             ('nominal_index', '2')]))]),
  OrderedDict([('word', 'is'),
               ('lemma', 'be'),
               ('vocabulary_prop',
                OrderedDict([('vocab_importane', 0),
                             ('nominal_index', '3')]))]),
  OrderedDict([('word', 'my'),
               ('lemma', 'my'),
               ('vocabulary_prop',
                OrderedDict([('vocab_importane', 0),
                             ('nominal_index', '4')]))]),
  OrderedDict([('word', 'friend'),
               ('lemma', 'friend'),
               ('vocabulary_prop',
                OrderedDict([('vocab_importane', 0.774119582428778),
                             ('nominal_index', '5')]))]),
  OrderedDict([('word', 'Jimmy'),
               ('lemma', 'Jimmy'),
               ('vocabulary_prop',
                OrderedDict([('v

In [39]:
"AUX" in pos_exclude_list

True

In [192]:
for sentence in conllu_text_map_ex:
    for word in sentence:
        print(word)
    print()

['2', 'This', 'this', 'PRON', 'DT', 'Number=Sing|PronType=Dem', '6', 'nsubj', '_', '_']
['3', 'is', 'be', 'AUX', 'VBZ', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '6', 'cop', '_', '_']
['4', 'my', 'my', 'PRON', 'PRP$', 'Number=Sing|Person=1|Poss=Yes|PronType=Prs', '6', 'nmod:poss', '_', '_']
['5', 'friend', 'friend', 'NOUN', 'NN', 'Number=Sing', '6', 'compound', '_', '_']
['6', 'Jimmy', 'Jimmy', 'PROPN', 'NNP', 'Number=Sing', '0', 'root', '_', 'SpaceAfter=No']

['1', 'He', 'he', 'PRON', 'PRP', 'Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs', '4', 'nsubj', '_', '_']
['2', 'is', 'be', 'AUX', 'VBZ', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'cop', '_', '_']
['3', 'from', 'from', 'ADP', 'IN', '_', '4', 'case', '_', '_']
['4', 'India', 'India', 'PROPN', 'NNP', 'Number=Sing', '0', 'root', '_', 'SpaceAfter=No']

['1', 'Jimmy', 'Jimmy', 'PROPN', 'NNP', 'Number=Sing', '4', 'nsubj', '_', '_']
['2', 'is', 'be', 'AUX', 'VBZ', 'Mood=Ind|Number=Sing|Person=

# Contractions (судя по всему не нужно тк udpipe успешно парсит их)

# A1 vocabulary

In [15]:
basic_vocabulary = []
with open("./materials/A1_vocab_processed.txt", "r",encoding = "ISO-8859-1") as voc:
    for word in voc.readlines():
        basic_vocabulary.append(word[:-1].lower())
#basic_vocabulary = set(basic_vocabulary)
#basic_vocabulary

adjectives = []
with open("./materials/common_adj.txt", "r") as common_adj:
    for word in common_adj.readlines():
        adjectives.append(word[:-1].lower())
        
common_uncountable = []
with open("./materials/common_unountable_manually_filtered.txt", "r") as common_unctbl:
    for word in common_unctbl.readlines():
        common_uncountable.append(word[:-1].lower())

countries = []
with open("./materials/countries.txt", "r") as cntr:
    for word in cntr.readlines():
        countries.append(word[:-1].lower())

names = []
with open("./materials/names.txt", "r") as names_file:
    for word in names_file.readlines():
        names.append(word[:-1].lower())
        
print(len(basic_vocabulary), len(adjectives), len(common_uncountable), len(countries), len(countries), len(names))
final_basic_vocabulary = basic_vocabulary
final_basic_vocabulary.extend(adjectives)
final_basic_vocabulary.extend(common_uncountable)
final_basic_vocabulary.extend(countries)
final_basic_vocabulary.extend(names)
print(len(final_basic_vocabulary))
final_basic_vocabulary = set(final_basic_vocabulary)

660 50 81 196 196 3309
4296


# АНАЛИЗИРУЕМ ЛЕКСИКУ (А1)

In [240]:
def vocabulary_analysis(text_map_input, dictionary):
    text_map = copy.deepcopy(text_map_input)
    a1_vocab = []
    other_vocab = []
    a1_weight = 0
    other_weight = 0
    for sentence in text_map:
        for word in sentence:
            low_lemma = word['lemma'].lower()
            low_lemma_clean = ''
            for char in low_lemma:
                if char not in full_punctuation:
                    low_lemma_clean += char
            #print(low_lemma_clean)
            
            if(low_lemma_clean not in dictionary):
                other_vocab.append((low_lemma_clean,word['vocabulary_prop']['vocab_importane']))
                other_weight += word['vocabulary_prop']['vocab_importane']
            else:
                a1_vocab.append((low_lemma_clean,word['vocabulary_prop']['vocab_importane']))
                a1_weight += word['vocabulary_prop']['vocab_importane']
    print(a1_weight, a1_vocab)
    print("OTHER VOCAB")
    print(other_weight, other_vocab)
            
vocabulary_analysis(text_map_ex, final_basic_vocabulary)            

38.39833086455389 [('this', 0), ('be', 0), ('my', 0), ('friend', 0.774119582428778), ('jimmy', 0.31651969610922104), ('he', 0), ('be', 0), ('from', 0), ('india', 0.5), ('jimmy', 0.31651969610922104), ('be', 0), ('a', 0), ('the', 0), ('best', 0.3410705106931752), ('breakfast', 0.3410705106931752), ('for', 0), ('he', 0), ('be', 0), ('a', 0), ('glass', 0.3410705106931752), ('of', 0), ('orange', 0.3410705106931752), ('juice', 0.3410705106931752), ('two', 0.1523549913641152), ('apple', 0.3410705106931752), ('and', 0), ('three', 0.1523549913641152), ('banana', 0.3410705106931752), ('jimmy', 0.21959565789953497), ('like', 0.5370702079080142), ('very', 0), ('much', 0.5370702079080142), ('a', 0), ('be', 0), ('green', 0.21308855008252736), ('outside', 0.4770320942477113), ('and', 0), ('red', 0.4770320942477113), ('inside', 0.4261771001650547), ('it', 0), ('be', 0), ('hard', 0.5978117297300675), ('and', 0), ('inside', 0.5340807725375345), ('it', 0), ('be', 0), ('and', 0), ('sweet', 0.633039392218

# Grammar

In [71]:
model_test = Model('./UDPIPE/english-ud-2.0-170801.udpipe')

sentences_test = model_test.tokenize("He doesn't go there")
for s in sentences_test:
    model.tag(s)
    model.parse(s)
conllu_test = model.write(sentences_test, "conllu")
for line in conllu_test.split('\n'):
    if line:
        if line[0].isdigit():
            print(line.split('\t'))
        else:
            print(line)

# newdoc
# newpar
# sent_id = 1
# text = He doesn't go there
['1', 'He', 'he', 'PRON', 'PE', 'Gender=Masc|Number=Sing|Person=3|PronType=Prs', '4', 'nsubj', '_', '_']
['2', 'does', 'do', 'AUX', 'VM', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'aux', '_', 'SpaceAfter=No']
['3', "n't", "n't", 'ADV', 'B', '_', '4', 'advmod', '_', '_']
['4', 'go', 'go', 'VERB', 'V', 'Mood=Ind|Number=Plur|Tense=Pres|VerbForm=Fin', '0', 'root', '_', '_']
['5', 'there', 'there', 'ADV', 'B', '_', '4', 'advmod', '_', 'SpaceAfter=No']


In [None]:
model_partut = Model('./UDPIPE/english-partut-ud-2.0-170801.udpipe')
model_ud = Model('./UDPIPE/english-ud-2.0-170801.udpipe')

# УЧИТЫВАТЬ ГРАММАТИКУ И ИЗВЕСТНОСТЬ СЛОВА В СОВОКУПНОСТИ, СНАЧАЛА НАЛОЖИТЬ ГРАММАТИКУ А ПОТОМ УЖЕ СМОТРЕТЬ ВОКАБУЛЯР

In [None]:
        grammar_properties_log = {}
        for word in sentence: 
            #if (word[3] != 'PUNCT'):
            #print(word[1], "head_word_nominal_index =", word[6])
            #PRESENT SIPMLE POSITIVE
            if(word[3] == 'VERB'):
                grammar = word[5].split("|")
                for gr_unit in grammar:
                    if ("Tense" in gr_unit):
                        tense = gr_unit.split("=")[1]
                        if(tense == "Pres"):
                            grammar_properties_log [str(word[0])] = "PrSmpl+"
                        print(tense)
                print(word[2], grammar)

# ИМПЛЕМЕНТИТЬ ПРИСВОЕНИЕ ВРЕМЕНИ ВСЕМ СЛОВАМ ВНУТРИ ПОДДЕРЕВА ДЛЯ КОТОРЫХ ВРЕМЯ НЕ ПОДТЯНУЛОСЬ НА ОСНОВАНИИ СУЩЕСТВУЮЩИХ МАРКЕРОВ ВНУТРИ ПОДДЕРЕВА

In [27]:
conllu_text_map_ex

[[['1', '"', '"', 'PUNCT', 'FB', '_', '5', 'punct', '_', 'SpaceAfter=No'],
  ['2',
   'This',
   'this',
   'PRON',
   'PD',
   'Number=Sing|PronType=Dem',
   '5',
   'nsubj',
   '_',
   '_'],
  ['3',
   'is',
   'be',
   'AUX',
   'VA',
   'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
   '5',
   'cop',
   '_',
   '_'],
  ['4',
   'my',
   'my',
   'DET',
   'AP',
   'Number=Sing|Poss=Yes|PronType=Prs',
   '5',
   'nmod:poss',
   '_',
   '_'],
  ['5', 'friend', 'friend', 'NOUN', 'S', 'Number=Sing', '0', 'root', '_', '_'],
  ['6',
   'Jimmy',
   'Jimmy',
   'PROPN',
   'SP',
   '_',
   '5',
   'nmod',
   '_',
   'SpaceAfter=No'],
  ['7', '.', '.', 'PUNCT', 'FS', '_', '5', 'punct', '_', '_']],
 [['1',
   'He',
   'he',
   'PRON',
   'PE',
   'Gender=Masc|Number=Sing|Person=3|PronType=Prs',
   '4',
   'nsubj',
   '_',
   '_'],
  ['2',
   'is',
   'be',
   'AUX',
   'VA',
   'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
   '4',
   'cop',
   '_',
   '_'],
  ['3', 'from

In [25]:
text_map_dep

[[OrderedDict([('word', 'This'),
               ('lemma', 'this'),
               ('vocabulary_prop',
                OrderedDict([('tf_idf', 0.508581756852006),
                             ('nominal_index', '2'),
                             ('dep_words_count', 1),
                             ('vocab_importane', 0.508581756852006)]))]),
  OrderedDict([('word', 'is'),
               ('lemma', 'be'),
               ('vocabulary_prop',
                OrderedDict([('tf_idf', 0.22464279715435767),
                             ('nominal_index', '3'),
                             ('dep_words_count', 1),
                             ('vocab_importane', 0.22464279715435767)]))]),
  OrderedDict([('word', 'my'),
               ('lemma', 'my'),
               ('vocabulary_prop',
                OrderedDict([('tf_idf', 0.508581756852006),
                             ('nominal_index', '4'),
                             ('dep_words_count', 1),
                             ('vocab_importane', 0.5

грамматику обозгачаем только через глагол и потом по ним будем ситать количество ее появлений. второстепенные глаголы не обозначаем и используем только для выяснения времени основного глагола

# РАЗОБРАТЬСЯ С ПРЕДЛОЖЕНИЯМИ ГДЕ КОРЕНЬ - СУЩЕСТВИТЕЛЬНОЕ НО ЕСТЬ ГЛАГОЛЫ  have been there (for example)

def grammar_analysis(conllu_map,text_map_input):
    assert len(conllu_map) == len(text_map_input) #sentences count is equal
    text_map = copy.deepcopy(text_map_input)
    
    for sentence_conllu, text_map_sentence in zip(conllu_map,text_map):
        #СОБИРАЕМ СПИСОК СВАОЙСТВ ВСЕХ СЛОВ ПРЕДЛОЖЕНИЯ ДЛЯ ПОСЛЕДУЮЩЕГО ОБРАЩЕНИЯ
        pos_word_dict = {}
        for pos_word in sentence_conllu:
            pos_word_dict[pos_word[0]] = (pos_word[0]+'_'+pos_word[1], pos_word[2:])
        
        print("POS", pos_word_dict)# словарь "номинальный индекс слова" (номинальный-индекс_слово,  остальные conllu based свойства)
        noun_phrase_sentence = False
        
        #СТРОИМ ПОДДЕРЕВЬЯ ГЛАГОЛЬНЫХ ГРУПП
        verb_phrases_dict = {}#словарь "индекс_слово(глагол в верштне)" [список conllu based свойств зависимых эл-тов]
        for word_leave in sentence_conllu: 
            print(word_leave[1], "head_word_nominal_index =", word_leave[6])
            
            if (word_leave[7] == "root" and  word_leave[3] != 'VERB'):
                print("NOUN PHRASE BASED SENTENCE")
                noun_phrase_sentence = True
                break

            head_word_nominal_index = word_leave[6]#смотрим на номинальный индекс главного элемента 
            if (int(head_word_nominal_index)!= 0):
                current_head_word = pos_word_dict[head_word_nominal_index][0]
                current_head_pos = pos_word_dict[head_word_nominal_index][1][1]
                #print("current_head_word",pos_word_dict[head_word_nominal_index])
                if(current_head_word in verb_phrases_dict):
                    verb_phrases_dict[current_head_word].append(word_leave)
                elif(current_head_pos == "VERB"):
                    verb_phrases_dict[current_head_word] = []
                    verb_phrases_dict[current_head_word].append(word_leave)
        
        if(noun_phrase_sentence):
            print("SKIPPING THIS SENTENCE")
            continue
        else:
            print("VERB SUBTREES")
            for key, value in verb_phrases_dict.items():
                print(key)
                for el in value:
                    print(el)
                    
        # АНАЛИЗ ПОДДЕРЕВЬЕВ, ПРИСВОЕНИЕ ВРЕМЕНИ, ЗАПИСЬ В ЛОГ
        grammar_properties_log = {}
        undefined_tense_stack = []
        #head_of_subtree_index is from pos_word_dict
        for head_of_subtree_plus_index, dependent_elements in verb_phrases_dict.items():
            #исследуем корень поддерева
            head_of_subtree_index = head_of_subtree_plus_index.split("_")[0]
            print("NOW HANDLING HEADWORD",head_of_subtree_plus_index)
            head_properties = pos_word_dict[head_of_subtree_index]
            print(head_properties)
            if (head_properties[0].endswith("ing")): continious = True
            if ("Tense" in head_properties[1][3]):#случай когда глагол уже маркирован временем
                # в такой ситуации можно переходить на следующий ключ и сохранять всю херню в логи
                grammar = head_properties[1][3].split("|")
                for gr_unit in grammar:
                    if ("Tense" in gr_unit):
                        print("TENSE DETECTED", head_properties)
                        tense = gr_unit.split("=")[1]
                        if(tense == "Pres" and not head_properties[0].endswith("ing")):
                            print("not ing",head_properties[0])
                            grammar_properties_log [head_of_subtree_index] = "PrsSmpl"
                            
                        elif(tense == "Pres" and head_properties[0].endswith("ing")):
                            print("INGGGGG")
                            for dep_el in dependent_elements:
                                if (dep_el[1].lower() == "am" or dep_el[1].lower() == "is" or dep_el[1].lower() == "are"):
                                    grammar_properties_log[head_of_subtree_index] = "PrsCont"
                                elif(dep_el[1].lower() == "was" or dep_el[1].lower() == "were"):
                                    grammar_properties_log[head_of_subtree_index] = "PastCont"
                                elif(dep_el[2] == "will" or dep_el[2] == "shall"): 
                                    grammar_properties_log [head_of_subtree_index] = "FutCont"
                            
                                
                        elif(tense == "Past"):
                            perfect = False
                            some_future = False
                            for dep_el in dependent_elements:
                                if (dep_el[1].lower() == "have" or dep_el[1].lower() == "has"):
                                    grammar_properties_log [head_of_subtree_index] = "PrPerf"
                                    perfect = True
                                elif(dep_el[1].lower() == "had" ):
                                    grammar_properties_log [head_of_subtree_index] = "PastPerf"
                                    perfect = True
                                elif(dep_el[2] == "will" or dep_el[2] == "shall"): 
                                    some_future = True
                            if some_future and perfect:
                                grammar_properties_log [head_of_subtree_index] = "FutPerf"
                                    
                            if not perfect:
                                grammar_properties_log [head_of_subtree_index] = "PastSmpl"
            
            
            else:#ищем маркер времени в зависимых
                #undefined_tense_stack.append(head_of_subtree_index)
                tense = None
                for dep_el in dependent_elements:
                    if (dep_el[3] == "AUX"):
                        for gr_unit in dep_el[5].split("|"):
                            if "Tense" in gr_unit:
                                tense = gr_unit.split("=")[1]
                                if(tense == "Pres"):
                                    grammar_properties_log [head_of_subtree_index] = "PrsSmpl"
                                elif(tense == "Past"):
                                    grammar_properties_log [head_of_subtree_index] = "PastSmpl"
                        if(dep_el[2] == "will" or dep_el[2] == "shall"):
                            grammar_properties_log [head_of_subtree_index] = "SOME_FUTURE"
                                    
                                    
        print("grammar_properties_log", grammar_properties_log)        
        for map_word in text_map_sentence:
            word_index = map_word['vocabulary_prop']['nominal_index']
            if (str(word_index) in grammar_properties_log):
                map_word['grammar_prop'] = grammar_properties_log [word_index]

    return text_map

In [16]:
phrasal_list_EASY = []
with open("./materials/phrasal_verbs_easy", "r") as pv_doc:
    for pv in pv_doc:
        phrasal_list_EASY.append(pv[:-1])

In [18]:
phrasal_list_big = []
with open("./materials/phrasal_verbs.txt", "r") as pv_doc:
    for pv in pv_doc:
        phrasal_list_big.append(pv[:-1])

In [9]:
def build_subtree_branch(head_word_nominal_index, pos_word_dict,verb_phrases_dict, word_leave):
    if (int(head_word_nominal_index)!= 0):
        current_head_word = pos_word_dict[head_word_nominal_index][0]
        current_head_pos = pos_word_dict[head_word_nominal_index][1][1]
        #print("current_head_word",pos_word_dict[head_word_nominal_index])
        if(current_head_word in verb_phrases_dict):
            verb_phrases_dict[current_head_word].append(word_leave)
        else:
            verb_phrases_dict[current_head_word] = []
            verb_phrases_dict[current_head_word].append(word_leave)

In [33]:
def get_verb_phrase_properties(verb_phrases_dict,grammar_properties_log,pos_word_dict,vocab_properties_log):
    conditional_list = []
    for head_of_subtree_plus_index, dependent_elements in verb_phrases_dict.items():
            head_of_subtree_index = head_of_subtree_plus_index.split("_")[0]
            continious = False
            head_word_lower = head_of_subtree_plus_index.split("_")[1].lower()
            head_lemma_lower = head_of_subtree_plus_index.split("_")[2].lower()
            if(head_word_lower.endswith("ing")):
                continious = True  
            be_words = ['am','is','are',"was","were","'s","'re","'m"]
            present_be = ['am','is','are',"'s","'re","'m"]
            past_be = ["was","were"]
            aux_be_words = ['been','be']
            be_head = None
            be_non_head = None
            aux_be_head = None
            if(head_word_lower in be_words):
                be_head = head_of_subtree_index
            elif(head_word_lower in aux_be_words):
                aux_be_head = head_of_subtree_index
                print("AUX detected")
            have_words = ["have","has","had","'ve"]
            have_head = False
            if(head_word_lower in have_words):
                have_head = True
                
            head_inf = False
            if head_word_lower == head_lemma_lower: head_inf = True
            would_Vinf_index = -1
            
            head_type = pos_word_dict[head_of_subtree_index][1][2]
            head_V3 = False
            head_V2 = False
            if(head_type == "VBN"):
                head_V3 = True
            elif(head_type == "VBD"):
                head_V2 = True
            would_V3_index = -1
            
            being = False
        
            perfect = False
            future = False
            
            past_perf = False
            
            conditional_if_index = -1
            conditional_when_index = -1
            present_index =  -1
            past_index = -1
            future_index = -1
            for dep_el in dependent_elements:
                print("dep_el",dep_el)
                #print(be_head,int(head_of_subtree_index), int(dep_el[0]))
                #if(be_head and dep_el[1].lower() == "there" and int(head_of_subtree_index)  > int(dep_el[0]) ):
                #подразумевается что в случае smth is there - будет именная фраза
                
                #for passive voice
                #print("be_words",be_words, dep_el[1].lower,dep_el[1].lower() in be_words)
                if(dep_el[1].lower() in be_words or dep_el[1].lower() in aux_be_words):
                    be_non_head = dep_el[1].lower()
                elif(dep_el[1].lower() == "being"):
                    being = True
                
                #GERUND
                print(dep_el[5])
                if "VerbForm=Ger" in dep_el[5] or dep_el[1].endswith("ing"):#грубое округление с расчетом на то что не герундиев оканчивающихся на инг мало
                    if (len(dep_el[1])>4 and not head_V3 and not head_V2):#доп филтрр от коротких существительных
                        print("GERUND FOUND")
                        grammar_properties_log[dep_el[0]] = "Gerund"
                
                potential_phrasal_verb = head_lemma_lower + ' ' + dep_el[2]
                print("potential_phrasal_verb",potential_phrasal_verb)
                if( potential_phrasal_verb in  phrasal_list_EASY):
                    print("EASY PV DETECTED")
                    if abs(int(dep_el[0]) - int(head_of_subtree_index)) > 2:
                        vocab_properties_log[dep_el[0]] = "dist_easy_phrasal_verb"
                        vocab_properties_log[head_of_subtree_index] = "dist_easy_phrasal_verb"
                    else:
                        vocab_properties_log[dep_el[0]] = "easy_phrasal_verb"
                        vocab_properties_log[head_of_subtree_index] = "easy_phrasal_verb"
                elif(potential_phrasal_verb in phrasal_list_big):
                    if abs(int(dep_el[0]) - int(head_of_subtree_index)) > 2:
                        vocab_properties_log[dep_el[0]] = "dist_phrasal_verb"
                        vocab_properties_log[head_of_subtree_index] = "dist_phrasal_verb"
                    else:
                        vocab_properties_log[dep_el[0]] = "phrasal_verb"
                        vocab_properties_log[head_of_subtree_index] = "phrasal_verb"
                
                if(be_head and dep_el[1].lower() == "there"):
                    grammar_properties_log[dep_el[0]] = "there_is_are"
                elif(aux_be_head and dep_el[1].lower() == "there" and int(aux_be_head) > int(dep_el[0])):
                    grammar_properties_log[dep_el[0]] = "there_is_are"
                if(dep_el[1].lower() == "when"):
                    conditional_when_index = dep_el[0] 
                if(dep_el[1].lower() == "if"):
                    conditional_if_index = dep_el[0]
                if(dep_el[1].lower() == "would" and head_inf):
                    would_Vinf_index = dep_el[0]
                elif(dep_el[1].lower() == "would" and head_V3):
                    would_V3_index = dep_el[0]
                if (dep_el[3] == "VERB" and "VerbForm=Inf" in dep_el[5] and head_lemma_lower == "have"):
                    grammar_properties_log[dep_el[0]] = "modal_have_to"
                                    

                if (dep_el[1].lower() == "am" or dep_el[1].lower() == "is" or dep_el[1].lower() == "are"):
                    if (continious):
                        grammar_properties_log[head_of_subtree_index] = "PresCont"
                elif(dep_el[1].lower() == "was" or dep_el[1].lower() == "were"):
                    if (continious):
                        grammar_properties_log[head_of_subtree_index] = "PastCont"
                elif(dep_el[1].lower() == "will" or dep_el[1].lower() == "shall"):
                    future = True
                    future_index = dep_el[0]
                elif(dep_el[1].lower() == "had"):
                    if(continious):
                        grammar_properties_log[head_of_subtree_index] = "PastPerfCont"#had (been) doing
                    else:
                        grammar_properties_log[head_of_subtree_index] = "PastPerf"
                        past_perf = True
                elif(dep_el[1].lower() == "have" or dep_el[1].lower() == "has" or dep_el[1].lower() == "'ve"): 
                    perfect = True
                    if(continious):
                        grammar_properties_log[head_of_subtree_index] = "PrPerfCont"
                    elif(would_V3_index == -1):
                        grammar_properties_log[head_of_subtree_index] = "PrPerf"
                    elif(int(would_V3_index) > 0):
                        grammar_properties_log[head_of_subtree_index] = "would_have_V3"
                        grammar_properties_log["would_have_V3"] = head_of_subtree_index
                elif(dep_el[1].lower() == "do" or dep_el[1].lower() == "does"):
                    grammar_properties_log[head_of_subtree_index] = "PresSimp"
                    present_index = dep_el[0]
                elif(dep_el[1].lower() == "did"):
                    grammar_properties_log[head_of_subtree_index] = "PastSimp"
                    past_index = dep_el[0]
                elif(dep_el[1].lower() == "would" and head_word_lower == "like"):
                    grammar_properties_log[head_of_subtree_index] = "would_like"
            if(be_non_head == "been" and past_perf and (head_V3 or head_V2)):
                grammar_properties_log[head_of_subtree_index] = "PastPerf_Passive"
            elif(be_non_head == "been" and perfect and (head_V3 or head_V2) and not future):
                grammar_properties_log[head_of_subtree_index] = "PresPerf_Passive"
            elif(be_non_head == "been" and perfect and (head_V3 or head_V2) and future):
                grammar_properties_log[head_of_subtree_index] = "FuturePerf_Passive"
            elif (continious and future and perfect):
                grammar_properties_log[head_of_subtree_index] = "FutPerfCont"
            elif (continious and future):
                grammar_properties_log[head_of_subtree_index] = "FutCont"#определеит даже без will BE
            elif (perfect and future):
                grammar_properties_log[head_of_subtree_index] = "FutPerf"
            elif(future and not head_V2 and not head_V3):
                grammar_properties_log[head_of_subtree_index] = "FutSimp"
            print("head_prop", pos_word_dict[head_of_subtree_index])    
            if(head_of_subtree_index not in grammar_properties_log):#если нет вспомогательных маркеров
                head_properties = pos_word_dict[head_of_subtree_index]
                if ("Tense=Pres" in head_properties[1][3] ):
                    grammar_properties_log[head_of_subtree_index] = "PresSimp"
                    present_index = dep_el[0]
                elif("Tense=Past" in head_properties[1][3]):
                    if be_non_head:
                        if be_non_head in present_be and not being:
                            grammar_properties_log[head_of_subtree_index] = "PresSimp_Passive" 
                        elif be_non_head in present_be and being:
                            grammar_properties_log[head_of_subtree_index] = "PresCont_Passive" 
                        elif be_non_head in past_be and not being:
                            grammar_properties_log[head_of_subtree_index] = "PastSimp_Passive" 
                        elif be_non_head in past_be and being:
                            grammar_properties_log[head_of_subtree_index] = "PastCont_Passive"
                        elif be_non_head == "be" and future:
                            grammar_properties_log[head_of_subtree_index] = "FutSimp_Passive"
                    else:
                        grammar_properties_log[head_of_subtree_index] = "PastSimp" 
                        past_index = dep_el[0]

                                 
                elif("Tense" not in head_properties[1][3] or head_of_subtree_index not in grammar_properties_log):
                    print("FAILED TO DETECT VERB SUBTREE TENSE")
                   
            if int(present_index) > 0 and (int(conditional_if_index) > 0 or (int(conditional_when_index) > 0)):
                conditional_list.append((max(int(conditional_if_index),int(conditional_when_index)), "if_pres"))
            elif(int(conditional_if_index) > 0 and past_perf):
                conditional_list.append((conditional_if_index, "if_past_perfect"))
            elif(int(past_index) > 0 and int(conditional_if_index) > 0):
                conditional_list.append((conditional_if_index, "if_past"))
            elif(int(present_index) > 0 ):
                conditional_list.append((present_index, "pres"))
            elif(int(future_index) > 0 ):
                conditional_list.append((future_index, "future"))
            elif(int(would_Vinf_index) > 0 ):
                conditional_list.append((would_Vinf_index, "would_Vinf"))
            elif(int(would_V3_index) > 0 ):
                conditional_list.append((would_V3_index, "would_V3_index"))    
    #print("conditional_list",conditional_list)            
    if len(conditional_list) >1:
        if_past_perfect_index = None
        would_V3 = False
        
        if_past_index = None
        would_Vinf = False
        
        cond_word_index = None
        pres = False
        fut = False
        for condition_element in conditional_list:
            #standard sequence if/when then
            if(condition_element[1] == "if_pres"):
                cond_word_index = condition_element[0]
            elif(condition_element[1] == "if_past_perfect"):
                if_past_perfect_index = condition_element[0]
            elif(condition_element[1] == "pres"):
                pres = True
            elif(condition_element[1] == "future"):
                fut = True
            elif(condition_element[1] == "would_Vinf"):
                would_Vinf = True
            elif(condition_element[1] == "if_past"):
                if_past_index = condition_element[0]
            elif(condition_element[1] == "would_V3_index"):
                would_V3 = True    
                
        if(cond_word_index and pres):
            grammar_properties_log[str(cond_word_index)] = "ZeroCond"
        elif(cond_word_index and fut):
            grammar_properties_log[str(cond_word_index)] = "FirstCond"
        elif(if_past_index and would_Vinf):
            grammar_properties_log[str(if_past_index)] = "SecondCond"
        elif(if_past_perfect_index and would_V3):
            grammar_properties_log[str(if_past_perfect_index)] = "ThirdCond"
    elif(len(conditional_list) == 1):
        cond_word_index = None
        print("SHORTTT")
        print(conditional_list[0][1])
        if conditional_list[0][1] == "if_pres":
            print("SHORTTT--")
            grammar_properties_log["if_pres"] = str(conditional_list[0][0])
        elif conditional_list[0][1] == "would_Vinf":
            grammar_properties_log["would_Vinf"] = str(conditional_list[0][0])

In [11]:
def get_non_verb_phrase_properties(non_verb_phrases_dict,grammar_properties_log,pos_word_dict,vocab_properties_log):
    """оцениваем линейно без отсылки к поддереву"""
    pr_simple_be = ["am", "is", "are"]
    past_simp_be = ["was","were"]
    perfect_list = ["have","has"]
    future_list = ["will","shall"]
    be_list = ["be", "been"]
    be_phrasal_list = ["after", "along","away","upset","down", "up"]
    for head_of_subtree_plus_index, dependent_elements in non_verb_phrases_dict.items():
        perfect = False
        future = False
        be_index = -1
        would = False
        past_perf = False
        when_condition_index = -1
        if_condition_index = -1
        present = False
        past = False
                
        for dep_el in dependent_elements:
            if "VerbForm=Ger" in dep_el[5] or dep_el[1].endswith("ing"):#грубое округление с расчетом на то что не герундиев оканчивающихся на инг мало
                if (len(dep_el[1])>4):#доп филтрр от коротких существительных
                    print("GERUND FOUND")
                    grammar_properties_log[dep_el[0]] = "Gerund"
            if (dep_el[1].lower() == "would"):
                would = True
            elif (dep_el[1].lower() == "if"):
                if_condition_index = dep_el[0]
            elif(dep_el[1].lower() == "when"):
                when_condition_index = dep_el[0]
            elif (dep_el[1].lower() in be_list):
                be_index = int(dep_el[0])
                if dep_el[1].lower() == "be" and would:
                    grammar_properties_log[dep_el[0]] = "would_Vinf"
                    grammar_properties_log["would_Vinf"] = dep_el[0]
                elif(dep_el[1].lower() == "been" and would):
                    grammar_properties_log[dep_el[0]] = "would_have_V3"
                    grammar_properties_log["would_have_V3"] = dep_el[0]
            if(dep_el[1].lower() in be_phrasal_list and int(be_index) > 0 and int(be_index) < int(dep_el[0])):
                if abs(int(dep_el[0]) - int(be_index)) > 2:
                        vocab_properties_log[str(dep_el[0])] = "dist_phrasal_verb"
                        vocab_properties_log[str(be_index)] = "dist_phrasal_verb"
                else:
                    vocab_properties_log[str(dep_el[0])]= "phrasal_verb"
                    vocab_properties_log[str(be_index)] = "phrasal_verb"
            if (dep_el[1].lower() in pr_simple_be):
                grammar_properties_log[dep_el[0]] = "PresSimp"
                present = True
            elif (dep_el[1].lower() in past_simp_be):
                grammar_properties_log[dep_el[0]] = "PastSimp"
                past = True
            elif (dep_el[1].lower() in perfect_list):
                perfect = True
            elif (dep_el[1].lower() in future_list):
                future = True
            elif (dep_el[1].lower() == "had"):
                past_perf = True
                
                
        if (be_index > 0):            
            if (perfect and not future and not would):
                grammar_properties_log[str(be_index)] = "PrPerf"
            elif (perfect and future):
                grammar_properties_log[str(be_index)] = "FutPerf"
            elif(not perfect and future):
                grammar_properties_log[str(be_index)] = "FutSimp"
            elif(past_perf):
                grammar_properties_log[str(be_index)] = "PastPerf"
        print("if_condition_index",if_condition_index)         
        if((int(when_condition_index) > 0 or int(if_condition_index) > 0) and present):
            grammar_properties_log["if_pres"] = str(max(int(when_condition_index),int(if_condition_index)))            
        elif(int(if_condition_index) > 0 and past):
            grammar_properties_log["if_past"] = str(if_condition_index)
        elif(int(if_condition_index) > 0 and past_perf):
            grammar_properties_log["if_past_perf"] = str(if_condition_index)
        

In [39]:
def grammar_analysis(conllu_map,text_map_input):
    assert len(conllu_map) == len(text_map_input) #sentences count is equal
    text_map = copy.deepcopy(text_map_input)
    
    for sentence_conllu, text_map_sentence in zip(conllu_map,text_map):
        #СОБИРАЕМ СПИСОК СВАОЙСТВ ВСЕХ СЛОВ ПРЕДЛОЖЕНИЯ ДЛЯ ПОСЛЕДУЮЩЕГО ОБРАЩЕНИЯ
        pos_word_dict = {}
        for pos_word in sentence_conllu:
            pos_word_dict[pos_word[0]] = (pos_word[0]+'_'+pos_word[1]+'_'+pos_word[2], pos_word[2:])
        
        print("POS", pos_word_dict)# словарь "номинальный индекс слова" (номинальный-индекс_слово,  остальные conllu based свойства)
        grammar_properties_log = {}
        vocab_properties_log = {}
        #СТРОИМ ПОДДЕРЕВЬЯ ГЛАГОЛЬНЫХ ГРУПП И СРАЗУ СМОТРИМ ЛИНЕЙНЫЕ СВОЙСТВА
        verb_phrases_dict = {}#словарь "индекс_слово(глагол в верштне)" [список conllu based свойств зависимых эл-тов]
        non_verb_phrases_dict = {}
        #be_list = ["is","are", "be"]
        #there_is_are = False
        for word_leave in sentence_conllu: 
            #if word_leave[2] == "there": there_is_are = True
            #if word_leave[1].lower() in be_list and there_is_are"
            print(word_leave[1], "head_word_nominal_index =", word_leave[6])
            if (int(word_leave[6]) != 0):
                print(pos_word_dict[word_leave[6]],pos_word_dict[word_leave[6]][1][1])
                if(pos_word_dict[word_leave[6]][1][1] != "VERB"):
                    build_subtree_branch(word_leave[6], pos_word_dict, non_verb_phrases_dict, word_leave)
                else:
                    build_subtree_branch(word_leave[6], pos_word_dict, verb_phrases_dict, word_leave)


        print("VERB SUBTREES")
        #print(verb_phrases_dict)
        for key, value in verb_phrases_dict.items():
            print(key)
            for el in value:
                print(el)
            print("==========")
        if (len(list(verb_phrases_dict.keys()))>0):
            get_verb_phrase_properties(verb_phrases_dict, grammar_properties_log,pos_word_dict,vocab_properties_log)
    
        print("NON-VERB SUBTREES")
        #print(verb_phrases_dict)
        for key, value in non_verb_phrases_dict.items():
            print(key)
            for el in value:
                print(el)
            print("==========")
        if (len(list(non_verb_phrases_dict.keys()))>0):
            get_non_verb_phrase_properties(non_verb_phrases_dict, grammar_properties_log,pos_word_dict,vocab_properties_log)
        
                
        print("grammar_properties_log", grammar_properties_log) 
        print("vocab_properties_log", vocab_properties_log)  
        
        if ("if_pres" in grammar_properties_log):
            future = False
            present = False  
            for key, val in grammar_properties_log.items():
                if (key != "if_pres"):
                    if(val == "PresSimp"):
                        present = True
                    elif("Fut" in val):
                        future = True
            if future and present:
                grammar_properties_log[grammar_properties_log["if_pres"]] = "FirstCond"
            elif not future and present:
                grammar_properties_log[grammar_properties_log["if_pres"]] = "ZeroCond"
            print("grammar_properties_log CHANGED", grammar_properties_log)  
        
        if ("if_past" in grammar_properties_log and "would_Vinf" in grammar_properties_log):
            grammar_properties_log[grammar_properties_log["if_past"]] = "SecondCond"
            print("grammar_properties_log CHANGED", grammar_properties_log) 
        if ("if_past_perf" in grammar_properties_log and "would_have_V3" in grammar_properties_log):
            grammar_properties_log[grammar_properties_log["if_past_perf"]] = "ThirdCond"
            print("grammar_properties_log CHANGED", grammar_properties_log)
            
        for map_word in text_map_sentence:
            word_index = map_word['vocabulary_prop']['nominal_index']
            if (str(word_index) in grammar_properties_log):
                map_word['grammar_prop'] = grammar_properties_log[word_index]
            if (str(word_index) in vocab_properties_log):
                map_word['vocabulary_prop']['supp_properties'] = vocab_properties_log[word_index]
            
                
    return text_map
        
#grammar_analysis(conllu_text_map_ex,text_map_dep)

def get_map(text_line,model):
    conllu = get_conllu(text_line, model, print_output = True)
    conllu_text_map = get_conllu_text_map(conllu)
    #print(conllu_text_map)
    
    tfidf = False
    if tfidf:
        lemm_sentences = lemmatize_from_udmap(conllu_text_map)
        tf_idf_dict = get_tf_idf_dict (lemm_sentences)
    else:
        tf_idf_dict = None
    
    text_map = create_map(conllu_text_map, tf_idf_dict, apply_tf_idf = False)
    
    #vocabulary_analysis(text_map_dep, final_basic_vocabulary)
    text_map_gramm = grammar_analysis(conllu_text_map, text_map)
    
    return text_map_gramm
get_map("They were looking for you", model)
#get_map("I like going there", model) Gerund example
#get_map("The cat will be there", model)
#get_map("Mrs Scolefield  has really-really liked going there", model)# has to root
#get_map("Mr. Scolefield is going there", model)
#Is he going there""He is a cat" --- на этом примере Tense не отрабатывает --- решение - посылать на еще одну обработку без корня
#Had I gone there --- Had уходит в корень - время определяется неправильно


# newdoc
# newpar
# sent_id = 1
# text = They were looking for you
['1', 'They', 'they', 'PRON', 'PRP', 'Case=Nom|Number=Plur|Person=3|PronType=Prs', '3', 'nsubj', '_', '_']
['2', 'were', 'be', 'AUX', 'VBD', 'Mood=Ind|Tense=Past|VerbForm=Fin', '3', 'aux', '_', '_']
['3', 'looking', 'look', 'VERB', 'VBG', 'Tense=Pres|VerbForm=Part', '0', 'root', '_', '_']
['4', 'for', 'for', 'ADP', 'IN', '_', '5', 'case', '_', '_']
['5', 'you', 'you', 'PRON', 'PRP', 'Case=Acc|Person=2|PronType=Prs', '3', 'obl', '_', 'SpaceAfter=No']
POS {'1': ('1_They_they', ['they', 'PRON', 'PRP', 'Case=Nom|Number=Plur|Person=3|PronType=Prs', '3', 'nsubj', '_', '_']), '2': ('2_were_be', ['be', 'AUX', 'VBD', 'Mood=Ind|Tense=Past|VerbForm=Fin', '3', 'aux', '_', '_']), '3': ('3_looking_look', ['look', 'VERB', 'VBG', 'Tense=Pres|VerbForm=Part', '0', 'root', '_', '_']), '4': ('4_for_for', ['for', 'ADP', 'IN', '_', '5', 'case', '_', '_']), '5': ('5_you_you', ['you', 'PRON', 'PRP', 'Case=Acc|Person=2|PronType=Prs', '3', 'obl'

[[OrderedDict([('word', 'They'),
               ('lemma', 'they'),
               ('vocabulary_prop',
                OrderedDict([('vocab_importane', 0),
                             ('nominal_index', '1')]))]),
  OrderedDict([('word', 'were'),
               ('lemma', 'be'),
               ('vocabulary_prop',
                OrderedDict([('vocab_importane', 0),
                             ('nominal_index', '2')]))]),
  OrderedDict([('word', 'looking'),
               ('lemma', 'look'),
               ('vocabulary_prop',
                OrderedDict([('vocab_importane', 0), ('nominal_index', '3')])),
               ('grammar_prop', 'PastCont')]),
  OrderedDict([('word', 'for'),
               ('lemma', 'for'),
               ('vocabulary_prop',
                OrderedDict([('vocab_importane', 0),
                             ('nominal_index', '4')]))]),
  OrderedDict([('word', 'you'),
               ('lemma', 'you'),
               ('vocabulary_prop',
                OrderedDict([('vo

In [38]:
# Dictionary of strings and int
wordFreqDic = {
    "Hello": 56,
    "at" : 23 ,
    "test" : 43,
    "this" : 43
    }

del wordFreqDic["at"]

wordFreqDic

{'Hello': 56, 'test': 43, 'this': 43}

In [13]:
#model = Model('./UDPIPE/english-partut-ud-2.0-170801.udpipe')# ==== 
model = Model('./UDPIPE/english-ud-2.0-170801.udpipe')

In [None]:
I go
I dont go
do I go
He goes
He doesnt go
Does he go

I went
I didnt go
did I go