In [112]:
from ufal.udpipe import Model, Pipeline, ProcessingError

from string import punctuation
full_punctuation = punctuation + "–" + "," + "»" + "«" + "…"

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

from collections import OrderedDict

import copy

In [4]:
model = Model.load('./UDPIPE/english-lines-ud-2.0-170801.udpipe')

In [136]:
import ufal.udpipe
# ufal.udpipe.Model etc. are SWIG-magic and cannot be detected by pylint
# pylint: disable=no-member

class Model:
    def __init__(self, path):
        """Load given model."""
        self.model = ufal.udpipe.Model.load(path)
        if not self.model:
            raise Exception("Cannot load UDPipe model from file '%s'" % path)

    def tokenize(self, text):
        """Tokenize the text and return list of ufal.udpipe.Sentence-s."""
        tokenizer = self.model.newTokenizer(self.model.DEFAULT)
        if not tokenizer:
            raise Exception("The model does not have a tokenizer")
        return self._read(text, tokenizer)

    def read(self, text, in_format):
        """Load text in the given format (conllu|horizontal|vertical) and return list of ufal.udpipe.Sentence-s."""
        input_format = ufal.udpipe.InputFormat.newInputFormat(in_format)
        if not input_format:
            raise Exception("Cannot create input format '%s'" % in_format)
        return self._read(text, input_format)

    def _read(self, text, input_format):
        input_format.setText(text)
        error = ufal.udpipe.ProcessingError()
        sentences = []

        sentence = ufal.udpipe.Sentence()
        while input_format.nextSentence(sentence, error):
            sentences.append(sentence)
            sentence = ufal.udpipe.Sentence()
        if error.occurred():
            raise Exception(error.message)

        return sentences

    def tag(self, sentence):
        """Tag the given ufal.udpipe.Sentence (inplace)."""
        self.model.tag(sentence, self.model.DEFAULT)

    def parse(self, sentence):
        """Parse the given ufal.udpipe.Sentence (inplace)."""
        self.model.parse(sentence, self.model.DEFAULT)

    def write(self, sentences, out_format):
        """Write given ufal.udpipe.Sentence-s in the required format (conllu|horizontal|vertical)."""

        output_format = ufal.udpipe.OutputFormat.newOutputFormat(out_format)
        output = ''
        for sentence in sentences:
            output += output_format.writeSentence(sentence)
        output += output_format.finishDocument()

        return output

# Can be used as
#  model = Model('english-ud-1.2-160523.udpipe')
#  sentences = model.tokenize("Hi there. How are you?")
#  for s in sentences:
#      model.tag(s)
#      model.parse(s)
#  conllu = model.write(sentences, "conllu")

In [34]:
with open ("./UDPIPE/text_0.txt", "r") as f:
    text = ''
    for line in f.readlines():
        text += line + ' '
        text.strip()
    #print(text)

In [38]:
model = Model('./UDPIPE/english-partut-ud-2.0-170801.udpipe')
#sentences = model.tokenize("Hi there. How are you?")

In [None]:
sentences = model.tokenize(text)

In [35]:
sentences = model.tokenize(text)
for s in sentences:
    model.tag(s)
    model.parse(s)
conllu = model.write(sentences, "conllu")

In [140]:
model = Model('./UDPIPE/english-ud-2.0-170801.udpipe')

sentences = model.tokenize("He has made this tricky story up")
for s in sentences:
    model.tag(s)
    model.parse(s)
conllu = model.write(sentences, "conllu")
for line in conllu.split('\n'):
    if line:
        if line[0].isdigit():
            print(line.split('\t'))
        else:
            print(line)

# newdoc
# newpar
# sent_id = 1
# text = He has made this tricky story up
['1', 'He', 'he', 'PRON', 'PRP', 'Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs', '3', 'nsubj', '_', '_']
['2', 'has', 'have', 'AUX', 'VBZ', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '3', 'aux', '_', '_']
['3', 'made', 'make', 'VERB', 'VBN', 'Tense=Past|VerbForm=Part', '0', 'root', '_', '_']
['4', 'this', 'this', 'DET', 'DT', 'Number=Sing|PronType=Dem', '6', 'det', '_', '_']
['5', 'tricky', 'tricky', 'ADJ', 'JJ', 'Degree=Pos', '6', 'amod', '_', '_']
['6', 'story', 'story', 'NOUN', 'NN', 'Number=Sing', '3', 'obj', '_', '_']
['7', 'up', 'up', 'ADP', 'RP', '_', '3', 'compound:prt', '_', 'SpaceAfter=No']


# sentenceses conlluu map

In [46]:
def get_conllu_text_map(conllu_parsed_object):
    conllu_text_map = []
    conllu_sentence_map = []
    for line in conllu_parsed_object.split('\n'):
        if line:
            if line[0].isdigit():
                #print(line.split('\t'))
                conllu_sentence_map.append(line.split('\t'))
            else:
                if(len(conllu_sentence_map) > 0):
                    conllu_text_map.append(conllu_sentence_map)
                    conllu_sentence_map = []
                    
    return conllu_text_map
conllu_text_map_ex = get_conllu_text_map(conllu)

In [47]:
for sentence in conllu_text_map_ex:
    for word in sentence:
        print(word)
    print()

['1', '"', '"', 'PUNCT', 'FB', '_', '5', 'punct', '_', 'SpaceAfter=No']
['2', 'This', 'this', 'PRON', 'PD', 'Number=Sing|PronType=Dem', '5', 'nsubj', '_', '_']
['3', 'is', 'be', 'AUX', 'VA', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '5', 'cop', '_', '_']
['4', 'my', 'my', 'DET', 'AP', 'Number=Sing|Poss=Yes|PronType=Prs', '5', 'nmod:poss', '_', '_']
['5', 'friend', 'friend', 'NOUN', 'S', 'Number=Sing', '0', 'root', '_', '_']
['6', 'Jimmy', 'Jimmy', 'PROPN', 'SP', '_', '5', 'nmod', '_', 'SpaceAfter=No']
['7', '.', '.', 'PUNCT', 'FS', '_', '5', 'punct', '_', '_']

['1', 'He', 'he', 'PRON', 'PE', 'Gender=Masc|Number=Sing|Person=3|PronType=Prs', '4', 'nsubj', '_', '_']
['2', 'is', 'be', 'AUX', 'VA', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'cop', '_', '_']
['3', 'from', 'from', 'ADP', 'E', '_', '4', 'case', '_', '_']
['4', 'India', 'India', 'PROPN', 'SP', '_', '0', 'root', '_', 'SpaceAfter=No']
['5', '.', '.', 'PUNCT', 'FS', '_', '4', 'punct', '_', '_']



# LEMM TEXT FROM UDPIPE MAP

In [54]:
def lemmatize_from_udmap(conllu_map):
    sentences_list = []
    for sentence in conllu_map:
        line = ''
        for word in sentence: 
            if (word[2] not in full_punctuation):
                #print(word[2])
                line += word[2] + ' '
        
        sentences_list.append(line.strip())
        #print()
    return sentences_list
lemm_sentences = lemmatize_from_udmap(conllu_text_map_ex)

# TF_IDF

In [93]:
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

In [94]:
def get_tf_idf_dict(lemm_text_list, save_to_csv = False):
    vect = TfidfVectorizer(stop_words = stopWords)
    tfidf_matrix = vect.fit_transform(lemm_text_list)
    df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
    #print(df.head())
    if (save_to_csv): df.to_csv("./text_0_tfidf.xlsx", sep = '\t')
    tf_idf_dict = df.to_dict()
    return tf_idf_dict
tf_idf_dict_example = get_tf_idf_dict (lemm_sentences, save_to_csv = True)

In [96]:
tf_idf_dict_example

{'accompany': {0: 0.0,
  1: 0.0,
  2: 0.0,
  3: 0.0,
  4: 0.0,
  5: 0.0,
  6: 0.0,
  7: 0.0,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 0.0,
  17: 0.32331279032138033,
  18: 0.0,
  19: 0.0,
  20: 0.0,
  21: 0.0,
  22: 0.0,
  23: 0.0,
  24: 0.0,
  25: 0.0,
  26: 0.0,
  27: 0.0,
  28: 0.0,
  29: 0.0,
  30: 0.0},
 'add': {0: 0.0,
  1: 0.0,
  2: 0.0,
  3: 0.0,
  4: 0.0,
  5: 0.0,
  6: 0.0,
  7: 0.0,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 0.0,
  17: 0.0,
  18: 0.0,
  19: 0.0,
  20: 0.0,
  21: 0.0,
  22: 0.0,
  23: 0.0,
  24: 0.0,
  25: 0.0,
  26: 0.38730920010712067,
  27: 0.0,
  28: 0.0,
  29: 0.0,
  30: 0.0},
 'ago': {0: 0.0,
  1: 0.0,
  2: 0.0,
  3: 0.0,
  4: 0.0,
  5: 0.0,
  6: 0.0,
  7: 0.0,
  8: 0.0,
  9: 0.4881419323715902,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 0.0,
  17: 0.0,
  18: 0.0,
  19: 0.0,
  20: 0.0,
  21: 0.0,
  22: 0.0,
  23: 0.0,
  24: 0.

# Text map

# ГРАММАТИКУ УЧИТЫВАЕМ ОТДЕЛЬНО НЕ ССЫЛАЯСЬ НА ТФ ИДФ 
ЕСЛИ ГРАММАТИКА ИДЕТ В КУПЕ С ТФИДФ ТО НАКЛАДЫВАЕМ ДОПОЛНИТЕЛЬНЫЙ ШТРАФ

In [129]:
def create_map(conllu_map, tf_idf_dict):
    text_map = []
    sentence_ind = 0
    for sentence in conllu_map:
        sentence_map = []
        for word in sentence: 
            if (word[2] not in full_punctuation):
                weight = OrderedDict([("word", word[1]),("lemma",word[2]), ("vocabulary_prop",(OrderedDict([("tf_idf", 0),("nominal_index",word[0])])))])
                #print(word[2])
                lemma_lower = word[2].lower()
                if (lemma_lower in tf_idf_dict):
                    weight["vocabulary_prop"]["tf_idf"] = tf_idf_dict[lemma_lower][sentence_ind]
                sentence_map.append(weight)
        text_map.append(sentence_map)
        sentence_ind += 1
    return text_map
text_map_ex = create_map(conllu_text_map_ex, tf_idf_dict_example)

In [None]:
tf_idf_dict[lemma][sentence_ind]

In [120]:
text_map_ex[0]

[OrderedDict([('word', 'This'),
              ('lemma', 'this'),
              ('vocabulary_prop',
               OrderedDict([('tf_idf', 0), ('nominal_index', '2')]))]),
 OrderedDict([('word', 'is'),
              ('lemma', 'be'),
              ('vocabulary_prop',
               OrderedDict([('tf_idf', 0), ('nominal_index', '3')]))]),
 OrderedDict([('word', 'my'),
              ('lemma', 'my'),
              ('vocabulary_prop',
               OrderedDict([('tf_idf', 0), ('nominal_index', '4')]))]),
 OrderedDict([('word', 'friend'),
              ('lemma', 'friend'),
              ('vocabulary_prop',
               OrderedDict([('tf_idf', 0), ('nominal_index', '5')])),
              ('tf_idf', 0.7746836041079114)]),
 OrderedDict([('word', 'Jimmy'),
              ('lemma', 'Jimmy'),
              ('vocabulary_prop',
               OrderedDict([('tf_idf', 0), ('nominal_index', '6')])),
              ('tf_idf', 0.6323490440621989)])]

In [130]:
def get_dependencies (conllu_map, text_map_input):
    assert len(conllu_map) == len(text_map_input) #sentences count is equal
    text_map = copy.deepcopy(text_map_input)
    for sentence, text_map_sentence in zip(conllu_map,text_map):
        dep_dict = {}
        for word in sentence: 
            if (word[2] not in full_punctuation):
                #print(word[1], "head_word_nominal_index =", word[6])
                if(word[6] in dep_dict):
                    dep_dict[word[6]] += 1
                elif(word[6] != 0):
                    dep_dict[word[6]] = 1
        #print(dep_dict)
        for map_word in text_map_sentence:
            if(map_word["vocabulary_prop"]["nominal_index"] in dep_dict):
                map_word["vocabulary_prop"]["dep_words_count"] = dep_dict[map_word["vocabulary_prop"]["nominal_index"]] + 1
                map_word["vocabulary_prop"]["vocab_importane"] = map_word["vocabulary_prop"]["dep_words_count"] * map_word["vocabulary_prop"]["tf_idf"]
            else:
                map_word["vocabulary_prop"]["dep_words_count"] = 1
                map_word["vocabulary_prop"]["vocab_importane"] = map_word["vocabulary_prop"]["dep_words_count"] * map_word["vocabulary_prop"]["tf_idf"]
    return text_map
text_map_dep = get_dependencies(conllu_text_map_ex, text_map_ex)

In [132]:
text_map_dep[2]

[OrderedDict([('word', 'Jimmy'),
              ('lemma', 'Jimmy'),
              ('vocabulary_prop',
               OrderedDict([('tf_idf', 0.6323490440621989),
                            ('nominal_index', '1'),
                            ('dep_words_count', 1),
                            ('vocab_importane', 0.6323490440621989)]))]),
 OrderedDict([('word', 'is'),
              ('lemma', 'be'),
              ('vocabulary_prop',
               OrderedDict([('tf_idf', 0),
                            ('nominal_index', '2'),
                            ('dep_words_count', 1),
                            ('vocab_importane', 0)]))]),
 OrderedDict([('word', 'a'),
              ('lemma', 'a'),
              ('vocabulary_prop',
               OrderedDict([('tf_idf', 0),
                            ('nominal_index', '3'),
                            ('dep_words_count', 1),
                            ('vocab_importane', 0)]))]),
 OrderedDict([('word', 'vegetarian'),
              ('lemma', 've

In [102]:
for sentence in conllu_text_map_ex:
    
    for word in sentence:
        print()
    print()

['1', '"', '"', 'PUNCT', 'FB', '_', '5', 'punct', '_', 'SpaceAfter=No']
['2', 'This', 'this', 'PRON', 'PD', 'Number=Sing|PronType=Dem', '5', 'nsubj', '_', '_']
['3', 'is', 'be', 'AUX', 'VA', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '5', 'cop', '_', '_']
['4', 'my', 'my', 'DET', 'AP', 'Number=Sing|Poss=Yes|PronType=Prs', '5', 'nmod:poss', '_', '_']
['5', 'friend', 'friend', 'NOUN', 'S', 'Number=Sing', '0', 'root', '_', '_']
['6', 'Jimmy', 'Jimmy', 'PROPN', 'SP', '_', '5', 'nmod', '_', 'SpaceAfter=No']
['7', '.', '.', 'PUNCT', 'FS', '_', '5', 'punct', '_', '_']

['1', 'He', 'he', 'PRON', 'PE', 'Gender=Masc|Number=Sing|Person=3|PronType=Prs', '4', 'nsubj', '_', '_']
['2', 'is', 'be', 'AUX', 'VA', 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', '4', 'cop', '_', '_']
['3', 'from', 'from', 'ADP', 'E', '_', '4', 'case', '_', '_']
['4', 'India', 'India', 'PROPN', 'SP', '_', '0', 'root', '_', 'SpaceAfter=No']
['5', '.', '.', 'PUNCT', 'FS', '_', '4', 'punct', '_', '_']

