In [203]:
import opencorpora
import string
import sklearn
import pycrfsuite

print(sklearn.__version__)
corpus = opencorpora.load("data/annot.opcorpora.no_ambig.xml")
corpus

0.18.2


<Corpus revision=4427619 docs:4022 tokens:84046>

In [204]:
def get_training_sentences():
    return_value = []
    for sentence in corpus.sentences:
        sample = []
        for token in sentence.tokens:
            token_text = normalize_token(token.source)
            if token_text:
                sample.append([token_text, 
                               token.lemma,
                               token.parses[0].grammemes[0]])
        if len(sample) > 0:
            return_value.append(sample)
    
    return return_value


def get_sentences_from_text(text):
    return_value = []
    # poor man's sentence breaking
    for sentence in text.split('.'):
        sample = []
        # poor man's tokenizer
        for token in sentence.split(' '):
            token = normalize_token(token)
            if token:
                sample.append([token])
        if len(sample) > 0:        
            return_value.append(sample)
    
    return return_value


def normalize_token(token: str, fix_diacritics=False):
    return_value = None
    punctuation = set(['»', '«', '–', ',']).union(string.punctuation)
    if token and token not in punctuation:
        return_value = ''.join([c for c in token if c not in punctuation])
        if fix_diacritics:
            return_value = return_value.replace('ё', 'e')
    return return_value

In [160]:
text = "стала стабильнее экономическая; и политическая обстановка, "
get_sentences_from_text(text)

[[['стала'],
  ['стабильнее'],
  ['экономическая'],
  ['и'],
  ['политическая'],
  ['обстановка']]]

In [161]:
training_sentences = get_training_sentences()

In [163]:
import pandas as pd
import codecs
f = codecs.open("data/odict.csv", 'r', 'cp1251')

lemmas = {}
for line in f:
        columns = line.split(",")
        # the first two columns are
        # lemma and tense
        lemma = normalize_token(columns[0].lower(), fix_diacritics=True)
        if len(columns) > 2:
            for word in columns[2:]:
                word = normalize_token(word, fix_diacritics=True)
                lemmas[word] = lemma    
        lemmas[lemma] = lemma

In [164]:
lemmas['всем'] = 'все'
lemmas['всех'] = 'все'
lemmas['все'] = 'все'
lemmas['гришины'] = 'гриша'
lemmas['был'] = 'быть'
lemmas['кого'] = 'кто'

In [165]:
lemmas["всем"]

'все'

In [186]:
def word2features(sent, i):
    word = sent[i][0]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
    ]
    if i > 0:
        word1 = sent[i-1][0]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word[-3:]=' + word1[-3:],
            '-1:word[-2:]=' + word1[-2:],
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word[-3:]=' + word1[-3:],
            '+1:word[-2:]=' + word1[-2:],
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [postag for token, lemma, postag in sent]


def sent2tokens(sent):
    return [token for token, lemma, postag in sent]

In [187]:
training_sentences[1:5]

[[['Сохранится', 'сохранился', 'VERB'],
  ['ли', 'ли', 'PRCL'],
  ['градус', 'градус', 'NOUN'],
  ['дискуссии', 'дискуссия', 'NOUN'],
  ['в', 'в', 'PREP'],
  ['новом', 'новый', 'ADJF'],
  ['сезоне', 'сезон', 'NOUN']],
 [['Великолепная', 'великолепный', 'ADJF'],
  ['Школа', 'школа', 'NOUN'],
  ['злословия', 'злословие', 'NOUN'],
  ['вернулась', 'вернулся', 'VERB'],
  ['в', 'в', 'PREP'],
  ['эфир', 'эфир', 'NOUN'],
  ['после', 'после', 'PREP'],
  ['летних', 'летний', 'ADJF'],
  ['каникул', 'каникулы', 'NOUN'],
  ['в', 'в', 'PREP'],
  ['новом', 'новый', 'ADJF'],
  ['формате', 'формат', 'NOUN']],
 [['В', 'в', 'PREP'],
  ['истории', 'история', 'NOUN'],
  ['программы', 'программа', 'NOUN'],
  ['это', 'это', 'NPRO'],
  ['уже', 'уже', 'ADVB'],
  ['не', 'не', 'PRCL'],
  ['первый', 'первый', 'ADJF'],
  ['ребрендинг', 'ребрендинг', 'UNKN']],
 [['Писательница', 'писательница', 'NOUN'],
  ['Татьяна', 'татьяна', 'NOUN'],
  ['Толстая', 'толстая', 'NOUN'],
  ['и', 'и', 'CONJ'],
  ['сценаристка', 'сцен

In [188]:
%%time
X_train = [sent2features(s) for s in training_sentences]
y_train = [sent2labels(s) for s in training_sentences]

Wall time: 506 ms


In [189]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

Wall time: 1.11 s


In [190]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [191]:
%%time
trainer.train('data/opencorpora-pos.crfsuite')

Wall time: 10 s


## Make predictions

In [192]:
tagger = pycrfsuite.Tagger()
tagger.open('data/opencorpora-pos.crfsuite')

<contextlib.closing at 0x203bd3cdda0>

In [193]:
example_sent = training_sentences[0]
print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

Школа злословия учит прикусить язык

Predicted: NOUN NOUN VERB INFN NOUN
Correct:   NOUN NOUN VERB INFN NOUN


In [198]:
def map_tag_to_task(tag: str):
    #return tag
    if tag.startswith("NOUN"):
        return "S"
    if tag.startswith("NPR"):
        return "NI"
    if tag.startswith("V"):
        return "V"
    if tag.startswith("INFN"):
        return "V"
    if tag.startswith("PRT"):
        return "V"
    if tag.startswith("ADJ"):
        return "A"
    if tag.startswith("COMP"):
        return "ADV"
    if tag.startswith("PRCL"):
        return "ADV"
    if tag.startswith("PREP"):
        return "PR"
    if tag.startswith("CONJ"):
        return "CONJ"
    return "ADV"
        

def word2pos(sent, i, tags):
    word = sent[i][0]
    tag = tags[i]
    if word:
        try:
            lemma = normalize_token(word.lower(), fix_diacritics=True)
            if lemma in lemmas:
                lemma = lemmas[lemma]
            return word+"{"+lemma+"="+map_tag_to_task(tag.strip()) + "}"
        except:
            print(word + ":" + sent)


def text2pos(sent, tagger):
    tags = tagger.tag(sent2features(sent))
    return ' '.join([word2pos(sent, i, tags) for i in range(len(sent))])

In [199]:
lemmas['всем'] = 'все'
lemmas['всех'] = 'все'
lemmas['все'] = 'все'
lemmas['гришины'] = 'гриша'
lemmas['был'] = 'быть'
lemmas['кого'] = 'кто'
lemmas['пахры'] = 'пахра'
lemmas['этому'] = 'этот'
lemmas['людей'] = 'человек'

In [200]:
sentences = \
    get_sentences_from_text("Стала стабильнее экономическая и политическая обстановка, "
                            "предприятия вывели из тени зарплаты сотрудников."
                            "Все Гришины одноклассники уже побывали за границей, "
                            "он был чуть ли не единственным, кого не вывозили никуда"
                            " дальше Красной Пахры."
                            "А все дёдовы авантюры только ещё более подтачивали его здоровье.")
text2pos(sentences[2], tagger)

'А{а=CONJ} все{все=S} дёдовы{дeдовы=S} авантюры{авантюра=S} только{только=ADV} ещё{ещe=S} более{более=ADV} подтачивали{подтачивать=V} его{его=A} здоровье{здоровье=S}'

In [202]:
with open('result.txt', mode='w', encoding='utf8') as output:
    with open('data\dataset_37845_1.txt', encoding='utf8') as input:
        for line in input:
            line = line.strip()
            sentences = get_sentences_from_text(line)
            for sentence in sentences:
                output.write(text2pos(sentence, tagger) + "\n")