In [None]:
from pymystem3 import Mystem
mystemmer = Mystem()

from nltk.stem.snowball import SnowballStemmer 
stemmer = SnowballStemmer("russian") 

In [None]:
stem_dic = {}
stem_persona_dic = {}

def stem_phrase(tokens):
    stemmed_phrase = ""
    for token in tokens:
        word = token['text']
        stemmed_word = stemmer.stem(word)
        if 'analysis' in token:
            new_stemmed_word = ""
            for i in range(len(stemmed_word)):
                new_stemmed_word += stemmed_word[i].upper() if word[i].isupper() else stemmed_word[i]
            stemmed_word = new_stemmed_word
        stemmed_phrase += stemmed_word
    return stemmed_phrase.rstrip('\n')

with open('collection.txt') as file:
    for line in file:
        line = line.rstrip('\n').split('\t')
        tag = line[0]
        phrase = line[1]
        if tag == "ORG":
            stem_dic[stem_phrase(mystemmer.analyze(phrase))] = "ORG"
        if tag == "PER":
            stem_persona_dic[stem_phrase(mystemmer.analyze(phrase))] = "PERSON"
            

In [None]:
def is_capitalised(text):
    return len(text) > 0 and text[0].isalpha() and text[0].isupper()

def is_word_token(token):
    return 'analysis' in token

def is_russian_word_token(token):
    return is_word_token(token) and token['analysis']

def detect_name_abbr(analyzed):
    result = []
    position = 0
    for i, token in enumerate(analyzed):
        text = token['text']
        if is_word_token(text):
            following_token = analyzed[i + 1]
            if len(text) == 1 and text.isupper() and following_token['text'][0] == '.':
                result.append((position, len(token['text'])))
        position = position + len(token['text'])
    return result

def detect_double_capitalised(analyzed):
    result = []
    position = 0
    
    for i, token in enumerate(analyzed):
        if 'analysis' in token:
            if analyzed[i + 1]['text'] == " ":
                sur_token = analyzed[i + 2]
                if is_russian_word_token(token) and is_russian_word_token(sur_token):
                    if not is_abbr(token['text']) and not is_abbr(sur_token['text']):
                        if is_capitalised(token['text']) and is_capitalised(sur_token['text']):
                            if 'S' == token['analysis'][0]['gr'][0] and 'S' == sur_token['analysis'][0]['gr'][0]:
                                result.append((position, len(token['text'])))
                                result.append((position + len(token['text']) + 1, len(sur_token['text'])))
        position = position + len(token['text'])
    return result

def detect_personas(text):
    analysed = mystemmer.analyze(text)
    results = detect_from_dict_stem(analysed, stem_persona_dic) \
            + detect_name_abbr(analysed) \
            + detect_double_capitalised(analysed)
    return list(set(results))

def detect_eng(analysed):
    result = []
    position = 0
    for token in analysed:
        if token['text'][0] in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' and len(token['text']) > 1:
            result.append((position, len(token['text'])))  
        position = position + len(token['text'])
    return result

def is_abbr(text):
    return text.isupper() and text.isalpha() and len(text) > 1

def detect_abbr(analysed):
    result = []
    position = 0
    for token in analysed:
        if is_abbr(token['text']):
            result.append((position, len(token['text'])))  
        position = position + len(token['text'])
    return result

def detect_in_quotes(analysed):
    result = []
    position = 0
    token_num = 0
    st = []
    
    for token in analysed:
        for sym in token['text']:
            if sym == '«' and token_num != 0:
                st.append(token_num)
            if sym == '»' and st:
                st.pop()
        if 'analysis' in token and st:
            if analysed[st[-1] + 1]['text'][0].isupper():
                result.append((position, len(token['text'])))
        position = position + len(token['text'])
        token_num += 1
    return result

def detect_from_dict_stem(analysed, dic):
    low = 1
    
    result = []
    texts = []
    position = 0
    
    for cur, token in enumerate(analysed):
        if is_word_token(token):
            for i in range(low, 10):
                if cur + i > len(analysed):
                    break
                s = stem_phrase(analysed[cur:cur + i])
                if s in dic and len(s) > 1:
                    texts.append(s)
                    cur_position = position
                    for j in range(i):
                        l = len(analysed[cur + j]['text'])
                        if 'analysis' in analysed[cur + j]:
                            result.append((cur_position, l))
                        cur_position += l
                    break
        position = position + len(token['text'])
    if texts:
        print(texts)
    return result

def detect_orgs(text):
    analysed = mystemmer.analyze(text)
    results = detect_eng(analysed) + detect_abbr(analysed) \
    + detect_in_quotes(analysed) + detect_from_dict_stem(analysed, stem_dic)# + detect_from_dict(text, bad_dic)
    return list(set(results))
    

In [None]:
def to_str(pref, arr):
    return ''.join(map(lambda x: "{} {} {} ".format(x[0], x[1], pref), arr)) 

total = 0
last_line = None
try:
    with open('dataset.txt', 'r') as data:
        with open('answer.txt', 'w') as out:
            for line in data:
                last_line = line
                personas = detect_personas(line)
                orgs = detect_orgs(line)
                total += len(personas) + len(orgs)
                result = to_str("PERSON", personas) + to_str("ORG", orgs) + "EOL\n"
                out.write(result)
except Exception as e:
    print(e)
    print(last_line)
print(total)