In [68]:
import pandas as pd
import re
import string
from collections import defaultdict
from pullenti_wrapper.langs import set_langs, UA
from pullenti_wrapper.processor import Processor, PERSON
set_langs([UA])

In [33]:
class NameSearch():
    NAME_ENDINGS = ["ев", "єв", "ив", "ів", "їв", 
                   "еві", "ева", "евим", "еву", "евої", "евій", "евою", "евому",
                   "єві", "єву", "єва", "євим", "євої", "євій", "євою", "євому",
                   "ива", "иву", "иві", "ивим", "ивої", "ивій", "ивою", "ивому",
                   "іва", "іву", "іві", "івим", "івої", "івій", "івою",
                   "їва", "їву", "їві", "ївим", "ївої", "ївій", "ївою",
                   "кий", "кому", "кого", "ким",
                   "ому", "ого", "ок", "к", "ка", "кої", "кою", "кій", 'ки', 'ці',
                   "ко", "ку", "кові", "ком",
                   "ець", "єць", "йця", "еця", "єця", "ця", "ь", "ецю", "єцю", "йцю", "цю", "цеві", "йцеві", "цем", "йцем",
                   "ої", "ою","ею","ию","ой", "им", "ім", "їм", "их", "іх", "їх", "ий", "ій", "ія", "ію", "ієм", "їй", 
                   "о", "а", "е", "є", "и", "і", "ї", "й", "у", "ю", "я",
                   "ивих", "івих", "ївих", "ам", "ям", "ом", "ем",
                   "ір", "ора", "ору", "ором"]
    
    def __init__(self, text):
        self.text = text
    
    def get_ner(self, text):
        persons = []
        try:
            processor = Processor([PERSON])
            result = processor(text)
            for match in result.walk():
                label = match.referent.label
                if label == "PERSON":
                    d = {"raw": str(match.referent.raw),
                         "last_name": match.referent.lastname,
                         "first_name": match.referent.firstname,
                         "middle_name": match.referent.middlename
                         }
                    for child in match.children:
                        if child.referent.label == "PERSONPROPERTY":
                            d["label"] = child.referent.name
                        break
                    persons.append(d)
            return persons
        except Exception as e:
            print("ERROR in NER.get_ner: ", e)
            return persons

    @staticmethod
    def find_unique_persons(persons):
        d = defaultdict(list)
        for person in persons:
            try:
                if '-' in person["last_name"]:
                    person["last_name"] = person["last_name"].split('-')[0]
                if person["first_name"].lower() == "екатерина":
                    person["first_name"] = "КАТЕРИНА"
                stem_name = ""
                for ch in NameSearch.NAME_ENDINGS:
                    if person["last_name"].lower().endswith(ch):
                        stem_name = person["last_name"].lower()[:-len(ch)]
                        stem_name = stem_name[0].upper() + stem_name[1:]
                        initials = person["first_name"][0] + "." + person["middle_name"][0] + "."
                        break
                if not stem_name:
                    stem_name = person["last_name"][0].upper() + person["last_name"][1:].lower() 
                    initials = person["first_name"][0] + "." + person["middle_name"][0] + "."
                d[(stem_name, initials)].append(person)
            except Exception as e:
                print("ERROR in NER.find_unique_persons: ", e)
        return d

    def search_person_algorithm(self, text):
        names = []
        trash = ('Судд', 'Головуюч', 'України')
        exclude = ['\\', '|', ':', ';', '(', ')', '*', '’', '\'', '`']
        lst = ''.join(ch for ch in text if ch not in exclude)
        lst = re.sub('\n', ' $ ', lst).split(' ')
        lst = [x for x in lst if x != '']
        for i in range(2):
            lst.insert(i, ' ')
            lst.append(' ')
        for i in range(len(lst)):
            try:
                if lst[i].isalpha() & lst[i][0].isupper() & lst[i+1][0].isupper() & (lst[i+1][1] == '.') & (not lst[i].startswith(trash)):
                    if lst[i+2][0].isupper() & lst[i+2].endswith(('.', ',')):
                        if (not (lst[i-1][0].isupper() & lst[i-1].endswith('.'))) | lst[i-1].endswith(','):
                            names.append((lst[i], lst[i+1]+lst[i+2]))
                    else:
                        if (not (lst[i-1][0].isupper() & lst[i-1].endswith('.'))) | lst[i-1].endswith(','):
                            names.append((lst[i], lst[i+1]))
            except IndexError:
                pass
        return names
    
    @staticmethod
    def persons_stemming(persons):
        exclude = ['.', ',', ':', ';']
        d = defaultdict(list)
        for person in persons:
            stem_name = ""
            initials = ""
            initial_stripped = ''.join(ch for ch in person[1] if ch not in exclude)
            for ch in NameSearch.NAME_ENDINGS:
                if person[0].lower().endswith(ch):
                    stem_name = person[0].lower()[:-len(ch)]
                    if len(stem_name) == 1:
                        stem_name = stem_name[0].upper()
                    else:
                        try:
                            stem_name = stem_name[0].upper() + stem_name[1:]
                        except IndexError:
                            break
                    if len(initial_stripped) == 2:
                        initials = initial_stripped[0] + '.' + initial_stripped[1] + '.'
                        d[(stem_name, initials)].append(person)
                    break
            if not stem_name:
                stem_name = person[0][0].upper() + person[0][1:].lower() 
                if len(initial_stripped) == 2:
                    initials = initial_stripped[0] + '.' + initial_stripped[1] + '.'
                    d[(stem_name, initials)].append(person)
        return d

    def name_searh(self): 
        persons_pullenti = NameSearch.find_unique_persons(self.get_ner(self.text))
        persons_alg = NameSearch.persons_stemming(self.search_person_algorithm(self.text))
        dct_alg = {}
        for person in (set(persons_alg.keys()) - set(persons_pullenti.keys())):
            dct_alg[person] = persons_alg[person]
        dct = {**persons_pullenti, **dct_alg}
        return dct
        

In [None]:
class UkrainianStemmer():
    def __init__(self, word):
        self.word = word
        self.vowel = r'аеиоуюяіїє'  # http://uk.wikipedia.org/wiki/Голосний_звук
        self.perfectiveground = r'(ив|ивши|ившись|ыв|ывши|ывшись((?<=[ая])(в|вши|вшись)))$'
        # http://uk.wikipedia.org/wiki/Рефлексивне_дієслово
        self.reflexive = r'(с[яьи])$'
        # http://uk.wikipedia.org/wiki/Прикметник + http://wapedia.mobi/uk/Прикметник
        self.adjective = r'(ими|ій|ий|а|е|ова|ове|ів|є|їй|єє|еє|я|ім|ем|им|ім|их|іх|ою|йми|іми|у|ю|ого|ому|ої)$'
        # http://uk.wikipedia.org/wiki/Дієприкметник
        self.participle = r'(ий|ого|ому|им|ім|а|ій|у|ою|ій|і|их|йми|их)$'
        # http://uk.wikipedia.org/wiki/Дієслово
        self.verb = r'(сь|ся|ив|ать|ять|у|ю|ав|али|учи|ячи|вши|ши|е|ме|ати|яти|є)$'
        # http://uk.wikipedia.org/wiki/Іменник
        self.noun = r'(а|ев|ов|е|ями|ами|еи|и|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я|і|ові|ї|ею|єю|ою|є|еві|ем|єм|ів|їв|ю)$'
        self.rvre = r'[аеиоуюяіїє]'
        self.derivational = r'[^аеиоуюяіїє][аеиоуюяіїє]+[^аеиоуюяіїє]+[аеиоуюяіїє].*(?<=о)сть?$'
        self.RV = ''

    def ukstemmer_search_preprocess(self, word):
        word = word.lower()
        word = word.replace("'", "")
        word = word.replace("ё", "е")
        word = word.replace("ъ", "ї")
        return word

    def s(self, st, reg, to):
        orig = st
        self.RV = re.sub(reg, to, st)
        return (orig != self.RV)

    def stem_word(self):
        word = self.ukstemmer_search_preprocess(self.word)
        if not re.search('[аеиоуюяіїє]', word):
            stem = word
        else:
            p = re.search(self.rvre, word)
            start = word[0:p.span()[1]]
            self.RV = word[p.span()[1]:]

            # Step 1
            if not self.s(self.RV, self.perfectiveground, ''):

                self.s(self.RV, self.reflexive, '')
                if self.s(self.RV, self.adjective, ''):
                    self.s(self.RV, self.participle, '')
                else:
                    if not self.s(self.RV, self.verb, ''):
                        self.s(self.RV, self.noun, '')
            # Step 2
            self.s(self.RV, 'и$', '')

            # Step 3
            if re.search(self.derivational, self.RV):
                self.s(self.RV, 'ость$', '')

            # Step 4
            if self.s(self.RV, 'ь$', ''):
                self.s(self.RV, 'ейше?$', '')
                self.s(self.RV, 'нн$', u'н')

            stem = start + self.RV
        return stem

In [78]:
class FeaturesForNames():
    
    def __init__(self, file_name, text, persons):
        self.text = text
        self.persons = persons
        self.file = file_name
        
    @staticmethod
    def person_indexes(name, initials, text):
        """
        params: name - last name of person
        initials - initials of person
        text - document to depersonalize
        returns: lists of indexes
        """
        indexes = []
        exclude = ['-','\\', '|', ';', ',', '(', ')', '*', '’', '\'','`']
        lst = ''.join(ch for ch in text if ch not in exclude)
        pat1 = r':'
        pat2 = r'\n'
        combined_pat = r'|'.join((pat1, pat2))
        lst = re.sub(combined_pat, ' ', lst).split(' ')
        lst = [x for x in lst if x != '']
        for i in range(2):
            lst.insert(i, ' ')
            lst.append(' ')
        for i in range(len(lst)):
            try:
                if (lst[i].startswith((name,name.upper())) & ((lst[i-1].endswith(initials) | lst[i+1].startswith(initials)) | \
                                              ((initials[0] == lst[i+1][0]) & (initials[2] == lst[i+2][0])) |\
                                              (initials[0] == lst[i+1][0]) |\
                                              ((initials[0] == lst[i-2][0]) & (initials[2] == lst[i-1][0])) |\
                                              (initials[0] == lst[i-1][0]))) |\
                   lst[i].startswith(initials+name) | (lst[i].startswith(name) & lst[i].endswith(initials)):
                    indexes.append(i)
            except IndexError:
                pass        
        return indexes, len(lst)

    
    def features(self):
        """
        params: name - last name of person
        initials - initials of person
        text - document to depersonalize
        returns: lists of indexes
        """
        judge_indexes = []
        prosecutor_indexes = []
        defence_indexes = []
        secretary_indexes = []
        notary_indexes = []
        convict_indexes = []
        arbitration_indexes = []
        plaintiff_indexes = []
        defendant_indexes = []
        detective_indexes = []
        liquidator_indexes = []
        third_party_indexes = []
        registrar_indexes = []
        witness_indexes = []
        accused_indexes = []
        suspect_indexes = []
        victim_indexes = []
        representative_indexes = []
        overhaul_indexes = []
        head_indexes = []
        statement_indexes = []
        debtor_indexes = []
        agent_indexes = []
        famous_indexes = []
        president_indexes = []
        quotes_indexes = []
        judge = ('судд')
        prosecutor = ('прокурор')
        defence = ('захисник', 'адвокат')
        secretary = ('секретар')
        notary = ('нотаріус', 'перекладач')
        convict = ('засуджен')
        arbitration = (('арбітражн'), ('керуюч'))
        plaintiff = ('позивач', 'позов')
        defendant = ('відповідач')
        detective = ('слідч', 'детектив')
        liquidator = ('ліквідатор')
        third_party = (('трет'), ('особ'))
        registrar = ('реєстратор')
        witness = ('свідок', 'свідк', 'показ')
        accused = ('звинувачуван', 'звинувачен', 'обвинувачуван','обвинувачен')
        suspect = ('підозрюван')
        victim = ('потерпілий')
        representative = ('представни')
        overhaul = (('керуюч'), ('санацією'))
        head = ('голова', 'голові', 'головою', 'голови', 'начальн', 'керівник', 'директор')
        statement = ('заяв')
        debtor = ('боржник')
        agent = ('уповноважен')
        famous = ('ім.', 'імені')
        president = (('президент', 'україн'))
        quotes = ('\"', '«', '»', '”', '“')
        exclude = ['-', '\\', '|', ':', ';', ',', '(', ')', '*', '’', '\'', '`']
        lst = ''.join(ch for ch in self.text if ch not in exclude)
        lst = re.sub('\n', ' ', lst).split(' ')
        lst = [x for x in lst if x != '']
        for i in range(2):
            lst.insert(i, ' ')
            lst.append(' ')
        for i in range(len(lst)):
            try:
                if lst[i].lower().startswith(judge):
                    judge_indexes.append(i)
                if lst[i].lower().startswith(prosecutor):
                    prosecutor_indexes.append(i)
                if lst[i].lower().startswith(defence):
                    defence_indexes.append(i)
                if lst[i].lower().startswith(secretary):
                    secretary_indexes.append(i)
                if lst[i].lower().startswith(notary):
                    notary_indexes.append(i)
                if lst[i].lower().startswith(convict):
                    convict_indexes.append(i)
                if lst[i].lower().startswith(arbitration[0]) & lst[i+1].lower().startswith(arbitration[1]):
                    arbitration_indexes.append(i+1)
                if lst[i].lower().startswith(plaintiff):
                    plaintiff_indexes.append(i)
                if lst[i].lower().startswith(defendant):
                    defendant_indexes.append(i)
                if lst[i].lower().startswith(detective):
                    detective_indexes.append(i)
                if lst[i].lower().startswith(liquidator):
                    liquidator_indexes.append(i)
                if lst[i].lower().startswith(third_party[0]) & lst[i+1].lower().startswith(third_party[1]):
                    third_party_indexes.append(i+1)
                if lst[i].lower().startswith(registrar):
                    registrar_indexes.append(i)
                if lst[i].lower().startswith(witness):
                    witness_indexes.append(i)
                if lst[i].lower().startswith(accused):
                    accused_indexes.append(i)
                if lst[i].lower().startswith(suspect):
                    suspect_indexes.append(i)
                if lst[i].lower().startswith(victim):
                    victim_indexes.append(i)
                if lst[i].lower().startswith(representative):
                    representative_indexes.append(i)
                if lst[i].lower().startswith(overhaul[0]) & lst[i+1].lower().startswith(overhaul[1]):
                    overhaul_indexes.append(i+1)
                if lst[i].lower().startswith(head):
                    head_indexes.append(i)
                if lst[i].lower().startswith(statement):
                    statement_indexes.append(i)
                if lst[i].lower().startswith(debtor):
                    debtor_indexes.append(i)
                if lst[i].lower().startswith(agent):
                    agent_indexes.append(i)
                if lst[i].lower().startswith(famous):
                    famous_indexes.append(i)
                if lst[i].lower().startswith(president[0]) & lst[i+1].lower().startswith(president[1]):
                    president_indexes.append(i+1)
                if any([x in lst[i] for x in quotes]):
                    quotes_indexes.append(i)
            except IndexError:
                pass        
        return judge_indexes, prosecutor_indexes, defence_indexes, secretary_indexes, \
                notary_indexes, convict_indexes, arbitration_indexes, plaintiff_indexes, defendant_indexes, \
                detective_indexes, liquidator_indexes, third_party_indexes, registrar_indexes, witness_indexes, \
                accused_indexes, suspect_indexes, victim_indexes, representative_indexes, overhaul_indexes, head_indexes, \
                statement_indexes, debtor_indexes, agent_indexes, famous_indexes, president_indexes, quotes_indexes
    
    
    @staticmethod
    def pseudo_label(name, initials, text):
        dct = defaultdict(list)
        exclude = ['\\', '|', ':', ';', ',', '(', ')', '*', '’', '\'', '`']
        lst = ''.join(ch for ch in text if ch not in exclude)
        lst = re.sub('\n', ' ', lst).split(' ')
        lst = [x for x in lst if x != '']
        for i in range(2):
            lst.insert(i, ' ')
            lst.append(' ')
        for i in range(len(lst)):
            try:
                if ((lst[i].startswith(name) & ((initials == lst[i-1]) | (initials == lst[i+1]) | \
                                              ((initials[0] == lst[i+1][0]) & (initials[2] == lst[i+2][0])) |\
                                              (initials[0] == lst[i+1][0]) |\
                                              ((initials[0] == lst[i-2][0]) & (initials[2] == lst[i-1][0])) |\
                                              (initials[0] == lst[i-1][0]))) |\
                   lst[i].startswith(initials+name) | (lst[i].startswith(name) & lst[i].endswith(initials))) & (lst[i-1] == '-'):
                    dct[name+' '+initials] = ''.join(ch for ch in lst[i-2] if ch not in exclude)
            except IndexError:
                pass        
        return dct
    
    
    def court_type(self):
        court_type = []
        exclude = ['\\', '|', ':', ';', '(', ')', '*', '’', '\'', '`']
        lst = ''.join(ch for ch in self.text if ch not in exclude)
        lst = re.sub('\n', ' ', lst).split(' ')
        lst = [x for x in lst if x != '']
        stop = []
        for i in lst:
            if i.lower().startswith('судд'):
                stop.append(i.lower())
            if (i.lower() == 'суд') | (i.lower() == 'суду'):
                break
            else:
                court_type.append(i.lower())
        try:
            answer = ' '.join(court_type[court_type.index(stop[-1])+1:])
        except (ValueError, IndexError):
            answer = ' '.join(court_type).replace('![]herb.gif','')
        return answer
    
    def make_dataframe(self):
        persons = []
        labels = []
        pseudo_labels = []
        person_index = []
        file = [self.file]
        court_type = [self.file, self.court_type()]
        feature = [self.file, self.features()]
        for person in self.persons:
            person_index.append(FeaturesForNames.person_indexes(person[0], person[1], self.text))
            persons.append(person)
            label = []
            try:
                for i in self.persons[person]:
                    if 'label' in i.keys():
                        label.append(i['label'])
            except AttributeError:
                pass
            labels.append(label)
            if FeaturesForNames.pseudo_label(person[0], person[1], self.text):
                pseudo_labels.append(list(FeaturesForNames.pseudo_label(person[0], person[1], self.text).values()))
            else:
                pseudo_labels.append([])
        data = pd.DataFrame({'person': persons, 'person_indexes': [person_index[i][0] for i in range(len(person_index))],
                             'length_doc' : [person_index[i][1] for i in range(len(person_index))], 
                             'labels': labels, 'pseudo_labels': pseudo_labels, 'file_name': file*len(persons)
                            })
        court_types = pd.DataFrame({'file_name': [court_type[0]], 'court_type': [court_type[1]]})
        features = pd.DataFrame({'file_name': [feature[0]], 'judge': [feature[1][0]],
                                 'prosecutor': [feature[1][1]], 'defence': [feature[1][2]], 
                                 'secretary': [feature[1][3]] , 'notary': [feature[1][4]], 
                                 'convict': [feature[1][5]], 'arbitration': [feature[1][6]],  
                                 'plaintiff': [feature[1][7]], 'defendant': [feature[1][8]],
                                 'detective': [feature[1][9]], 'liquidator': [feature[1][10]],  
                                 'third_party': [feature[1][11]], 'registrar': [feature[1][12]],  
                                 'witness': [feature[1][13]], 'accused': [feature[1][14]],  
                                 'suspect': [feature[1][15]], 'victim': [feature[1][16]],
                                 'representative': [feature[1][17]], 'overhaul': [feature[1][18]], 
                                 'head': [feature[1][19]], 'statement': [feature[1][20]], 
                                 'debtor': [feature[1][21]], 'agent': [feature[1][22]],
                                 'famous': [feature[1][23]], 'president': [feature[1][24]],
                                 'quotes': [feature[1][25]]
                                })
        data = data.merge(features, on='file_name', how='left').merge(court_types, on='file_name', how='left')
        return data