### NLP of legal texts
Analysis of agreements between governments 
* Key words & key phrases extraction with TF-IDF and N-gramms
* NER for DATES with ([Natasha](https://natasha.github.io/demo/) (rule-based lib for Russian language). Sequence model, implemented in [AnaGo](https://anago.herokuapp.com/) and NER by [DeepMIPT](https://demo.ipavlov.ai/) have lower accuracy for this type of text.
* Dictionary method and morphological analysis for finding ORGANIZATIONS and COUNTRIES (accuracy is more important than the opportunity to expand the lists)

In [1]:
import re
from datetime import date
import pymorphy2
import gensim
import nltk
#nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter
m = pymorphy2.MorphAnalyzer()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from natasha import (
    NamesExtractor,
    DatesExtractor,
    MoneyExtractor,
    LocationExtractor,
)

### Data preprocessing

In [2]:
def lemm(word):
    word = re.sub("(</?.*?>)|(<>)|(\\d|\\W)+", '', word).lower()
    return m.parse(word)[0].normal_form.strip()
def preprocess(file_readed_by_lines):
    return [[lemm(word) for word in word_tokenize(text) if ((lemm(word) not in stopWords) and len(word)>3)] for text in file_readed_by_lines]

In [3]:
# corpus is uploaded from https://xn--80abucjiibhv9a.xn--p1ai/%D0%BC%D0%B8%D0%BD%D0%B8%D1%81%D1%82%D0%B5%D1%80%D1%81%D1%82%D0%B2%D0%BE/68/%D1%84%D0%B0%D0%B9%D0%BB/916/%D0%9C%D0%A1_%D0%9D%D0%A2%D0%A1.pdf
# doesn't contain the test doc
with open ("corpus.txt", "r") as f:
    corpus_lines = f.readlines() 

In [4]:
with open ("corpus.txt", "r") as f:
    corpus = f.read() 

In [5]:
with open ("testdoc.txt", "r") as f:
    doc_lines = f.readlines()

In [6]:
with open ("testdoc.txt", "r") as f:
    doc = f.read()

In [7]:
stopWords = set(line.strip() for line in open('RUstopwords.txt', 'r'))
# can be expanded
print(len(stopWords))

151


In [8]:
countries = set(line.strip().lower() for line in open('countries.txt', 'r'))
# can be expanded
print(len(countries))

204


In [9]:
organizations = set(line.strip().lower() for line in open('organizations.txt', 'r'))
# can be expanded
print(len(organizations))

156


In [10]:
df_idf = preprocess(corpus_lines)

In [11]:
df_idf_doc = preprocess(doc_lines)

In [12]:
corpus = ""
for line in df_idf:
    corpus += " ".join(line)

In [13]:
doc = ""
for line in df_idf_doc:
    doc += " ".join(line)

### Computing TF-IDF and extracting key words

In [14]:
#  sorts the values in the vector while preserving the column index
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key = lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=20):
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []
    for idx, score in sorted_items:
        fname = feature_names[idx]
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    return results

def tfidf_keywords(corpus, doc):
    
    cv = CountVectorizer(max_df=0.85)
    word_count_vector = cv.fit_transform(word_tokenize(corpus))
    tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(word_count_vector)
    feature_names = cv.get_feature_names()
    
    tf_idf_vector = tfidf_transformer.transform(cv.transform([doc])) # enumerates a vector of tf-idf scores
    sorted_items = sort_coo(tf_idf_vector.tocoo())
    return extract_topn_from_vector(feature_names,sorted_items,20) 

In [15]:
keywords = []
kw_euristics = [[word for word in line if (word[-3:]=="ция")] for line in df_idf_doc]
for line in kw_euristics:
    for word in line:
        keywords.append(word)

In [16]:
keywords += tfidf_keywords(corpus, doc)

### Key phrases with tf-idf using N-gramms

In [17]:
def words_to_bigramms(text, str_bigrams = ""):
    for line in text:
        bigrams = ngrams(line,2)
        for k1, k2 in Counter(bigrams):
            str_bigrams += k1+ "_" + k2+ "_" + " "
    return str_bigrams

In [18]:
def words_to_trigramms(text, str_trigrams = ""):
    for line in text:
        trigrams = ngrams(line,3)
        for k1, k2,k3 in Counter(trigrams):
            str_trigrams += k1+ "_" + k2+ "_" +k3 + " "
    return str_trigrams

In [19]:
keyphrases = list(tfidf_keywords(words_to_bigramms(df_idf), words_to_bigramms(df_idf_doc)))
keyphrases += list(tfidf_keywords(words_to_trigramms(df_idf), words_to_bigramms(df_idf_doc)))

### Output

In [23]:
# euristics and morphological analysis
title = str(doc_lines[0]).strip()
topic = title.split(' ')[-2:]   
noun = m.parse(topic[1])[0]
adj = m.parse(topic[0])[0].inflect({noun.tag.gender, 'sing', 'nomn'})
print("\nНазвание документа: %s%s" % (title[0].upper(), title[1:]))

# dictionary method - искать также на три граммах и двуграммах (список строк)
df_idf_doc = preprocess(doc_lines)
orgs = []
orgs_ = [[word for word in line if (word in organizations)] for line in df_idf_doc]
for line in orgs_:
    for word in line:
        orgs.append(word)
for word in set(orgs):
    print("\nОрганизации:", word[0].upper()+word[1:])
    
# dictionary method - искать также на три граммах и двуграммах
coun_euristics = []
coun = [[word for word in line if (word in countries)] for line in df_idf_doc]
for line in coun:
    for word in line:
        coun_euristics.append(word)
for word in set(coun_euristics):
    print("\nСтраны:", word[0].upper()+word[1:])

# euristics and morphological analysis
act_type = ["в рамках", "содружества", "государств-участников", "межгосударственном"]
title_lemm = ' '.join([lemm(word) for word in word_tokenize(title)])

for word in act_type:
    if lemm(word) in title_lemm:
        type = "Многостороннее соглашение"
    else:
        type = "Двустороннее соглашение"
print("\nВид документа: %s" % type) 
          
print("\nНаправление:", adj[0][0].upper()+adj[0][1:].lower(), lemm(noun[0]))
print("Область:", adj[0][0].upper()+adj[0][1:].lower(), lemm(noun[0]))

# NER
dates = []
attention = set()
extractor = DatesExtractor()
for line in doc_lines:
    matches = extractor(line)
    for index, match in enumerate(matches):
        try:
            dates.append(date(match.fact.year, match.fact.month, match.fact.day))
        except TypeError as e:
            attention = match.fact.year, match.fact.month, match.fact.day
# usually acts with earlier dates are denied or are the ones which the current document is based on
data2 = dates.pop(dates.index(max(dates)))
data1 = max(dates);
print("\nДата заключения:", data1)
print("\nДата вступления в силу:", data2)
if len(attention) > 0:
    print("\n*Документ также содержит даты в неполном формате", attention)
    
print("\nКлючевые слова:")
for word in set(keywords):
    print(word)

print("\nНаиболее часто встречающиеся выражения (ngramms):")
for phrase in set(keyphrases):
    print(phrase)


Название документа: СОГЛАШЕНИЕ между Правительством Российской Федерации и Правительством Федеративной Республики Бразилии о научно-техническом сотрудничестве

Страны: Бразилия

Вид документа: Двустороннее соглашение

Направление: Научно-техническое сотрудничество
Область: Научно-техническое сотрудничество

Дата заключения: 1997-11-21

Дата вступления в силу: 1999-09-30

*Документ также содержит даты в неполном формате (2002, 12, None)

Ключевые слова:
страна
реализация
проект
научнотехнический
сила
информация
организация
бразилия
ассоциация
федеративный
условие
координация
бразилиа
федерация
сторона
соглашение
комиссия
сотрудничество
правительство
отношение
настоящий
действие
каждый
настоящее
республика
рекомендация

Наиболее часто встречающиеся выражения (ngramms):
декабрь__
документ_упомянуть_
сотрудничать_организация_
каждый_конкретный_
правительство_федеративный_
благоприятный_условие_
научнотехнический_сотрудничество_
настоящий_соглашение_
конкретный_случай_
_год_
бразилиа_ноябр