In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from itertools import chain
from sklearn.metrics import pairwise_distances
#from sqlalchemy import create_engine
import pandas as pd
from nltk.tokenize import sent_tokenize
import fasttext as ft
from collections import defaultdict
#from greed import greed_sum_query

from tqdm.notebook import tqdm
import os

import pickle
def save_obj(obj, name):
    pickle.dump(obj,open(name + '.pkl', 'wb'), protocol=4)
    
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

from nltk.corpus import stopwords
sw = stopwords.words('russian')

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.tokenize import word_tokenize

# Load Data

## Get file paths

In [None]:
# get file paths in all data folders and subfolders
# https://pynative.com/python-list-files-in-a-directory/#:~:text=You%20can%20use%20os.,current_path%2C%20files%20in%20current_path).
import glob

file_names = []
# get all files inside a specific folder
dir_path = './data/**/*.*'
for file in glob.glob(dir_path, recursive=True):
    print(file)
    file_names.append(file)

In [None]:
len(file_names)

In [None]:
# filter out doc files
doc_files = [f for f in file_names if '.doc'in f]
len(doc_files)

## Get texts

In [None]:
import docx2txt

# extract text
text = docx2txt.process(doc_files[0])

In [None]:
text = ' '.join(text.split('\n\n'))
text

In [None]:
texts = []
         
for f in tqdm(doc_files):
    try:
       texts.append(' '.join(docx2txt.process(f).split('\n\n')) ) 
    except:
        texts.append('Error')
        continue

In [None]:
len(texts)

# Generate vectors

In [None]:
import fasttext as ft
modelPath = "../com_lang_models/"
model = ft.load_model(modelPath+"cc.ru.300.bin")

In [None]:
X_fasttext = [model.get_sentence_vector(t.replace('\n','')) for t in tqdm(texts)]

In [None]:
len(X_fasttext[50])

In [None]:
df = pd.DataFrame()
df['title'] = [os.path.basename(p) for p in doc_files]
df['text'] = texts
df['vecs'] = X_fasttext

df.head()

# GreedSum

In [None]:
def greed_sum_query(text, query, num_sent=10, min_df=1, max_df=1.0, stop_words=sw):
    # Let's take 10% of the most meaningful sentences
    #num_sent = 10 # int(len(text)*0.05) 
    #print('Number of 5% sentences', num_sent)
        
    #fit a TFIDF vectorizer
    #print(min_df, max_df)
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, stop_words=stop_words)
    X1 = vectorizer.fit_transform(text).toarray()
    
    # query specific TFIDF
    voc = word_tokenize(query.lower())
    vectorizer2 = TfidfVectorizer(vocabulary=voc)
    X2 = vectorizer2.fit_transform(text).toarray()
    
    X = np.concatenate((X1, X2), axis=1)
    
    # uprank the sentences which are semantically closer to the query and downrank others
    query_vec = model.get_sentence_vector(query)
    text_vecs = np.array([model.get_sentence_vector(s.replace('\n','')) for s in text])
    similarity = 1 - pairwise_distances(query_vec.reshape(1, -1), text_vecs, metric='cosine')
    X = X * similarity[0][:, None]
    
    #print(X.shape)
    #get the sentence indices
    idx = []
    while sum(sum(X)) != 0:
        ind = np.argmax(X.sum(axis=1))
        #print(ind)
        idx.append(ind)
        #print(num_sent, idx)
        #stop if we have more than 20% of the sentences from the text
        if len(idx) > int(len(text)*0.2):#num_sent:
            break
            
        #update the matrix deleting the columns corresponding to the words found in previous step
        cols = X[ind]
        col_idx = [i for i in range(len(cols)) if cols[i] > 0]
        X = np.delete(X, col_idx, 1)
        #print(X.shape)
        
    #print (len(text), len(idx))
    #make a condition to extract a number of sentences or all salient sentences
    if num_sent != 0:
        idx = idx[:num_sent]
    idx.sort()
    #print(idx)
    summary = [text[i] for i in idx]
    return summary

In [None]:
# Hyperparameters setup
top_n = 5
num_sent = 0

uq = 'Организация процесса хвостового хозяйства'

# 1. Select top N documents closest to the query
def get_docs(query, vecs, top_n=top_n):
    query_vec = model.get_sentence_vector(query.replace('\n',''))
    dists = pairwise_distances(query_vec.reshape(1, -1), vecs, metric='cosine')
    idx = np.argsort(dists)[0][: top_n]
    #print(df.title.values[idx])
    return idx

idx = get_docs(uq, np.stack(df.vecs.values))

    
# 2. Summarize each document regarding query
from nltk.tokenize import sent_tokenize
from itertools import chain
text_sents = sent_tokenize(df.text.values[idx[0]])
#text_sents
sums = [greed_sum_query(sent_tokenize(t), uq, num_sent=num_sent) for t in df.text.values[idx]]


# 3. Merge summaries and summarize them again
sums = list(chain(*sums))
#len(sums)
final_sum = greed_sum_query(sums, uq, num_sent=num_sent)
len(final_sum)

# 4. Get references for each sentence in the resulting summary
refs = []
for s in final_sum:
    for i, d in enumerate(df.text.values[idx]):
        if s in d:
            refs.append(i)
            break
#refs
final_sum = [s + '[%s]'%(r+1) for s, r in zip(final_sum, refs)]
#print(final_sum)

ref_list = ['%s. '%(r+1)+t for r, t in enumerate(df.title.values[idx])]
#print(ref_list)


# 5. Separate the text into paragraphs
## Find paragraph boundaries

### consecutive sentence pair similarity
sum_sent_vecs = [model.get_sentence_vector(s.replace('\n','')) for s in final_sum]
sims = []
for i in range(1, len(sum_sent_vecs)):
    sims.append(1 - pairwise_distances(sum_sent_vecs[i].reshape(1, -1), sum_sent_vecs[i-1].reshape(1, -1), metric='cosine')[0][0])

treshld = 0.5
par_begin = np.array(sims) <= treshld
par_begin_idx = np.array(range(len(sims)))[par_begin]
#par_begin_idx


## Make titles for paragraphs

def get_title(par):
    vectorizer = TfidfVectorizer(ngram_range=(3, 3),stop_words=sw)
    X = vectorizer.fit_transform(par).toarray().sum(axis=0)
    return vectorizer.get_feature_names_out()[np.argmax(X)].upper()

# par = final_sum[0:par_begin_idx[0]]
# get_title(par)

# divide a list into parts
def partition(alist, indices):
    return [alist[i:j] for i, j in zip([0]+indices, indices+[None])]

pars = partition(final_sum, list(par_begin_idx))

par_tits = [get_title(p) for p in pars]
#par_tits

In [None]:
# 6. Put together text with inline references and a reference list

print(('Саммари по запросу: ' + uq).upper())
print()

for t,p in zip(par_tits, pars):
    print(t)
    print('. '.join(p))
    print()

print('СПИСОК ИСПОЛЬЗОВАННОЙ ЛИТЕРАТУРЫ:')
for r in ref_list:
    print(r)
    

In [None]:
par_begin_idx

In [None]:
from matplotlib import pyplot as plt
plt.plot(sims)

In [None]:
np.mean(sims)

In [None]:
np.std(sims)

# Wikipedia mode

In [None]:
import wikipedia

In [None]:
# QUERY 
uq = 'Организация процесса добычи твердых полезных ископаемых'

# Hyperparameters
top_n = 10
num_sent = 0

## Get the most relevant docs

In [None]:
wikipedia.set_lang("ru")
wiki_titles = wikipedia.search(uq, results=10000, suggestion=False)
wiki_titles

In [None]:
len(wiki_titles)

In [None]:
def get_page(title):
    try:
        p = wikipedia.page(title, auto_suggest=False, redirect=True, preload=False)
        return p
    except wikipedia.DisambiguationError as e:
        s = e.options[0] #random.choice(e.options)
        p = wikipedia.page(s, auto_suggest=False, redirect=True, preload=False)
        return p

In [None]:
summaries = [get_page(title).summary for title in tqdm(wiki_titles)]
summaries

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
# select model from https://www.sbert.net/docs/pretrained_models.html
model_emb = SentenceTransformer('distiluse-base-multilingual-cased-v1')

def similarity_measure(text1, text2):
    # based on https://towardsdatascience.com/bert-for-measuring-text-similarity-eec91c6bf9e1
    text_embeddings = model_emb.encode([text1, text2])
    sim_score = cosine_similarity(text_embeddings[0].reshape(1, -1), text_embeddings[1].reshape(1, -1))
    
    return sim_score[0][0]

sum_vecs = [model_emb.encode(t) for t in tqdm(summaries)]

In [None]:
uq_vec = model_emb.encode(uq)

In [None]:
uq

In [None]:
from sklearn.metrics import pairwise_distances, pairwise_distances_chunked 
dists = pairwise_distances(uq_vec.reshape(1, -1), sum_vecs, metric="cosine")
dists.shape

In [None]:
idx = np.argsort(dists[0])[:top_n]
dists[0][idx]

In [None]:
np.array(wiki_titles)[idx]

## Load collected wiki articles

In [None]:
ds = load_obj('ds_mining_wiki')
ds.head()

## Vectorize

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
# select model from https://www.sbert.net/docs/pretrained_models.html
model_emb = SentenceTransformer('distiluse-base-multilingual-cased-v1', device='cuda')

def similarity_measure(text1, text2):
    # based on https://towardsdatascience.com/bert-for-measuring-text-similarity-eec91c6bf9e1
    text_emb1 = model_emb.encode(text1)#model.get_sentence_vector(text1.replace('\n','').lower())#model_emb.encode([text1, text2])
    text_emb2 = model_emb.encode(text2)#model.get_sentence_vector(text2.replace('\n','').lower())
    sim_score = cosine_similarity(text_emb1.reshape(1, -1), text_emb2.reshape(1, -1))
    
    return sim_score[0][0]

In [None]:
similarity_measure('Организация хвостового хозяйства', 'Хвостохранилище на ГМК')

In [None]:
cont_vecs = [model.get_sentence_vector(c.replace('\n','').lower()) for c in tqdm(ds.title.values)]

In [None]:
ds['cont_vecs'] = cont_vecs

In [None]:
save_obj(ds, 'ds_mining_wiki_vecs')

## GreedSum

In [None]:
# def greed_sum_query(text, query, num_sent=10, min_df=1, max_df=1.0, stop_words=sw):
#     # Let's take 10% of the most meaningful sentences
#     #num_sent = 10 # int(len(text)*0.05) 
#     #print('Number of 5% sentences', num_sent)
        
#     #fit a TFIDF vectorizer
#     #print(min_df, max_df)
#     vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, stop_words=stop_words)
#     X1 = vectorizer.fit_transform(text).toarray()
    
#     # query specific TFIDF
#     voc = word_tokenize(query.lower())
#     vectorizer2 = TfidfVectorizer(vocabulary=voc)
#     X2 = vectorizer2.fit_transform(text).toarray()
    
#     X = np.concatenate((X1, X2), axis=1)
    
#     # uprank the sentences which are semantically closer to the query and downrank others
#     query_vec = model_emb.encode(query)
#     text_vecs = np.array([model_emb.encode(s.replace('\n','')) for s in text])
#     similarity = 1 - pairwise_distances(query_vec.reshape(1, -1), text_vecs, metric='cosine')
#     X = X * similarity[0][:, None]
    
#     #print(X.shape)
#     #get the sentence indices
#     idx = []
#     while sum(sum(X)) != 0:
#         ind = np.argmax(X.sum(axis=1))
#         #print(ind)
#         idx.append(ind)
#         #print(num_sent, idx)
#         #stop if we have more than 20% of the sentences from the text
#         if len(idx) > int(len(text)*0.2):#num_sent:
#             break
            
#         #update the matrix deleting the columns corresponding to the words found in previous step
#         cols = X[ind]
#         col_idx = [i for i in range(len(cols)) if cols[i] > 0]
#         X = np.delete(X, col_idx, 1)
#         #print(X.shape)
        
#     #print (len(text), len(idx))
#     #make a condition to extract a number of sentences or all salient sentences
#     if num_sent != 0:
#         idx = idx[:num_sent]
#     idx.sort()
#     #print(idx)
#     summary = [text[i] for i in idx]
#     return summary

In [None]:
# Hyperparameters setup
top_n = 5
num_sent = 0

uq = 'Организация процесса хвостового хозяйства'

In [None]:
# 1. Select top N documents closest to the query
idx = get_docs(uq, np.stack(ds.cont_vecs.values))

    

In [None]:
idx

In [None]:
ds.title.values[idx]

In [None]:
# 2. Summarize each document regarding query
from nltk.tokenize import sent_tokenize
from itertools import chain
#text_sents = sent_tokenize(ds.content.values[idx[0]])
#text_sents
sums = [greed_sum_query(sent_tokenize(t), uq, num_sent=num_sent) for t in tqdm(ds.content.values[idx])]

In [None]:
len(sums)

In [None]:
sums

In [None]:
# 3. Merge summaries and summarize them again
sums = list(chain(*sums))
#len(sums)
final_sum = greed_sum_query(sums, uq, num_sent=num_sent)
len(final_sum)

In [None]:
final_sum

In [None]:
# 4. Get references for each sentence in the resulting summary
refs = []
for s in final_sum:
    for i, d in enumerate(ds.content.values[idx]):
        if s in d:
            refs.append(i)
            break
#refs
final_sum = [s + '[%s]'%(r+1) for s, r in zip(final_sum, refs)]
#print(final_sum)

ref_list = ['%s. '%(r+1)+t for r, t in enumerate(ds.title.values[idx])]
#print(ref_list)

In [None]:
# 5. Separate the text into paragraphs
## Find paragraph boundaries

### consecutive sentence pair similarity
sum_sent_vecs = [model.get_sentence_vector(s.replace('\n','')) for s in final_sum]
sims = []
for i in range(1, len(sum_sent_vecs)):
    sims.append(1 - pairwise_distances(sum_sent_vecs[i].reshape(1, -1), sum_sent_vecs[i-1].reshape(1, -1), metric='cosine')[0][0])

treshld = 0.5
par_begin = np.array(sims) <= treshld
par_begin_idx = np.array(range(len(sims)))[par_begin]
#par_begin_idx


## Make titles for paragraphs

def get_title(par):
    if len(par) > 0:
        vectorizer = TfidfVectorizer(ngram_range=(3, 3),stop_words=sw)
        X = vectorizer.fit_transform(par).toarray().sum(axis=0)
        return vectorizer.get_feature_names_out()[np.argmax(X)].upper()
    else:
        return ''

# par = final_sum[0:par_begin_idx[0]]
# get_title(par)

# divide a list into parts
def partition(alist, indices):
    return [alist[i:j] for i, j in zip([0]+indices, indices+[None])]

pars = partition(final_sum, list(par_begin_idx))

par_tits = [get_title(p) for p in pars]
#par_tits

In [None]:
pars

In [None]:
# 6. Put together text with inline references and a reference list

print(('Саммари по запросу: ' + uq).upper())
print()

for t,p in zip(par_tits, pars):
    print(t)
    print('. '.join(p))
    print()

In [None]:
print('СПИСОК ИСПОЛЬЗОВАННОЙ ЛИТЕРАТУРЫ:')
for r in ref_list:
    print(r)
    