<a href="https://colab.research.google.com/github/ahill132009/Machine-Learning-Course-SPBU/blob/main/Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
import re
import numpy as np

from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')  # one time execution


import spacy
from spacy.lang.ru import Russian

!pip install pymorphy2
!pip install compress-fasttext

import compress_fasttext
small_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load(
    'https://github.com/avidale/compress-fasttext/releases/download/v0.0.1/ft_freqprune_100K_20K_pq_100.bin'
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
class Summarizer:

  def __init__(self):
    pass
  
  def preprocess(self, text):
    '''Returns list of lists of lemmatized words in sentence'''

    self.text = text
    self.text = sent_tokenize(self.text)
    self.lemmatized_sentences = []
    nlp = Russian()
    for line in self.text:
      self.lemmatized_sentences.append([t.lemma_.lower() for t in nlp(line) if 
                    t.lemma_ not in stopwords.words('russian') and
                    t.lemma_.isalpha() and re.search(r'[А-Яа-я]', t.lemma_)])
    return self.lemmatized_sentences
  
  def average_sentence_vector(self, sentences):
    '''Returns list of average vectors of the sentences'''

    self.sentences = sentences
    self.sent_as_vec = []
    for line in self.sentences:
      plus = 0
      for w in line:
        try:
          plus += small_model[w]
        except KeyError:
          pass
      # Добавляем 1, чтобы избежать деления на 0 тогда,
      # когда попадается пустой лист
      plus = plus/(len(line)+1)
      # Если предложение состояло только из нерусских слов,
      # то вместо вектора класса ndarray, получится float,
      # что нарушит целостность списка векторов и даст ошибку при fit()
      if not isinstance(plus, float):
        self.sent_as_vec.append(plus)
    return self.sent_as_vec


  def fit_clustering(self, list_of_vector_matrices):
    '''Returns list of indices of the sentences that are closest to the
    centroid of the cluster'''

    self.vectors = list_of_vector_matrices
    X = self.vectors
    cluster = KMeans(n_clusters=int(np.sqrt(len(X))), init='k-means++')
    cluster.fit(X)
    #???
    y_pred = cluster.predict(X)
    self.closest, _ = pairwise_distances_argmin_min(cluster.cluster_centers_, X)
    return sorted(self.closest)
  
  def get_summary(self, indices_of_sentences):
    '''Returns summary'''
    
    self.indices_of_sentences = indices_of_sentences
    summary =  ' '.join([str(self.text[sen]) for sen in self.indices_of_sentences])
    return summary
  
  