In [13]:
#%tensorflow_version 2.x
#%load_ext tensorboard
#!pip3 -q install -U tensorflow==2.1.0 tensorflow-gpu==2.1.0 tensorflow-datasets==2.1.0 tensorflow-text==2.1.1 tensorflow-hub==0.7.0 nltk sklearn transformers tensorflow-addons 
!pip3 -q install pandas

In [1]:
from typing import List, Tuple
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import tensorflow_datasets as tfds
from nltk.tokenize import sent_tokenize
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer

nltk.download('punkt')
os.environ["TFHUB_CACHE_DIR"] = 'datasets'


class BaseSummarizer(object):
    ROUND_DIGITS = 5

    def __text2sentences__(self, text: str) -> List[str]:
        raise NotImplementedError

    def __embeddings__(self, sentences: List[str]) -> tf.Tensor:
        raise NotImplementedError

    def __sim_mat__(self, vec: tf.Tensor) -> tf.Tensor:
        normalize = tf.math.l2_normalize(vec, 1)
        cosine = tf.linalg.matmul(normalize, normalize, transpose_b=True)
        rounded = tf.math.round(cosine * 10 ** BaseSummarizer.ROUND_DIGITS) / 10 ** BaseSummarizer.ROUND_DIGITS
        return rounded

    @staticmethod
    def __ranks__(sent_sim_mat: tf.Tensor) -> tf.Tensor:
        eig_val, eig_vec = tf.linalg.eigh(sent_sim_mat)
        best_vector_idx = tf.math.argmax(eig_val)
        return eig_vec[best_vector_idx]

    @staticmethod
    def __z_score__(vec: tf.Tensor) -> tf.Tensor:
        return (vec - tf.math.reduce_min(vec)) / (tf.math.reduce_max(vec) - tf.math.reduce_min(vec))

    def bleu(self, references: List[List[str]], texts: List[str]):
        score = 0.
        smoothie = SmoothingFunction().method1

        for refs, txt in zip(references, texts):
            hyp = self.the_most_important(txt, k=1)[0]
            score += sentence_bleu([ nltk.word_tokenize(s) for s in refs ], nltk.word_tokenize(hyp), smoothing_function=smoothie)

        score /= len(references)
        return score

    def scored_sentences(self, text: str) -> List[Tuple[str, float]]:
        sents = self.__text2sentences__(text)
        if not sents:
            return []
        sim_mat = self.__sim_mat__(self.__embeddings__(sents))
        ranks = BaseSummarizer.__z_score__(BaseSummarizer.__ranks__(sim_mat))
        return list(zip(sents, ranks.numpy()))

    def the_most_important(self, text, k=1):
        return [ p[0] for p in sorted(self.scored_sentences(text), key=lambda p: p[1], reverse=True)[:k] ]


class USETextRank(BaseSummarizer):
    __embed__ = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

    def __embeddings__(self, sentences: List[str]) -> tf.Tensor:
        return self.__embed__(sentences)

    def __text2sentences__(self, text: str) -> List[str]:
        return sent_tokenize(text)


class TFIDFTextRank(BaseSummarizer):
    __vectorizer__ = TfidfVectorizer()

    def __embeddings__(self, sentences: List[str]) -> tf.Tensor:
        return tf.constant(self.__vectorizer__.fit_transform(sentences).todense())

    def __text2sentences__(self, text: str) -> List[str]:
        return sent_tokenize(text)


summarizerUSE = USETextRank()
summarizerTFIDF = TFIDFTextRank()

[nltk_data] Downloading package punkt to /home/vad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:absl:Using datasets to cache modules.


In [2]:
from typing import List, Tuple
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
from transformers import BertTokenizer

MAX_SEQ_LENGTH = 256
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_subnet = hub.KerasLayer("https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1", 
                      signature="tokens", output_key="pooled_output", trainable=True)

def embedding4pair(s1: str, s2: str) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
    r = tokenizer.encode_plus(
          pad_to_max_length='right',
          text=s1,
          text_pair=s2,
          max_length=MAX_SEQ_LENGTH)
    
    return tf.constant(r['input_ids']), tf.constant(r['attention_mask']), tf.constant(r['token_type_ids'])


def create_ruler() -> tf.keras.Model:
    i_id = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), name="input_ids", dtype=tf.int32)
    i_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), name="input_masks", dtype=tf.int32)
    i_segment = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), name="segment_ids", dtype=tf.int32)

    bert_inputs = {"input_ids": i_id, "input_mask": i_mask, "segment_ids": i_segment}
  
    embedding = bert_subnet(bert_inputs)
    dense = tf.keras.layers.Dense(256, input_shape=(768,), activation='relu')(embedding)
    d = tf.keras.layers.Dense(1, input_shape=(256,))(dense)

    return tf.keras.models.Model(inputs=bert_inputs, outputs=d)


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [3]:
import numpy as np


nli_validation = tfds.load(name="multi_nli", split='validation_matched', data_dir='datasets')
nli_train = tfds.load(name="multi_nli", split='train', data_dir='datasets')

def process_dataset(ds):
    premises = []
    hypothesis = []
    input_ids = []
    input_mask = []
    segment_ids = []
    labels = []
    for x in ds:
        p = x['premise'].numpy().decode('utf8')
        l = x['label'].numpy()
        h = x['hypothesis'].numpy().decode('utf8')
        r = tokenizer.encode_plus(
          pad_to_max_length='right',
          text=p,
          text_pair=h,
          max_length=MAX_SEQ_LENGTH)
  
        input_ids.append(r['input_ids'])
        input_mask.append(r['attention_mask'])
        segment_ids.append(r['token_type_ids'])
        if l == 1:
            labels.append(1)
        else:
            labels.append(0)

        premises.append(p)
        hypothesis.append(h)

    input_ids = np.array(input_ids, dtype=np.int32)
    input_mask = np.array(input_mask, dtype=np.int32)
    segment_ids = np.array(segment_ids, dtype=np.int32)

    labels = np.array(labels, dtype=np.float16)

    dataset = tf.data.Dataset.from_tensor_slices(((input_ids, input_mask, segment_ids), labels))

    return dataset

ds_nli_train = process_dataset(nli_train)
ds_nli_valid = process_dataset(nli_validation)

INFO:absl:No config specified, defaulting to first: multi_nli/plain_text
INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset multi_nli (datasets/multi_nli/plain_text/1.0.0)
INFO:absl:Constructing tf.data.Dataset for split validation_matched, from datasets/multi_nli/plain_text/1.0.0
INFO:absl:No config specified, defaulting to first: multi_nli/plain_text
INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset multi_nli (datasets/multi_nli/plain_text/1.0.0)
INFO:absl:Constructing tf.data.Dataset for split train, from datasets/multi_nli/plain_text/1.0.0


In [None]:
import tensorflow_addons as tfa
import os

model = create_ruler()

es_cb = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

model.compile(
    optimizer=tfa.optimizers.LAMB(),
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.losses.MeanSquaredError()]
)
cp_cb = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                            save_weights_only=True,
                                            verbose=1)

model.fit(
    ds_nli_train.shuffle(32868).batch(512), 
    validation_data=ds_nli_valid.batch(512), 
    callbacks=[es_cb, cp_cb], 
    epochs=15)

Train for 767 steps, validate for 20 steps
Epoch 1/15
Epoch 00001: saving model to training_1/cp.ckpt
Epoch 2/15
Epoch 00002: saving model to training_1/cp.ckpt
Epoch 3/15
Epoch 00003: saving model to training_1/cp.ckpt
Epoch 4/15
Epoch 00004: saving model to training_1/cp.ckpt
Epoch 5/15
Epoch 00005: saving model to training_1/cp.ckpt
Epoch 6/15
Epoch 00006: saving model to training_1/cp.ckpt
Epoch 7/15
Epoch 00007: saving model to training_1/cp.ckpt
Epoch 8/15
Epoch 00008: saving model to training_1/cp.ckpt
Epoch 9/15
Epoch 00010: saving model to training_1/cp.ckpt
Epoch 11/15

In [9]:
model.save('simple')

INFO:tensorflow:Assets written to: simple/assets


INFO:tensorflow:Assets written to: simple/assets


In [4]:
import pandas as pd

df_test = pd.read_csv('cnn_dailymail.csv')
references = df_test['highlights'].map(lambda x: np.array(x.split('\n'), dtype=str)).values
txts = df_test['article'].values

In [6]:
import tensorflow_addons as tfa
from tqdm import tqdm

model = tf.keras.models.load_model('simple', compile=False )
model.compile(
    optimizer=tfa.optimizers.LAMB(),
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.losses.MeanSquaredError()]
)

r = embedding4pair('олооло ава ', 'sdsdasds')
        
    
class BERTTextRank(BaseSummarizer):
    
    def __embeddings__(self, sentences: List[str]) -> tf.Tensor:
        return tf.constant(sentences, dtype=tf.string)
    
    def __sim_mat__(self, vec: tf.Tensor) -> tf.Tensor:
        vecs = vec.numpy()
        
        res = []
        i = []
        a = []
        m = []
         
        for v1 in vecs:
            for v2 in vecs:
                r = embedding4pair(v1.decode('utf8'), v2.decode('utf8'))
                i.append(r[0])
                a.append(r[1])
                m.append(r[2])
        p = model.predict_on_batch((i, a, m)).numpy()
        res = p.reshape((len(vecs), len(vecs)))
                
        return res

    def __text2sentences__(self, text: str) -> List[str]:
        return sent_tokenize(text)

    
summarizerBERT = BERTTextRank()
print('bert', summarizerBERT.bleu(references, txts))

ResourceExhaustedError:  OOM when allocating tensor with shape[729,12,256,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model_1/keras_layer/StatefulPartitionedCall/StatefulPartitionedCall/StatefulPartitionedCall/bert/encoder/layer_0/attention/self/Softmax}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_function_4889786]

Function call stack:
function


In [None]:
r = embedding4pair('Чувак с автоматом гоняет обезьяну', 'обезьяна бегает')
r[2]

In [29]:
print('use', summarizerUSE.bleu(references, txts))

use 0.061298139955609794


In [30]:
print('tfidf', summarizerTFIDF.bleu(references, txts))

tfidf 0.06028948961872044


In [None]:
class BERTFTextRank(BaseSummarizer):
      #__tokenizer__ = BertTokenizer.from_pretrained('bert-base-uncased')
      #__embed__ = hub.Module("https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1", trainable=False)

    def scored_sentences(self, text: str) -> List[Tuple[str, float]]:
        sents = self.__text2sentences__(text)
        if not sents:
            return []

        

    def __text2sentences__(self, text: str) -> List[str]:
        return sent_tokenize(text)