In [13]:
#%tensorflow_version 2.x
#%load_ext tensorboard
#!pip3 -q install -U tensorflow==2.1.0 tensorflow-gpu==2.1.0 tensorflow-datasets==2.1.0 tensorflow-text==2.1.1 tensorflow-hub==0.7.0 nltk sklearn transformers tensorflow-addons 
!pip3 -q install pandas

In [1]:
from typing import List, Tuple
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import tensorflow_datasets as tfds
from nltk.tokenize import sent_tokenize
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer
from typing import List, Tuple
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import numpy as np
import pandas as pd
import json


nltk.download('punkt')
os.environ["TFHUB_CACHE_DIR"] = 'datasets'


class BaseSummarizer(object):
    ROUND_DIGITS = 5

    def __text2sentences__(self, text: str) -> List[str]:
        raise NotImplementedError

    def __embeddings__(self, sentences: List[str]) -> tf.Tensor:
        raise NotImplementedError

    def __sim_mat__(self, vec: tf.Tensor) -> tf.Tensor:
        normalize = tf.math.l2_normalize(vec, 1)
        cosine = tf.linalg.matmul(normalize, normalize, transpose_b=True)
        rounded = tf.math.round(cosine * 10 ** BaseSummarizer.ROUND_DIGITS) / 10 ** BaseSummarizer.ROUND_DIGITS
        return rounded

    @staticmethod
    def __ranks__(sent_sim_mat: tf.Tensor) -> tf.Tensor:
        eig_val, eig_vec = tf.linalg.eigh(sent_sim_mat)
        best_vector_idx = tf.math.argmax(eig_val)
        return eig_vec[best_vector_idx]

    @staticmethod
    def __z_score__(vec: tf.Tensor) -> tf.Tensor:
        return (vec - tf.math.reduce_min(vec)) / (tf.math.reduce_max(vec) - tf.math.reduce_min(vec))

    def bleu(self, references: List[List[str]], texts: List[str]):
        score = 0.
        smoothie = SmoothingFunction().method1

        for refs, txt in tqdm(zip(references, texts), total=len(references)):
            hyp = self.the_most_important(txt, k=1)[0]
            score += sentence_bleu([ nltk.word_tokenize(s) for s in refs ], nltk.word_tokenize(hyp), smoothing_function=smoothie)

        score /= len(references)
        return score

    def scored_sentences(self, text: str) -> List[Tuple[str, float]]:
        sents = self.__text2sentences__(text)
        if not sents:
            return []
        sim_mat = self.__sim_mat__(self.__embeddings__(sents))
        ranks = BaseSummarizer.__z_score__(BaseSummarizer.__ranks__(sim_mat))
        return list(zip(sents, ranks.numpy()))

    def the_most_important(self, text, k=1):
        return [ p[0] for p in sorted(self.scored_sentences(text), key=lambda p: p[1], reverse=True)[:k] ]


class USETextRank(BaseSummarizer):
    __embed__ = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

    def __embeddings__(self, sentences: List[str]) -> tf.Tensor:
        return self.__embed__(sentences)

    def __text2sentences__(self, text: str) -> List[str]:
        return sent_tokenize(text)


class TFIDFTextRank(BaseSummarizer):
    __vectorizer__ = TfidfVectorizer()

    def __embeddings__(self, sentences: List[str]) -> tf.Tensor:
        return tf.constant(self.__vectorizer__.fit_transform(sentences).todense())

    def __text2sentences__(self, text: str) -> List[str]:
        return sent_tokenize(text)


summarizerUSE = USETextRank()
summarizerTFIDF = TFIDFTextRank()

[nltk_data] Downloading package punkt to /home/vad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:absl:Using datasets to cache modules.


In [2]:
MAX_SEQ_LENGTH = 256
BATCH_SIZE = 16

def create_ruler() -> tf.keras.Model:
    
    i_id = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), name="input_ids", dtype=tf.int32)
    i_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), name="input_masks", dtype=tf.int32)
    i_segment = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), name="segment_ids", dtype=tf.int32)
                                
    bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/1", trainable=True)
        
    pooled_output, _ = bert_layer([i_id, i_mask, i_segment])
  
    d = tf.keras.layers.Dense(3)(pooled_output)

    return tf.keras.models.Model(inputs={"input_ids": i_id, "input_mask": i_mask, "segment_ids": i_segment}, outputs=d)

def convert_datasets():
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    
    nli_validation = tfds.load(name="multi_nli", split='validation_matched', data_dir='datasets')
    nli_train = tfds.load(name="multi_nli", split='train', data_dir='datasets')
    
    def write(ds, outf):
        def embedding4pair(s1: str, s2: str) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
            r = tokenizer.encode_plus(
                  pad_to_max_length='right',
                  text=s1,
                  text_pair=s2,
                  max_length=MAX_SEQ_LENGTH)

            return { 
                'input_ids': r['input_ids'], 
                'input_mask': r['attention_mask'], 
                'segment_ids': r['token_type_ids']
            }

        
        with open(outf, "w") as write_file:
            for x in ds:
                p = x['premise'].numpy().decode('utf8')
                l = x['label'].numpy()
                h = x['hypothesis'].numpy().decode('utf8')
                
                r = embedding4pair(p, h)
                r['label'] = int(l)
                r['s1'] = h
                r['s2'] = p
                
                json.dump(r, write_file)
                write_file.write("\n")
        
    write(nli_train, 'nli.train.jsonl')
    write(nli_validation, 'nli.valid.jsonl')

#convert_datasets()

In [3]:
train = pd.read_json('nli.train.jsonl', lines=True)
valid = pd.read_json('nli.valid.jsonl', lines=True)

tr_dataset = tf.data.Dataset.from_tensor_slices(((np.stack(train['input_ids'].map(lambda x: np.array(x, dtype=np.int32)).values), 
                                                  np.stack(train['input_mask'].map(lambda x: np.array(x, dtype=np.int32)).values), 
                                                  np.stack(train['segment_ids'].map(lambda x: np.array(x, dtype=np.int32)).values)), 
                                                 train['label'].values))
vl_dataset = tf.data.Dataset.from_tensor_slices(((np.stack(valid['input_ids'].map(lambda x: np.array(x, dtype=np.int32)).values), 
                                                  np.stack(valid['input_mask'].map(lambda x: np.array(x, dtype=np.int32)).values), 
                                                  np.stack(valid['segment_ids'].map(lambda x: np.array(x, dtype=np.int32)).values)), 
                                                 valid['label'].values))

In [4]:
train

Unnamed: 0,input_ids,input_mask,segment_ids,label,s1,s2
0,"[101, 14864, 90086, 26900, 10686, 10124, 11847...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,The narthex and the main entrance are located ...,Its narthex is set at an angle to the main ent...
1,"[101, 11696, 34420, 112, 188, 12888, 10105, 15...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,They were put in a museum after they were exca...,They didn't see the light of day until their e...
2,"[101, 37025, 136, 11723, 10134, 10192, 15453, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"He just looked at me and said, Well, what is it?",Well? There was no change of expression in the...
3,"[101, 10117, 20826, 10124, 10105, 22013, 10108...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,The winner has the fastest actions.,The winner is the master of the cleverest ploy...
4,"[101, 10117, 160, 14383, 64424, 10165, 30598, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,"The area is entirely wild, with no signs of ci...",The Whinlatter Pass Visitor Centre is a hub fo...
...,...,...,...,...,...,...
392697,"[101, 10167, 18638, 117, 12277, 13028, 16938, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,The only way to travel to the tree-shaded aven...,"In fact, if you don't feel like taking the bus..."
392698,"[101, 12865, 72894, 74755, 45788, 20442, 52253...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,"We need your help, we can't take care of ourse...",We appreciate your desire to help but we can t...
392699,"[101, 19687, 46739, 117, 169, 27914, 10111, 17...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,The concord coalition has open positions.,"Neil Howe, a historian and economist, is a sen..."
392700,"[101, 10380, 10189, 112, 187, 11023, 12257, 10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"That's one of the ways we do it, although ther...",so that's yeah that's one of the that's one of...


In [5]:
import tensorflow_addons as tfa
import os

model = create_ruler()

es_cb = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[ 'sparse_categorical_accuracy' ]
)
cp_cb = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                            save_weights_only=True,
                                            verbose=1)



In [6]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 177853441   input_ids[0][0]                  
                                                                 input_masks[0][0]            

In [None]:
model.fit(
    tr_dataset.shuffle(BATCH_SIZE*512*32).batch(BATCH_SIZE), 
    validation_data=vl_dataset.shuffle(BATCH_SIZE*512*32).batch(BATCH_SIZE), 
    callbacks=[es_cb, cp_cb], 
    epochs=3)

Train for 24544 steps, validate for 614 steps
Epoch 1/3
  317/24544 [..............................] - ETA: 3:37:18 - loss: 1.2864 - sparse_categorical_accuracy: 0.3314

In [24]:
model.save('simple')

INFO:tensorflow:Assets written to: simple/assets


INFO:tensorflow:Assets written to: simple/assets


In [135]:
import csv
import itertools

MAX_SEQ_LENGTH = 256
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def read_csv():
    
    labels = []
    def reader():
        with open('cnn_dailymail.csv', newline='') as csvfile:
            reader = csv.DictReader(csvfile)
            input_ids_batch = []
            input_mask_batch = []
            segment_ids_batch = []
            
            for i, doc in enumerate(reader):
                sents = sent_tokenize(doc['article'])
                references = doc['highlights'].split('\n')
                for s1 in sents:
                    for s2 in sents:
                        r = tokenizer.encode_plus(pad_to_max_length='right', text=s1, text_pair=s2, max_length=MAX_SEQ_LENGTH)
                        input_ids_batch.append(r['input_ids'])
                        input_mask_batch.append(r['input_mask'])
                        segment_ids_batch.append(r['segment_ids'])

                        labels.append(i)

                        if len(input_ids_batch) < BATCH_SIZE:
                            continue
                        else:
                            yield ({ 
                                'input_ids': np.array(input_ids_batch, dtype=np.int32), 
                                'input_masks': np.array(input_mask_batch, dtype=np.int32), 
                                'segment_ids': np.array(segment_ids_batch, dtype=np.int32)})
                            input_ids_batch = []
                            input_mask_batch = []
                            segment_ids_batch = []

    return reader(), labels                   

In [138]:
ds_p = ds.batch(512)


model = tf.keras.models.load_model('simple', compile=False )
model.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.losses.MeanSquaredError()]
)


X, l = read_csv()
y = model.predict(X)

ValueError: The `batch_size` argument must not be specified for the given input type. Received input: <generator object read_csv at 0x7f8dd155c450>, batch_size: 512

In [2]:
import os

model = tf.keras.models.load_model('simple', compile=False )
model.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.losses.MeanSquaredError()]
)

def sentance_pairs(docs: List[str]) -> Tuple[List[str], List[str], List[int]]:
    res1 = []
    res2 = []
    ids = []
    for i, doc in enumerate(docs):
        sents = sent_tokenize(doc)
        for s1 in sents:
            for s2 in sents:
                res1.append(s1)
                res2.append(s2)
                ids.append(i)
    return (res1, res2, ids)

def encode_pairs(ss1: List[str], ss2: List[str]) -> tf.Tensor:
    input_ids = []
    input_mask = []
    segment_ids = []
    
    for s1, s2 in zip(ss1, ss2):
        r = tokenizer.encode_plus(pad_to_max_length='right', text=s1, text_pair=s2, max_length=MAX_SEQ_LENGTH)
        input_ids.append(r['input_ids'])
        input_mask.append(r['attention_mask'])
        segment_ids.append(r['token_type_ids'])
        
    input_ids = np.array(input_ids, dtype=np.int32)
    input_mask = np.array(input_mask, dtype=np.int32)
    segment_ids = np.array(segment_ids, dtype=np.int32)
    
    dataset = tf.data.Dataset.from_tensor_slices((input_ids, input_mask, segment_ids))
    
    return model.predict(dataset.batch(512))


df_test = pd.read_csv('cnn_dailymail.csv')
references = df_test['highlights'].map(lambda x: np.array(x.split('\n'), dtype=str)).values
txts = df_test['article'].values

ss1, ss2, ids = sentance_pairs(txts)
flat_matrix = encode_pairs(ss1, ss2).numpy()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/mnt/d/work/automatic-summarization/env/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-2-7361b8f276b7>", line 47, in <module>
    flat_matrix = encode_pairs(ss1, ss2).numpy()
  File "<ipython-input-2-7361b8f276b7>", line 28, in encode_pairs
    r = tokenizer.encode_plus(pad_to_max_length='right', text=s1, text_pair=s2, max_length=MAX_SEQ_LENGTH)
NameError: name 'tokenizer' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/mnt/d/work/automatic-summarization/env/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2044, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent cal

NameError: name 'tokenizer' is not defined

In [13]:
from typing import List, Tuple
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
from transformers import BertTokenizer

MAX_SEQ_LENGTH = 256
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_subnet = hub.KerasLayer("https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1", 
                      signature="tokens", output_key="pooled_output", trainable=True)

def embedding4pair(s1: str, s2: str) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
    r = tokenizer.encode_plus(
          pad_to_max_length='right',
          text=s1,
          text_pair=s2,
          max_length=MAX_SEQ_LENGTH)
    
    return tf.constant(r['input_ids']), tf.constant(r['attention_mask']), tf.constant(r['token_type_ids'])


def create_ruler() -> tf.keras.Model:
    i_id = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), name="input_ids", dtype=tf.int32)
    i_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), name="input_masks", dtype=tf.int32)
    i_segment = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), name="segment_ids", dtype=tf.int32)

    bert_inputs = {"input_ids": i_id, "input_mask": i_mask, "segment_ids": i_segment}
  
    embedding = bert_subnet(bert_inputs)
    dense = tf.keras.layers.Dense(256, input_shape=(768,), activation='relu')(embedding)
    d = tf.keras.layers.Dense(1, input_shape=(256,))(dense)

    return tf.keras.models.Model(inputs=bert_inputs, outputs=d)



INFO:absl:No config specified, defaulting to first: multi_nli/plain_text
INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset multi_nli (datasets/multi_nli/plain_text/1.0.0)
INFO:absl:Constructing tf.data.Dataset for split validation_matched, from datasets/multi_nli/plain_text/1.0.0
INFO:absl:No config specified, defaulting to first: multi_nli/plain_text
INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset multi_nli (datasets/multi_nli/plain_text/1.0.0)
INFO:absl:Constructing tf.data.Dataset for split train, from datasets/multi_nli/plain_text/1.0.0


KeyboardInterrupt: 

In [3]:
import numpy as np


nli_validation = tfds.load(name="multi_nli", split='validation_matched', data_dir='datasets')
nli_train = tfds.load(name="multi_nli", split='train', data_dir='datasets')

def process_dataset(ds):
    premises = []
    hypothesis = []
    input_ids = []
    input_mask = []
    segment_ids = []
    labels = []
    for x in ds:
        p = x['premise'].numpy().decode('utf8')
        l = x['label'].numpy()
        h = x['hypothesis'].numpy().decode('utf8')
        r = tokenizer.encode_plus(
          pad_to_max_length='right',
          text=p,
          text_pair=h,
          max_length=MAX_SEQ_LENGTH)
  
        input_ids.append(r['input_ids'])
        input_mask.append(r['attention_mask'])
        segment_ids.append(r['token_type_ids'])
        if l == 1:
            labels.append(1)
        else:
            labels.append(0)

        premises.append(p)
        hypothesis.append(h)

    input_ids = np.array(input_ids, dtype=np.int32)
    input_mask = np.array(input_mask, dtype=np.int32)
    segment_ids = np.array(segment_ids, dtype=np.int32)

    labels = np.array(labels, dtype=np.float16)

    dataset = tf.data.Dataset.from_tensor_slices(((input_ids, input_mask, segment_ids), labels))

    return dataset

ds_nli_train = process_dataset(nli_train)
ds_nli_valid = process_dataset(nli_validation)

INFO:absl:No config specified, defaulting to first: multi_nli/plain_text
INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset multi_nli (datasets/multi_nli/plain_text/1.0.0)
INFO:absl:Constructing tf.data.Dataset for split validation_matched, from datasets/multi_nli/plain_text/1.0.0
INFO:absl:No config specified, defaulting to first: multi_nli/plain_text
INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset multi_nli (datasets/multi_nli/plain_text/1.0.0)
INFO:absl:Constructing tf.data.Dataset for split train, from datasets/multi_nli/plain_text/1.0.0


In [None]:
import tensorflow_addons as tfa
import os

model = create_ruler()

es_cb = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

model.compile(
    optimizer=tfa.optimizers.LAMB(),
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.losses.MeanSquaredError()]
)
cp_cb = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                            save_weights_only=True,
                                            verbose=1)

model.fit(
    ds_nli_train.shuffle(32868).batch(512), 
    validation_data=ds_nli_valid.batch(512), 
    callbacks=[es_cb, cp_cb], 
    epochs=15)

Train for 767 steps, validate for 20 steps
Epoch 1/15
Epoch 00001: saving model to training_1/cp.ckpt
Epoch 2/15
Epoch 00002: saving model to training_1/cp.ckpt
Epoch 3/15
Epoch 00003: saving model to training_1/cp.ckpt
Epoch 4/15
Epoch 00004: saving model to training_1/cp.ckpt
Epoch 5/15
Epoch 00005: saving model to training_1/cp.ckpt
Epoch 6/15
Epoch 00006: saving model to training_1/cp.ckpt
Epoch 7/15
Epoch 00007: saving model to training_1/cp.ckpt
Epoch 8/15
Epoch 00008: saving model to training_1/cp.ckpt
Epoch 9/15
Epoch 00010: saving model to training_1/cp.ckpt
Epoch 11/15

In [9]:
model.save('simple')

INFO:tensorflow:Assets written to: simple/assets


INFO:tensorflow:Assets written to: simple/assets


In [6]:
from typing import List, Tuple
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import tensorflow_addons as tfa
from tqdm import tqdm
import pandas as pd

df_test = pd.read_csv('cnn_dailymail.csv')
references = df_test['highlights'].map(lambda x: np.array(x.split('\n'), dtype=str)).values
txts = df_test['article'].values

model = tf.keras.models.load_model('simple', compile=False )
model.compile(
    optimizer=tfa.optimizers.LAMB(),
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.losses.MeanSquaredError()]
)

y = []
ss = []

for rfs, txt in tqdm(zip(references, txts), total=len(txts)):
    sntns = sent_tokenize(txt)
    y.append(rfs)
    ss.append(sntns)
    
        

In [9]:
def proc():
    i = []
    a = []
    m = []
    for sntns in tqdm(ss, total=len(ss)):
        for v1 in sntns:
            for v2 in sntns:
                r = embedding4pair(v1, v2)

                i.append(r[0])
                a.append(r[1])
                m.append(r[2])
                
                yield (r[0], r[1], r[2])

ds = tf.data.Dataset.from_generator(proc, output_types=tf.float16).batch(512)
p = model.predict(ds).numpy()

offset = 0
sim = []
for s in tqdm(ss, total=len(ss)):
    if offset >= len(i):
        break

    sentences_count = len(s) * len(s)
    sim_mat = p[offset:(offset + sentences_count)].reshape((len(s), len(s)))
    offset =+ sentences_count

    sim.append(sim_mat)

score = 0.
smoothie = SmoothingFunction().method1
for sents, sim_mat, refs in tqdm(zip(ss, sim, y), total=len(y)):
    ranks = BaseSummarizer.__z_score__(BaseSummarizer.__ranks__(sim_mat))
    scored_sentences = list(zip(sents, ranks.numpy()))
    the_most_important = [ p[0] for p in sorted(scored_sentences, key=lambda p: p[1], reverse=True)[:1] ]
    hyp = the_most_important[0]
    score += sentence_bleu([ nltk.word_tokenize(s) for s in refs ], nltk.word_tokenize(hyp), smoothing_function=smoothie)

score /= len(y)
print('bert', score)

ValueError: in converted code:

    /mnt/d/work/automatic-summarization/env/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py:677 map_fn
        batch_size=None)
    /mnt/d/work/automatic-summarization/env/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py:2410 _standardize_tensors
        exception_prefix='input')
    /mnt/d/work/automatic-summarization/env/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_utils.py:526 standardize_input_data
        standardize_single_array(x, shape) for (x, shape) in zip(data, shapes)
    /mnt/d/work/automatic-summarization/env/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_utils.py:526 <listcomp>
        standardize_single_array(x, shape) for (x, shape) in zip(data, shapes)
    /mnt/d/work/automatic-summarization/env/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_utils.py:451 standardize_single_array
        if (x.shape is not None and len(x.shape) == 1 and
    /mnt/d/work/automatic-summarization/env/lib/python3.7/site-packages/tensorflow_core/python/framework/tensor_shape.py:822 __len__
        raise ValueError("Cannot take the length of shape with unknown rank.")

    ValueError: Cannot take the length of shape with unknown rank.


In [4]:
import pandas as pd

df_test = pd.read_csv('cnn_dailymail.csv')
references = df_test['highlights'].map(lambda x: np.array(x.split('\n'), dtype=str)).values
txts = df_test['article'].values

In [5]:
import tensorflow_addons as tfa
from tqdm import tqdm

model = tf.keras.models.load_model('simple', compile=False )
model.compile(
    optimizer=tfa.optimizers.LAMB(),
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.losses.MeanSquaredError()]
)
        
    
class BERTTextRank(BaseSummarizer):
    
    def __embeddings__(self, sentences: List[str]) -> tf.Tensor:
        return tf.constant(sentences, dtype=tf.string)
    
    def __sim_mat__(self, vec: tf.Tensor) -> tf.Tensor:
        vecs = vec.numpy()[0:15]
        
        res = []
        i = []
        a = []
        m = []
         
        for v1 in vecs:
            for v2 in vecs:
                r = embedding4pair(v1.decode('utf8'), v2.decode('utf8'))
                i.append(r[0])
                a.append(r[1])
                m.append(r[2])
        p = model.predict_on_batch((i, a, m)).numpy()
        res = p.reshape((len(vecs), len(vecs)))
                
        return res

    def __text2sentences__(self, text: str) -> List[str]:
        return sent_tokenize(text)


X = []
y = []
ss = []
for rfs, txt in tqdm(zip(references, txts), total=len(txts)):
    sntns = sent_tokenize(txt)[0:15]
    i = []
    a = []
    m = []
    for v1 in sntns:
        for v2 in sntns:
            r = embedding4pair(v1, v2)
            i.append(r[0])
            a.append(r[1])
            m.append(r[2])
            
    X.append((i, a, m))
    y.append(rfs)
    ss.append(sntns)

100%|██████████| 13368/13368 [21:58<00:00, 10.14it/s]


In [None]:
sim = []
for x, s in tqdm(zip(X, ss), total=len(X)):
    p = model.predict_on_batch((x[0], x[1], x[2])).numpy()
    sim.append(p.reshape((len(s), len(s))))

 27%|██▋       | 3582/13368 [8:47:52<24:57:53,  9.18s/it]

In [None]:
score = 0.
smoothie = SmoothingFunction().method1
for sents, sim_mat, refs in zip(ss, sim, y):
    ranks = BaseSummarizer.__z_score__(BaseSummarizer.__ranks__(sim_mat))
    scored_sentences = list(zip(sents, ranks.numpy()))
    the_most_important = [ p[0] for p in sorted(scored_sentences, key=lambda p: p[1], reverse=True)[:1] ]
    hyp = the_most_important[0]
    score += sentence_bleu([ nltk.word_tokenize(s) for s in refs ], nltk.word_tokenize(hyp), smoothing_function=smoothie)

score /= len(y)
print('bert', score)

In [29]:
print('use', summarizerUSE.bleu(references, txts))

use 0.061298139955609794


In [30]:
print('tfidf', summarizerTFIDF.bleu(references, txts))

tfidf 0.06028948961872044


In [None]:
class BERTFTextRank(BaseSummarizer):
      #__tokenizer__ = BertTokenizer.from_pretrained('bert-base-uncased')
      #__embed__ = hub.Module("https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1", trainable=False)

    def scored_sentences(self, text: str) -> List[Tuple[str, float]]:
        sents = self.__text2sentences__(text)
        if not sents:
            return []

        

    def __text2sentences__(self, text: str) -> List[str]:
        return sent_tokenize(text)