# Suppose we are NLP Engineer working to build an automatic text summarizer for a News Channel. The news channel uses flashcards of broad news articles to design the front page of their blog, which is read by more than a million readers across the globe. To develop the content for these flashcards, the news channel editors manually summarize the prospective news articles. This process is very time-consuming therefore, it becomes important to build a text summarizer that can automatically generate summaries.

# The text summarizer developed by us is going to play a crucial role in reducing the turnaround time for the news editors in developing the content for the flashcards. 



# The dataset which we will be using to train our text summarizer, is DeepMind Q and A dataset. This dataset is discussed in the following research publication: 

https://arxiv.org/abs/1506.03340

# This dataset primarily contains the documents and accompanying questions from the news articles of the CNN news channel. The url of the dataset is here: 

https://cs.nyu.edu/~kcho/DMQA/

# Within the CNN column, we will be using the section named "Stories" to fetch our dataset which will be having long paragraphs as well as their summaries. 

# We have already downloaded the dataset from the above url, now let's unzip it: 

In [None]:
cd /content/drive/MyDrive

In [None]:
! tar -xvf /content/drive/MyDrive/cnn_stories.tgz

In [None]:
import pandas as pd
import numpy as np 
import os

# Let's create functions to load the dataset and split stories into news paragraphs as well as summaries or highlights. 

In [None]:
def load_story(single_story_path):

  file_handle = open(single_story_path,encoding="utf-8")
  single_complete_story = file_handle.read()
  file_handle.close()
  return single_complete_story

In [None]:
def split_story_into_para_highlights(single_complete_story):

  highlight_loc = single_complete_story.find("@highlight")
  para, highlights = single_complete_story[:highlight_loc], single_complete_story[highlight_loc:].split("@highlight")
  highlights = [summary.strip() for summary in highlights if len(highlights) > 0]

  return para,highlights

In [None]:
paragraphs = list()
summaries = list()

for story_filename in os.listdir("./cnn/stories"):

  single_story_path = os.path.join("./cnn/stories",story_filename)
  single_complete_story = load_story(single_story_path)

  para, highlights = split_story_into_para_highlights(single_complete_story)

  paragraphs.append(para)
  summaries.append(highlights)

stories = dict(zip(["Story_paragraphs","Abstractive_summaries"],[paragraphs,summaries]))

# We have now converted our data into abstractive text summarization dataset. 

In [None]:
import pickle

In [None]:
pkl_file_handle = open("cnn_news_stories.pkl","wb")
pickle.dump(stories,pkl_file_handle)

In [None]:
stories = pickle.load(open("./cnn_news_stories.pkl","rb"))

In [None]:
stories["Story_paragraphs"][0]

# Lets create a small function to preprocess each line of paragraphs as well as abstractive summaries. 

In [None]:
import string

In [None]:
def preprocess_single_sent_per_story(sents_per_story):

  cleaned_sents = list()
  waste_tokens_ascii_values_mapping = dict(zip(list(range(33,48)) + list(range(58,65)) +\
                                                 list(range(91,97)) + list(range(123,127)),[None]*32))
  for sent in sents_per_story:
            
    loc = sent.find('(CNN) -- ')
    if loc > -1:
      sent = sent[loc+len('(CNN)'):]
        
    sent = sent.split()
    sent = [token.lower() for token in sent]
    sent = [token.translate(waste_tokens_ascii_values_mapping) for token in sent]
    sent = [token for token in sent if token.isalpha()]
    cleaned_sents.append(' '.join(sent))
    
  cleaned_sents = [sent for sent in cleaned_sents if len(sent) > 0]
  return cleaned_sents

In [None]:
from tqdm.notebook import tqdm

In [None]:
for i in tqdm(range(len(stories["Story_paragraphs"]))):

  stories["Story_paragraphs"][i] = preprocess_single_sent_per_story(stories["Story_paragraphs"][i].split("\n"))
  stories["Abstractive_summaries"][i] = preprocess_single_sent_per_story(stories["Abstractive_summaries"][i])

In [None]:
stories["Story_paragraphs"][0]

In [None]:
stories["Abstractive_summaries"][0]

# Now, we will be making each story paragraphs as short and as concise as possible. The way to do this is by using ROUGE (Recall-Oriented Understudy for Gisting Evaluation) score. We will be here using ROUGE score to extract the most relevant sentences from each story paragraph based on the given summary corresponding to that story paragraph. 

# So, basically what we will be doing is that for each story, we will be calculating ROUGE score between each sentence of the story paragraph and each of the summaries corresponsing to that specific paragraph. Furthermore, we will be selecting top 5 sentences from the story paragraphs with respect to their ROUGE score. 

# In this manner, we will be making the story paragraphs concise. 

# To know more about ROUGE score and what extractive summarization is, please navigate to this url: 

https://arxiv.org/abs/1807.02305

In [None]:
! pip install Rouge

In [None]:
from rouge import Rouge

In [None]:
rouge_obj = Rouge()

In [None]:
def compute_rouge_score(story_para_sent, abstractive_summaries):

  score_per_story_para_sent = list()

  for summary in abstractive_summaries:

    summary_scores = rouge_obj.get_scores(summary, story_para_sent)
    score_per_story_para_sent.append(summary_scores[0]['rouge-1']['f'])
    
  return max(score_per_story_para_sent)

In [None]:
def fetch_each_story_top5_para_sents(story_para, abstractive_summaries):

  story_para_sents = list()
  max_scores = list()

  for i in range(0, len(story_para)):

    story_para_sent = story_para[i]
    max_score = compute_rouge_score(story_para_sent, abstractive_summaries)

    story_para_sents.append(story_para_sent)
    max_scores.append(max_score)
        
  story_para_sents = np.array(story_para_sents)
    
  max_scores1 = np.array(max_scores)
  max_scores2 = np.sort(max_scores)[::-1]
  idx = np.argsort(max_scores)[::-1]
     
  idx = idx[0:5]
    
  return list(story_para_sents[idx]), max_scores2[0:5]

In [None]:
fetch_each_story_top5_para_sents(stories["Story_paragraphs"][0],stories["Abstractive_summaries"][0])

In [None]:
len(stories["Story_paragraphs"])

In [None]:
all_stories_top5_sents_dict = dict()
all_stories_top5_sents_scores = dict()

for story_idx in tqdm(range(0, len(stories["Story_paragraphs"]))):
    
  story_para_sents = stories["Story_paragraphs"][story_idx]
  abstractive_summaries = stories["Abstractive_summaries"][story_idx]
  top5_para_sents, top5_sents_scores = fetch_each_story_top5_para_sents(story_para_sents,abstractive_summaries)
  all_stories_top5_sents_dict[story_idx] = top5_para_sents
  all_stories_top5_sents_scores[story_idx] = top5_sents_scores

In [None]:
pkl_file_handle = open("./all_stories_top5_sents_dict.pkl","wb")
pickle.dump(all_stories_top5_sents_dict,pkl_file_handle)

pkl_file_handle = open("./all_stories_top5_sents_scores.pkl","wb")
pickle.dump(all_stories_top5_sents_scores,pkl_file_handle)

In [None]:
all_stories_top5_sents_dict = pickle.load(open("./all_stories_top5_sents_dict.pkl","rb"))
all_stories_top5_sents_scores = pickle.load(open("./all_stories_top5_sents_scores.pkl","rb"))

# Let's now create a Pandas DataFrame where each row will consist of story index (story_idx), sentence index of top 5 sentences selected from the story paragraph (sent_idx), Each sentence out of top 5 sentences in a story paragraph, label representing whether each sentence out of top 5 sentences is in the extractive summary or not (extractive_label)

In [None]:
len(stories["Story_paragraphs"])

In [None]:
story_idx = list()
sent_idx = list()
sents_list = list()
extractive_label = list()

for i in tqdm(range(0, len(stories["Story_paragraphs"]))):
    
  top5_para_sents = all_stories_top5_sents_dict[i]
    
  for j, para_sent in enumerate(stories["Story_paragraphs"][i]):
        
    ohe_label =  int(para_sent in top5_para_sents)
    extractive_label.append(ohe_label)
    sents_list.append(para_sent)
    sent_idx.append(j)
    story_idx.append(i)

In [None]:
extractive_summaries_df = pd.DataFrame()
extractive_summaries_df["Story_idx"] = story_idx
extractive_summaries_df["Sent_idx"] = sent_idx
extractive_summaries_df["Para_sents"] = sents_list
extractive_summaries_df["Extractive_label"] = extractive_label

In [None]:
extractive_summaries_df.head()

In [None]:
len(extractive_summaries_df["Story_idx"].unique())

In [None]:
extractive_summaries_df.to_pickle("extractive_summaries_df.pkl")

In [None]:
data = pd.read_pickle("extractive_summaries_df.pkl")

In [None]:
data.head()

In [None]:
len(data)

# Let's divide our data into Training, Cross Validation and Testing Data. 

In [None]:
data_story_sents_count = data.groupby("Story_idx").size().reset_index(name="Sentences_count")

In [None]:
data_story_sents_count.head()

In [None]:
selected_stories_idx = list(data_story_sents_count[data_story_sents_count["Sentences_count"] <= 20]["Story_idx"])

In [None]:
len(selected_stories_idx)

In [None]:
train_story_ids = selected_stories_idx[:30000]
cv_story_ids = selected_stories_idx[30000:40000]
test_story_ids = selected_stories_idx[40000:]

training_data = data[data["Story_idx"].isin(train_story_ids)]
cv_data = data[data["Story_idx"].isin(cv_story_ids)]
testing_data = data[data["Story_idx"].isin(test_story_ids)]

In [None]:
selected_stories_idx

In [None]:
len(training_data["Story_idx"].unique())

In [None]:
training_data.head()

In [None]:
len(cv_data["Story_idx"].unique())

In [None]:
cv_data.head()

# Now, lets compute maximum number of sentences which a paragraph can have inside a story in a training data. 

In [None]:
training_data = training_data.sort_values(["Story_idx","Sent_idx"])
sents_count = training_data.groupby("Story_idx").size().reset_index(name="Sentences_count")

In [None]:
sents_count["Sentences_count"].describe()

In [None]:
story_max_length = sents_count["Sentences_count"].max()

In [None]:
story_max_length

In [None]:
unique_sents = set(training_data["Para_sents"].tolist())

In [None]:
len(unique_sents)

In [None]:
num_labels = len(training_data["Extractive_label"].unique())

In [None]:
num_labels

In [None]:
np.sort(training_data["Extractive_label"].unique())

In [None]:
labels2idx = {l: i+1 for i,l in enumerate(np.sort(training_data["Extractive_label"].unique()))}
labels2idx["PAD"] = 0
idx2labels = {i: l for l,i in labels2idx.items()}
print(labels2idx)

# Let's now add two more columns into the Training, Cross Validation as well as Testing Data. 

In [None]:
training_data.head()

In [None]:
def create_token_count_list(df):
  
  df['Number_tokens'] = df["Para_sents"].apply(lambda x: len(x.split()))
  df['Tokens_list'] = df["Para_sents"].apply(lambda x: x.split())
  return df

In [None]:
training_data = create_token_count_list(training_data)
cv_data = create_token_count_list(cv_data)
testing_data = create_token_count_list(testing_data)

In [None]:
training_data.head()

# Now, let's compute the total number of unique tokens inside the training data paragraphs. 

In [None]:
from itertools import chain

In [None]:
total_unique_tokens = set(list(chain(*training_data['Tokens_list'].tolist())))
num_unique_tokens = len(total_unique_tokens)

token2idx = {token: i+2 for i,token in enumerate(total_unique_tokens)}
token2idx["UNK"] = 1
token2idx["PAD"] = 0

idx2token = {i: token for token, i in token2idx.items()}

In [None]:
len(idx2token)

In [None]:
def create_sent_label_example(df):

  df["Sent_example"] = df[["Para_sents","Extractive_label"]].apply(tuple,axis=1)
  return df

In [None]:
training_data = create_sent_label_example(training_data)
cv_data = create_sent_label_example(cv_data)
testing_data = create_sent_label_example(testing_data)

In [None]:
training_data.iloc[0]["Sent_example"]

In [None]:
max_sent_length = 40

def stories_representation(df):
   
  story_ids = df['Story_idx'].unique()
  stories_examples = list()

  for story_idx in tqdm(story_ids):

    temp_story = list(df[df['Story_idx'] == story_idx]["Sent_example"])
    stories_examples.append(temp_story)
    X_token = np.zeros((len(stories_examples), story_max_length, max_sent_length))
    
    for idx, story_example in enumerate(stories_examples):

      story_seq = list()
        
      # to give an upper bound on the maximum length of the token sequence for sentence
      for i in range(story_max_length):

          sent_seq = list()
            
          # to give an upper bound on the maximum length of tokens to consider
          for j in range(max_sent_length):

            try:
                split_sent = story_example[i][0].split()
                sent_seq.append(token2idx.get(split_sent[j]))
            except:  
                # exception will be there when there will not be any sentence for the length 
                # and will be padded 0
                sent_seq.append(token2idx.get("PAD"))

          story_seq.append(sent_seq)
        
      X_token[idx] = np.array(story_seq)

  return (X_token, stories_examples)

In [None]:
X_train,Y_train = stories_representation(training_data)

In [None]:
X_train.shape

In [None]:
X_cv,Y_cv = stories_representation(cv_data)

In [None]:
X_cv.shape

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
def prepare_labels(story_examples):

    Y = [[labels2idx[ex_content[1]] for ex_content in sent_example] for sent_example in story_examples]
    Y = pad_sequences(maxlen=story_max_length, sequences=Y, value=labels2idx["PAD"], padding='post', truncating='post')
    Y = Y.reshape(-1, story_max_length, 1)
    
    return Y

In [None]:
train_labels = prepare_labels(Y_train)
cv_labels = prepare_labels(Y_cv)

In [None]:
import tensorflow as tf

In [None]:
training_data_batch_gen = tf.data.Dataset.from_tensor_slices((X_train, Y_train))
training_data_batch_gen = (training_data_batch_gen.batch(64).cache().prefetch(tf.data.experimental.AUTOTUNE))

cv_data_batch_gen = tf.data.Dataset.from_tensor_slices((X_cv, Y_cv))
cv_data_batch_gen = (cv_data_batch_gen.batch(64).cache().prefetch(tf.data.experimental.AUTOTUNE))

In [None]:
! wget https://nlp.stanford.edu/data/glove.6B.zip

In [None]:
! unzip /content/drive/MyDrive/glove.6B.zip

In [None]:
def create_embedding_matrix(token_idxes, embedding_path, topic_vector_dim):

  embedding_matrix_dict = dict()

  with open(embedding_path) as file_handle:

    for line in file_handle:

      values = line.split()
      token = values[0]
      topic_vector = np.asarray(values[1:], dtype='float32')
      embedding_matrix_dict[token] = topic_vector

  num_words = len(token_idxes) 
  embedding_matrix = np.zeros((num_words, topic_vector_dim))

  for token, idx in token_idxes.items():

    topic_vector = embedding_matrix_dict.get(token)

    if topic_vector is not None:
      embedding_matrix[idx] = topic_vector
  
  return embedding_matrix

In [None]:
from tensorflow.keras.layers import Input, TimeDistributed, Embedding, Convolution1D, Dense, Flatten, Activation, RepeatVector, Permute, multiply
from tensorflow.keras.layers import Lambda, Bidirectional, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

In [None]:
embedding_matrix_txt_path = "/content/drive/MyDrive/glove.6B.100d.txt"
topic_vectors_dim = 100

def text_summarization_model():

  token_input = Input(shape=(story_max_length, max_sent_length,))
  embedding_layer_out = TimeDistributed(Embedding(input_dim=(num_unique_tokens + 2), output_dim=topic_vectors_dim, input_length=max_sent_length,
                                      weights=[create_embedding_matrix(token2idx, embedding_matrix_txt_path, topic_vectors_dim)], trainable=True))(token_input)

  embedding_layer2_out = TimeDistributed(Convolution1D(32, 2, activation='relu',padding= 'same'))(embedding_layer_out)
    
  hidden_layer_out = TimeDistributed(Dense(1, activation='tanh'))(embedding_layer2_out)
  hidden_layer_out = TimeDistributed(Flatten())(hidden_layer_out)
  hidden_layer_out = TimeDistributed(Activation('softmax'))(hidden_layer_out)
  hidden_layer_out = TimeDistributed(RepeatVector(32))(hidden_layer_out)
  hidden_layer_out = TimeDistributed(Permute([2, 1]))(hidden_layer_out)
  hidden_layer_out = multiply([embedding_layer2_out,hidden_layer_out])
    
  sent_embedding = TimeDistributed(Lambda(lambda x: K.sum(x, axis=-2)))(hidden_layer_out)
    
  lstm_nw = Bidirectional(LSTM(units=16, return_sequences=True))(sent_embedding)
  nw_final_output = TimeDistributed(Dense(num_labels + 1, activation='softmax'))(lstm_nw)

  model = Model([token_input], nw_final_output)

  return model

In [None]:
model = text_summarization_model()

In [None]:
lr_start = 1e-5
lr_max = 1e-3
lr_rampup_epochs = 5
lr_to_sustain_epochs = 0
lr_step_decay = 0.75

In [None]:
def lr_scheduler(epoch):

  if epoch < lr_rampup_epochs:
    lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start

  elif epoch < lr_rampup_epochs + lr_to_sustain_epochs:
    lr = lr_max

  else:
    lr = lr_max * lr_step_decay**((epoch - lr_rampup_epochs - lr_to_sustain_epochs)//10)

  return lr

In [None]:
lr_scheduler_cb = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=True)
early_stopping_cb = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)

In [None]:
optimizer = tf.keras.optimizer.Adam(lr=1e-5)

In [None]:
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
model.fit(training_data_batch_gen, validation_data=cv_data_batch_gen,epochs=50,callbacks=[lr_scheduler_cb, early_stopping_cb], verbose=1)

# Write the code to perform the inference on this network and provide the output as extractive summary to the input paragraph. 