# Summarizing Pubmed Articles

In [None]:
%%bash
#install necessary libraries
pip install -r requirements.txt

In [18]:
# import necessary libraries
import os
import re
import numpy as np
import torch
import time
import json 
import warnings 
import tabulate
import pandas as pd
from evaluate import load
from nltk import word_tokenize
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import pipeline, set_seed

In [5]:
set_seed(102)

#load evaluation metric
METRIC = load("rouge")

# Define device type
DEVICE = 0 if torch.cuda.is_available() else -1
DEVICE_NAME="cuda"

# Define constants
NUMBER_OF_RECORDS = 100
MAX_SUMMARY_LENGTH = 200
MAX_TOKEN_SIZE = 512
NUMBER_OF_CHARACTERS = 100
INPUT_DATA_PATH = './pub_data.txt'
SENTENCE_TRANSFORMER_MODEL = 'pritamdeka/S-PubMedBert-MS-MARCO'

#define empty dictionaries for appending metrics
ROUGE_METRICS = {}
TIME_METRICS = {}

# Define text splitters by token size
TEXT_SPLITTER=RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=MAX_TOKEN_SIZE, chunk_overlap=NUMBER_OF_CHARACTERS)

#Different versions of transformer based models
GPT_SUMMARIZATION_MODELS = ['gpt2','gpt2-medium']
T5_SUMMARIZATION_MODELS = ['t5-base','t5-small','gayanin/t5-small-finetuned-pubmed']
BART_SUMMARIZATION_MODELS = ['facebook/bart-base','facebook/bart-large','mse30/bart-base-finetuned-pubmed']
PEGASUS_SUMMARIZATION_MODELS = ['google/pegasus-x-base','google/pegasus-x-large','google/pegasus-pubmed','google/bigbird-pegasus-large-pubmed']
ALL_MODELS = GPT_SUMMARIZATION_MODELS+T5_SUMMARIZATION_MODELS+BART_SUMMARIZATION_MODELS+PEGASUS_SUMMARIZATION_MODELS

warnings.filterwarnings('ignore')

## Load Pubmed Dataset
Load data into a dataframe with article_id, article_text and article_text_raw as attributes

In [9]:
def load_data(path:str=INPUT_DATA_PATH)->pd.DataFrame:
    """
    Load Pubmed dataset to dataframe
    """
    pubmed_data={}
    article_id=""
    article_text=""
    print("############# Started Data Loading ##############")
    with open(path, 'r') as file:
        for line in file:
            article_id = json.loads(line)['article_id']
            article_text_raw = json.loads(line)['article_text']
            article_text = " ".join(article_text_raw)
            abstract_text="".join(x.strip("<S> </S>") for x in json.loads(line)['abstract_text'])
            pubmed_data[article_id]=[article_text,abstract_text,article_text_raw]
    df=pd.DataFrame.from_dict(pubmed_data,orient='index',columns=['article_text','abstract_text','article_text_raw'])
    print("############# Finished Data Loading ##############")
    return df

## Chunker Class to generate normal and semantic chunks

In [18]:
class chunker():

    def get_pubmed_embeddings(self,sentence_list:list)->list:
        """
        Get the list of embeddings from pretrained model on pubmed data for the sentences
        """
        sentence_model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
        embeddings = sentence_model.encode(sentence_list)
        return embeddings

    def calculate_cosine_distances(self,sentence_list:list) -> tuple[list,list]:
        """
        Find the cosine distances between adjacent sentences
        """
        distances = []
        for i in range(len(sentence_list) - 1):
            embedding_current = sentence_list[i]['combined_sentence_embedding']
            embedding_next = sentence_list[i + 1]['combined_sentence_embedding']
            
            # Calculate cosine similarity
            similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
            
            # Convert to cosine distance
            distance = 1 - similarity
    
            # Append cosine distance to the list
            distances.append(distance)
    
            # Store distance in the dictionary
            sentence_list[i]['distance_to_next'] = distance
    
        return distances, sentence_list

    def combine_sentences(self,sentence_list:list, buffer_size:int=1)->list:
        """
        Combine buffer size number of adjacent sentences into a single sentence 
        """
        # Go through each sentence dict
        for i in range(len(sentence_list)):
    
            # Create a string that will hold the sentences which are joined
            combined_sentence = ''
    
            # Add sentences before the current one, based on the buffer size.
            for j in range(i - buffer_size, i):
                # Check if the index j is not negative (to avoid index out of range like on the first one)
                if j >= 0:
                    # Add the sentence at index j to the combined_sentence string
                    combined_sentence += sentence_list[j]['sentence'] + ' '
    
            # Add the current sentence
            combined_sentence += sentence_list[i]['sentence']
    
            # Add sentences after the current one, based on the buffer size
            for j in range(i + 1, i + 1 + buffer_size):
                # Check if the index j is within the range of the sentences list
                if j < len(sentence_list):
                    # Add the sentence at index j to the combined_sentence string
                    combined_sentence += ' ' + sentence_list[j]['sentence']
    
            # Then add the whole thing to your dict
            # Store the combined sentence in the current sentence dict
            sentence_list[i]['combined_sentence'] = combined_sentence
    
        return sentence_list

    def create_semantic_chunks(self,article_text_list:list)->list:
        """
        Takes in list of sentences of the article and creates semantic chunks of them
        """
        breakpoint_percentile_threshold = 95
        #single_sentences_list = re.split(r'(?<=\.)\s+', article_text)
        sentences = [{'sentence': x, 'index' : i} for i, x in enumerate(article_text_list)]
        sentences = self.combine_sentences(sentences)
        embeddings = self.get_pubmed_embeddings([x['combined_sentence'] for x in sentences])
        for i, sentence in enumerate(sentences):
            sentence['combined_sentence_embedding'] = embeddings[i]
        distances, sentences = self.calculate_cosine_distances(sentences)
        breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold)
        print(breakpoint_distance_threshold)
        indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]
        # Initialize the start index
        start_index = 0
        
        # Create a list to hold the grouped sentences
        chunks = []
        
        # Iterate through the breakpoints to slice the sentences
        for index in indices_above_thresh:
            # The end index is the current breakpoint
            end_index = index
        
            # Slice the sentence_dicts from the current start index to the end index
            group = sentences[start_index:end_index + 1]
            group_sentence_list = [d['sentence'] for d in group]
            combined_text = ' '.join(group_sentence_list)
            # call the function create_semantic_chunks recursively if the chunk size is greater than MAX_TOKEN_SIZE
            if len(word_tokenize(combined_text)) > MAX_TOKEN_SIZE:
                #chunks = chunks + TEXT_SPLITTER.split_text(combined_text)
                chunks = chunks + self.create_semantic_chunks(group_sentence_list)
            else:
                chunks.append(combined_text)
            
            # Update the start index for the next group
            start_index = index + 1
        
        # The last group, if any sentences remain
        if start_index < len(sentences):
            group_sentence_list = [d['sentence'] for d in sentences[start_index:]]
            combined_text = ' '.join(group_sentence_list)
            # call the function create_semantic_chunks recursively if the chunk size is greater than MAX_TOKEN_SIZE
            if len(word_tokenize(combined_text)) > MAX_TOKEN_SIZE:
                #chunks = chunks + TEXT_SPLITTER.split_text(combined_text)
                chunks = chunks + self.create_semantic_chunks(group_sentence_list)
            else:
                chunks.append(combined_text)
        return chunks

    def get_chunks(self,df:pd.DataFrame)->pd.DataFrame:
        """
        creates a new column with the normal chunks of the article text for every record
        """
        print("############# Generating Chunks ##############")
        df['chunks'] = df['article_text'].apply(lambda x:TEXT_SPLITTER.split_text(x))
        print("############# Completed Generating Chunks ##############")
        return df
        
    def get_semantic_chunks(self,df:pd.DataFrame)->pd.DataFrame:
        """
        creates a new column with the semantic chunks of the article text for every record
        """
        print("############# Generating Semantic Chunks ##############")
        df['semantic_chunks'] = df['article_text_raw'].apply(lambda x:self.create_semantic_chunks(x))
        print("############# Completed Generating Semantic Chunks ##############")
        return df

## Summarization Class

In [11]:
class text_summarizer():
    # define variable to use fast tokenization
    USE_FAST=True

    def get_summary_gpt(self,model:str,chunks:list)->str:
        """
        Takes in the gpt model and the list of chunks to generate the output summary
        """
        final_summary_list=[]
        # define pipeline for generating summary
        summarizer = pipeline('text-generation', model = model, device=DEVICE)
        # iterate over chunks to generate summary
        for chunk in chunks:
            query = chunk + "\nTL;DR:\n"
            pipe_out = summarizer(query,max_new_tokens=MAX_SUMMARY_LENGTH,clean_up_tokenization_spaces=True)
            # skip the input chunk and also the 8 characters \nTL;DR:\n at the end to get the summary
            final_summary_list.append(pipe_out[0]['generated_text'][len(chunk)+8:])
        # combine all the summaries into one
        final_summary = ''.join(final_summary_list)
        # if final summary is greater than 512 tokens, recursively summarize 
        if len(word_tokenize(final_summary))>MAX_TOKEN_SIZE:
            return self.get_summary_gpt(model,TEXT_SPLITTER.split_text(final_summary))
        # finally summarize the combined summary
        output_summary = summarizer(final_summary,max_new_tokens=MAX_SUMMARY_LENGTH,clean_up_tokenization_spaces=True)
        # skip the input chunk and also the 8 characters \nTL;DR:\n at the end to get the summary
        return output_summary[0]['generated_text'][len(chunk)+8:]
    
    def get_summary_others(self,model:str,chunks:list)->str:
        """
        Takes in the other model and the list of chunks to generate the output summary
        """
        final_summary_list=[]
        # disable fast tokenization as it is not compatible with google/pegasus-pubmed
        if model == PEGASUS_SUMMARIZATION_MODELS[2]:
            self.USE_FAST=False
        # define pipeline for generating summary
        summarizer = pipeline('summarization', model = model, device=DEVICE,use_fast=self.USE_FAST)
        # iterate over chunks to generate summary
        for chunk in chunks:
            query = chunk
            pipe_out = summarizer(query,max_new_tokens=MAX_SUMMARY_LENGTH,clean_up_tokenization_spaces=True)
            final_summary_list.append(pipe_out[0]['summary_text'])
        # combine all the summaries into one
        final_summary = ''.join(final_summary_list)
        # if final summary is greater than 512 tokens, recursively summarize 
        if len(word_tokenize(final_summary))>MAX_TOKEN_SIZE:
            return self.get_summary(model,TEXT_SPLITTER.split_text(final_summary))
        # finally summarize the combined summary
        output_summary = summarizer(final_summary,max_new_tokens=MAX_SUMMARY_LENGTH,clean_up_tokenization_spaces=True)
        return output_summary[0]['summary_text']

## Predictions Class

In [21]:
class run_predictions():

    def run_summarization_models(self,data:pd.DataFrame,model_list:list=GPT_SUMMARIZATION_MODELS,chunking_type:str="chunks")->tuple[pd.DataFrame,pd.DataFrame]:
        """
        Takes the dataframe, list of models and chunking type as the inputs and stores the predictions in a new column in the dataframe
        """
        # iterate through all the models to run the predictions
        for model in model_list:
            print("############# Summarizing using {} ##############".format(model))
            start = time.time()
            if model.startswith('gpt'):
                data[model+'_'+chunking_type+'_predictions']=data[chunking_type].apply(lambda chunks:text_summarizer().get_summary_gpt(model,chunks))
            else:
                data[model+'_'+chunking_type+'_predictions']=data[chunking_type].apply(lambda chunks:text_summarizer().get_summary_others(model,chunks))
            end = time.time()
            print("############# Predictions completed for {} ############## \n".format(model))
            print("############# Logging Metrics ##############")
            ROUGE_METRICS[model+'_'+chunking_type+'_predictions']=METRIC.compute(predictions=data[model+'_'+chunking_type+'_predictions'], references=data['abstract_text'])
            TIME_METRICS[model+'_'+chunking_type+'_predictions']=round(end-start, 2)
            print("############# Logging Completed ############## \n")
        ROUGE_METRICS = pd.DataFrame.from_dict(ROUGE_METRICS,orient='index').sort_values(by=['rouge1'],ascending=False)
        TIME_METRICS = pd.DataFrame.from_dict(TIME_METRICS,columns=['time(seconds)'],orient='index').sort_values(by=['time(seconds)'],ascending=False)
        return ROUGE_METRICS, TIME_METRICS


## Running all together

In [15]:
%%time
pubmed_df=load_data()

############# Started Data Loading ##############
############# Finished Data Loading ##############
CPU times: total: 281 ms
Wall time: 452 ms


In [14]:
%%time
#Randomly select 100 records
pubmed_df_N=pubmed_df.sample(n=100)

CPU times: total: 0 ns
Wall time: 30.7 ms


In [19]:
%%time
pubmed_df_N = chunker().get_chunks(pubmed_df_N)

############# Generating Chunks ##############
############# Completed Generating Chunks ##############
CPU times: total: 3.45 s
Wall time: 4.42 s


In [None]:
%%time
pubmed_df_N = chunker().get_semantic_chunks(pubmed_df_N)

In [None]:
%%time
ROUGE_METRICS,TIME_METRICS=run_predictions().run_summarization_models(data=pubmed_df_N,model_list=ALL_MODELS)

In [22]:
print(ROUGE_METRICS)

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
google/bigbird-pegasus-large-pubmed_predictions,0.388646,0.154228,0.23243,0.237661
google/pegasus-pubmed_predictions,0.383846,0.152244,0.232697,0.237762
google/pegasus-pubmed_semantic_predictions,0.379763,0.155475,0.236825,0.240366
google/bigbird-pegasus-large-pubmed_semantic_predictions,0.37557,0.139944,0.223682,0.227538
mse30/bart-base-finetuned-pubmed_predictions,0.371113,0.146808,0.228494,0.252258
mse30/bart-base-finetuned-pubmed_semantic_predictions,0.355427,0.140262,0.220893,0.246486
facebook/bart-base_semantic_predictions,0.337382,0.113309,0.192816,0.196912
facebook/bart-base_predictions,0.334315,0.114154,0.192467,0.196672
facebook/bart-large_semantic_predictions,0.314529,0.091228,0.172488,0.176876
facebook/bart-large_predictions,0.309903,0.086964,0.172292,0.175652


In [27]:
print(TIME_METRICS)

Unnamed: 0,time(seconds)
google/pegasus-pubmed_semantic_predictions,7807.32
google/pegasus-pubmed_predictions,7218.26
google/bigbird-pegasus-large-pubmed_semantic_predictions,7007.86
gpt2-medium_predictions,5967.51
google/bigbird-pegasus-large-pubmed_predictions,5845.54
facebook/bart-large_semantic_predictions,4702.63
gpt2_semantic_predictions,4397.33
facebook/bart-large_predictions,4316.24
google/pegasus-x-large_semantic_predictions,4139.57
google/pegasus-x-base_semantic_predictions,3949.54
