In [9]:
###########
# IMPORTS #
###########
import tensorflow as tf

#Abstractive Summarizer Imports
import datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import BertTokenizer, EncoderDecoderModel

#Extractive Summarizer Imports
from tqdm import tqdm_pandas
from summarizer import Summarizer

#Cosine Similarity Imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

#Utility Imports
from functools import reduce
from operator import add
import pandas as pd
import numpy as np

#Dataset Library Imports
from sklearn.datasets import fetch_20newsgroups

In [10]:
%%capture
###############
# GLOBAL VARS #
###############

vectorizer = TfidfVectorizer()
tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")  
abstractive_summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
extractive_summarizer_model = Summarizer()

In [11]:
########
# DATA #
########

#CNN Dailymail dataset, initially used for testing summarizers
test_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="test")
twenty_news_dataset = fetch_20newsgroups()

#Toy data for testing the pipeline
covid_test_data = pd.read_csv("../nyt_data_collection/toy_data/fp_covid_articles.csv")
golf_test_data = pd.read_csv("../nyt_data_collection/toy_data/golf_articles.csv")

Reusing dataset cnn_dailymail (/home/jupyter/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602)


In [12]:
####################
# HELPER FUNCTIONS #
####################

# MISC Helpers
divString = lambda size, char = "#": reduce(add, [char for i in range(size)])
flatten = lambda lst: [i for sublst in lst for i in sublst]


#Batch Summary Generation and Batch Metrics
def generate_summary(batch):
    """This function computes a summary for a given article from the Dataset object
    batch
    Params:
    batch: an article from the given Dataset object."""
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    batch["pred"] = output_str
    return batch


def compute_metrics(batch, batch_size=16, metric_name="rouge"):
    """This function computes the rouge or bleu scores for predicted summaries
    Params:
    batch: A Dataset object which contains the articles at the specified indices
    Use the select method for this function call. 
    Example format: Dataset.select([list of indices to select from the original dataset])
    metric_name: The prefered evaluation metric to use"""
    
    metric = datasets.load_metric(metric_name)
    results = batch.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["article"])
    summary_pred = results["pred"]
    label_ref = results["highlights"]
    if metric_name == "rouge":
        output = metric.compute(predictions=summary_pred, references=label_ref, rouge_types=["rouge2"])["rouge2"].mid
        print("\n" + "ROUGE SCORE:")
        return output
    else:
        # Else compute bleu score with metric name "sacrebleu"
        all_bleu_scores = []
        for i in range(len(batch)):
            output = metric.compute(predictions= [summary_pred[i]], references= [[label_ref[i]]])
            all_bleu_scores.append(output)
            print("\n\n")
            print(divString(100))
            print("\n\n" + "Summary prediction: " + "\n\n", summary_pred[i])
            print("\n\n" + "Reference Label: " + "\n\n", label_ref[i])
            print("\n\n" + "BLEU SCORE:" + "\n\n", output)
            print("\n")
        return all_bleu_scores
    

#Raw Text Summarization
def generate_abstractive_summary(raw_string, model = abstractive_summarizer_model):
    """This function produces an abstractive summary for a given article"
    Params:
    raw_string: an article string.
    model: An abstractive summarizer model"""
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(raw_string, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return output_str[0]


def generate_extractive_summary(raw_string, model = extractive_summarizer_model, min_summary_length = 50):
    """This function produces an extractive summary for a given article"
    Params:
    raw_string: an article string.
    model: An extractive summarizer model"""
    output_str = model(raw_string, min_length = min_summary_length)
    return output_str
    
    
#Search and Subset Dataset
def search_and_subset_data(df, keyword, column = "first_paragraph"):
    """
    This function takes in a dataframe, keyword to search, and an optional column to search through and returns a
    subset of the data as a pandas dataframe, with entries that contain the searched keyword.
    Params:
    df: Dataframe
    keyword: A keyword to search
    column: Optional that takes either 'first_paragraph' or 'keywords'
    """
    df = df.sort_values(by='date', ascending = False).reset_index().drop("index", axis = 1) # Ordering all documents chronologically so that indices don't need reodered when combining similar documents
    df = df.dropna(subset=[column])
    subset = df[df[column].str.lower().str.contains(keyword.strip().lower())]
    return subset


def select_random_document(df):
    """
    This function selects a single random row from a dataframe. This is used as a default for initial document selection at the beginning of the pipeline.
    df: A pandas dataframe
    """
    return df.sample()



# Similarity Clustering and Aggregate Document Synthesis
def compute_cosine_similarities(document, corpus, vectorizer = vectorizer):
    """
    This function computes the cosine similarity between a document and a specified corpus of documents.
    document: A string of text.
    corpus: An array of documents.
    vectorizer: A TfidfVectorizer() object. (Default: initialized in the constants cell)
    """
    vectorized_corpus = vectorizer.fit_transform(corpus)
    vectorized_document = vectorizer.transform([document])
    return linear_kernel(vectorized_document, vectorized_corpus).flatten()


def get_related_docs_indices(cos_similarities_array, n_docs=5):
    """
    This function returns the document indices with the highest cosine similarity.
    cos_similarities_array: An array of the computed cosine similarities for the whole corpus.
    n_docs: The number of highest scoring documents to return. (e.g. n_docs=5 returns the top 5 highest scoring documents)
    """
    cos_similarities_array = np.array([i for i in cos_similarities_array if i < 0.999999999]) # This eliminates the case where the most similar document is the document itself, which has a similarity of 1.0
    return sorted(cos_similarities_array.argsort()[:-(n_docs + 1):-1])
    

def get_top_similarities(cos_similarities_array, n_docs=5):
    """
    This function returns the highest document cosine similarity scores.
    cos_similarities_array: An array of the computed cosine similarities for the whole corpus.
    n_docs: The number of highest cosine scores to return. (e.g. n_docs=5 returns the top 5 highest cosine similarity scores)
    """
    related_indices = get_related_docs_indices(cos_similarities_array, n_docs)
    return cos_similarities_array[related_indices]


def concatenate_related_docs(corpus, related_docs_indices):
    docs = [corpus[i] for i in related_docs_indices]
    return " ".join(docs)
    

# Display Functions
def show_related_docs(document, corpus, related_docs_indices):
    """
    This function displays the seed document, selected similar documents, and the concatenated aggregate of the similar documents.
    """
    aggregate_doc = concatenate_related_docs(corpus, related_docs_indices)
    print("\n" + "SELECTED DOCUMENT: " + "\n")
    print(document)
    print("\n")
    print(divString(100))
    print("\n")
    print("SIMILAR DOCUMENTS: " + "\n")
    for i in related_docs_indices:
        print(corpus[i], "\n")
        print(divString(100, char = "~") + "\n")
    print(divString(100))
    print("\n")
    print("AGGREGATE DOCUMENT: " + "\n")
    print(aggregate_doc + "\n")
    print(divString(100))


#Pipeline Iteration
def iterate_pipeline(initial_document, corpus, num_iter = 5, aggregate_doc_variant = "abstractive_secondary"):
    """
    This function begins with an initial document. From there the initial document is clustered with similar documents, 
    """
    document = initial_document
    for i in range(num_iter):
        cosine_similarities = compute_cosine_similarities(document, corpus)
        related_docs_indices = get_related_docs_indices(cosine_similarities)
        aggregate_document = concatenate_related_docs(corpus, related_docs_indices)
        if aggregate_doc_variant == "abstractive_primary":
            document = generate_abstractive_summary(aggregate_document, model = abstractive_summarizer_model)
        if aggregate_doc_variant == "abstractive_secondary":
            extractive_summary_primary = generate_extractive_summary(aggregate_document, min_summary_length=100)
            document = generate_abstractive_summary(extractive_summary_primary, model = abstractive_summarizer_model)
        if aggregate_doc_variant == "extractive_primary":
            document = generate_extractive_summary(aggregate_document, min_summary_length=100)
    print("Seed Document:" + "\n")
    print(initial_document, "\n")
    print(divString(100) + "\n")
    print("Resulting Document:" + "\n")
    print(document, "\n")
    return document

In [13]:
# Helper Function Unit Tests

# generate_summary(test_data[0])
# compute_metrics(test_data.select([1,2]), metric_name = "rouge")
# compute_metrics(test_data.select([1, 2]), metric_name = "sacrebleu")
# generate_extractive_summary(test_data[0]["article"])
# generate_extractive_summary(test_data[0]["article"], min_summary_length=10)

### Baseline Algorithm Walkthrough
1. Select a group/subset of articles (Order the subset chronologically so that the dataframe indices are chronological)
2. Select a single article from the subset produced in step 1. (Baseline selection will be random)
3. Perform cosine similarity between the single/selected article and the entire subset of articles produced in step 1.
4. Select the top most similar indices and their associated articles.
5. Summarize them.

In [14]:
############
# PIPELINE #
############

# Document Selection
search_results_df = search_and_subset_data(covid_test_data, "death") # a df containing entries that have death in the first paragraph
corpus = search_results_df.first_paragraph.to_list()  # Converting to a list to vectorize the entries
document = select_random_document(search_results_df).first_paragraph.values[0] # Selected a random row from the search results dataframe and extracted the first paragraph text to serve as our document

# Similarity Clustering and Aggregate Document Synthesis
cosine_similarities = compute_cosine_similarities(document, corpus) # The cosine similarities between the target document and all documents contained in the corpus
related_docs_indices = get_related_docs_indices(cosine_similarities) # The indices of the most related docs
aggregate_document = concatenate_related_docs(corpus, related_docs_indices) # The resulting document that is produced by concatenating all of the most similar documents

# Summarization
extractive_summary_primary = generate_extractive_summary(aggregate_document, min_summary_length=100)
abstractive_summary_primary = generate_abstractive_summary(aggregate_document, model = abstractive_summarizer_model)
abstractive_summary_secondary = generate_abstractive_summary(extractive_summary_primary, model = abstractive_summarizer_model) #Secondary summary in the hierarchy (i.e. a summary of a summary)

In [15]:
print("Top Cosine Similarity Scores: ", get_top_similarities(cosine_similarities))
show_related_docs(document, corpus, related_docs_indices)

print("\n" + "SUMMARIZATION RESULTS" + "\n")
print(divString(100) + "\n")
print("EXTRACTIVE SUMMARY PRIMARY:" + "\n", extractive_summary_primary)
print("\n")
print("ABSTRACTIVE SUMMARY PRIMARY:" + "\n", abstractive_summary_primary)
print("\n")
print("ABSTRACTIVE SUMMARY SECONDARY:" + "\n", abstractive_summary_secondary)
print(divString(100) + "\n")

Top Cosine Similarity Scores:  [0.14345526 0.14457817 0.04430281 0.13621813 0.08780106]

SELECTED DOCUMENT: 

The death of Herman Cain, attributed to the coronavirus, has made Republicans and President Trump face the reality of the pandemic as it hit closer to home than ever before, claiming a prominent conservative ally whose frequently dismissive attitude about taking the threat seriously reflected the hands-off inconsistency of party leaders.


####################################################################################################


SIMILAR DOCUMENTS: 

WASHINGTON — President Trump on Wednesday rejected the professional scientific conclusions of his own government about the prospects for a widely available coronavirus vaccine and the effectiveness of masks in curbing the spread of the virus as the death toll in the United States from the disease neared 200,000. 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Presid

In [16]:
######################
# ITERATIVE PIPELINE #
######################

# Initial Document Selection
search_results_df = search_and_subset_data(covid_test_data, "death") # a df containing entries that have death in the first paragraph
corpus = search_results_df.first_paragraph.to_list()  # Converting to a list to vectorize the entries
initial_document = select_random_document(search_results_df).first_paragraph.values[0] # Selected a random row from the search results dataframe and extracted the first paragraph text to serve as our document

# Pipeline Iteration
iterate_pipeline(initial_document, corpus, num_iter = 5, aggregate_doc_variant = "abstractive_secondary")

Seed Document:

[Read more on Brazil’s Coronavirus cases and deaths.] 

####################################################################################################

Resulting Document:

the death toll in the u. s. has reached 200, 000. president trump rejected the professional findings of his own government. the public's interest in the case may make this one of the highest - profile trials in recent memory. the trial of derek chauvin will be a one - of - the - kind. 



"the death toll in the u. s. has reached 200, 000. president trump rejected the professional findings of his own government. the public's interest in the case may make this one of the highest - profile trials in recent memory. the trial of derek chauvin will be a one - of - the - kind."

# Scratch Code

## Daphne Test

- Try new iterative approach for the summarizations
    - Top 5 to target, summarize, get top 5 to summary, summarize (iterate 5x) and 

In [17]:
full_data = pd.read_csv('../nyt_data_collection/dataset/full_nyt_dataset.csv')

# Below is code written just to try out the difference between using the full data and the subsetted dataset (Not really important to what we are doing)

## Using Full Data

### Notes

- Using Full Data Works much better than using presubset data

In [18]:
full_data.head()

Unnamed: 0,headline,date,doc_type,material_type,section,abstract,first_paragraph,keywords,web_url
0,Trump’s Book Club: A President Who Doesn’t Rea...,2018-12-01,article,News,,"President Trump, who is not a reader, has used...","WASHINGTON — President Trump, a leader who is ...","['Books and Literature', 'United States Politi...",https://www.nytimes.com/2018/11/30/us/politics...
1,Kareem Hunt Is Cut by the Chiefs After a Video...,2018-12-01,article,News,,"Hunt, a star running back, was also suspended ...",The Kansas City Chiefs cut their star running ...,"['Football', 'Domestic Violence']",https://www.nytimes.com/2018/11/30/sports/kare...
2,Agency Pulls Back on Its Warning Against Talk ...,2018-12-01,article,News,,Casual conversations about impeachment and inv...,"WASHINGTON — An independent government agency,...",['Hatch Act (1939)'],https://www.nytimes.com/2018/11/30/us/politics...
3,A China Hawk Gains Prominence as Trump Confron...,2018-12-01,article,News,,"Michael Pillsbury, the president’s top outside...",WASHINGTON — Michael Pillsbury had just finish...,"['United States International Relations', 'Uni...",https://www.nytimes.com/2018/11/30/us/politics...
4,Wilmer Flores Is a Met No More,2018-12-01,article,News,,"A fan favorite, Flores did not receive a contr...","The Mets parted ways with a fan favorite, Wilm...",['Baseball'],https://www.nytimes.com/2018/11/30/sports/wilm...


In [20]:
# Document Selection
search_results_df = search_and_subset_data(full_data, "mcilroy") # a df containing entries that have death in the first paragraph
search_results_df.head()

Unnamed: 0,headline,date,doc_type,material_type,section,abstract,first_paragraph,keywords,web_url
1073,"Sergio García, Leading the Players Championshi...",2021-03-12,article,News,,García tested positive for the coronavirus jus...,"PONTE VEDRA BEACH, Fla. — When Sergio García w...","['Players Championship (Golf)', 'Coronavirus (...",https://www.nytimes.com/2021/03/11/sports/golf...
1810,Arnold Palmer’s Legacy Hints at What Tiger Woo...,2021-03-05,article,News,,The link between the two golf legends feels st...,"ORLANDO, Fla. — In a fashion befitting someone...","['Golf', 'Arnold Palmer Invitational (Golf)']",https://www.nytimes.com/2021/03/04/sports/golf...
2682,Golf Without Tiger Woods? His Fellow Players C...,2021-02-25,article,News,,"Woods, 45, has been sidelined before as he rec...","BRADENTON, Fla. — As a golf prodigy growing up...","['Golf', 'PGA Championship', 'Masters Golf Tou...",https://www.nytimes.com/2021/02/24/sports/tige...
13321,"Rory McIlroy Aims for Balance, and a Green Jacket",2020-11-14,article,News,,"McIlroy, still seeking a Masters win for a car...",Rory McIlroy watched the flight of his ball in...,"['Golf', 'Masters Golf Tournament']",https://www.nytimes.com/2020/11/14/sports/golf...
20164,Matthew Wolff Leads Bryson DeChambeau at U.S. ...,2020-09-20,article,News,,A fast start and some steady nerves gave Wolff...,"MAMARONECK, N.Y. — In May, with most global sp...","['Golf', 'United States Open (Golf)']",https://www.nytimes.com/2020/09/19/sports/golf...


In [21]:
search_doc = search_results_df.first_paragraph.to_list()  # Converting to a list to vectorize the entries
selected_doc = search_doc[4]

In [22]:
full_corpus = full_data.first_paragraph.dropna().to_list()

In [23]:
cosine_similarities = compute_cosine_similarities(selected_doc, full_corpus) # The cosine similarities between the target document and all documents contained in the corpus
cosine_similarities[:5]

array([0.01048547, 0.04569611, 0.02160506, 0.00879925, 0.02199285])

In [24]:
related_docs_indices = get_related_docs_indices(cosine_similarities) # The indices of the most related docs
related_docs_indices

[7862, 22485, 79140, 137945, 172489]

In [25]:
aggregate_document = concatenate_related_docs(full_corpus, related_docs_indices) # The resulting document that is produced by concatenating all of the most similar documents
aggregate_document

'Rory McIlroy stroked the clinching shot in Sunday’s uber-quirky charity golf match — the first televised competitive men’s golf in more than two months — but it was Dustin Johnson, McIlroy’s teammate in the event, who won the day. The PGA Tour’s Memorial Tournament, scheduled next week in central Ohio, had hoped to pave the way for spectators to return to golf events during the coronavirus pandemic, and in turn perhaps provide a blueprint for how fans could safely attend other major American sports competitions. HOUSTON — The PGA Tour does not have a 72-hole stroke play event this week, and several weekend college football games, including the marquee matchup between Michigan and Ohio State, have been canceled or postponed because of the coronavirus, leaving the best female golfers in the world well positioned to fill the TV viewing void. SPRINGFIELD, N.J. — It was on the eve of this month’s British Open that the world’s top four ranked golfers — Jason Day, Dustin Johnson, Jordan Spie

In [26]:
extractive_summary_primary = generate_extractive_summary(aggregate_document, min_summary_length=100)
extractive_summary_primary

'Rory McIlroy stroked the clinching shot in Sunday’s uber-quirky charity golf match — the first televised competitive men’s golf in more than two months — but it was Dustin Johnson, McIlroy’s teammate in the event, who won the day. The PGA Tour’s Memorial Tournament, scheduled next week in central Ohio, had hoped to pave the way for spectators to return to golf events during the coronavirus pandemic, and in turn perhaps provide a blueprint for how fans could safely attend other major American sports competitions.'

In [27]:
abstractive_summary_primary = generate_abstractive_summary(aggregate_document, model = abstractive_summarizer_model)
abstractive_summary_primary

"rory mcilroy beat dustin johnson, dustin johnson and jordan spieth in sunday's charity match. the pga tour's memorial tournament had hoped to pave the way for spectators to return to golf events during the coronavirus pandemic. it was the first televised men's golf in more than two months."

In [28]:
abstractive_summary_secondary = generate_abstractive_summary(extractive_summary_primary, model = abstractive_summarizer_model) #Secondary summary in the hierarchy (i.e. a summary of a summary)
abstractive_summary_secondary

"rory mcilroy beat dustin johnson in sunday's charity golf match. the pga tour's memorial tournament had hoped to pave the way for spectators to return to golf events during the coronavirus pandemic. the event was the first televised men's golf in more than two months."

## Using Subset Data

In [29]:
# Summarization
extractive_summary_primary = generate_extractive_summary(aggregate_document, min_summary_length=100)
extractive_summary_primary

'Rory McIlroy stroked the clinching shot in Sunday’s uber-quirky charity golf match — the first televised competitive men’s golf in more than two months — but it was Dustin Johnson, McIlroy’s teammate in the event, who won the day. The PGA Tour’s Memorial Tournament, scheduled next week in central Ohio, had hoped to pave the way for spectators to return to golf events during the coronavirus pandemic, and in turn perhaps provide a blueprint for how fans could safely attend other major American sports competitions.'

In [30]:
abstractive_summary_primary = generate_abstractive_summary(aggregate_document, model = abstractive_summarizer_model)
abstractive_summary_primary

"rory mcilroy beat dustin johnson, dustin johnson and jordan spieth in sunday's charity match. the pga tour's memorial tournament had hoped to pave the way for spectators to return to golf events during the coronavirus pandemic. it was the first televised men's golf in more than two months."

This is the primary abstracted summary -- works prety well but still prone to hallucination (In the reference articles, it's actually the russian vaccine that they are trying to convince works better than the american competitors)

In [31]:
abstractive_summary_secondary = generate_abstractive_summary(extractive_summary_primary, model = abstractive_summarizer_model) #Secondary summary in the hierarchy (i.e. a summary of a summary)
abstractive_summary_secondary

"rory mcilroy beat dustin johnson in sunday's charity golf match. the pga tour's memorial tournament had hoped to pave the way for spectators to return to golf events during the coronavirus pandemic. the event was the first televised men's golf in more than two months."

Lacks the coherency (In the last year, the subway conductor's coronavirus swas diagnosed with coronavirus)

## Iterative Approach 2 

Taking an article -> finding its 5 most similar articles -> summarizing that -> using summary to find next 5 most similar -> take 3 summaries and run abstractive

In [35]:
from tqdm import tqdm
from IPython.display import display

In [50]:
def iterative_pipeline_new(search_document, df, model = "abstractive_secondary", n_iterations = 3):
    print(f'Original Document: {search_document}')
    document_summaries = list()
    df = df.dropna(subset = ['first_paragraph'])
    corpus = df.first_paragraph.to_list()
    for i in tqdm(range(n_iterations)):
        cosine_similarities = compute_cosine_similarities(search_document, corpus)
        related_docs_indices = get_related_docs_indices(cosine_similarities) # The indices of the most related docs
        show_related_docs(search_document, corpus, related_docs_indices)
        aggregate_document = concatenate_related_docs(corpus, related_docs_indices) # The resulting document that is produced by concatenating all of the most similar documents
        print(aggregate_document)
        if model == "abstractive_primary":
            document = generate_abstractive_summary(aggregate_document, model = abstractive_summarizer_model)
        if model == "abstractive_secondary":
            extractive_summary_primary = generate_extractive_summary(aggregate_document, min_summary_length=100)
            document = generate_abstractive_summary(extractive_summary_primary, model = abstractive_summarizer_model)
        if model == "extractive_primary":
            document = generate_extractive_summary(aggregate_document, min_summary_length=100)
        document_summaries.append(document)
        search_document = document
        print(f"Iteration {i+1} model = {model}: \n {document}")
    aggregate_document = " ".join(document_summaries)
    print("All document summaries aggregated: \n", aggregate_document)
    if model == "abstractive_primary":
        document = generate_abstractive_summary(aggregate_document, model = abstractive_summarizer_model)
    if model == "abstractive_secondary":
        extractive_summary_primary = generate_extractive_summary(aggregate_document, min_summary_length=100)
        document = generate_abstractive_summary(extractive_summary_primary, model = abstractive_summarizer_model)
    if model == "extractive_primary":
        document = generate_extractive_summary(aggregate_document, min_summary_length=100)
    print(f"Final Iterative {model} Summary: {document}")
    return document

In [48]:
selected_doc

'MAMARONECK, N.Y. — In May, with most global sports competitions suspended because of the coronavirus pandemic, the veteran golfers Dustin Johnson, Rory McIlroy and Rickie Fowler competed in a made-for-TV charity match. Almost as a lark, or perhaps to lure a younger audience, Matthew Wolff, a 21-year-old PGA Tour newcomer, was invited to play, too.'

In [51]:
a = iterative_pipeline_new(selected_doc, full_data)
a

  0%|          | 0/3 [00:00<?, ?it/s]

Original Document: MAMARONECK, N.Y. — In May, with most global sports competitions suspended because of the coronavirus pandemic, the veteran golfers Dustin Johnson, Rory McIlroy and Rickie Fowler competed in a made-for-TV charity match. Almost as a lark, or perhaps to lure a younger audience, Matthew Wolff, a 21-year-old PGA Tour newcomer, was invited to play, too.

SELECTED DOCUMENT: 

MAMARONECK, N.Y. — In May, with most global sports competitions suspended because of the coronavirus pandemic, the veteran golfers Dustin Johnson, Rory McIlroy and Rickie Fowler competed in a made-for-TV charity match. Almost as a lark, or perhaps to lure a younger audience, Matthew Wolff, a 21-year-old PGA Tour newcomer, was invited to play, too.


####################################################################################################


SIMILAR DOCUMENTS: 

Rory McIlroy stroked the clinching shot in Sunday’s uber-quirky charity golf match — the first televised competitive men’s golf in mo

 33%|███▎      | 1/3 [00:14<00:28, 14.18s/it]

Iteration 1 model = abstractive_secondary: 
 rory mcilroy beat dustin johnson in sunday's charity golf match. the pga tour's memorial tournament had hoped to pave the way for spectators to return to golf events during the coronavirus pandemic. the event was the first televised men's golf in more than two months.

SELECTED DOCUMENT: 

rory mcilroy beat dustin johnson in sunday's charity golf match. the pga tour's memorial tournament had hoped to pave the way for spectators to return to golf events during the coronavirus pandemic. the event was the first televised men's golf in more than two months.


####################################################################################################


SIMILAR DOCUMENTS: 

In a telephone appearance during a televised charity golf exhibition on Sunday, President Trump enthusiastically supported the return of live sports events during the coronavirus pandemic. 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 67%|██████▋   | 2/3 [00:29<00:14, 14.49s/it]

Iteration 2 model = abstractive_secondary: 
 president trump supported the return of live sports events during the coronavirus pandemic. the pga tour's memorial tournament had hoped to pave the way for spectators to return to golf events. president trump was in a telephone interview during a televised charity golf exhibition on sunday. the event was scheduled to take place next week in central ohio.

SELECTED DOCUMENT: 

president trump supported the return of live sports events during the coronavirus pandemic. the pga tour's memorial tournament had hoped to pave the way for spectators to return to golf events. president trump was in a telephone interview during a televised charity golf exhibition on sunday. the event was scheduled to take place next week in central ohio.


####################################################################################################


SIMILAR DOCUMENTS: 

With the goal of resuming tournament play next month in Texas, PGA Tour officials on Wednes

100%|██████████| 3/3 [00:43<00:00, 14.57s/it]

Iteration 3 model = abstractive_secondary: 
 pga tour officials outlined safety procedures they intend to implement next month. the pga tour's memorial tournament had hoped to pave the way for spectators to return to golf events during the coronavirus pandemic. players, caddies and support personnel were among those to be included.
All document summaries aggregated: 
 rory mcilroy beat dustin johnson in sunday's charity golf match. the pga tour's memorial tournament had hoped to pave the way for spectators to return to golf events during the coronavirus pandemic. the event was the first televised men's golf in more than two months. president trump supported the return of live sports events during the coronavirus pandemic. the pga tour's memorial tournament had hoped to pave the way for spectators to return to golf events. president trump was in a telephone interview during a televised charity golf exhibition on sunday. the event was scheduled to take place next week in central ohio. pg




Final Iterative abstractive_secondary Summary: pga tour's memorial tournament had hoped to pave the way for spectators to return to golf events during the coronavirus pandemic. pga tour had hoped the event would pave for spectators returning from the corona virus. the memorial tournament hoped to help spectators return from the virus - stricken region during the world's coronavirus.


"pga tour's memorial tournament had hoped to pave the way for spectators to return to golf events during the coronavirus pandemic. pga tour had hoped the event would pave for spectators returning from the corona virus. the memorial tournament hoped to help spectators return from the virus - stricken region during the world's coronavirus."

### Trying with a different reference article

In [52]:
search_doc = full_data.first_paragraph.to_list()  # Converting to a list to vectorize the entries
selected_doc = search_doc[3]
iterative_pipeline_new(selected_doc, full_data, n_iterations = 3)

  0%|          | 0/3 [00:00<?, ?it/s]

Original Document: WASHINGTON — Michael Pillsbury had just finished a rib-eye salad at the Cosmos Club on Tuesday when he received a text message from the White House: “The president is trying to reach you. Call back.”

SELECTED DOCUMENT: 

WASHINGTON — Michael Pillsbury had just finished a rib-eye salad at the Cosmos Club on Tuesday when he received a text message from the White House: “The president is trying to reach you. Call back.”


####################################################################################################


SIMILAR DOCUMENTS: 

True to its mischievous title, Dean Rader’s book “Self Portrait as Wikipedia Entry” is an eclectic assemblage of tributes and allegories, letters and instructional headers, interspersed with subversive self-portraits. In this poem, even unlikely end words like “a” and “the” turn out to be faithful to the rhyme scheme — a clear homage to Robert Frost, who was known for setting colloquial expressions to classic meter. Frost’s spiri

 33%|███▎      | 1/3 [00:15<00:31, 15.98s/it]

Iteration 1 model = abstractive_secondary: 
 dean rader's book'self portrait as wikipedia entry'is an eclectic assemblage of tributes, letters and instructional headers. the new president delivered the loudest message : bring back domestic manufacturing jobs, or face punishing tariffs and other penalties. the book was billed as a listening session for a'listening session '

SELECTED DOCUMENT: 

dean rader's book'self portrait as wikipedia entry'is an eclectic assemblage of tributes, letters and instructional headers. the new president delivered the loudest message : bring back domestic manufacturing jobs, or face punishing tariffs and other penalties. the book was billed as a listening session for a'listening session '


####################################################################################################


SIMILAR DOCUMENTS: 

From the book: 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

True to its mischievous ti

 67%|██████▋   | 2/3 [00:30<00:15, 15.45s/it]

Iteration 2 model = abstractive_secondary: 
 dean rader's book'self portrait as wikipedia entry'is an eclectic selection of tributes and subversive self - portraits. the book is a tribute to the book's author, dean. the president has veered into some of his preferred topics, including crime and media bias.

SELECTED DOCUMENT: 

dean rader's book'self portrait as wikipedia entry'is an eclectic selection of tributes and subversive self - portraits. the book is a tribute to the book's author, dean. the president has veered into some of his preferred topics, including crime and media bias.


####################################################################################################


SIMILAR DOCUMENTS: 

True to its mischievous title, Dean Rader’s book “Self Portrait as Wikipedia Entry” is an eclectic assemblage of tributes and allegories, letters and instructional headers, interspersed with subversive self-portraits. In this poem, even unlikely end words like “a” and “the” turn o

100%|██████████| 3/3 [00:45<00:00, 15.18s/it]

Iteration 3 model = abstractive_secondary: 
 dean rader's book'self portrait as wikipedia entry'is an eclectic assemblage of tributes and allegories, letters and instructional headers. the book is a mixture of tribute and subversive self - portraits and tributes to the wikipedia entry.
All document summaries aggregated: 
 dean rader's book'self portrait as wikipedia entry'is an eclectic assemblage of tributes, letters and instructional headers. the new president delivered the loudest message : bring back domestic manufacturing jobs, or face punishing tariffs and other penalties. the book was billed as a listening session for a'listening session ' dean rader's book'self portrait as wikipedia entry'is an eclectic selection of tributes and subversive self - portraits. the book is a tribute to the book's author, dean. the president has veered into some of his preferred topics, including crime and media bias. dean rader's book'self portrait as wikipedia entry'is an eclectic assemblage of tr




Final Iterative abstractive_secondary Summary: dean rader's book'self portrait as wikipedia entry'is an eclectic assemblage of tributes, letters and instructional headers. it's an eclectic mix of tribute headers, tributes and instruction headers from all over the world. the book is based on a book's self portrait titled'self portraits as wikipedia entries '


"dean rader's book'self portrait as wikipedia entry'is an eclectic assemblage of tributes, letters and instructional headers. it's an eclectic mix of tribute headers, tributes and instruction headers from all over the world. the book is based on a book's self portrait titled'self portraits as wikipedia entries '"

There is still potentially the problem for cascading errors when it comes to articles that are relatively new and unique and there is not much written about it before. OR if the document's first paragraph is not as descriptive but it is interesting that the summmary documents come out to be relatively informative on background information.