In [8]:
# %%capture
############
# INSTALLS #
############

#Abstractive Summarizer Installs
!pip install datasets
!pip install transformers
!pip install rouge_score
!pip install sacrebleu
!pip install sentencepiece==0.1.95

#Extractive Summarizer Installs
!pip install bert-extractive-summarizer
!pip install neuralcoref
!pip install spacy==2.1.3
!python -m spacy download en_core_web_md


Collecting en_core_web_md==2.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4 MB)
[K     |████████████████████████████████| 95.4 MB 99.7 MB/s eta 0:00:01
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [9]:
###########
# IMPORTS #
###########

#Abstractive Summarizer Imports
import datasets
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM  
from transformers import BertTokenizer, EncoderDecoderModel

#Extractive Summarizer Imports
from tqdm import tqdm_pandas
from summarizer import Summarizer

#Cosine Similarity Imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

#Utility Imports
from functools import reduce
from operator import add
import pandas as pd
import numpy as np

#Dataset Library Imports
from sklearn.datasets import fetch_20newsgroups

In [10]:
%%capture
###############
# GLOBAL VARS #
###############

vectorizer = TfidfVectorizer()
tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")  
abstractive_summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
extractive_summarizer_model = Summarizer()

In [11]:
########
# DATA #
########

#CNN Dailymail dataset, initially used for testing summarizers
test_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="test")
twenty_news_dataset = fetch_20newsgroups()

In [13]:
#Toy data for testing the pipeline
covid_test_data = pd.read_csv("~/266_final/nyt_data_collection/toy_data/fp_covid_articles.csv")
golf_test_data = pd.read_csv("~/266_final/nyt_data_collection/toy_data/golf_articles.csv")

In [18]:
covid_test_data

Unnamed: 0,version https://git-lfs.github.com/spec/v1
0,oid sha256:fa580c2fd0764656aa3d23b17108d3c7908...
1,size 4664199


In [30]:
nyt = pd.read_csv("~/266_final/nyt_data_collection/dataset/full_nyt_dataset.csv")
nyt = nyt[nyt['first_paragraph'].str.len()>200]

In [34]:
covid_test_data = nyt[nyt['first_paragraph'].str.contains('coronavirus')]

In [15]:
####################
# HELPER FUNCTIONS #
####################

# MISC Helpers
divString = lambda size, char = "#": reduce(add, [char for i in range(size)])
flatten = lambda lst: [i for sublst in lst for i in sublst]


#Batch Summary Generation and Batch Metrics
def generate_summary(batch):
    """This function computes a summary for a given article from the Dataset object
    batch
    Params:
    batch: an article from the given Dataset object."""
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    batch["pred"] = output_str
    return batch


def compute_metrics(batch, batch_size=16, metric_name="rouge"):
    """This function computes the rouge or bleu scores for predicted summaries
    Params:
    batch: A Dataset object which contains the articles at the specified indices
    Use the select method for this function call. 
    Example format: Dataset.select([list of indices to select from the original dataset])
    metric_name: The prefered evaluation metric to use"""
    
    metric = datasets.load_metric(metric_name)
    results = batch.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["article"])
    summary_pred = results["pred"]
    label_ref = results["highlights"]
    if metric_name == "rouge":
        output = metric.compute(predictions=summary_pred, references=label_ref, rouge_types=["rouge2"])["rouge2"].mid
        print("\n" + "ROUGE SCORE:")
        return output
    else:
        # Else compute bleu score with metric name "sacrebleu"
        all_bleu_scores = []
        for i in range(len(batch)):
            output = metric.compute(predictions= [summary_pred[i]], references= [[label_ref[i]]])
            all_bleu_scores.append(output)
            print("\n\n")
            print(divString(100))
            print("\n\n" + "Summary prediction: " + "\n\n", summary_pred[i])
            print("\n\n" + "Reference Label: " + "\n\n", label_ref[i])
            print("\n\n" + "BLEU SCORE:" + "\n\n", output)
            print("\n")
        return all_bleu_scores
    

#Raw Text Summarization
def generate_abstractive_summary(raw_string, model = abstractive_summarizer_model):
    """This function produces an abstractive summary for a given article"
    Params:
    raw_string: an article string.
    model: An abstractive summarizer model"""
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(raw_string, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return output_str[0]


def generate_extractive_summary(raw_string, model = extractive_summarizer_model, min_summary_length = 50):
    """This function produces an extractive summary for a given article"
    Params:
    raw_string: an article string.
    model: An extractive summarizer model"""
    output_str = model(raw_string, min_length = min_summary_length)
    return output_str
    
    
#Search and Subset Dataset
def search_and_subset_data(df, keyword, column = "first_paragraph"):
    """
    This function takes in a dataframe, keyword to search, and an optional column to search through and returns a
    subset of the data as a pandas dataframe, with entries that contain the searched keyword.
    Params:
    df: Dataframe
    keyword: A keyword to search
    column: Optional that takes either 'first_paragraph' or 'keywords'
    """
    df = df.sort_values(by='date', ascending = False).reset_index().drop("index", axis = 1) # Ordering all documents chronologically so that indices don't need reodered when combining similar documents
    df = df.dropna(subset=[column])
    subset = df[df[column].str.lower().str.contains(keyword.strip().lower())]
    return subset


def select_random_document(df):
    """
    This function selects a single random row from a dataframe. This is used as a default for initial document selection at the beginning of the pipeline.
    df: A pandas dataframe
    """
    return df.sample()



# Similarity Clustering and Aggregate Document Synthesis
def compute_cosine_similarities(document, corpus, vectorizer = vectorizer):
    """
    This function computes the cosine similarity between a document and a specified corpus of documents.
    document: A string of text.
    corpus: An array of documents.
    vectorizer: A TfidfVectorizer() object. (Default: initialized in the constants cell)
    """
    vectorized_corpus = vectorizer.fit_transform(corpus)
    vectorized_document = vectorizer.transform([document])
    return linear_kernel(vectorized_document, vectorized_corpus).flatten()


def get_related_docs_indices(cos_similarities_array, n_docs=5):
    """
    This function returns the document indices with the highest cosine similarity.
    cos_similarities_array: An array of the computed cosine similarities for the whole corpus.
    n_docs: The number of highest scoring documents to return. (e.g. n_docs=5 returns the top 5 highest scoring documents)
    """
    cos_similarities_array = np.array([i for i in cos_similarities_array if i < 0.999999999]) # This eliminates the case where the most similar document is the document itself, which has a similarity of 1.0
    return sorted(cos_similarities_array.argsort()[:-(n_docs + 1):-1])
    

def get_top_similarities(cos_similarities_array, n_docs=5):
    """
    This function returns the highest document cosine similarity scores.
    cos_similarities_array: An array of the computed cosine similarities for the whole corpus.
    n_docs: The number of highest cosine scores to return. (e.g. n_docs=5 returns the top 5 highest cosine similarity scores)
    """
    related_indices = get_related_docs_indices(cos_similarities_array, n_docs)
    return cos_similarities_array[related_indices]


def concatenate_related_docs(corpus, related_docs_indices):
    docs = [corpus[i] for i in related_docs_indices]
    return " ".join(docs)
    

# Display Functions
def show_related_docs(document, corpus, related_docs_indices):
    """
    This function displays the seed document, selected similar documents, and the concatenated aggregate of the similar documents.
    """
    aggregate_doc = concatenate_related_docs(corpus, related_docs_indices)
    print("\n" + "SELECTED DOCUMENT: " + "\n")
    print(document)
    print("\n")
    print(divString(100))
    print("\n")
    print("SIMILAR DOCUMENTS: " + "\n")
    for i in related_docs_indices:
        print(corpus[i], "\n")
        print(divString(100, char = "~") + "\n")
    print(divString(100))
    print("\n")
    print("AGGREGATE DOCUMENT: " + "\n")
    print(aggregate_doc + "\n")
    print(divString(100))


#Pipeline Iteration
def iterate_pipeline(initial_document, corpus, num_iter = 5, aggregate_doc_variant = "abstractive_secondary"):
    """
    This function begins with an initial document. From there the initial document is clustered with similar documents, 
    """
    document = initial_document
    for i in range(num_iter):
        cosine_similarities = compute_cosine_similarities(document, corpus)
        related_docs_indices = get_related_docs_indices(cosine_similarities)
        aggregate_document = concatenate_related_docs(corpus, related_docs_indices)
        if aggregate_doc_variant == "abstractive_primary":
            document = generate_abstractive_summary(aggregate_document, model = abstractive_summarizer_model)
        if aggregate_doc_variant == "abstractive_secondary":
            extractive_summary_primary = generate_extractive_summary(aggregate_document, min_summary_length=100)
            document = generate_abstractive_summary(extractive_summary_primary, model = abstractive_summarizer_model)
        if aggregate_doc_variant == "extractive_primary":
            document = generate_extractive_summary(aggregate_document, min_summary_length=100)
    print("Seed Document:" + "\n")
    print(initial_document, "\n")
    print(divString(100) + "\n")
    print("Resulting Document:" + "\n")
    print(document, "\n")
    return document

In [16]:
# Helper Function Unit Tests

# generate_summary(test_data[0])
# compute_metrics(test_data.select([1,2]), metric_name = "rouge")
# compute_metrics(test_data.select([1, 2]), metric_name = "sacrebleu")
# generate_extractive_summary(test_data[0]["article"])
# generate_extractive_summary(test_data[0]["article"], min_summary_length=10)

### Baseline Algorithm Walkthrough
1. Select a group/subset of articles (Order the subset chronologically so that the dataframe indices are chronological)
2. Select a single article from the subset produced in step 1. (Baseline selection will be random)
3. Perform cosine similarity between the single/selected article and the entire subset of articles produced in step 1.
4. Select the top most similar indices and their associated articles.
5. Summarize them.

In [67]:
############
# PIPELINE #
############

# Document Selection
search_results_df = search_and_subset_data(covid_test_data, "death") # a df containing entries that have death in the first paragraph
corpus = search_results_df.first_paragraph.to_list()  # Converting to a list to vectorize the entries
document = select_random_document(search_results_df).first_paragraph.values[0] # Selected a random row from the search results dataframe and extracted the first paragraph text to serve as our document

# Similarity Clustering and Aggregate Document Synthesis
cosine_similarities = compute_cosine_similarities(document, corpus) # The cosine similarities between the target document and all documents contained in the corpus
related_docs_indices = get_related_docs_indices(cosine_similarities) # The indices of the most related docs
aggregate_document = concatenate_related_docs(corpus, related_docs_indices) # The resulting document that is produced by concatenating all of the most similar documents

# Summarization
extractive_summary_primary = generate_extractive_summary(aggregate_document, min_summary_length=100)
abstractive_summary_primary = generate_abstractive_summary(aggregate_document, model = abstractive_summarizer_model)
abstractive_summary_secondary = generate_abstractive_summary(extractive_summary_primary, model = abstractive_summarizer_model) #Secondary summary in the hierarchy (i.e. a summary of a summary)

In [68]:
print("Top Cosine Similarity Scores: ", get_top_similarities(cosine_similarities))
show_related_docs(document, corpus, related_docs_indices)

print("\n" + "SUMMARIZATION RESULTS" + "\n")
print(divString(100) + "\n")
print("EXTRACTIVE SUMMARY PRIMARY:" + "\n", extractive_summary_primary)
print("\n")
print("ABSTRACTIVE SUMMARY PRIMARY:" + "\n", abstractive_summary_primary)
print("\n")
print("ABSTRACTIVE SUMMARY SECONDARY:" + "\n", abstractive_summary_secondary)
print(divString(100) + "\n")

Top Cosine Similarity Scores:  [0.13927901 0.14297279 0.13397415 0.11997941 0.11820805]

SELECTED DOCUMENT: 

On a day that saw the signing of a historic agreement with the Taliban and the first American death of the coronavirus outbreak, President Trump found himself veering off message on Saturday during both a news conference and a speech at the Conservative Political Action Conference.


####################################################################################################


SIMILAR DOCUMENTS: 

Despite the horrifying surge of Covid-19 cases and deaths in the United States right now, one bit of good news is emerging this winter: It looks unlikely that the country will endure a “twindemic” of both flu and the coronavirus at the same time. 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

On April 20, 49 days after Georgia reported its first two coronavirus cases and 39 days after the announcement of the state’s firs

In [274]:
######################
# ITERATIVE PIPELINE #
######################

# Initial Document Selection
search_results_df = search_and_subset_data(covid_test_data, "death") # a df containing entries that have death in the first paragraph
corpus = search_results_df.first_paragraph.to_list()  # Converting to a list to vectorize the entries
initial_document = select_random_document(search_results_df).first_paragraph.values[0] # Selected a random row from the search results dataframe and extracted the first paragraph text to serve as our document

# Pipeline Iteration
iterate_pipeline(initial_document, corpus, num_iter = 5, aggregate_doc_variant = "abstractive_secondary")

Seed Document:

A surge in coronavirus deaths in the United States has prompted the vast majority of governors to order their residents to stay home, but a small number of states are resisting increasingly urgent calls to shut down. 

####################################################################################################

Resulting Document:






# Rougue and Blue


In [37]:
#!pip install rouge/requirements.txt
!pip install rouge-score



In [100]:
cosine_similarities.argsort()

array([126,  42, 122, 140, 131, 100,  45, 178, 154, 119,  77, 135, 111,
       174,  24,  22, 136,  74, 183, 144, 125,  15, 121, 112, 167,  17,
         0, 170, 114,  53, 108, 124, 177,  30, 139,   2,  97,  58,  78,
       186,  39, 151,   4, 104, 129, 120,  25, 145, 155,  65, 128, 115,
       134,  76,  26,   7, 117,  29, 130, 196, 168, 181,  37, 184,   3,
       132,  95, 146,  73,  87, 113, 192, 102, 152,  90, 123, 160,  19,
        92,  11,  12,  69,   5,  55, 118, 175, 172, 166,  60,  14,  81,
       150, 149,  36,  91,  23, 185, 138, 173,  46, 162, 109,  51, 156,
        47,  89, 159,  10,   6, 169,  57, 103,  16,  40, 157,  44, 133,
        67,  66, 143,  80,  98,  79, 163,  68, 189,  20,  34,  33, 116,
        88,  48, 142,  99, 182,  63,   1,  96,  41, 187, 105,  86,  85,
        49,  21, 164,  52,  70,  72,  71, 193,  32,  54, 110, 194, 153,
       137,  27, 188,  43,  35, 101,  64,  31, 161, 106,  61,   9,   8,
        93,  38,  50, 179, 190,  84,  62,  94,  59, 202, 195,  7

In [103]:
sorted(random.sample(cosine_similarities.argsort().tolist(), 5))

[15, 38, 68, 79, 106]

In [104]:
############
# PIPELINE #
############
from rouge_score import rouge_scorer
import random

document = select_random_document(search_results_df).first_paragraph.values[0]
def pipeline(top_bottom_rand = 'top'):
    # Document Selection
    search_results_df = search_and_subset_data(covid_test_data, "death") # a df containing entries that have death in the first paragraph
    corpus = search_results_df.first_paragraph.to_list()  # Converting to a list to vectorize the entries
     # Selected a random row from the search results dataframe and extracted the first paragraph text to serve as our document

    # Similarity Clustering and Aggregate Document Synthesis
    cosine_similarities = compute_cosine_similarities(document, corpus) # The cosine similarities between the target document and all documents contained in the corpus
    cosine_similarities = np.array([i for i in cosine_similarities if i < 0.999999999])
    def get_related_docs_indices_choice(cos_similarities_array, n_docs=5,type_='top'):
        """
        This function returns the document indices with the highest cosine similarity.
        cos_similarities_array: An array of the computed cosine similarities for the whole corpus.
        n_docs: The number of highest scoring documents to return. (e.g. n_docs=5 returns the top 5 highest scoring documents)
        """
        cos_similarities_array = np.array([i for i in cos_similarities_array if i < 0.999999999]) # This eliminates the case where the most similar document is the document itself, which has a similarity of 1.0
        if type_ =='bottom':
            return sorted(cos_similarities_array.argsort()[:(n_docs + 1)])
        if type_ =='rand':
            return sorted(random.sample(cos_similarities_array.argsort().tolist(), n_docs))
        return sorted(cos_similarities_array.argsort()[:-(n_docs + 1):-1])
    def get_top_similarities2(cos_similarities_array, n_docs=5):
        """
        This function returns the highest document cosine similarity scores.
        cos_similarities_array: An array of the computed cosine similarities for the whole corpus.
        n_docs: The number of highest cosine scores to return. (e.g. n_docs=5 returns the top 5 highest cosine similarity scores)
        """
        related_indices = get_related_docs_indices_choice(cos_similarities_array, n_docs,type_=top_bottom_rand)
        #print(sorted(cos_similarities_array))
        return cos_similarities_array[related_indices]
    related_doc_indices = get_related_docs_indices_choice(cosine_similarities, n_docs=5,type_=top_bottom_rand)

     # The indices of the most related docs
    aggregate_document = concatenate_related_docs(corpus, related_doc_indices) # The resulting document that is produced by concatenating all of the most similar documents

    # Summarization
    extractive_summary_primary = generate_extractive_summary(aggregate_document, min_summary_length=100)
    abstractive_summary_primary = generate_abstractive_summary(aggregate_document, model = abstractive_summarizer_model)
    abstractive_summary_secondary = generate_abstractive_summary(extractive_summary_primary, model = abstractive_summarizer_model) #Secondary summary in the hierarchy (i.e. a summary of a summary)
    
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL','rougeLsum'], use_stemmer=True)
    scores = scorer.score(abstractive_summary_secondary,
                          aggregate_document)
    print("Top Cosine Similarity Scores: ", get_top_similarities2(cosine_similarities))
    show_related_docs(document, corpus, related_doc_indices)

    print("\n" + "SUMMARIZATION RESULTS" + "\n")
    print(divString(100) + "\n")
    print("EXTRACTIVE SUMMARY PRIMARY:" + "\n", extractive_summary_primary)
    print("\n")
    print("ABSTRACTIVE SUMMARY PRIMARY:" + "\n", abstractive_summary_primary)
    print("\n")
    print("ABSTRACTIVE SUMMARY SECONDARY:" + "\n", abstractive_summary_secondary)
    print(divString(100) + "\n")
    
    print(scores)

In [105]:
pipeline('top')

Top Cosine Similarity Scores:  [0.16744721 0.16588634 0.171213   0.17106115 0.18105539]

SELECTED DOCUMENT: 

President Biden delivered his Inaugural Address at a time of deep national anxiety. The coronavirus pandemic continues to devastate the United States, with more than 400,000 deaths reported so far. Americans are still reeling from the riot at the Capitol, and many are facing economic uncertainty.


####################################################################################################


SIMILAR DOCUMENTS: 

President Biden delivered his Inaugural Address at a time of deep national anxiety. The coronavirus pandemic continues to devastate the United States, with more than 400,000 deaths reported so far. Americans are still reeling from the riot at the Capitol, and many are facing economic uncertainty. 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

With the United States reaching a once-unthinkable coronavirus p

In [106]:
pipeline('bottom')

Top Cosine Similarity Scores:  [0.0196589  0.0222812  0.01022922 0.01837328 0.02457433 0.0177952 ]

SELECTED DOCUMENT: 

President Biden delivered his Inaugural Address at a time of deep national anxiety. The coronavirus pandemic continues to devastate the United States, with more than 400,000 deaths reported so far. Americans are still reeling from the riot at the Capitol, and many are facing economic uncertainty.


####################################################################################################


SIMILAR DOCUMENTS: 

Dr. Anthony S. Fauci, the top infectious disease expert in the United States, predicted on Thursday that the daily death toll from the coronavirus would continue to rise for weeks to come, and counseled patience with the vaccination program gearing up across the nation. 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The New Hampshire State Legislature was already fiercely divided over the corona

In [107]:
pipeline('rand')

Top Cosine Similarity Scores:  [0.07190863 0.02971859 0.09739063 0.04841665 0.04767389]

SELECTED DOCUMENT: 

President Biden delivered his Inaugural Address at a time of deep national anxiety. The coronavirus pandemic continues to devastate the United States, with more than 400,000 deaths reported so far. Americans are still reeling from the riot at the Capitol, and many are facing economic uncertainty.


####################################################################################################


SIMILAR DOCUMENTS: 

Few groups have witnessed more of the virus’s horrors than caregivers — frontline workers who have grappled with the public health crisis while trying to help older people at risk of isolation, distress and, in some cases, death. The deaths of almost 40 percent of all Americans killed by the coronavirus have been linked to nursing homes and similar facilities — indoor spaces crowded with vulnerable adults. The share is even higher in Pennsylvania, where deaths i

In [73]:
scores

{'rouge1': Score(precision=0.19318181818181818, recall=0.9622641509433962, fmeasure=0.32176656151419564),
 'rougeL': Score(precision=0.15151515151515152, recall=0.7547169811320755, fmeasure=0.25236593059936907),
 'rougeLsum': Score(precision=0.15151515151515152, recall=0.7547169811320755, fmeasure=0.25236593059936907)}

Top Cosine Similarity Scores:  [0.18180213 0.17909269 0.15919848 0.24679445 0.14357941]

SELECTED DOCUMENT: 

BEIJING — The coronavirus epidemic in China surpassed a grim milestone on Sunday with a death toll that exceeds that of the SARS outbreak 17 years ago, a development that coincided with news that World Health Organization experts might soon be in the country to help stanch the crisis.


####################################################################################################


SIMILAR DOCUMENTS: 

Battered by a wave of coronavirus infections and deaths, local jails and state prison systems around the United States have resorted to a drastic strategy to keep the virus at bay: Shutting down completely and transferring their inmates elsewhere. 

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In a world plagued by pandemic, Vietnam seemed like a miracle. As months went by without a single recorded coronavirus death, 

# Scratch Code

In [30]:
search_results_df = search_and_subset_data(covid_test_data, "death") # a df containing entries that have death in the first paragraph
search_results = search_results_df.first_paragraph.to_list()  # Converting to a list to vectorize the entries
corpus = vectorizer.fit_transform(search_results) # Vectorize and fit transform
document = select_random_document(x).first_paragraph.values[0] # Selected a random row from the search results dataframe and extracted the first paragraph text to serve as our document
# compute_cosine_similarities(document, corpus)

In [19]:
select_random_document(x).first_paragraph.values[0]

'WASHINGTON — The Trump administration, racing a surging Covid-19 death toll, instructed states on Tuesday to immediately begin vaccinating every American 65 and older, as well as tens of millions of adults with medical conditions that put them at higher risk of dying from coronavirus infection.'

In [70]:
x = search_and_subset_data(covid_test_data, "death") # a df containing entries that have death in the first paragraph
y = x.first_paragraph.to_list() # Converting to a list to vectorize the entries
z = vectorizer.fit_transform(y) # Vectorize and fit transform

cosine_similarities = linear_kernel(z[0], z).flatten() #Compute the similarities between a single vectorized article and the rest of the vectorized corpus
cosine_similarities

array([1.        , 0.07616775, 0.02808496, 0.11774414, 0.03636491,
       0.11259535, 0.10315487, 0.12655489, 0.05390195, 0.08478859,
       0.08520882, 0.07082236, 0.07575236, 0.04181228, 0.03807006,
       0.10346441, 0.13086099, 0.05104183, 0.08180416, 0.09875876,
       0.06415114, 0.0425284 , 0.0628859 , 0.13876087, 0.0863506 ,
       0.06123893, 0.05645384, 0.11517398, 0.02818955, 0.03787033,
       0.08467625, 0.0716679 , 0.07715621, 0.17073456, 0.06300363,
       0.10544956, 0.03742148, 0.05863446, 0.11771831, 0.07676028,
       0.1086421 , 0.09694976, 0.04841343, 0.05976972, 0.09376444,
       0.03410992, 0.10781565, 0.10350352, 0.08012644, 0.0365176 ,
       0.15337929, 0.08447607, 0.06627055, 0.03930884, 0.04738421,
       0.06617171, 0.04870063, 0.06727244, 0.0839498 , 0.02872655,
       0.09031904, 0.03866   , 0.12150809, 0.15830899, 0.02474477,
       0.11012431, 0.09457802, 0.05708611, 0.0213857 , 0.11309312,
       0.02541409, 0.06927503, 0.06761577, 0.03243748, 0.08644