# Imports

In [1]:
# Third-Party Imports
import nltk
import torch

# Standard Library Imports
import os
from math import inf
from string import punctuation
import sys

from queries import get_text_cli
from get_documents import search

# Utility Functions

In [94]:
def get_text():
    term = get_text_cli('Enter a search term')
    return search(term)

def load_docs(dirname='corpus'):
    corpus = dict()
    main_path = os.path.join(os.path.dirname('__file__'), dirname)

    for file in os.listdir(main_path):
        with open(os.path.join(main_path, file), 'r') as f:
            corpus[file] = f.read()
    
    return corpus

def chunk_text(text, chunk_len):
    chunks = []
    current_chunk = ""
    sents = nltk.sent_tokenize(text)
    
    for sent in sents:
        if len(nltk.word_tokenize(current_chunk + f" {sent}")) >= chunk_len:
            chunks.append(current_chunk)
            current_chunk = ""
        else:
            current_chunk += f" {sent}"
    
    chunks.append(current_chunk)
    
    return chunks

# Simple Extractive Text Summarisation

In [3]:
def word_tokenize(text):
    banned = list(punctuation) + nltk.corpus.stopwords.words("english")
    return [w for w in nltk.word_tokenize(text) if w not in banned]

def SE_summary(text):
    # Create word and sentence tokens
    words = word_tokenize(text)
    word_set = set(words) # set of all unique words in word tokens
    sents = nltk.sent_tokenize(text)
    
    # Initialise frequency table for word tokens
    w_freq_table = {w: words.count(w) for w in word_set}
    
    # Score sentences based on frequency of their words
    sent_scores = {
        sent: sum(w_freq_table.get(w, 0) for w in word_tokenize(sent))
        for sent in sents
    }
    
    # Build summary
    avg = sum(sent_scores.values()) / len(sent_scores)
    summary = " ".join(sent for sent in sents if sent_scores[sent] >= avg)
    return summary

In [4]:
text = get_text()
text

Enter a search term: Kirk


('Q36380778',
 'Kirk',
 'Kirk is a Scottish and former Northern English word meaning "church". It is often used specifically of the Church of Scotland. Many place names and personal names are also derived from it.\nAs a common noun, kirk (meaning \'church\') is found in Scots, Scottish English, Ulster-Scots and some English dialects, attested as a noun from the 14th century onwards, but as an element in placenames much earlier. Both words, kirk and church, derive from the Koine Greek κυριακόν (δωμα) (kyriakon (dōma)) meaning Lord\'s (house), which was borrowed into the Germanic languages in late antiquity, possibly in the course of the Gothic missions. (Only a connection with the idiosyncrasies of Gothic explains how a Greek neuter noun became a Germanic feminine).\nWhereas church displays Old English palatalisation, kirk is a loanword from Old Norse[citation needed] and thus retains the original mainland Germanic consonants. Compare cognates: Icelandic & Faroese kirkja; Swedish kyrka 

In [5]:
summary = SE_summary(text[2])
summary

'Kirk is a Scottish and former Northern English word meaning "church". As a common noun, kirk (meaning \'church\') is found in Scots, Scottish English, Ulster-Scots and some English dialects, attested as a noun from the 14th century onwards, but as an element in placenames much earlier. Both words, kirk and church, derive from the Koine Greek κυριακόν (δωμα) (kyriakon (dōma)) meaning Lord\'s (house), which was borrowed into the Germanic languages in late antiquity, possibly in the course of the Gothic missions. As a proper noun, The Kirk is an informal name for the Church of Scotland, the country\'s national church. The Kirk of Scotland was in official use as the name of the Church of Scotland until the 17th century, and still today the term is frequently used in the press and everyday speech, though seldom in the Church\'s own literature. However, Kirk Session is still the standard term in church law for the court of elders in the local congregation, both in the Church of Scotland and

In [6]:
print(len(text[2]), len(summary))

5882 2959


# Clever Algorithms

In [7]:
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

## LexRank, TextRank, Latent Semantic Analysis (LSA), Luhn's Algorithm

In [8]:
def algo_summary(text, method):
    # Initialise summariser
    if method == "lex":
        summariser = LexRankSummarizer()
    elif method == "text":
        summariser = TextRankSummarizer()
    elif method == "lsa":
        summariser = LsaSummarizer()
    elif method == "luhn":
        summariser = LuhnSummarizer()
    
    # Initialise parser
    parser = PlaintextParser(text, Tokenizer("english"))
    
    # Create summary
    summary_sents = summariser(parser.document, 20)
    
    return " ".join(str(s) for s in summary_sents)

In [9]:
text = get_text()
text

Enter a search term: Lionel Messi


('Q615',
 'Lionel Messi',

In [10]:
for meth in ("lex", "text", "lsa", "luhn"):
    summary = algo_summary(text[2], meth)
    print(f"\nMethod: {meth}", summary, f"Orginal: {len(text[2])}, Summary: {len(summary)}", sep="\n\n")
    print(f"\n{100*'='}\n")


Method: lex

Messi has scored over 795 senior career goals for club and country, and has the most goals by a player for a single club (672). The following two seasons, Messi finished second for the Ballon d'Or behind Cristiano Ronaldo (his perceived career rival), before regaining his best form during the 2014–15 campaign, becoming the all-time top scorer in La Liga and leading Barcelona to a historic second treble, after which he was awarded a fifth Ballon d'Or in 2015. He scored all of his side's four goals in the Champions League quarter-final against Arsenal on 6 April while becoming Barcelona's all-time top scorer in the competition. After Barcelona lost the Copa del Rey final four days later, Messi scored both goals in his side's 2–0 win in the first leg of the Champions League semi-finals in Madrid, the second of which – a slaloming dribble past three Real players – was acclaimed as one of the best ever in the competition. At the close of the year, Messi had scored a record 91 


Method: lsa

As his father's health insurance covered only two years of growth hormone treatment, which cost at least $1,000 per month, Newell's agreed to contribute, but later reneged on their promise. French winger Ludovic Giuly explained how Messi caught the eye in a training session with Frank Rijkaard's first team: "He destroyed us all... They were kicking him all over the place to avoid being ridiculed by this kid, he just got up and kept on playing. However, he continued to be plagued by major injuries; a metatarsal fracture sustained on 12 November 2006 kept him out of action for three months. Messi helped the team achieve 16 consecutive league victories, a record in Spanish football, concluding with another hat-trick against Atlético Madrid on 5 February 2011. In Barça's last home league match on 5 May, against Espanyol, Messi scored all four goals before approaching the bench to embrace Guardiola, who had announced his resignation as manager. On 17 April, Messi ended a five-

# Neural Techniques

## Facebook BART and Google T5

In [11]:
from transformers import pipeline

In [16]:
models = ["facebook/bart-large-cnn", "t5-base", "philschmid/bart-large-cnn-samsum"]

In [20]:
def BART_summary(text, model_name, summary_size=0.5):
    # Get lengths of original text
    word_len = len(nltk.word_tokenize(text))
    
    # Initialise summariser
#     summariser = pipeline("summarization", model="facebook/bart-large-cnn")
    summariser = pipeline("summarization", model=model_name)
    
    # Chunking
    if word_len > 1024:
        chunks = []
        current_chunk = ""
        sents = nltk.sent_tokenize(text)
        for sent in sents:
            if len(nltk.word_tokenize(current_chunk + f" {sent}")) >= 900:
                chunks.append(current_chunk)
                current_chunk = ""
            else:
                current_chunk += f" {sent}"
        
        summary_chunks = []
        for chunk in chunks:
            chunk_len = len(nltk.word_tokenize(chunk))
            summary_len = int(summary_size * chunk_len)
            print(summary_len)
            summary = summariser(chunk, max_length=chunk_len, min_length=summary_len, do_sample=True)
            print(chunk_len, summary_len, summary, end="\n=====================================\n\n")
    
#     return summariser(text, max_length=int(0.5 * len(text[2])), min_length=100, do_sample=False)

In [21]:
text = get_text()
text

Enter a search term: Tom Cruise


('Q37079',
 'Tom Cruise',
 'Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. One of the world\'s highest-paid actors, he has received various accolades, including an Honorary Palme d\'Or and three Golden Globe Awards, in addition to nominations for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time.\nCruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). Critical acclaim came with his roles in the dramas The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989). For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor. As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the 

In [22]:
summary = BART_summary(text[2], models[2])
summary

448
897 448 [{'summary_text': "Thomas Cruise Mapother IV was born on July 3, 1962, in Syracuse, New York. He has been married to Mimi Rogers, Nicole Kidman, and Katie Holmes. His films have grossed over $4 billion in North America and over $11.5 billion worldwide. He is one of the highest-grossing box-office stars of all time. His parents were from Louisville, Kentucky, and he has three sisters named Lee Anne, Marian, and Cass. He attended 15 schools in 14 years. His father died of cancer in 1984. He grew up in near poverty and had a Catholic upbringing. He credits Scientology with helping him to overcome dyslexia. His cousin William Mapother is also an actor and has appeared in five films with him. He was adopted by Jack South in 1978 and his mother moved to Ottawa in 1971. He had a biological daughter with Mary Lee Pfeiffer in 1984, and a daughter with Nicole Rogers in 1996. He also had an adopted daughter with Katie Holmes in 2000. He claims his father was a bully and a bully who be

897 448 [{'summary_text': 'Cruise was converted to Scientology by his first wife Mimi Rogers. He married Nicole Kidman on the set of Days of Thunder (1990). They adopted two children: Isabella Jane (born 1992) and Connor Antony (born 1995). In 2001, Cruise filed for divorce from Kidman. He has been dating actress Katie Holmes since 2005. On November 18, 2006, Cruise and Holmes were married at the 15th-century Odescalchi Castle in Bracciano. Katie Holmes and Cruise divorced on July 9, 2012. The terms of the divorce are not publicly available. According to Janet Reitman\'s book, Cruise removed himself from the Church of Scientology and worked on the film Wide Wide Shut.    He publicly admitted to following Scientology in 1992 and he was an outspoken advocate for the organization in the 2000s. He also said that ex-wife Katie Holmes divorced him to protect the couple\'s daughter Suri from Scientology and that Suri is no longer a practicing member of the organization. In October 2012, Vanit

## Longformer

In [26]:
from transformers import LongformerTokenizer, EncoderDecoderModel
from transformers import LEDTokenizer, LEDForConditionalGeneration

In [36]:
text = get_text()
text

Enter a search term: Lionel Messi


('Q615',
 'Lionel Messi',

In [28]:
def LF_summary(text, summary_size=0.5):
    # Get word length of original text
    word_len = len(nltk.word_tokenize(text))
    
    # Initialise tokeniser and model
    tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
    model = EncoderDecoderModel.from_pretrained("patrickvonplaten/longformer2roberta-cnn_dailymail-fp16")
    
    # Chunking
    if word_len > 4096:
        chunks = []
        current_chunk = ""
        sents = nltk.sent_tokenize(text)
        for sent in sents:
            if len(nltk.word_tokenize(current_chunk + f" {sent}")) > 3000:
                chunks.append(current_chunk)
                current_chunk = ""
            else:
                current_chunk += f" {sent}"
        
        summary_chunks = []
        for chunk in chunks:
            chunk_len = len(nltk.word_tokenize(chunk))
            summary_len = int(summary_size * chunk_len)
            
            input_ids = tokenizer(chunk, return_tensors="pt").input_ids
            
            summary_ids = model.generate(input_ids, max_length=chunk_len, min_length=summary_len, num_beams=4, early_stopping=True)
            summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
            print(len(nltk.word_tokenize(chunk)), summary_len, len(summary), summary, end="\n=====================================\n\n")

In [48]:
def LF_summary_2(text):
    # Initialise model and tokenizer
    tokenizer = LEDTokenizer.from_pretrained("patrickvonplaten/led-large-16384-pubmed")
    model = LEDForConditionalGeneration.from_pretrained("patrickvonplaten/led-large-16384-pubmed")
    
#     inputs_dict = tokenizer(text, padding="max_length", max_length=8192, return_tensors="pt", truncation=True)
    inputs_dict = tokenizer(text, padding="max_length", max_length=16384, return_tensors="pt", truncation=True)
    input_ids = inputs_dict.input_ids #.to("cuda")
    attention_mask = inputs_dict.attention_mask #.to("cuda")
    global_attention_mask = torch.zeros_like(attention_mask)
    # put global attention on <s> token
    global_attention_mask[:, 0] = 1

    predicted_abstract_ids = model.generate(input_ids, max_length=1024, min_length=1024, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
    return tokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
#     return batch

In [49]:
LF_summary_2(text[2])

[" Messi is an Argentine professional footballer who plays as a forward for  \n ligue 1 club Paris Saint-germain and captains the Argentina national team. \n he has scored over 795 senior career goals for club and country, and has the most goals by a player for a single club (672 ). a prolific goalscorer and creative playmaker, \n Messi holds the records for most goals in la Liga (474 ), most hat-tricks ( 36 ) and the UEFA Champions league ( 8 ), and most assists ( 192 ). at 17 years, three months, and 22 days old \n, he was at the time the youngest player to represent Barcelona in an official competition, and the youngest to win the Ballon d'or and the FIFA world player of the year award, both times by the biggest voting margin in each trophy's history. after five years of playing in the centre of the pitch, Messi had returned to his old position on the right wing late the previous year, and from there he regained his best form, while his team's attacking team ended the attacking depe

In [24]:
# LF_summary(text[2])

In [25]:
# SE_summary(text[2])

In [56]:
text_list = chunk_text(text[2], 512)
text_list

[" Argentine professional footballer\nEponyms and public art\nFilms\nFamily\nLionel Andrés Messi[note 1] (Spanish pronunciation:\xa0[ljoˈnel anˈdɾes ˈmesi] (listen); born 24 June 1987), also known as Leo Messi, is an Argentine professional footballer who plays as a forward for  Ligue 1 club Paris Saint-Germain and captains the Argentina national team. Widely regarded as one of the greatest players of all time, Messi has won a record seven Ballon d'Or awards,[note 2] a record six European Golden Shoes, and in 2020 was named to the Ballon d'Or Dream Team. Until leaving the club in 2021, he had spent his entire professional career with Barcelona, where he won a club-record 35 trophies, including 10 La Liga titles, seven Copa del Rey titles and four UEFA Champions Leagues. With his country, he won the 2021 Copa América and the 2022 FIFA World Cup. A prolific goalscorer and creative playmaker, Messi holds the records for most goals in La Liga (474), most hat-tricks in La Liga (36) and the U

In [75]:
LF_summary_2(text_list[0])

[" lionel andrés Messi[note 1] (Spanish pronunciation: \n ljoˈnel anˈdɾes \n \n (listen ), also known as Leo Messi, is an Argentine professional footballer who plays as a forward for  ligue 1 club Paris Saint-germain and captains the Argentina national team. \n a prolific goalscorer and creative playmaker, Messi holds the records for most goals in la Liga (474 ), most hat-tricks in the league ( 36 ) and the UEFA Champions league ( 8 ), and most assists in both the league and the Copa américa ( 17 ). until leaving the club in 2021, he had spent his entire professional career with Barcelona, where he won a club-record 35 trophies, including 10 league titles, seven cup titles and four UEFA Champions leagues. in 2020, \n he was named to the Ballon d'or Dream Team, and in 2020 he will be the first player to win the award four times. out of contract, he signed for  \n p.s. in this article \n, we review the career of the great Argentine professional golfer, with special attention to his achie

## OpenAI

In [88]:
import openai
from dotenv import load_dotenv
load_dotenv()

True

In [104]:
def openai_summary(text, summary_size=0.5):
    openai.api_key = os.getenv("OPENAI_API_KEY")
    prompt=f"Context: {text}\n\nCreate summary of the given context that is {summary_size*100}% of the original."

    res = openai.Completion.create(
        model="text-davinci-003", 
        prompt=prompt, 
        temperature=0,
        max_tokens=500,
    )

    return res.choices[0].text

In [80]:
text = get_text()
text

Enter a search term: Tom Cruise


('Q37079',
 'Tom Cruise',
 'Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. One of the world\'s highest-paid actors, he has received various accolades, including an Honorary Palme d\'Or and three Golden Globe Awards, in addition to nominations for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time.\nCruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). Critical acclaim came with his roles in the dramas The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989). For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor. As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the 

In [106]:
text_chunks = chunk_text(text[2], 3000)
text_chunks

[' Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. One of the world\'s highest-paid actors, he has received various accolades, including an Honorary Palme d\'Or and three Golden Globe Awards, in addition to nominations for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time. Cruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). Critical acclaim came with his roles in the dramas The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989). For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor. As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the drama A Few Good Men (1992

In [107]:
openai_summary(text_chunks[0])

"\n\nThomas Cruise Mapother IV is an American actor and producer who has achieved worldwide success. He has won various accolades, including an Honorary Palme d'Or and three Golden Globe Awards, and has been nominated for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time. Cruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). He won a Golden Globe Award for his portrayal of Ron Kovic in Born on the Fourth of July (1989) and received a nomination for the Academy Award for Best Actor. He starred in several commercially successful films in the 1990s, including A Few Good Men (1992), The Firm (1993), Interview with the Vampire (1994), and Jerry Maguire (1996). Cruise was born in Syracuse, New York, to electrical engineer Thomas Cruise Mapother III and special edu

In [108]:
for t in text_chunks:
    print()
    summary = openai_summary(t)
    print(summary)
    print()




Thomas Cruise Mapother IV is an American actor and producer who has achieved worldwide success. He has won various accolades, including an Honorary Palme d'Or and three Golden Globe Awards, and has been nominated for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time. Cruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). He has since starred in several commercially successful films, including A Few Good Men (1992), The Firm (1993), Interview with the Vampire (1994), Jerry Maguire (1996), Magnolia (1999), Mission: Impossible (1996-2018), Vanilla Sky (2001), Minority Report (2002), The Last Samurai (2003), Collateral (2004), War of the Worlds (2005), Knight and Day (2010), Jack Reacher (2012), Oblivion (2013), and Edge of Tomorrow (2014). Cruise has been mar