# Imports

In [1]:
# Third-Party Imports
import nltk
import torch

# Standard Library Imports
import os
from math import inf
from string import punctuation
import sys

from queries import get_text_cli
from get_documents import search

# Utility Functions

In [2]:
def get_text():
    term = get_text_cli('Enter a search term')
    return search(term)

def load_docs(dirname='corpus'):
    corpus = dict()
    main_path = os.path.join(os.path.dirname('__file__'), dirname)

    for file in os.listdir(main_path):
        with open(os.path.join(main_path, file), 'r') as f:
            corpus[file] = f.read()
    
    return corpus

# Simple Extractive Text Summarisation

In [3]:
def word_tokenize(text):
    banned = list(punctuation) + nltk.corpus.stopwords.words("english")
    return [w for w in nltk.word_tokenize(text) if w not in banned]

def SE_summary(text):
    # Create word and sentence tokens
    words = word_tokenize(text)
    word_set = set(words) # set of all unique words in word tokens
    sents = nltk.sent_tokenize(text)
    
    # Initialise frequency table for word tokens
    w_freq_table = {w: words.count(w) for w in word_set}
    
    # Score sentences based on frequency of their words
    sent_scores = {
        sent: sum(w_freq_table.get(w, 0) for w in word_tokenize(sent))
        for sent in sents
    }
    
    # Build summary
    avg = sum(sent_scores.values()) / len(sent_scores)
    summary = " ".join(sent for sent in sents if sent_scores[sent] >= avg)
    return summary

In [4]:
text = get_text()
text

Enter a search term: Spock


('Q16341',
 'Spock',
 'Spock is a fictional character in the Star Trek media franchise. He first appeared in the original Star Trek series serving aboard the starship USS Enterprise as science officer and first officer (and Kirk\'s Second-in-command) and later as commanding officer of two iterations of the vessel. Spock\'s mixed human-Vulcan heritage serves as an important plot element in many of the character\'s appearances. Along with Captain James T. Kirk (William Shatner) and Dr. Leonard "Bones" McCoy (DeForest Kelley), he is one of the three central characters in the original Star Trek series and its films. After retiring from active duty in Starfleet, Spock served as a Federation ambassador, and later became involved in the ill-fated attempt to save Romulus from a supernova, leading him to live out the rest of his life in a parallel universe.\nSpock was played by Leonard Nimoy in the original Star Trek series, Star Trek: The Animated Series, eight of the Star Trek feature films, 

In [5]:
summary = SE_summary(text[2])
summary

'He first appeared in the original Star Trek series serving aboard the starship USS Enterprise as science officer and first officer (and Kirk\'s Second-in-command) and later as commanding officer of two iterations of the vessel. Spock\'s mixed human-Vulcan heritage serves as an important plot element in many of the character\'s appearances. Along with Captain James T. Kirk (William Shatner) and Dr. Leonard "Bones" McCoy (DeForest Kelley), he is one of the three central characters in the original Star Trek series and its films. Spock was played by Leonard Nimoy in the original Star Trek series, Star Trek: The Animated Series, eight of the Star Trek feature films, and a two-part episode of Star Trek: The Next Generation. Multiple actors have played the character since Nimoy within Star Trek\'s main continuity; the most recent portrayal is Ethan Peck, who played Spock as a recurring character in the second season of Star Trek: Discovery and in Star Trek: Short Treks. Peck has reprised the

In [6]:
print(len(text[2]), len(summary))

37284 21063


# Clever Algorithms

In [7]:
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

## LexRank, TextRank, Latent Semantic Analysis (LSA), Luhn's Algorithm

In [8]:
def algo_summary(text, method):
    # Initialise summariser
    if method == "lex":
        summariser = LexRankSummarizer()
    elif method == "text":
        summariser = TextRankSummarizer()
    elif method == "lsa":
        summariser = LsaSummarizer()
    elif method == "luhn":
        summariser = LuhnSummarizer()
    
    # Initialise parser
    parser = PlaintextParser(text, Tokenizer("english"))
    
    # Create summary
    summary_sents = summariser(parser.document, 20)
    
    return " ".join(str(s) for s in summary_sents)

In [9]:
text = get_text()
text

Enter a search term: Lionel Messi


('Q615',
 'Lionel Messi',

In [10]:
for meth in ("lex", "text", "lsa", "luhn"):
    summary = algo_summary(text[2], meth)
    print(f"\nMethod: {meth}", summary, f"Orginal: {len(text[2])}, Summary: {len(summary)}", sep="\n\n")
    print(f"\n{100*'='}\n")


Method: lex

Messi has scored over 795 senior career goals for club and country, and has the most goals by a player for a single club (672). The following two seasons, Messi finished second for the Ballon d'Or behind Cristiano Ronaldo (his perceived career rival), before regaining his best form during the 2014–15 campaign, becoming the all-time top scorer in La Liga and leading Barcelona to a historic second treble, after which he was awarded a fifth Ballon d'Or in 2015. He scored all of his side's four goals in the Champions League quarter-final against Arsenal on 6 April while becoming Barcelona's all-time top scorer in the competition. After Barcelona lost the Copa del Rey final four days later, Messi scored both goals in his side's 2–0 win in the first leg of the Champions League semi-finals in Madrid, the second of which – a slaloming dribble past three Real players – was acclaimed as one of the best ever in the competition. At the close of the year, Messi had scored a record 91 


Method: lsa

As his father's health insurance covered only two years of growth hormone treatment, which cost at least $1,000 per month, Newell's agreed to contribute, but later reneged on their promise. French winger Ludovic Giuly explained how Messi caught the eye in a training session with Frank Rijkaard's first team: "He destroyed us all... They were kicking him all over the place to avoid being ridiculed by this kid, he just got up and kept on playing. However, he continued to be plagued by major injuries; a metatarsal fracture sustained on 12 November 2006 kept him out of action for three months. Messi helped the team achieve 16 consecutive league victories, a record in Spanish football, concluding with another hat-trick against Atlético Madrid on 5 February 2011. In Barça's last home league match on 5 May, against Espanyol, Messi scored all four goals before approaching the bench to embrace Guardiola, who had announced his resignation as manager. On 17 April, Messi ended a five-

# Neural Techniques

## Facebook BART and Google T5

In [11]:
from transformers import pipeline

In [12]:
models = ["facebook/bart-large-cnn", "t5-base"]

In [13]:
def BART_summary(text, model_name, summary_size=0.5):
    # Get lengths of original text
    word_len = len(nltk.word_tokenize(text))
    
    # Initialise summariser
#     summariser = pipeline("summarization", model="facebook/bart-large-cnn")
    summariser = pipeline("summarization", model=model_name)
    
    # Chunking
    if word_len > 1024:
        chunks = []
        current_chunk = ""
        sents = nltk.sent_tokenize(text)
        for sent in sents:
            if len(nltk.word_tokenize(current_chunk + f" {sent}")) >= 900:
                chunks.append(current_chunk)
                current_chunk = ""
            else:
                current_chunk += f" {sent}"
        
        summary_chunks = []
        for chunk in chunks:
            chunk_len = len(nltk.word_tokenize(chunk))
            summary_len = int(summary_size * chunk_len)
            print(summary_len)
            summary = summariser(chunk, max_length=chunk_len, min_length=summary_len, do_sample=False)
            print(len(nltk.word_tokenize(chunk)), summary_len, summary, end="\n=====================================\n\n")
    
#     return summariser(text, max_length=int(0.5 * len(text[2])), min_length=100, do_sample=False)

In [14]:
text = get_text()
text

Enter a search term: Tom Cruise


('Q37079',
 'Tom Cruise',
 'Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. One of the world\'s highest-paid actors, he has received various accolades, including an Honorary Palme d\'Or and three Golden Globe Awards, in addition to nominations for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time.\nCruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). Critical acclaim came with his roles in the dramas The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989). For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor. As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the 

In [15]:
summary = BART_summary(text[2], models[0])
summary

448
897 448 [{'summary_text': 'Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time. He has been married to actresses Mimi Rogers, Nicole Kidman, and Katie Holmes. Cruise has three children, two of whom were adopted during his marriage to Kidman and the other of whom is a biological daughter he had with Holmes. In the 2000s, he sparked controversy with his criticisms of psychiatry and anti-depressant drugs, his efforts to promote Scientology in Europe, and a leaked video interview of him promoting Scientology. He is an outspoken advocate for the Church of Scientology, which he credits with helping him overcome dyslexia. Cruise grew up in near poverty and had a Catholic upbringing. His parents were both from Louisville, Kentucky, and had English, German, and Irish ancestry. One of his cousins, William Ma

897 448 [{'summary_text': 'Cruise married actress Mimi Rogers on May 9, 1987. They divorced on February 4, 1990. Cruise met his second wife, actress Nicole Kidman, on the set of their film Days of Thunder (1990). The couple married on December 24, 1990 and adopted two children: Isabella Jane (born 1992) and Connor Antony (born 1995). In February 2001, Cruise filed for divorce from Kidman while she was unknowingly pregnant. In April 2005, Cruise began dating actress Katie Holmes. On November 18, Holmes and Cruise were married at the 15th-century Odescalchi Castle in Bracciano, in a Scientologist ceremony attended by many Hollywood stars. There has been widespread speculation that their marriage was arranged by the Church of Scientology. On June 29, 2012, Holmes filed for divorces from Cruise. On July 9, the couple signed a divorce settlement worked out by their lawyers. New York law requires all divorce documents remain sealed, so the exact terms of the settlement are not publicly avail

## Longformer

In [16]:
from transformers import LongformerTokenizer, EncoderDecoderModel

In [17]:
text = get_text()
text

Enter a search term: Tom Cruise


('Q37079',
 'Tom Cruise',
 'Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. One of the world\'s highest-paid actors, he has received various accolades, including an Honorary Palme d\'Or and three Golden Globe Awards, in addition to nominations for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time.\nCruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). Critical acclaim came with his roles in the dramas The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989). For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor. As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the 

In [18]:
def LF_summary(text, summary_size=0.5):
    # Get word length of original text
    word_len = len(nltk.word_tokenize(text))
    
    # Initialise tokeniser and model
    tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
    model = EncoderDecoderModel.from_pretrained("patrickvonplaten/longformer2roberta-cnn_dailymail-fp16")
    
    # Chunking
    if word_len > 4096:
        chunks = []
        current_chunk = ""
        sents = nltk.sent_tokenize(text)
        for sent in sents:
            if len(nltk.word_tokenize(current_chunk + f" {sent}")) > 3000:
                chunks.append(current_chunk)
                current_chunk = ""
            else:
                current_chunk += f" {sent}"
        
        summary_chunks = []
        for chunk in chunks:
            chunk_len = len(nltk.word_tokenize(chunk))
            summary_len = int(summary_size * chunk_len)
            
            input_ids = tokenizer(chunk, return_tensors="pt").input_ids
            
            summary_ids = model.generate(input_ids, max_length=chunk_len, min_length=summary_len, num_beams=4, early_stopping=True)
            summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
            print(len(nltk.word_tokenize(chunk)), summary_len, len(summary), summary, end="\n=====================================\n\n")

In [19]:
LF_summary(text[2])

You are using a model of type encoder_decoder to instantiate a model of type encoder-decoder. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at patrickvonplaten/longformer2roberta-cnn_dailymail-fp16 were not used when initializing EncoderDecoderModel: ['decoder.roberta.pooler.dense.bias', 'encoder.embeddings.position_ids', 'decoder.roberta.pooler.dense.weight']
- This IS expected if you are initializing EncoderDecoderModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EncoderDecoderModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: 

In [35]:
SE_summary(text[2])

'Cruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). Cruise\'s performance as a motivational speaker in the drama Magnolia (1999) earned him another Golden Globe Award and a nomination for the Academy Award for Best Supporting Actor. Since then, Cruise has largely starred in science fiction and action films, establishing himself as an action star, often performing his own risky stunts. His other notable roles in the genre include Vanilla Sky (2001), Minority Report (2002), The Last Samurai (2003), Collateral (2004), War of the Worlds (2005), Knight and Day (2010), Jack Reacher (2012), Oblivion (2013), Edge of Tomorrow (2014), and Top Gun: Maverick (2022), with Maverick being his highest-grossing film. Cruise was born on July 3, 1962, in Syracuse, New York, to electrical engineer Thomas Cruise Mapother III (1934–1984) and special education teacher Mary Lee (née Pfeiffer; 1936–2017). 