# Imports

In [1]:
# Third-Party Imports
import nltk
import torch

# Standard Library Imports
import os
from math import inf
from string import punctuation
import sys

# Local Imports
from queries import get_text_cli
from get_documents import search

# Utility Functions

In [2]:
def get_text():
    term = get_text_cli('Enter a search term')
    return search(term)

def load_docs(dirname='corpus'):
    corpus = dict()
    main_path = os.path.join(os.path.dirname('__file__'), dirname)

    for file in os.listdir(main_path):
        with open(os.path.join(main_path, file), 'r') as f:
            corpus[file] = f.read()
    
    return corpus

def chunk_text(text, chunk_len):
    chunks = []
    current_chunk = ""
    sents = nltk.sent_tokenize(text)
    
    for sent in sents:
        if len(nltk.word_tokenize(current_chunk + f" {sent}")) >= chunk_len:
            chunks.append(current_chunk)
            current_chunk = ""
        else:
            current_chunk += f" {sent}"
    
    chunks.append(current_chunk)
    
    return chunks

# Simple Extractive Text Summarisation

In [3]:
def word_tokenize(text):
    banned = list(punctuation) + nltk.corpus.stopwords.words("english")
    return [w for w in nltk.word_tokenize(text) if w not in banned]

def SE_summary(text):
    # Create word and sentence tokens
    words = word_tokenize(text)
    word_set = set(words) # set of all unique words in word tokens
    sents = nltk.sent_tokenize(text)
    
    # Initialise frequency table for word tokens
    w_freq_table = {w: words.count(w) for w in word_set}
    
    # Score sentences based on frequency of their words
    sent_scores = {
        sent: sum(w_freq_table.get(w, 0) for w in word_tokenize(sent))
        for sent in sents
    }
    
    # Build summary
    avg = sum(sent_scores.values()) / len(sent_scores)
    summary = " ".join(sent for sent in sents if sent_scores[sent] >= avg)
    return summary

In [4]:
text = get_text()
text

Enter a search term: Tom Cruise


('Q37079',
 'Tom Cruise',
 'Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. One of the world\'s highest-paid actors, he has received various accolades, including an Honorary Palme d\'Or and three Golden Globe Awards, in addition to nominations for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time.\nCruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). Critical acclaim came with his roles in the dramas The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989). For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor. As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the 

In [5]:
summary = SE_summary(text[2])
summary

'Cruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). Cruise\'s performance as a motivational speaker in the drama Magnolia (1999) earned him another Golden Globe Award and a nomination for the Academy Award for Best Supporting Actor. Since then, Cruise has largely starred in science fiction and action films, establishing himself as an action star, often performing his own risky stunts. His other notable roles in the genre include Vanilla Sky (2001), Minority Report (2002), The Last Samurai (2003), Collateral (2004), War of the Worlds (2005), Knight and Day (2010), Jack Reacher (2012), Oblivion (2013), Edge of Tomorrow (2014), and Top Gun: Maverick (2022), with Maverick being his highest-grossing film. Cruise was born on July 3, 1962, in Syracuse, New York, to electrical engineer Thomas Cruise Mapother III (1934–1984) and special education teacher Mary Lee (née Pfeiffer; 1936–2017). 

In [6]:
print(len(text[2]), len(summary), len(nltk.word_tokenize(text[2])), len(nltk.word_tokenize(summary)))

33497 18376 6314 3463


# Clever Algorithms

In [7]:
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

## LexRank, TextRank, Latent Semantic Analysis (LSA), Luhn's Algorithm

In [8]:
def algo_summary(text, method):
    # Initialise summariser
    if method == "lex":
        summariser = LexRankSummarizer()
    elif method == "text":
        summariser = TextRankSummarizer()
    elif method == "lsa":
        summariser = LsaSummarizer()
    elif method == "luhn":
        summariser = LuhnSummarizer()
    
    # Initialise parser
    parser = PlaintextParser(text, Tokenizer("english"))
    
    # Create summary
    summary_sents = summariser(parser.document, 20)
    
    return " ".join(str(s) for s in summary_sents)

In [9]:
text = get_text()
text

Enter a search term: Tom Cruise


('Q37079',
 'Tom Cruise',
 'Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. One of the world\'s highest-paid actors, he has received various accolades, including an Honorary Palme d\'Or and three Golden Globe Awards, in addition to nominations for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time.\nCruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). Critical acclaim came with his roles in the dramas The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989). For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor. As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the 

In [10]:
for meth in ("lex", "text", "lsa", "luhn"):
    summary = algo_summary(text[2], meth)
    print(f"\nMethod: {meth}", summary, f"Orginal: {len(text[2])}, Summary: {len(summary)}", sep="\n\n")
    print(f"\n{100*'='}\n")


Method: lex

Cruise's performance as a motivational speaker in the drama Magnolia (1999) earned him another Golden Globe Award and a nomination for the Academy Award for Best Supporting Actor. He has played Ethan Hunt in all six of the Mission: Impossible films from 1996 to 2018. He has three children, two of whom were adopted during his marriage to Kidman and the other of whom is a biological daughter he had with Holmes. That year, his mother left his father, taking Cruise and his sisters back to the United States. In 1978, she married Jack South. Cruise's next films were Days of Thunder (1990) and Far and Away (1992), both of which co-starred then-wife Nicole Kidman as his love interest, followed by the legal thriller The Firm, which was a critical and commercial success. In 2005, Cruise worked again with Steven Spielberg in War of the Worlds, a loose adaptation of the H. G. Wells novel of the same name, which became the fourth highest-grossing film of the year with US$591.4 million

# Neural Techniques

## Facebook BART and Google T5

In [11]:
from transformers import pipeline

In [12]:
models = ["facebook/bart-large-cnn", "t5-base", "philschmid/bart-large-cnn-samsum"]

In [13]:
def ann_summary(text, model_name, summary_size=0.5):
    # Get lengths of original text and summary
    word_len = len(nltk.word_tokenize(text))
    summary_len = int(summary_size * word_len)
    
    # Initialise summariser
    summariser = pipeline("summarization", model=model_name)
    
    summary = summariser(text, max_length=summary_len, min_length=summary_len, do_sample=False)
    return summary[0]['summary_text']

In [14]:
text = get_text()
text

Enter a search term: Tom Cruise


('Q37079',
 'Tom Cruise',
 'Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. One of the world\'s highest-paid actors, he has received various accolades, including an Honorary Palme d\'Or and three Golden Globe Awards, in addition to nominations for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time.\nCruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). Critical acclaim came with his roles in the dramas The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989). For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor. As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the 

In [15]:
text_chunks = chunk_text(text[2], 400)
text_chunks

[" Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. One of the world's highest-paid actors, he has received various accolades, including an Honorary Palme d'Or and three Golden Globe Awards, in addition to nominations for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time. Cruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). Critical acclaim came with his roles in the dramas The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989). For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor. As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the drama A Few Good Men (1992),

In [16]:
for chunk in text_chunks:
    summary = ann_summary(chunk, models[1])
    print(len(nltk.word_tokenize(chunk)), len(nltk.word_tokenize(summary)), summary, end=f"\n\n{100*'='}\n\n")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


332 106 he is one of the highest-grossing box-office stars of all time . his films have grossed over $4 billion in North America and over $11.5 billion worldwide . he has played Ethan Hunt in all six of the Mission: Impossible films from 1996 to 2018 . "i'm not going to let you down," he says, "but i'll let you know if it's a good movie or a bad one" he won an honorary palme versiuneversiuneversiune n   n     -  "  " " " "   "  " "- " n " , " - "


384 108 he has been married to actresses Mimi Rogers, Nicole Kidman, and Katie Holmes . he is an outspoken advocate for the Church of Scientology, which helped him overcome dyslexia . in the 2000s, he sparked controversy with his criticisms of psychiatry and anti-depressant drugs . his parents were both from Louisville, Kentucky, and had English, German, and Irish ancestry . "it was a great lesson in my life,"    . n.  .  "    -   n   - " " "    "  "  " "  s "  " , " n " s  "s " "- " -


381 115 in sixth grade, cruise went to the St. Francis

Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors


395 106 cruise/wagner productions is said to be developing a screenplay about a real-life serial killer . Leonardo DiCaprio's production company, Appian Way, is also developing the film . cruise has produced several films in which he appeared . on august 22, 2006, Paramount Pictures announced it was ending its 14-year relationship with Cruise . in November 2006, Cruise and Paula Wagner announced they had taken over the film studio United Artists - a joint venture between cruise and versiuneversiuneversiune versiune .    .     "    . "     an - n  s s   " " "  n. - " " "  " s. .. 


393 98 in the early-to-mid-1980s, Cruise had relationships with Melissa Gilbert, Rebecca De Mornay, Patti Scialfa, and Cher . in 2001, Cruise filed for divorce from actress Mimi Rogers while she was unknowingly pregnant . after the breakup with Cruz, Scientologist leaders launched a secret project to find Cruise a new girlfriend . a series of "auditions" of Scientologists actresses resulted in a short-lived 

## Longformer

In [17]:
from transformers import LongformerTokenizer, EncoderDecoderModel
from transformers import LEDTokenizer, LEDForConditionalGeneration

In [18]:
text = get_text()
text

Enter a search term: Tom Cruise


('Q37079',
 'Tom Cruise',
 'Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. One of the world\'s highest-paid actors, he has received various accolades, including an Honorary Palme d\'Or and three Golden Globe Awards, in addition to nominations for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time.\nCruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). Critical acclaim came with his roles in the dramas The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989). For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor. As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the 

In [19]:
def LF_summary(text, summary_size=0.5):
    # Get word length of original text
    word_len = len(nltk.word_tokenize(text))
    
    # Initialise tokeniser and model
    tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
    model = EncoderDecoderModel.from_pretrained("patrickvonplaten/longformer2roberta-cnn_dailymail-fp16")
    
    # Chunking
    if word_len > 4096:
        chunks = []
        current_chunk = ""
        sents = nltk.sent_tokenize(text)
        for sent in sents:
            if len(nltk.word_tokenize(current_chunk + f" {sent}")) > 3000:
                chunks.append(current_chunk)
                current_chunk = ""
            else:
                current_chunk += f" {sent}"
        
        summary_chunks = []
        for chunk in chunks:
            chunk_len = len(nltk.word_tokenize(chunk))
            summary_len = int(summary_size * chunk_len)
            
            input_ids = tokenizer(chunk, return_tensors="pt").input_ids
            
            summary_ids = model.generate(input_ids, max_length=chunk_len, min_length=summary_len, num_beams=4, early_stopping=True)
            summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
            print(len(nltk.word_tokenize(chunk)), summary_len, len(summary), summary, end="\n=====================================\n\n")

In [20]:
def LF_summary_2(text, summary_size=0.5):
    # Get word lengths of original text and summary
    word_len = len(nltk.word_tokenize(text))
    summary_len = min(int(summary_size * word_len), 1024)
    
    # Initialise model and tokenizer
    tokenizer = LEDTokenizer.from_pretrained("patrickvonplaten/led-large-16384-pubmed")
    model = LEDForConditionalGeneration.from_pretrained("patrickvonplaten/led-large-16384-pubmed")
    
    # Initialise input dictionary
    inputs_dict = tokenizer(text, padding="max_length", max_length=16384, return_tensors="pt", truncation=True)
    input_ids = inputs_dict.input_ids #.to("cuda")
    attention_mask = inputs_dict.attention_mask #.to("cuda")
    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1

    # Create summary
    predicted_abstract_ids = model.generate(input_ids, max_length=summary_len, min_length=summary_len, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
    return tokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)

In [22]:
LF_summary_2(text[2])

[" tom Cruise Mapother IV ( born July 3, 1962 ) is an actor and producer who is one of the highest-grossing box - office stars of all time. \n he is an outspoken advocate for the Church of Scientology, which he credits with helping him overcome dyslexia. in the 2000s, he sparked controversy with his criticisms of psychiatry and anti-depressant drugs, his efforts to promote Scientology in the united states, and a leaked video interview of him promoting Scientology. in this article \n, we review the history of his career, including his career as an actor, producer, and activist for the church of Scientology and his current career as a film producer and activist. in addition, we discuss his personal and professional controversies, and discuss his current status as a public figure and his recent legal and legal actions against the media and the media organizations. in conclusion, \n we review his career and discuss some of the controversies that have occurred in the last few years. in part

## OpenAI

In [23]:
import openai
from dotenv import load_dotenv
load_dotenv()

True

In [65]:
def openai_summary(text, summary_size=0.5):
    word_len = len(nltk.word_tokenize(text))
    summary_len = int(summary_size * word_len)
    print(word_len, summary_len)
    
    openai.api_key = os.getenv("OPENAI_API_KEY")
#     prompt=f"Context: {text}\n\nCreate summary of the given context whose word count is {summary_size*100}% of that of the context."
    prompt=f"Context: {text}\n\nCreate summary of the given context."

#     res = openai.Completion.create(
#         model="text-davinci-003", 
#         prompt=prompt, 
#         temperature=0,
#         max_tokens=summary_len,
#     )
    
    res = openai.Completion.create(
        model="text-davinci-003", 
        prompt=f"Please summarize the following text in no fewer than {summary_len} words (make sure to use full senetences):\n\n{text}\n\nSummary:", 
        temperature=0,
        max_tokens=summary_len,
    )


    return res.choices[0].text

In [25]:
text = get_text()
text

Enter a search term: Tom Cruise


('Q37079',
 'Tom Cruise',
 'Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. One of the world\'s highest-paid actors, he has received various accolades, including an Honorary Palme d\'Or and three Golden Globe Awards, in addition to nominations for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time.\nCruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). Critical acclaim came with his roles in the dramas The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989). For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor. As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the 

In [71]:
text_chunks = chunk_text(text[2], 500)
text_chunks

[" Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer. One of the world's highest-paid actors, he has received various accolades, including an Honorary Palme d'Or and three Golden Globe Awards, in addition to nominations for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time. Cruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). Critical acclaim came with his roles in the dramas The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989). For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor. As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the drama A Few Good Men (1992),

In [74]:
openai_summary(text_chunks[0])

478 239


" Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer who has achieved worldwide success. He has won various awards, including an Honorary Palme d'Or and three Golden Globe Awards, and has been nominated for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time. Cruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). He has since starred in several commercially successful films, including A Few Good Men (1992), The Firm (1993), Interview with the Vampire (1994), and Jerry Maguire (1996). Cruise is also known for his roles in science fiction and action films, such as Mission: Impossible (1996-2018), Vanilla Sky (2001), Minority Report (2002), The Last Samurai (2003), Collateral (2004), War of the Worlds (2005), Knight and Day (2010), Jac

In [None]:
for t in text_chunks:
    summary = openai_summary(t)
    print(len(nltk.word_tokenize(t)), len(nltk.word_tokenize(summary)), summary, end=f"\n\n{100*'='}\n\n")

478 239
478 233  Thomas Cruise Mapother IV (born July 3, 1962) is an American actor and producer who has achieved worldwide success. He has won various awards, including an Honorary Palme d'Or and three Golden Globe Awards, and has been nominated for four Academy Awards. His films have grossed over $4 billion in North America and over $11.5 billion worldwide, making him one of the highest-grossing box-office stars of all time. Cruise began acting in the early 1980s and made his breakthrough with leading roles in the comedy film Risky Business (1983) and action film Top Gun (1986). He has since starred in several commercially successful films, including A Few Good Men (1992), The Firm (1993), Interview with the Vampire (1994), and Jerry Maguire (1996). Cruise is also known for his roles in science fiction and action films, such as Mission: Impossible (1996-2018), Vanilla Sky (2001), Minority Report (2002), The Last Samurai (2003), Collateral (2004), War of the Worlds (2005), Knight and 