In [18]:
#!pip install transformers
#!pip install torch
#!pip3 install pyPDF2
#!pip3 install wget
#!pip3 install sentencepiece
#!pip3 install protobuf==3.20.1

In [7]:
from GPUtil import showUtilization as gpu_usage
gpu_usage()  

| ID | GPU | MEM |
------------------
|  0 | 24% |  7% |
|  1 | 24% |  6% |
|  2 | 23% |  7% |
|  3 | 19% |  6% |


In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [5]:
%env CUDA_VISIBLE_DEVICES= 2

env: CUDA_VISIBLE_DEVICES=2


In [14]:
#import needed libraries
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline, AutoModelForSequenceClassification
from datasets import load_dataset
import torch
import requests
import io
from PyPDF2 import PdfFileReader
import warnings
warnings.filterwarnings('ignore')

# Summarization Model

The goal of summarization models is to create a summary using new phrases built by interpreting and understanding a text. This leads to a more "human-like" sounding summary as opposed to an extractive summary that only extracts important information verbatim from a text. This goal is met by using transformers that use encoder and decoder layers, similar to a Seq2Seq model (sequence to sequence). 

Each encoder layer consists of a self-attention and feed forward layer. An input, in this case a sentence, is fed into the self-attention layer where the encoder looks at other words while encoding a specific word. This is repeated for every word in a sentence before being passed on to the decoder portion of the transformer. The decoder consists of the same layers as an encoder, but has an extra layer, the attention layer, that allows the decoder to focus on relevant parts of the sentence.

The attention layers of the encoder accesses words in the text while the attention layers of the decoder accesses the words positioned before the given word input to create meaning.


In this case, BART (Bidirectional and Auto-Regressive Transformer) is used. BART uses Seq2Seq with a bidirectional encoder and a left-to-right decoder similar to GPT.

In [7]:
class summarization:
    def __init__(self, demo=False):
        if not demo:
            #allow for inout of other text
            self.sequence= input('What would you like to summarize?')
        else:
            self.sequence= "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-story building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct."

    def predict(self):
        #choose model that performs summarization
        checkpoint= 'facebook/bart-large-cnn'
    
        #the model has already been pretrained and finetuned to data from cnn()
        model= BartForConditionalGeneration.from_pretrained(checkpoint)
        tokenizer= BartTokenizer.from_pretrained(checkpoint)

        #apply the tokenizer to the text
        #padding is used to ensure that the shorter sentences have the same length as the longsest sequence or the max length accepted by the model
        inputs= tokenizer(self.sequence, padding= True, return_tensors="pt")

        #pass encoded tokens along with specific parameters
        outputs= model.generate(inputs['input_ids'], max_length=150, min_length= 50, length_penalty= 2.0, num_beams= 10, early_stopping= True)
        #decode the generated summary into text
        summary= tokenizer.decode(outputs[0], skip_special_tokens= True)

        print('Original text: ')
        print(self.sequence)
        print('')
        print('Summary: ')
        print(summary)


In [8]:
summarization(demo=True).predict()

Original text: 
The tower is 324 metres (1,063 ft) tall, about the same height as an 81-story building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.

Summary: 
The tower is 324 metres (1,063 ft) tall, about the same height as an 81-story building, and the tallest structure in Paris. Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the

# Summarization model finetuned on conversations

Due to the formatting of text messages/transcripts, the pretrained model does nto summarize the conversation in the best way. This model was fine-tuned and trained on the samsum dataset found on HuggingFace.

In [21]:
class conversation_summarization:
    def __init__(self, demo=False):
        #allow for input
        if not demo:
            self.sequence= input('What would you like to summarize?')
        else:
            self.sequence= """Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye"""
    
    def predict(self):
        #choose model that performs coversation summarization, trained on samsum data
        checkpoint= 'amaibrah/bart-large-samsum'
    
        model= AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
        tokenizer= AutoTokenizer.from_pretrained(checkpoint)

        #apply the tokenizer to the text
        #padding is used to ensure that the shorter sentences have the same length as the longsest sequence or the max length accepted by the model
        #PyTorch is being used, so the tensors should be set as 'pt' in this case
        inputs= tokenizer(self.sequence, padding= True, return_tensors="pt")
        #create a summary no longer than 150 words and no less than 15
        outputs= model.generate(inputs['input_ids'], max_length=150, min_length= 15, length_penalty= 2.0, num_beams= 4, early_stopping= True)
        summary= tokenizer.decode(outputs[0], skip_special_tokens= True)

        print('Original text: ')
        print(self.sequence)
        print('')
        print('Summary: ')
        print(summary)

In [22]:
conversation_summarization(demo=True).predict()

Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/280 [00:00<?, ?B/s]

Original text: 
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Summary: 
Amanda can't find Betty's number. Larry called Betty the last time they were at the park together. Hannah doesn't know Larry very well. Amanda will text him.


# Summarization model finetuned on long documents

The BART-large-CNN model has some limitations when it comes to summarizing long documents. This model was fine-tuned on the launch/gov_report dataset from HuggingFace. It contains long reports and their associated summaries written by goverment research agencies. This model does take some time to run depending on the length of the document.

When using the class below with demo set to true, the model will automatically summarize chapter 2 of and open source political science textbook.

In [19]:
class long_doc_summarization:
    def __init__(self, demo=False):
        if not demo:
            #in case someone would like to summarize a specific document
            self.url= input('Please enter the url to your PDF document:')
            #first page in pdf to be summarized
            self.start= input("What page would you like to start at?")
            #last page in pdf to be summarized
            self.end= input("What page would you like to end at?")
            
            #extract the pdf's content from the webpage
            r= requests.get(self.url)
            f= io.BytesIO(r.content)
            reader= PdfFileReader(f, strict= False)

            #extract content from each page chosen
            self.sequence= []
            for i in range(self.start, self.end+1):
                contents= reader.getPage(i).extractText()
                self.sequence.append(contents)

        else:
            #url to open access pdf file of a political science textbook
            url= "https://web.ung.edu/media/university-press/public-policy.pdf?t=1661449833017"
            r= requests.get(url)
            f= io.BytesIO(r.content)
            reader= PdfFileReader(f, strict=False)
            #Chapter 2 of textbook
            start= 21
            end= 40
            self.sequence= []
            for i in range(start, end+1):
                contents= reader.getPage(i).extractText()
                self.sequence.append(contents)

    def predict(self):
        #choose model that performs coversation summarization, trained on samsum data
        checkpoint= 'amaibrah/long_doc_summarizer'
    
        model= AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
        tokenizer= AutoTokenizer.from_pretrained(checkpoint)

        inputs= tokenizer(self.sequence, padding= True, return_tensors="pt")
        #return summary that is no more than 1000 words and no less than 250
        outputs= model.generate(inputs['input_ids'], max_length=1000, min_length= 250, length_penalty= 2.0, num_beams= 10, early_stopping= True)
        summary= tokenizer.decode(outputs[0], skip_special_tokens= True)

        print('Summary: ')
        print(summary)

In [20]:
long_doc_summarization(demo=True).predict()

Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/280 [00:00<?, ?B/s]

Summary: 
The Articles of Confederation were introduced in 1776 as an attempt to create a new, permanent government in the American colonies. By 1781, the Articles had been ratified and had officially become the law of the land. Fearing a return to the oppressive policies associated with a strong unitary government, Americans were reluctant to give too much power to their new central government. The framers opted for a confederate government where policy making power was placed in the hands of local (state) governments. The U.S. Constitution, written in 1787, created the new federal government and established the foundation for federal government policy making. The problem the framers of the Constitution faced was power. How much political power should be given to the government, and where should it be placed? Not enough political power in the central government could lead to anarchy and the tyranny of the majority, that is, when a majority controls a representative. Stop and Think: Wh

# Zero-shot Classification Model

The main goal of zero-shot classification is to be able to classify text without using any labeled data and without seeing labelled text. Zero-shot classification models can be used on text of a different domain that is was not initially trained on. This makes these types of models best for generalized topics overall.

In [17]:
#using HuggingFace's pipeline function to show model's performance
classifier= pipeline(task="zero-shot-classification", device=0, model="joeddav/xlm-roberta-large-xnli")

input_sequence = "I love traveling"
label_candidate = ['travel', 'cooking', 'entertainment', 'dancing', 'technology']
classifier(input_sequence, label_candidate)

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'sequence': 'I love traveling',
 'labels': ['travel', 'entertainment', 'technology', 'dancing', 'cooking'],
 'scores': [0.9668570756912231,
  0.02837473899126053,
  0.002383037470281124,
  0.0014391953591257334,
  0.0009460083092562854]}

The result above shows the label that is most likely to be related out of the list of labels. This is a single-label version of zero-shot classification.

Zero-shot classification can also be used for multi-label classification where it tests the probablity of each label being similar to the statement or input. Each probability score is calculated independently instead of together.

This model was pretrained on multiple languages as listed below. The statement and labels do not have to be in the same language for this model!

The languages include: English, French, Spanish, German, Greek, Bulgarian, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, Hindi, Swahili, and Urdu

In [34]:
class zero_shot_model:
    def __init__(self, demo=False):
        #can accept input
        if not demo:
            self.statement= input('What would you like to classify?')
            #the input can be a list of words separated by commas
            self.labels= [str(x) for x in input("Please list the labels you would like to use for this classification. Use a comma to separate each word: ").split(', ')]

        else:
        #automatic inputs if demo is selected
            self.statement= "I love traveling"
            self.labels= ['travel', 'cooking', 'entertainment', 'dancing', 'technology']

    def predict(self):
        #select pretrained model
        checkpoint= 'joeddav/xlm-roberta-large-xnli'

        model= AutoModelForSequenceClassification.from_pretrained(checkpoint)
        tokenizer= AutoTokenizer.from_pretrained(checkpoint)

        #create empty dictionary that will be addended using the loop below
        output={}
        for label in self.labels:
            x= tokenizer.encode(self.statement, label, return_tensors='pt', truncation_strategy='only_first')
            logits= model(x)[0]
            #compute probability of how related each label is to the statement
            entail_contradiction_logits= logits[:, [0,2]]
            probs= entail_contradiction_logits.softmax(dim=1)
            output[label]= float(probs[:,1])

        #create function that returns labels with probability in descending order
        def sort(d, reverse=False):
            return dict(sorted(d.items(), key= lambda x: x[1], reverse=reverse))

        print(self.statement)
        print('')
        print(sort(output, True))


In [35]:
zero_shot_model(demo=True).predict()

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


I love traveling

{'travel': 0.9993754029273987, 'entertainment': 0.922992467880249, 'cooking': 0.33314982056617737, 'dancing': 0.1990966647863388, 'technology': 0.043315738439559937}
