In [1]:
# Based on Huggingface interface
# - https://huggingface.co/transformers/notebooks.html
# - https://github.com/huggingface/notebooks/blob/master/transformers_doc/quicktour.ipynb
# - More examples with LLMs at: https://github.com/biplav-s/course-tai/blob/3a37536b00a0b386d32cb29da61b1ce68f72cfdb/sample-code/l13-l16-supervised-text/l15-langmodel-commontasks.ipynb

In [2]:
# Transformers installation, if needed
#! pip install transformers datasets

In [3]:
# Using Huggingface pipeline abstraction for common tasks
# - Pipleine: https://huggingface.co/docs/transformers/main_classes/pipelines
# - Summarization: https://huggingface.co/docs/transformers/main/en/task_summary#summarization

In [4]:
# Note: large downloads are done first time a pipeline of a type is run
#       - Approx 2GB in this case 
from transformers import pipeline

summarizer = pipeline("summarization")

In [5]:
ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
2010 marriage license application, according to court documents.
Prosecutors said the marriages were part of an immigration scam.
On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.
After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective
Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.
All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.
Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.
Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s
Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.
"""


In [6]:
# In below, a tokenizer and a pre-trained model is chosen by default

In [7]:
print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))

  beam_id = beam_token_id // vocab_size


[{'summary_text': ' Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and 2002 . At one time, she was married to eight men at once, prosecutors say .'}]


In [8]:
# Utility function to calculate words in a string as well as 
#   given percentage limit. Returns two number
#   Credit for idea: https://www.pythonpool.com/python-count-words-in-string/ 
def calculate_words_limits_in_article(str, perc_limit):
    num = len(str.strip().split(" "))
    return num, (int) (num * (perc_limit / 100))

In [9]:
article_size, p = calculate_words_limits_in_article(ARTICLE, 10)
print(article_size, p)

354 35


In [10]:
# Utility function to summarize content based on given percentage
# Minimum 30 words will be used. 
def summarize_given_perc(ARTICLE, perc):
    article_size, wsize = calculate_words_limits_in_article(ARTICLE, perc)
    print(f"article size = {article_size}, calculated word limit = {wsize}")
    return summarizer(ARTICLE, max_length=wsize, min_length=30, do_sample=False)

In [11]:
print(summarize_given_perc(ARTICLE, 50))

article size = 354, calculated word limit = 177
[{'summary_text': ' Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and 2002 . At one time, she was married to eight men at once, prosecutors say .'}]


In [12]:
print(summarize_given_perc(ARTICLE, 10))

article size = 354, calculated word limit = 35
[{'summary_text': ' Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in the first degree" In total, she has'}]


In [13]:
# From: https://github.com/biplav-s/course-nl/blob/master/common-data/Example-TDBank-PersonalAcctAgree.txt

ARTICLE = """ Returned Checks/Waiver of Rights 
If you deposit a check or item in your Account that the drawee 
bank returns unpaid for any reason (called “dishonor”), we may 
put the check or item through for collection again. This means 
that you are waiving your right to receive immediate notice of 
dishonor. If the check or item is dishonored for any reason, the 
amount of the dishonored check or item will be deducted from 
your Account. You agree to pay the Bank a fee for any such 
check or item that is dishonored (see Personal Fee Schedule). The 
Bank may also collect any amounts due to the Bank because of 
returned checks, through the right of set-off, from any other of 
your Accounts at the Bank, or collect the funds directly from you.
"""


In [14]:
print(summarize_given_perc(ARTICLE, 50))

article size = 131, calculated word limit = 65
[{'summary_text': ' If you deposit a check or item in your Account that the drawee  purposefullybank returns unpaid for any reason (called “dishonor”), we may  put the check through for collection again . This means  that you are waiving your right to receive immediate notice of “'}]


In [15]:
print(summarize_given_perc(ARTICLE, 10))

article size = 131, calculated word limit = 13
[{'summary_text': ' If you deposit a check or item in your Account,'}]


In [16]:
# From: https://github.com/biplav-s/course-nl/blob/master/common-data/Example-TDBank-PersonalAcctAgree.txt

ARTICLE = '''
For (i) Checking Accounts and (ii) Money Market Accounts 
with check access, items are processed as follows: 

a) First, items, including both deposits and withdrawals, are 
added to and deducted from your available Account balance 
in chronological date and time order based on the information 
that we receive for each item. The following transaction fees 
also will be deducted in date and time order based on when 
they are assessed: wire transfer fees, deposit return fees, 
returned item fees, and overdraft fees. For some items, we do 
not receive date and time information. We assign these items 
a date and time, which may vary from when the transactions 
were conducted.  All checks drawn upon your account that 
are not cashed at a TD Bank Store are assigned a time of 
11pm on the date we receive them.  If multiple items have the 
same date and time information, they will be processed in the 
following order: (i) deposits first; (ii) checks drawn upon your 
account next, from lowest to highest check number, and then 
(iii) other withdrawals, from lowest to highest dollar amount. 
For purposes of this section (a), withdrawals include 
transactions that have been presented for payment as well as 
pending debit card, ATM or electronic transactions that have 
been authorized but not yet presented to us for payment. 
Please see the additional details below for more information 
regarding pending transactions. Deposits are made available 
to you in accordance with our Funds Availability Policy. 

b) Second, we add to or deduct from your available Account 
balance any interest credits or fees not described in (a) above. 
Examples of these fees include non-TD ATM fees, monthly 
maintenance fees, and overdraft protection transfer fees.  

For (i) Savings Accounts, (ii) Money Market Accounts with 
no check access, and (iii) CD Accounts, items are processed 
as follows: 

a) First, deposits that have become available to you that Business 
Day in accordance with our Funds Availability Policy are added 
to your available Account balance. 

b) Next, the total amount of any “pending” debit card, ATM and 
other electronic transactions that have been authorized but 
not yet presented to us for payment is deducted from your 
available Account balance. Please see the additional details 
below for more information regarding pending transactions. 

c) We then deduct items from your available Account balance 
by category, in the following order: 

i. Outgoing wire transfers, return deposit items, and debit 
adjustments to your available Account balance; 

ii. Overdraft fees, other returned item fees, and deposit 
return fees; 

iii. All other Account fees (except as described in (iv) below), 
and all other items including checks, ATM transactions, 
and debit card transactions; and 

iv. Fees assessed at the end of the statement cycle including, 
for example but not limited to, monthly maintenance fees. 

Within categories i, ii, and iii, we post items in order from 
highest to lowest dollar amount. 

Additional details regarding pending transactions for 
all Accounts: 

When you use a debit card, ATM card, or other electronic means 
to make withdrawals, we may receive notice of the transaction 
before it is actually presented to us for payment. That notice 
may be in the form of a merchant authorization request or 
other electronic inquiry. Upon receipt of such notice, we treat 

the transaction as “pending” at the time we receive notice, 
and subject to certain exceptions, we deduct the amount of the 
pending transactions from your available Account balance to 
determine the amount available to pay other items presented 
against your Account. The amount of a pending transaction 
may not be equal to the amount of the actual transaction that is 
subsequently presented for payment and posted to your Account. 
If a pending transaction is not presented for payment within three 
(3) Business Days after we receive notice of the transaction, we 
will release the amount of the pending transaction. We do not 
deduct the amount of pending debit card transactions from your 
available Account balance for certain categories of merchants 
that frequently request authorization for amounts in excess of 
the likely transaction amount, including hotels and resorts, airlines 
and cruise lines, car rental companies, and automated gas pumps 
(pay at the pump). 
 '''


In [17]:
print(summarize_given_perc(ARTICLE, 10))

article size = 695, calculated word limit = 69
[{'summary_text': ' Deposits are made available to you in accordance with our Funds Availability Policy . Items are processed as follows: deposits and withdrawals are added to and deducted from your available Account balance . The total amount of any “pending” debit card, ATM and other electronic transactions that have been authorized but not yet presented to us for'}]
