In [1]:
!pip install transformers datasets evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24936 sha256=190c811073348cec39febcc72e1e9908623326cfb230967877cf6b49a75152e7
  Stored in directory: /root/.cache/pip/wheels/b0/3f/ac/cc3bc304f50c77ef38d79d8e4e2684313de39af543cb4eb3da
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.1 rouge_score-0.1.2
[0m

In [2]:
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm.notebook import tqdm
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.utils import shuffle
import evaluate
from transformers import AutoTokenizer, T5Config, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Prepare dataset

### Creating directories

In [3]:
DIR="t5"

!mkdir -p "{DIR}/dataset"
!mkdir -p "{DIR}/model"
!mkdir -p "{DIR}/tokenizer"

!pwd
!ls -r "{DIR}"

/notebooks/custom
tokenizer  model  dataset


### Getting Dataset

In [4]:
dataset = load_dataset("billsum")

train_test_valid = dataset["train"].train_test_split(test_size=0.2, seed=20)
test_valid = train_test_valid["test"].train_test_split(test_size=0.5, seed=20)

Downloading builder script:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/832 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset billsum/default (download: 64.14 MiB, generated: 259.80 MiB, post-processed: Unknown size, total: 323.94 MiB) to /root/.cache/huggingface/datasets/billsum/default/3.0.0/d1e95173aed3acb71327864be74ead49b578522e4c7206048b2f2e5351b57959...


Downloading data:   0%|          | 0.00/67.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

Dataset billsum downloaded and prepared to /root/.cache/huggingface/datasets/billsum/default/3.0.0/d1e95173aed3acb71327864be74ead49b578522e4c7206048b2f2e5351b57959. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
ds_ = DatasetDict({
        "train": train_test_valid["train"],
        "test": test_valid["test"],
        "valid": test_valid["train"]
      })

In [6]:
print(ds_)

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 15159
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1895
    })
    valid: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1895
    })
})


### Visualizing the dataset on a Dataframe

In [7]:
pd.set_option("display.max_colwidth", None)

columns=["text", "summary", "title"]

df_train = pd.DataFrame(columns=columns)
df_validation = pd.DataFrame(columns=columns)
df_test = pd.DataFrame(columns=columns)

In [13]:
def populate_dataframe(dataset: any, dataframe: pd.DataFrame, type_dataset:str) -> tuple:
    
    print(f"Populating {type_dataset} dataset...")
    for index, val in enumerate(tqdm(dataset)):
        text = val["text"]
        summary = val["summary"]
        title = val["title"]
        dataframe.loc[index] = [text] + [summary] + [title]

In [14]:
populate_dataframe(dataset=ds_["train"], dataframe=df_train, type_dataset="train")
print(f"number of instances: {df_train.shape}")

Populating train dataset...


  0%|          | 0/15159 [00:00<?, ?it/s]

number of instances: (15159, 3)


In [15]:
populate_dataframe(dataset=ds_["valid"], dataframe=df_validation, type_dataset="validation")
print(f"number of instances: {df_validation.shape}")

Populating validation dataset...


  0%|          | 0/1895 [00:00<?, ?it/s]

number of instances: (1895, 3)


In [16]:
populate_dataframe(dataset=ds_["test"], dataframe=df_test, type_dataset="test")
print(f"number of instances: {df_test.shape}")

Populating test dataset...


  0%|          | 0/1895 [00:00<?, ?it/s]

number of instances: (1895, 3)


In [17]:
df_train = shuffle(df_train)
df_validation = shuffle(df_validation)
df_test = shuffle(df_test)

print(f"Train dataframe shape: {df_train.shape}")
print(f"Validation dataframe shape: {df_validation.shape}")
print(f"Test dataframe shape: {df_test.shape}")

Train dataframe shape: (15159, 3)
Validation dataframe shape: (1895, 3)
Test dataframe shape: (1895, 3)


### Sample train

In [18]:
df_train.head(1)

Unnamed: 0,text,summary,title
3247,"SECTION 1. SHORT TITLE.\n\n This Act may be cited as the ``Law Enforcement Abuse Transparency \nand Accountability Act of 2006''.\n\nSEC. 2. FINDINGS.\n\n Congress finds the following:\n (1) American law enforcement has a fundamental duty to \n provide compassion, aid, and protection and safety to the \n people it serves.\n (2) A primary function of law enforcement is to preserve \n life, regardless of the race, ethnicity, religion, social or \n economic standing, sexual preference, or country of origin of \n the individuals involved.\n (3) Over many years, thousands of cases of State and local \n law enforcement agency violations of suspects, detainees, and \n prisoners went inadequately addressed across the United States, \n especially in African-American communities and other \n communities of color or poverty.\n (4) In recent years, procedures, training, and public \n oversight have failed to significantly curb or eliminate abuses \n and murders of innocent suspects and citizens at the hands of \n officers of the law, to fully investigate claims of excessive \n use of force, or to adequately reprimand, punish, or remove \n such offenders or their superiors.\n (5) A special prosecutor in Chicago has been investigating \n a police abuse ring that operated over three decades with \n impunity, responsible for torturing over 200 African-American \n males in their custody at the Area 2 and Area 3 police \n headquarters.\n (6) Since the 1997 New York Police Department torture of \n Abner Louima, and the fatal shooting of Amadou Bailo Diallo in \n 1999, public and media attention concerning abuse of power and \n the unjustified and improper use of force by law enforcement in \n communities of color has continuously increased, as have the \n number of outraged community responses and the demands for \n external oversight of police practices.\n (7) More recently, police in Chicago on November 25, 2006, \n police officers in New York City shot 50 times and killed an \n unarmed man, Sean Bell. The next day, a community rally \n protested the police action and called for the removal of \n Police Commissioner Raymond Kelly.\n (8) Fatal shootings and abuse of suspects and prisoners \n have come to light again recently in other cities, including \n Atlanta, Georgia, and DeKalb County, Georgia.\n (9) Every major State and local city law enforcement agency \n receives and depends on some level of Federal funding, \n training, grants, or assistance, paid for primarily from the \n tax revenues of the citizens being abused.\n (10) The common and continuing unaccountable behavior and \n silence of members of law enforcement agencies regarding these \n abuses are a disgrace to the efforts of law enforcement \n agencies throughout the United States and should not be \n tolerated.\n (11) The lack of transparency, oversight, community \n involvement, independent review and investigation, and \n consequences to the law enforcement violators makes continuing \n abuse more likely, and must be reversed by denying Federal \n funding to any law enforcement agency that fails to establish a \n minimum of professional training and procedures of engagement; \n that tolerates abuses or fatal use of excessive force; that \n fails to operate under rules of transparency and community \n oversight, investigation, and review; that fails to discipline, \n remove, or otherwise hold accountable any perpetrators acting \n under the color of law enforcement; or that refuses to fairly \n hear each case or allegation of possible abuse or excessive use \n of force by law enforcement officers, reviewed by an \n established and independent forum.\n\nSEC. 3. SENSE OF CONGRESS.\n\n It is the sense of Congress that the dishonorable actions referred \nto in section 2 should be independently investigated, recorded, and \ncondemned.\n\nSEC. 4. INELIGIBILITY FOR FEDERAL ASSISTANCE.\n\n (a) In General.--During the 1-year period beginning on the date of \nenactment of this Act, or until transparency and accountability are \nfully restored, law enforcement agencies that do not have established \nprocedures for independent oversight and review, or do not hold \nviolations by police of excessive use of force, torture, or \nmanslaughter accountable, shall be ineligible to participate in any \nFederal program, whether by funding, assistance, contract, grant, \npersonnel support, or otherwise.\n (b) Licenses.--During the 1-year period beginning on the date of \nenactment of this Act, any Federal license issued to any such law \nenforcement agencies shall be suspended, or until transparency and \naccountability are fully restored.\n (c) Equipment.--Law enforcement agencies identified as carrying out \nabuses or wrongful deaths, without consequence or open public review or \nallegations, shall immediately return all federally-owned equipment in \nthe possession or use of such law enforcement agencies to the \nappropriate Federal agency.\n\nSEC. 5. LAW ENFORCEMENT AGENCIES DEFINED.\n\n In this Act, the term ``law enforcement agencies'' means the \nfollowing entities in any State or locality within the control and \njurisdiction of the United States receiving Federal funds for training, \nequipment, or other support.","Law Enforcement Abuse Transparency and Accountability Act of 2006 - Expresses the sense of Congress that incidences of law enforcement abuse of citizens should be independently investigated, recorded, and condemned.\n\nRenders state or local law enforcement agencies receiving assistance from the federal government ineligible for further assistance, licenses, or federally-owned equipment until such agencies establish procedures for independent oversight and review and hold police officers accountable for excessive use of force, torture, or manslaughter.","To deny Federal assistance to any State or local law enforcement agencies whose officers use excessive force or violence leading to the death of innocent or unarmed citizens, or who fail to establish, enforce and follow transparent and accountable procedures that fully protect the lives and health of citizens during surveillance, interrogation, arrest or imprisonment from torture, excessive physical or psychological abuse and death, and to require a system of transparent legal and public review of such allegations and cases that can result in the sanction, punishment and removal of officers who perpetrate such abuses or their superiors."


### Sample validation

In [19]:
df_validation.head(1)

Unnamed: 0,text,summary,title
1842,"SECTION 1. SHORT TITLE.\n\n This Act may be cited as the ``Help Separated Families Act of \n2012''.\n\nSEC. 2. IMMIGRATION STATUS ALONE NOT A DISQUALIFICATION FROM BEING A \n PLACEMENT FOR A FOSTER CHILD.\n\n Section 471(a)(19) of the Social Security Act (42 U.S.C. \n671(a)(19)) is amended--\n (1) by striking ``(19) provides that the State'' and \n inserting the following:\n ``(19) provides that--\n ``(A) the State''; and\n (2) by adding after and below the end the following:\n ``(B) such standards shall ensure that the \n immigration status alone of a parent, legal guardian, \n or relative shall not disqualify the parent, legal \n guardian, or relative from being a placement for a \n child;''.\n\nSEC. 3. STATE PLAN REQUIREMENT TO ACCEPT CERTAIN DOCUMENTS ISSUED BY \n FOREIGN ENTITIES AS SUFFICIENT IDENTIFICATION FOR \n PURPOSES OF INITIATING A CRIMINAL RECORDS CHECK OR A \n FINGERPRINT-BASED CHECK.\n\n Section 471(a)(20) of the Social Security Act (42 U.S.C. \n671(a)(20)) is amended--\n (1) in subparagraph (A), by inserting ``which procedures \n shall require the State (including the State agency, the child \n welfare agency of any county or other political subdivision of \n the State, and caseworkers and supervisors of any such agency) \n to accept a foreign consulate identification card, a foreign \n passport, or such other foreign identification document as may \n be allowed in regulations prescribed by the Secretary, as \n sufficient identification for purposes of initiating a criminal \n records check or a fingerprint-based check,'' before \n ``including procedures''; and\n (2) in subparagraph (C), by inserting ``, which procedures \n shall require the State (including the State agency, the child \n welfare agency of any county or other political subdivision of \n the State, and caseworkers and supervisors of any such agency) \n to accept a foreign consulate identification card, a foreign \n passport, or such other foreign identification document as may \n be allowed in regulations prescribed by the Secretary, as \n sufficient identification for purposes of initiating a criminal \n records check or a fingerprint-based check'' before the \n semicolon.\n\nSEC. 4. STATE CHILD WELFARE AGENCIES ENCOURAGED TO GRANT WAIVERS OF \n REQUIREMENTS THAT WOULD PREVENT A CHILD FROM BEING PLACED \n WITH A RELATIVE ON THE BASIS OF A MINOR LEGAL INFRACTION \n BY THE RELATIVE.\n\n It is the sense of the Congress that the child welfare agency of a \nState, or of any county or other political subdivision of a State, \nshould grant a waiver of any requirement which would prevent the \nplacement of a child with a relative of the child, on the basis of a \nminor legal infraction, if the relative would otherwise be considered \neligible for such a placement.\n\nSEC. 5. STATE PLAN REQUIREMENT TO NOTIFY RELATIVES SEEKING PLACEMENT OF \n A CHILD THAT THEIR IMMIGRATION STATUS WILL NOT BE \n QUESTIONED.\n\n Section 471(a)(29) of the Social Security Act (42 U.S.C. \n671(a)(29)) is amended--\n (1) by striking ``and'' at the end of subparagraph (C);\n (2) by adding ``and'' at the end of subparagraph (D); and\n (3) by adding at the end the following:\n ``(E) the immigration status of any such relative \n seeking placement of the child with the relative shall \n not be questioned, except to the extent necessary in \n determining eligibility for relevant services or \n programs;''.\n\nSEC. 6. PROHIBITION ON STATE FILING FOR TERMINATION OF PARENTAL RIGHTS \n IN FOSTER CARE CASES IN WHICH OTHERWISE FIT AND WILLING \n PARENT OR RELATIVE HAS BEEN DEPORTED OR IS INVOLVED IN AN \n IMMIGRATION PROCEEDING, UNLESS CERTAIN CONDITIONS HAVE \n BEEN MET.\n\n Section 475(5)(E) of the Social Security Act (42 U.S.C. 675(5)(E)) \nis amended by adding after and below the end the following flush text:\n ``except that the State, and a county or other political \n subdivision of the State, shall not file (or join in the filing \n of such a petition) based on the removal of the parent from the \n United States or the involvement of the parent in (including \n detention pursuant to) an immigration proceeding, unless (I) \n the State (or the county or other political subdivision of the \n State, as the case may be) has made reasonable efforts to \n identify, locate, and contact any parent of the child, who has \n been removed from the United States, and any adult relative of \n the child, referred to in section 471(a)(29), including through \n the diplomatic or consular offices of the country to which the \n parent was removed, to notify such a parent or relative of the \n intent of the State (or the county or other political \n subdivision of the State, as the case may be) to file (or join \n in the filing of) such a petition, and to reunify the child \n with any such parent or relative; or (II) the parent is unfit \n or unwilling to be a parent of the child;''.\n\nSEC. 7. EFFECTIVE DATE.\n\n (a) In General.--The amendments made by this Act shall take effect \non the 1st day of the 1st fiscal year beginning on or after the date of \nthe enactment of this Act, and shall apply to payments under part E of \ntitle IV of the Social Security Act for calendar quarters beginning on \nor after such date.\n (b) Delay Permitted if State Legislation Required.--If the \nSecretary of Health and Human Services determines that State \nlegislation (other than legislation appropriating funds) is required in \norder for a State plan approved under part E of title IV of the Social \nSecurity Act to meet the additional requirements imposed by the \namendments made by this Act, the plan shall not be regarded as failing \nto meet any of the additional requirements before the 1st day of the \n1st calendar quarter beginning after the 1st regular session of the \nState legislature that begins after the date of the enactment of this \nAct. For purposes of the preceding sentence, if the State has a 2-year \nlegislative session, each year of the session is deemed to be a \nseparate regular session of the State legislature.","Help Separated Families Act of 2012 - Amends part E (Foster Care and Adoption Assistance) of title IV of the Social Security Act to: (1) require state child protection standards to ensure that the immigration status alone of a parent, legal guardian, or relative shall not disqualify the parent, legal guardian, or relative from being a placement for a child; and (2) require the state procedures for criminal records checks to require the state to accept foreign identification documents as sufficient identification for purposes of initiating a criminal records check or a fingerprint-based check.\n\nExpresses the sense of Congress that the child welfare agency of a state, or of any county or other political subdivision of a state, should grant a waiver of any requirement which would prevent the placement of a child with a relative of the child, on the basis of a minor legal infraction, if the relative would otherwise be considered eligible for such a placement.\n\nRequires the state plan for foster care and adoption assistance to notify relatives seeking placement of a child that their immigration status will not be questioned, except to the extent necessary in determining eligibility for relevant services or programs.\n\nProhibits a state or local government agency from filing for termination of parental rights in foster care cases based on the removal of the parent from the United States or the parent's involvement in an immigration proceeding, unless: (1) the state (or local agency) has made reasonable efforts to notify of the intention to file such a petition any parent of the child who has been removed from the United States, and any adult relative of the child, including through the diplomatic or consular offices of the country to which the parent was removed, and to reunify the child with any such parent or relative; or (2) the parent is unfit or unwilling to be a parent of the child.","To amend part E of title IV of the Social Security Act to ensure that immigration status alone does not disqualify a parent, legal guardian, or relative from being a placement for a foster child, to prohibit a State, county, or other political subdivision of a State from filing for termination of parental rights in foster care cases in which an otherwise fit and willing parent or legal guardian has been deported or is involved in (including detention pursuant to) an immigration proceeding, unless certain conditions have been met, and for other purposes."


### Sample test

In [20]:
df_test.head(1)

Unnamed: 0,text,summary,title
755,"SECTION 1. SHORT TITLE.\n\n This Act may be cited as the ``Unlawful Internet Gambling Funding \nProhibition Act''.\n\nSEC. 2. FINDINGS.\n\n The Congress finds as follows:\n (1) Internet gambling is primarily funded through personal \n use of bank instruments, including credit cards and wire \n transfers.\n (2) The National Gambling Impact Study Commission in 1999 \n recommended the passage of legislation to prohibit wire \n transfers to Internet gambling sites or the banks which \n represent them.\n (3) Internet gambling is a major cause of debt collection \n problems for insured depository institutions and the consumer \n credit industry.\n (4) Internet gambling conducted through offshore \n jurisdictions has been identified by United States law \n enforcement officials as a significant money laundering \n vulnerability.\n\nSEC. 3. POLICIES AND PROCEDURES REQUIRED TO PREVENT PAYMENTS FOR \n UNLAWFUL INTERNET GAMBLING.\n\n (a) Regulations.--Before the end of the 6-month period beginning on \nthe date of the enactment of this Act, the Federal functional \nregulators shall prescribe regulations requiring any designated payment \nsystem to establish policies and procedures reasonably designed to \nidentify and prevent restricted transactions in any of the following \nways:\n (1) The establishment of policies and procedures that--\n (A) allow the payment system and any person \n involved in the payment system to identify restricted \n transactions by means of codes in authorization \n messages or by other means; and\n (B) block restricted transactions identified as a \n result of the policies and procedures developed \n pursuant to subparagraph (A).\n (2) The establishment of policies and procedures that \n prevent the acceptance of the products or services of the \n payment system in connection with a restricted transaction.\n (b) Requirements for Policies and Procedures.--In prescribing \nregulations pursuant to subsection (a), the Federal functional \nregulators shall--\n (1) identify types of policies and procedures, including \n nonexclusive examples, which would be deemed to be ``reasonably \n designed to identify'' and ``reasonably designed to block'' or \n to ``prevent the acceptance of the products or services'' with \n respect to each type of transaction, such as, should credit \n card transactions be so designated, identifying transactions by \n a code or codes in the authorization message and denying \n authorization of a credit card transaction in response to an \n authorization message;\n (2) to the extent practical, permit any participant in a \n payment system to choose among alternative means of identifying \n and blocking, or otherwise preventing the acceptance of the \n products or services of the payment system or participant in \n connection with, restricted transactions; and\n (3) consider exempting restricted transactions from any \n requirement under subsection (a) if the Federal functional \n regulators find that it is not reasonably practical to identify \n and block, or otherwise prevent, such transactions.\n (c) Compliance With Payment System Policies and Procedures.--A \ncreditor, credit card issuer, financial institution, operator of a \nterminal at which an electronic fund transfer may be initiated, money \ntransmitting business, or international, national, regional, or local \nnetwork utilized to effect a credit transaction, electronic fund \ntransfer, or money transmitting service, or a participant in such \nnetwork, meets the requirement of subsection (a) if--\n (1) such person relies on and complies with the policies \n and procedures of a designated payment system of which it is a \n member or participant to--\n (A) identify and block restricted transactions; or\n (B) otherwise prevent the acceptance of the \n products or services of the payment system, member, or \n participant in connection with restricted transactions; \n and\n (2) such policies and procedures of the designated payment \n system comply with the requirements of regulations prescribed \n under subsection (a).\n (d) Enforcement.--\n (1) In general.--This section shall be enforced by the \n Federal functional regulators and the Federal Trade Commission \n under applicable law in the manner provided in section 505(a) \n of the Gramm-Leach-Bliley Act.\n (2) Factors to be considered.--In considering any \n enforcement action under this subsection against any payment \n system, or any participant in a payment system that is a \n creditor, credit card issuer, financial institution, operator \n of a terminal at which an electronic fund transfer may be \n initiated, money transmitting business, or international, \n national, regional, or local network utilized to effect a \n credit transaction, electronic fund transfer, or money \n transmitting service, or a participant in such network, the \n Federal functional regulators and the Federal Trade Commission \n shall consider the following factors:\n (A) The extent to which such person is extending \n credit or transmitting funds knowing the transaction is \n in connection with unlawful Internet gambling.\n (B) The history of such person in extending credit \n or transmitting funds knowing the transaction is in \n connection with unlawful Internet gambling.\n (C) The extent to which such person has established \n and is maintaining policies and procedures in \n compliance with regulations prescribed under this \n subsection.\n (D) The feasibility that any specific remedy \n prescribed can be implemented by such person without \n substantial deviation from normal business practice.\n (E) The costs and burdens the specific remedy will \n have on such person.\n\nSEC. 4. DEFINITIONS.\n\n For purposes of this Act, the following definitions shall apply:\n (1) Restricted transaction.--The term ``restricted \n transaction'' means any transaction or transmittal to any \n person engaged in the business of betting or wagering, in \n connection with the participation of another person in unlawful \n Internet gambling, of--\n (A) credit, or the proceeds of credit, extended to \n or on behalf of such other person (including credit \n extended through the use of a credit card);\n (B) an electronic fund transfer or funds \n transmitted by or through a money transmitting \n business, or the proceeds of an electronic fund \n transfer or money transmitting service, from or on \n behalf of the other person;\n (C) any check, draft, or similar instrument which \n is drawn by or on behalf of the other person and is \n drawn on or payable at or through any financial \n institution; or\n (D) the proceeds of any other form of financial \n transaction as the Federal functional regulators may \n prescribe by regulation which involves a financial \n institution as a payor or financial intermediary on \n behalf of or for the benefit of the other person.\n (2) Bets or wagers.--The term ``bets or wagers''--\n (A) means the staking or risking by any person of \n something of value upon the outcome of a contest of \n others, a sporting event, or a game subject to chance, \n upon an agreement or understanding that the person or \n another person will receive something of greater value \n than the amount staked or risked in the event of a \n certain outcome;\n (B) includes the purchase of a chance or \n opportunity to win a lottery or other prize (which \n opportunity to win is predominantly subject to chance);\n (C) includes any scheme of a type described in \n section 3702 of title 28, United States Code;\n (D) includes any instructions or information \n pertaining to the establishment or movement of funds in \n an account by the bettor or customer with the business \n of betting or wagering; and\n (E) does not include--\n (i) any activity governed by the securities \n laws (as that term is defined in section \n 3(a)(47) of the Securities Exchange Act of \n 1934) for the purchase or sale of securities \n (as that term is defined in section 3(a)(10) of \n such Act);\n (ii) any transaction conducted on or \n subject to the rules of a registered entity or \n exempt board of trade pursuant to the Commodity \n Exchange Act;\n (iii) any over-the-counter derivative \n instrument;\n (iv) any other transaction that--\n (I) is excluded or exempt from \n regulation under the Commodity Exchange \n Act; or\n (II) is exempt from State gaming or \n bucket shop laws under section 12(e) of \n the Commodity Exchange Act or section \n 28(a) of the Securities Exchange Act of \n 1934;\n (v) any contract of indemnity or guarantee;\n (vi) any contract for insurance;\n (vii) any deposit or other transaction with \n a depository institution (as defined in section \n 3(c) of the Federal Deposit Insurance Act);\n (viii) any participation in a simulation \n sports game or an educational game or contest \n that--\n (I) is not dependent solely on the \n outcome of any single sporting event or \n nonparticipant's singular individual \n performance in any single sporting \n event;\n (II) has an outcome that reflects \n the relative knowledge and skill of the \n participants with such outcome \n determined predominantly by accumulated \n statistical results of sporting events; \n and\n (III) offers a prize or award to a \n participant that is established in \n advance of the game or contest and is \n not determined by the number of \n participants or the amount of any fees \n paid by those participants; and\n (ix) any lawful transaction with a business \n licensed or authorized by a State.\n (3) Designated payment system defined.--The term \n ``designated payment system'' means any system utilized by any \n creditor, credit card issuer, financial institution, operator \n of a terminal at which an electronic fund transfer may be \n initiated, money transmitting business, or international, \n national, regional, or local network utilized to effect a \n credit transaction, electronic fund transfer, or money \n transmitting service, or any participant in such network, that \n the Federal functional regulators determine, by regulation or \n order, could be utilized in connection with, or to facilitate, \n any restricted transaction.\n (4) Federal functional regulator.--The term ``Federal \n functional regulator'' has the same meaning as in section \n 509(2) of the Gramm-Leach-Bliley Act.\n (5) Internet.--The term ``Internet'' means the \n international computer network of interoperable packet switched \n data networks.\n (6) Unlawful internet gambling.--The term ``unlawful \n Internet gambling'' means to place, receive, or otherwise \n transmit a bet or wager by any means which involves the use, at \n least in part, of the Internet where such bet or wager is \n unlawful under any applicable Federal or State law in the State \n in which the bet or wager is initiated, received, or otherwise \n made.\n (7) Other terms.--\n (A) Credit; creditor; and credit card.--The terms \n ``credit'', ``creditor'', and ``credit card'' have the \n meanings given such terms in section 103 of the Truth \n in Lending Act.\n (B) Electronic fund transfer.--The term \n ``electronic fund transfer''--\n (i) has the meaning given such term in \n section 903 of the Electronic Fund Transfer \n Act; and\n (ii) includes any fund transfer covered by \n Article 4A of the Uniform Commercial Code, as \n in effect in any State.\n (C) Financial institution.--The term ``financial \n institution''--\n (i) has the meaning given such term in \n section 903 of the Electronic Fund Transfer \n Act; and\n (ii) includes any financial institution, as \n defined in section 509(3) of the Gramm-Leach-\n Bliley Act.\n (D) Money transmitting business and money \n transmitting service.--The terms ``money transmitting \n business'' and ``money transmitting service'' have the \n meanings given such terms in section 5330(d) of title \n 31, United States Code.\n\nSEC. 5. COMMON SENSE RULE OF CONSTRUCTION.\n\n No provision of this Act shall be construed as altering, limiting, \nextending, changing the status of, or otherwise affecting any law \nrelating to, affecting, or regulating gambling within the United \nStates.\n\n Passed the House of Representatives June 10, 2003.\n\n Attest:\n\n JEFF TRANDAHL,\n\n Clerk.","Unlawful Internet Gambling Funding Prohibition Act - Directs Federal functional regulators, within six months of this Act's enactment, to prescribe regulations requiring any designated payment system to establish policies and procedures reasonably designed to: (1) identify restricted transactions by means of code in authorization messages or other means and to block such transactions; and (2) prevent the acceptance of the products or services of the payment system in connection with a restricted transaction. Defines a ""restricted transaction"" as any transaction or transmittal to anyone engaged in the business of betting or wagering in connection with another person's participation in unlawful Internet gambling of credit, electronic fund transfers, checks, or the proceeds of any other form of financial transaction as regulators may prescribe. Requires such regulators and the Federal Trade Commission to enforce such regulations (under applicable law in the manner provided under the Gramm-Leach-Bliley Act) after considering specified factors concerning the extent of the violation, the history of compliance, and the feasibility and burden of implementing a remedy. Requires regulators to consider exempting restricted transactions if the regulators find that it is not reasonably practical to identify and block, or otherwise prevent, such transactions.Provides that no provision of this Act shall be construed as affecting any law relating to, affecting, or regulating gambling within the United States.","To prevent the use of certain bank instruments for unlawful Internet gambling, and for other purposes."


### Saving datasets into csv files

In [7]:
ds_path = f"{DIR}/dataset"
train_ds_file = "billsum_train.csv"
validation_ds_file = "billsum_validation.csv"
test_ds_file = "billsum_test.csv"

#### Save

In [22]:
df_train.to_csv(f"{ds_path}/{train_ds_file}", index=False)
df_validation.to_csv(f"{ds_path}/{validation_ds_file}", index=False)
df_test.to_csv(f"{ds_path}/{test_ds_file}", index=False)

# Preprocess

### Tokenization

In [6]:
checkpoint = "t5-small"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading tokenizer_config.json:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [8]:
def preprocess_function(examples, max_length_inputs=1024, max_length_labels=128):
    inputs = ["summarize: " + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=max_length_inputs, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_length_labels, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
train_test_valid

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 15159
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3790
    })
})

In [10]:
test_valid

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1895
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1895
    })
})

In [11]:
tokenized_train = train_test_valid.map(preprocess_function, batched=True)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [12]:
tokenized_train

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15159
    })
    test: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3790
    })
})

In [13]:
tokenized_validation = test_valid["train"].map(preprocess_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [14]:
tokenized_validation

Dataset({
    features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1895
})

In [15]:
tokenized_test = test_valid["test"].map(preprocess_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [16]:
tokenized_test

Dataset({
    features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1895
})

### Data Collator

In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

### Evaluation Metric

In [18]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

### Model

In [19]:
model_config = T5Config(decoder_start_token_id=tokenizer.convert_tokens_to_ids(["<pad>"])[0])
model = AutoModelForSeq2SeqLM.from_config(model_config)

# Train

### Hyper-Parameters

In [20]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./output/results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=12,
    predict_with_generate=True,
    fp16=True,
)

In [21]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train["train"],
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Using cuda_amp half precision backend


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, title, text. If summary, title, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15159
  Num Epochs = 12
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 11376
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,7.85,6.778646,0.1794,0.0563,0.1611,0.1611,19.0
2,6.7841,6.371436,0.1577,0.0522,0.1361,0.1361,19.0
3,6.4145,6.082403,0.1736,0.0615,0.151,0.1509,19.0
4,6.1392,5.871485,0.1876,0.0675,0.1601,0.16,19.0
5,5.9302,5.698036,0.1838,0.0698,0.1591,0.159,19.0
6,5.7664,5.563242,0.1855,0.0713,0.1607,0.1607,19.0
7,5.6411,5.459766,0.1912,0.0755,0.1655,0.1655,19.0
8,5.5373,5.380566,0.1889,0.0765,0.1644,0.1643,19.0
9,5.4636,5.320973,0.1868,0.0762,0.1628,0.1627,18.9916
10,5.4284,5.273892,0.1832,0.0757,0.1613,0.1613,18.9916


Saving model checkpoint to ./output/results/checkpoint-500
Configuration saved in ./output/results/checkpoint-500/config.json
Model weights saved in ./output/results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./output/results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./output/results/checkpoint-500/special_tokens_map.json
Copy vocab file to ./output/results/checkpoint-500/spiece.model
wandb: ERROR Error while calling W&B API: An internal error occurred. Please contact support. (<Response [500]>)
wandb: ERROR Error while calling W&B API: An internal error occurred. Please contact support. (<Response [500]>)
wandb: ERROR Error while calling W&B API: An internal error occurred. Please contact support. (<Response [500]>)
wandb: ERROR Error while calling W&B API: An internal error occurred. Please contact support. (<Response [500]>)
wandb: ERROR Error while calling W&B API: An internal error occurred. Please contact support. (<Response [500]>)
wa

TrainOutput(global_step=11376, training_loss=5.935580064475788, metrics={'train_runtime': 17380.0673, 'train_samples_per_second': 10.466, 'train_steps_per_second': 0.655, 'total_flos': 4.923951284433715e+16, 'train_loss': 5.935580064475788, 'epoch': 12.0})

# Test

In [23]:
trainer.evaluate(eval_dataset=tokenized_test)

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, title, text. If summary, title, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1895
  Batch size = 16


{'eval_loss': 5.237868309020996,
 'eval_rouge1': 0.1848,
 'eval_rouge2': 0.0765,
 'eval_rougeL': 0.161,
 'eval_rougeLsum': 0.1612,
 'eval_gen_len': 18.9916,
 'eval_runtime': 148.6614,
 'eval_samples_per_second': 12.747,
 'eval_steps_per_second': 0.8,
 'epoch': 12.0}

# Inference

In [25]:
text = """summarize: The Inflation Reduction Act lowers prescription drug costs, 
health care costs, and energy costs. It's the most aggressive action on tackling the 
climate crisis in American history, which will lift up American workers and create good-paying, 
union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and 
corporations to pay their fair share. And no one making under $400,000 per year will 
pay a penny more in taxes."""

### Load Trained Model

In [26]:
model_output = "output/results/checkpoint-11000"

tokenizer = AutoTokenizer.from_pretrained(model_output)
inputs = tokenizer(text, return_tensors="pt").input_ids
model_ = AutoModelForSeq2SeqLM.from_pretrained(model_output)
outputs = model_.generate(inputs, max_new_tokens=100, do_sample=False)

Didn't find file output/results/checkpoint-11000/added_tokens.json. We won't load it.
loading file output/results/checkpoint-11000/spiece.model
loading file output/results/checkpoint-11000/tokenizer.json
loading file None
loading file output/results/checkpoint-11000/special_tokens_map.json
loading file output/results/checkpoint-11000/tokenizer_config.json
loading configuration file output/results/checkpoint-11000/config.json
Model config T5Config {
  "_name_or_path": "output/results/checkpoint-11000",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "pad_token_id": 0,
  "relative_attention_max_distan

### Decode

In [27]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'. Requires the Secretary of Health and Human Services to establish a program to: (1) a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a '

In [28]:
text = """summarize: Cake is a flour confection made from flour, sugar, and other ingredients and is usually baked. 
In their oldest forms, cakes were modifications of bread, but cakes now cover a wide range of preparations 
that can be simple or elaborate and which share features with desserts such as pastries, meringues, custards, and pies.
The most common ingredients include flour, sugar, eggs, fat (such as butter, oil, or margarine), a liquid, 
and a leavening agent, such as baking soda or baking powder. Common additional ingredients include dried, candied, or 
fresh fruit, nuts, cocoa, and extracts such as vanilla, with numerous substitutions for the primary ingredients. Cakes 
can also be filled with fruit preserves, nuts, or dessert sauces (like custard, jelly, cooked fruit, whipped cream, or 
syrups), iced with buttercream or other icings, and decorated with marzipan, piped borders, or candied fruit."""

inputs = tokenizer(text, return_tensors="pt").input_ids
outputs = model_.generate(inputs, max_new_tokens=100, do_sample=False)
tokenizer.decode(outputs[0], skip_special_tokens=True)

'..................................................'

In [29]:
text = """summarize: 31 minutos (English: 31 minutes) is a Chilean comedy television series and a children's music 
virtual band created by the production company Aplaplac (owned by Álvaro Díaz, Pedro Peirano and Juan Manuel Egaña) 
that began to be transmitted on March 15, 2003 by the signal of Televisión Nacional de Chile (TVN). The program 
is a parody to 60 minutos, a controversial news program of the same channel, transmitted in the 1970s and 1980s. It 
focuses on the adventures of the team of a news program of little prestige in which something unexpected 
always happens, whose presenter is Tulio Triviño. The program's notes are educational and leave an explicit 
or implicit message, while others are quite ridiculous.

In its first period, the series had three seasons, from 2003 to 2005, in addition to a participation for the 2003 Chilean 
Telethon and a Christmas special that same year. On March 27, 2008, the series was taken to the cinema under the 
title of 31 minutos, la película.

After the third season and for the next nine years the series had no new episodes. In 2012, the production company 
Aplaplac confirmed that the series would return to television with a fourth season, which was released on 
October 4, 2014 through TVN, and its last original episode was broadcast on the night of December 27, 2014. 
During its run, the series received universal acclaim from critics and viewers alike, with praise directed to its 
clever humour, soundtrack, accessibility towards children about complex issues and helping to revitalize the 
Chilean puppetry tradition.

From 2004 to 2007, it was broadcast throughout Latin America by Nickelodeon and from 2015, it began to be broadcast 
by Cartoon Network. It also broadcasts in Mexico on Canal Once and Once Niños, and its most recent season is 
available in the Netflix Latin America catalog.

31 minutos has performed throughout Chile and Mexico, making the program a musical band. On their tours they 
perform the songs broadcast on the program and their musical works outside of it."""

inputs = tokenizer(text, return_tensors="pt").input_ids
outputs = model_.generate(inputs, max_new_tokens=100, do_sample=False)
tokenizer.decode(outputs[0], skip_special_tokens=True)

'....... a program to the a program to the a program for the a program of the a a program of the a a program of the a a program of the a a a program of the a a a a a program of the a a a a a a a a a a a '

In [30]:
!tar -czvf "custom.tar.gz" "output/results/checkpoint-11000"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
output/results/checkpoint-11000/
output/results/checkpoint-11000/trainer_state.json
output/results/checkpoint-11000/pytorch_model.bin
output/results/checkpoint-11000/training_args.bin
output/results/checkpoint-11000/optimizer.pt
output/results/checkpoint-11000/tokenizer.json
output/results/checkpoint-11000/config.json
output/results/checkpoint-11000/rng_state.pth
output/results/checkpoint-11000/scheduler.pt
output/results/checkpoint-11000/tokenizer_config.json
output/results/checkpoint-11000/scaler.pt
output/results/checkpoint-11000/special_tokens_map.json
output/results/checkpoint-11000/spiece.model
