In [19]:

import numpy as np 
import pandas as pd 



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/billsum-processed-train/catest_processed.csv
/kaggle/input/billsum-processed-train/ustrain_processed.csv
/kaggle/input/billsum-processed-train/ustest_processed.csv


In [20]:
!pip install evaluate
!pip install accelerate
!pip install transformers
!pip install rouge_score

In [21]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_metric
from sklearn.model_selection import train_test_split

In [22]:
# Set hyperparameters
BATCH_SIZE = 4
NUM_TRAIN_EPOCHS = 4
LEARNING_RATE = 1e-5
WEIGHT_DECAY = 0.01
#GRAD_ACCUMULATION_STEPS = 8
SEED = 161
MAX_SOURCE_LENGTH = 128
MAX_TARGET_LENGTH = 64

#give "None" if want all the rows
TRAINING_DATASET_SIZE = 1600 #10K = 8k + 2k
VALIDATION_DATASET_SIZE = 400
TESTING_DATASET_SIZE = 400 #give None if default size is required
DATASET_PATH = "/kaggle/input/billsum-processed-train/ustrain_processed.csv"
TEST_DATASET_PATH = "/kaggle/input/billsum-processed-train/ustest_processed.csv"
OUTPUT_DIR = "/kaggle/working/model_test_2k_new_6_11_2023_9"

In [23]:
# Define the prefix for the summarization task
prefix = "summarize: "

In [24]:
# Preprocess function
def preprocess_function(example):
    inputs = prefix + example["clean_text"]
    targets = example["summary"]
    inputs_encodings = tokenizer.encode_plus(inputs, truncation=True, padding="max_length", max_length=MAX_SOURCE_LENGTH)
    targets_encodings = tokenizer.encode_plus(targets, truncation=True, padding="max_length", max_length=MAX_TARGET_LENGTH)

    input_ids = torch.tensor(inputs_encodings["input_ids"]).to(device)
    attention_mask = torch.tensor(inputs_encodings["attention_mask"]).to(device)
    decoder_input_ids = torch.tensor(targets_encodings["input_ids"]).to(device)
    decoder_attention_mask = torch.tensor(targets_encodings["attention_mask"]).to(device)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "decoder_input_ids": decoder_input_ids,
        "decoder_attention_mask": decoder_attention_mask,
    }

# Convert the datasets to PyTorch Dataset format
class BillSumDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return preprocess_function(self.examples[idx])

In [25]:
output_dir = OUTPUT_DIR
# Check if saved model and tokenizer exist
model_path = os.path.join(output_dir)
tokenizer_path = os.path.join(output_dir)

In [26]:
# Validation function for train, eval and testing dataset
def validate_model(model, dataloader, tokenizer):
    model.to(device)  # Move the model to the GPU
    model.eval()
    total_loss = 0.0
    
    #aggrigates the loss from dataloader
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        decoder_input_ids = batch["decoder_input_ids"].to(device)
        decoder_attention_mask = batch["decoder_attention_mask"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, 
                            decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits.view(-1, logits.shape[-1]), decoder_input_ids.view(-1))
            total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [27]:
def train_model(model, train_dataloader, eval_dataloader, optimizer, scheduler, loss_fn, tokenizer, output_dir):
    model.to(device)  # Move the model to the GPU
    model.train()

    for epoch in range(NUM_TRAIN_EPOCHS):
        total_loss = 0.0
        print("Training")
        for batch in train_dataloader:
            input_ids = batch["input_ids"].to(device)  # Move input tensors to the GPU
            attention_mask = batch["attention_mask"].to(device)
            decoder_input_ids = batch["decoder_input_ids"].to(device)
            decoder_attention_mask = batch["decoder_attention_mask"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, 
                            decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits.view(-1, logits.shape[-1]), decoder_input_ids.view(-1))
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_dataloader)
        eval_loss = validate_model(model, eval_dataloader, tokenizer)
        print(f"Epoch {epoch + 1} - Training Loss: {avg_loss:.4f} Evaluation Loss: {eval_loss:.4f}")
        
    # Save the trained model and tokenizer
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print("Model and tokenizer saved.")

In [28]:
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Disable WandB logging
os.environ["WANDB_DISABLED"] = "true"

# Load the dataset
dataset_path = DATASET_PATH
df = pd.read_csv(dataset_path)
# Select the desired features
df = df[["clean_text", "summary"]]

# Load the testing dataset
test_dataset_path = TEST_DATASET_PATH
test_df = pd.read_csv(test_dataset_path)
test_df = test_df[:TESTING_DATASET_SIZE]  # Limit the size of the testing dataset if required
# Convert the testing dataset to PyTorch Dataset format
test_dataset = BillSumDataset(test_df.to_dict(orient="records"))
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Select a subset of the training dataset
if TRAINING_DATASET_SIZE is not None:
    train_df = train_df[:TRAINING_DATASET_SIZE]

# Select a subset of the validation dataset
if VALIDATION_DATASET_SIZE is not None:
    test_df = test_df[:VALIDATION_DATASET_SIZE]

train_dataset = BillSumDataset(train_df.to_dict(orient="records"))
test_dataset = BillSumDataset(test_df.to_dict(orient="records"))

# Create the DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
eval_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [29]:
if os.path.exists(model_path) and os.path.exists(tokenizer_path):
    print("Using saved model and tokenizer")
    model = BartForConditionalGeneration.from_pretrained(output_dir)
    tokenizer = BartTokenizer.from_pretrained(output_dir)
    model.to(device)  # Move the loaded model to the GPU
else:
    #use CNN trained bart from facebook for best results
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
    #ConditonalGeneration is used for summarization.
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
    model.to(device)  # Move the model to the GPU

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    
    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    train_model(model, train_dataloader, eval_dataloader, optimizer, scheduler, loss_fn, tokenizer, output_dir)

Training
Epoch 1 - Training Loss: 4.3761 Evaluation Loss: 4.1169
Training
Epoch 2 - Training Loss: 4.1376 Evaluation Loss: 4.1169
Training
Epoch 3 - Training Loss: 4.1391 Evaluation Loss: 4.1169
Training
Epoch 4 - Training Loss: 4.1394 Evaluation Loss: 4.1169
Model and tokenizer saved.


In [38]:
from datasets import load_metric

#Using load_metric calculate ROGUE-1, ROGUE-2 and ROGUE-L
def evaluate(model, dataloader, tokenizer):
    metric = load_metric("rouge")
    model.to(device)
    model.eval()
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        decoder_input_ids = batch["decoder_input_ids"].to(device)
        decoder_attention_mask = batch["decoder_attention_mask"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, 
                            decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask)
            logits = outputs.logits

            # Generate summaries
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=MAX_TARGET_LENGTH,
                num_beams=4,
                early_stopping=True
            )

        # Decode generated summaries and reference summaries
        generated_summaries = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        target_summaries = tokenizer.batch_decode(decoder_input_ids, skip_special_tokens=True)

        # Update the ROUGE metric with the generated summaries and target summaries
        metric.add_batch(predictions=generated_summaries, references=target_summaries)

    # Compute ROUGE scores
    rouge_scores = metric.compute()

    return rouge_scores

In [39]:
train_scores = evaluate(model, train_dataloader, tokenizer)

train_rouge1 = train_scores['rouge1'].mid.fmeasure
train_rouge2 = train_scores['rouge2'].mid.fmeasure
train_rougeL = train_scores['rougeL'].mid.fmeasure

print(f"Train ROUGE-1: {train_rouge1:.4f}")
print(f"Train ROUGE-2: {train_rouge2:.4f}")
print(f"Train ROUGE-L: {train_rougeL:.4f}")

eval_rouge1 = eval_scores['rouge1'].mid.fmeasure
eval_rouge2 = eval_scores['rouge2'].mid.fmeasure
eval_rougeL = eval_scores['rougeL'].mid.fmeasure

eval_scores = evaluate(model, eval_dataloader, tokenizer)

print(f"Evaluation ROUGE-1: {eval_rouge1:.4f}")
print(f"Evaluation ROUGE-2: {eval_rouge2:.4f}")
print(f"Evaluation ROUGE-L: {eval_rougeL:.4f}")

test_scores = evaluate(model, test_dataloader, tokenizer)

test_rouge1 = test_scores['rouge1'].mid.fmeasure
test_rouge2 = test_scores['rouge2'].mid.fmeasure
test_rougeL = test_scores['rougeL'].mid.fmeasure

print(f"Testing ROUGE-1: {test_rouge1:.4f}")
print(f"Testing ROUGE-2: {test_rouge2:.4f}")
print(f"Testing ROUGE-L: {test_rougeL:.4f}")

Train ROUGE-1: 0.3541
Train ROUGE-2: 0.1792
Train ROUGE-L: 0.2781
Evaluation ROUGE-1: 0.1826
Evaluation ROUGE-2: 0.0473
Evaluation ROUGE-L: 0.1239
Testing ROUGE-1: 0.3453
Testing ROUGE-2: 0.1732
Testing ROUGE-L: 0.2701


In [32]:
#Uses model to generate the summary from the bill
def generate_summary(model, tokenizer, text, summary, max_source_length, max_target_length, device):
    input_encoding = tokenizer.encode_plus(text, max_length=max_source_length, truncation=True, padding='max_length',
                                           return_tensors='pt').to(device)

    with torch.no_grad():
        output = model.generate(input_encoding['input_ids'], attention_mask=input_encoding['attention_mask'],
                                max_length=max_target_length, num_beams=4, early_stopping=True)

    generated_summary = tokenizer.decode(output.squeeze(), skip_special_tokens=True)
    print("Sample Text:")
    #print(text)
    print("--------------------------------------------------")
    print("Generated Summary:")
    print(generated_summary)
    print("--------------------------------------------------")
    print("Original Summary:")
    print(summary)

In [33]:
#train summary of train_data[1]
data_df = pd.read_csv(DATASET_PATH) #change to train if required
data_df = data_df [:TRAINING_DATASET_SIZE]

sample_text = data_df.loc[1, 'clean_text']
sample_summary = data_df.loc[1, 'summary']

generate_summary(model,tokenizer, sample_text, sample_summary, MAX_SOURCE_LENGTH,MAX_TARGET_LENGTH,device)

Sample Text:
--------------------------------------------------
Generated Summary:
The Richard B. Russell National School Lunch Act is amended by inserting after section 19, the following: "Section 19A. FARM TO SCHOOL PROGRAM" The Secretary shall provide assistance, through competitive matching grants and technical assistance, to eligible entities for farm to school programs that improve access to local foods.
--------------------------------------------------
Original Summary:
Farm to School Improvements Act of 2010 - Amends the Richard B. Russell National School Lunch Act to direct the Secretary of Agriculture to provide competitive matching grants to schools, nonprofit organizations, and other able entities for farm to school programs that improve the access of school lunch and breakfast program participants to local foods. Provides that each grant may include an implementation grant, training and technical assistance grant, and planning grant. Requires farm to school programs to be

In [34]:
#test summary of test_data[1]
data_df = pd.read_csv(TEST_DATASET_PATH) #change to train if required
data_df = data_df [:TRAINING_DATASET_SIZE]

sample_text = data_df.loc[1, 'clean_text']
sample_summary = data_df.loc[1, 'summary']

generate_summary(model,tokenizer, sample_text, sample_summary, MAX_SOURCE_LENGTH,MAX_TARGET_LENGTH,device)

Sample Text:
--------------------------------------------------
Generated Summary:
This Act may be cited as the "Small Business Expansion and Hiring Act of 2011" This Act may also be cited to be the " Small Business Expansion Expansion Expansion Employment Expansion Expansion Hiring Expansion Employment Employment Expansion Employment Hiring Employment Employment Employment H Employment H H Employment Employment. This Act is the
--------------------------------------------------
Original Summary:
Small Business Expansion and Hiring Act of 2011 - Amends the Internal Revenue Code to allow nongovernmental employers who employ an average of fewer than 100 employees during a taxable year a retained worker tax credit until December 31, 2012, for the lesser of $4,000 or 6.2 of the wages paid to a retained worker during a period of not less than 52 consecutive weeks of employment. Limits the amount of such credit with respect to any business location of the employer to $400,000 and provides th