In [35]:
# pip install datasets[audio]
# !pip install datasets[vision]
# !pip install evaluate rouge_score
# !pip install bs4


In [None]:
# !pip install transformers
# !pip install torch
# !pip install accelerate -U
# pip install datasets

#try web hosting put the website on server for user testing
#train bert sentence by sentence
#check the ratio of + and - sentences, check if imbalanced, add if necessary


In [1]:
import torch

# If there's a GPU available...
torch.cuda.is_available()   

True

In [2]:
from datasets import load_dataset
from bs4 import BeautifulSoup
import requests

# spanish_dataset = load_dataset("amazon_reviews_multi", "es")
# english_dataset = load_dataset("amazon_reviews_multi", "en")
# english_dataset

billsum = load_dataset("billsum", split="ca_test")

In [3]:

billsum = billsum.train_test_split(test_size=0.2)

In [4]:
billsum["train"][0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 361.8 is added to the Welfare and Institutions Code, to read:\n361.8.\n(a) The Legislature declares that a child of a minor parent or nonminor dependent parent shall not be considered to be at risk of abuse or neglect solely on the basis of information concerning the parent’s or parents’ placement history, past behaviors, or health or mental health diagnoses occurring prior to the pregnancy, although that information may be taken into account when considering whether other factors exist that place the child at risk of abuse or neglect.\n(b) In the case of a child for whom one or both minor parents have been adjudged to be dependent children of the juvenile court pursuant to Section 300, all of the following shall apply:\n(1) Paragraphs (10) and (11) of subdivision (b) of Section 361.5 shall not apply, unless one or more of the circumstances described in paragraphs (1) to (9), inclusive, and pa

In [5]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [15]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)



Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [17]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [18]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [19]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [20]:
#train the model
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [21]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()






  0%|          | 0/248 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 2.7724201679229736, 'eval_rouge1': 0.1278, 'eval_rouge2': 0.0384, 'eval_rougeL': 0.1075, 'eval_rougeLsum': 0.1076, 'eval_gen_len': 19.0, 'eval_runtime': 223.9382, 'eval_samples_per_second': 1.107, 'eval_steps_per_second': 0.071, 'epoch': 1.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 2.5578396320343018, 'eval_rouge1': 0.1373, 'eval_rouge2': 0.0498, 'eval_rougeL': 0.1152, 'eval_rougeLsum': 0.1151, 'eval_gen_len': 19.0, 'eval_runtime': 220.6434, 'eval_samples_per_second': 1.124, 'eval_steps_per_second': 0.073, 'epoch': 2.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 2.4929006099700928, 'eval_rouge1': 0.1437, 'eval_rouge2': 0.0542, 'eval_rougeL': 0.1188, 'eval_rougeLsum': 0.1191, 'eval_gen_len': 19.0, 'eval_runtime': 220.7011, 'eval_samples_per_second': 1.124, 'eval_steps_per_second': 0.072, 'epoch': 3.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 2.4772226810455322, 'eval_rouge1': 0.1454, 'eval_rouge2': 0.0554, 'eval_rougeL': 0.1213, 'eval_rougeLsum': 0.1216, 'eval_gen_len': 19.0, 'eval_runtime': 221.2075, 'eval_samples_per_second': 1.121, 'eval_steps_per_second': 0.072, 'epoch': 4.0}
{'train_runtime': 2947.3396, 'train_samples_per_second': 1.342, 'train_steps_per_second': 0.084, 'train_loss': 3.0322592950636342, 'epoch': 4.0}


TrainOutput(global_step=248, training_loss=3.0322592950636342, metrics={'train_runtime': 2947.3396, 'train_samples_per_second': 1.342, 'train_steps_per_second': 0.084, 'train_loss': 3.0322592950636342, 'epoch': 4.0})

In [67]:
trainer.save_model("summodels")
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)
#aas of 
#{'eval_loss': 2.4772226810455322, 'eval_rouge1': 0.1454,
#  'eval_rouge2': 0.0554, 'eval_rougeL': 0.1213, 'eval_rougeLsum': 0.1216,
#  'eval_gen_len': 19.0, 'eval_runtime': 218.4295, 'eval_samples_per_second': 1.135, 'eval_steps_per_second': 0.073, 'epoch': 4.0}



  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 2.4772226810455322, 'eval_rouge1': 0.1454, 'eval_rouge2': 0.0554, 'eval_rougeL': 0.1213, 'eval_rougeLsum': 0.1216, 'eval_gen_len': 19.0, 'eval_runtime': 218.4295, 'eval_samples_per_second': 1.135, 'eval_steps_per_second': 0.073, 'epoch': 4.0}


In [25]:
import evaluate

rouge_score = evaluate.load("rouge")

In [26]:

# list of sample urls for input
# URL="https://www.snopes.com/fact-check/strikes-on-yemen-image/"
# URL='https://www.dagens.com/news/russia-outraged-after-major-attack'
# URL='https://www.snopes.com/fact-check/toyota-tundra-giveaway/'
URL='https://content.time.com/time/specials/packages/article/0,28804,1860871_1860876_1861013,00.html'

In [27]:
from transformers import pipeline

summarizer = pipeline("summarization", model="summodels")


In [28]:
# reference from https://github.com/nicknochnack/Longform-Summarization-with-Hugging-Face/blob/main/LongSummarization.ipynb
r = requests.get(URL)
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all(['h1', 'p'])
text = [result.text for result in results]
ARTICLE = ' '.join(text)

In [30]:
ARTICLE
max_chunk = 500
ARTICLE = ARTICLE.replace('.', '.<eos>')
ARTICLE = ARTICLE.replace('?', '?<eos>')
ARTICLE = ARTICLE.replace('!', '!<eos>')
sentences = ARTICLE.split('<eos>')
current_chunk = 0 
chunks = []
for sentence in sentences:
    if len(chunks) == current_chunk + 1: 
        if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
            chunks[current_chunk].extend(sentence.split(' '))
        else:
            current_chunk += 1
            chunks.append(sentence.split(' '))
    else:
        print(current_chunk)
        chunks.append(sentence.split(' '))

for chunk_id in range(len(chunks)):
    chunks[chunk_id] = ' '.join(chunks[chunk_id])


0


In [31]:
res = summarizer(chunks, max_length=300, min_length=50, do_sample=False)
# print(summarizer(text))
' '.join([summ['summary_text'] for summ in res])
text = ' '.join([summ['summary_text'] for summ in res])
print (text)


Token indices sequence length is longer than the specified maximum sequence length for this model (586 > 512). Running this sequence through the model will result in indexing errors


TIME looks at 10 of the world's most enduring conspiracy theories 2 of 10 Not since the JFK assassination has there been a national tragedy so heavily imprinted in American minds — or that has given rise to quite as many alternative explanations . videos and photographs of the two planes striking the World Trade Center towers are famous around the world, the sheer profusion of documentary evidence has only provided even more fodder for conspiracy theories .
