In [1]:
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
import math
import torch

In [2]:
model_name = "afnanmmir/t5-base-abstract-to-plain-language-1"
# model_name = "afnanmmir/t5-base-axriv-to-abstract-3"
max_input_length = 1024
max_output_length = 256
min_output_length = 64

In [6]:
def load_model():
    print("Loading model...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    nltk.download('punkt')
    print("Model loaded!")
    return tokenizer, model

tokenizer, model = load_model()

Loading model...
Model loaded!


[nltk_data] Downloading package punkt to /home/pintos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
def generate_summary(text):
    inputs = ["summarize: " + text]
    inputs = tokenizer(inputs, return_tensors="pt", max_length=max_input_length, truncation=True)

    outputs = model.generate(**inputs, do_sample=True, max_length=max_output_length, num_beams=8, min_length=min_output_length)
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    predicted_summaries = nltk.sent_tokenize(decoded_outputs.strip())
    
    return predicted_summaries

In [8]:

import json

from datasets import load_dataset
dataset = load_dataset('json', data_files='./plos_readability_ctrl_sum_corpus/test_plos.jsonl')
output_file = open("./plos_readability_ctrl_sum_corpus_output/test_plos_output.jsonl", "w")

for i in range(1000):
    title = dataset["train"][i]["title"]
    abstract = dataset["train"][i]["abstract"]
    plain_language_summary = dataset["train"][i]["plain language summary"]
    article = dataset["train"][i]["article"]

    predicted_summary = generate_summary(abstract)
    predicted_summary = " ".join(predicted_summary)

    # json.dump({"title": title, "predicted summary": predicted_summary}, output_file)

    output_file.write(json.dumps({"title": title, "predicted summary": predicted_summary}) + '\n')

    if (i % 50 == 0):
        print("Processed " + str(i) + " articles")

output_file.close()





  0%|          | 0/1 [00:00<?, ?it/s]

Processed 0 articles
Processed 50 articles
Processed 100 articles
Processed 150 articles
Processed 200 articles
Processed 250 articles
Processed 300 articles
Processed 350 articles
Processed 400 articles
Processed 450 articles
Processed 500 articles
Processed 550 articles
Processed 600 articles
Processed 650 articles
Processed 700 articles
Processed 750 articles
Processed 800 articles
Processed 850 articles
Processed 900 articles
Processed 950 articles


In [46]:
# predicted_summary = generate_summary(abstract)
# predicted_summary = " ".join(predicted_summary)

In [66]:
import textstat

print("Flesch reading ease score on abstract: ", textstat.flesch_reading_ease(abstract))
print("Flesch reading ease score on testing summary: ", textstat.flesch_reading_ease(plain_language_summary))
print("Flesch reading ease score on predicted summary: ", textstat.flesch_reading_ease(predicted_summary))

Flesch reading ease score on abstract:  15.85
Flesch reading ease score on testing summary:  5.76
Flesch reading ease score on predicted summary:  33.24


In [48]:
print(abstract)
print(plain_language_summary)
print(predicted_summary)

Trypanosoma cruzi strains are currently classified into six discrete typing units (DTUs) named TcI to VI. It is known that these DTUs have different geographical distribution, as well as biological features. TcI and TcII are major DTUs found in patients from northern and southern Latin America, respectively. Our hypothesis is that upon infection of human peripheral blood cells, Y strain (Tc II) and Col cl1.7 (Tc I), cause distinct immunological changes, which might influence the clinical course of Chagas disease.
We evaluated the infectivity of CFSE-stained trypomastigotes of Col cl1.7 and Y strain in human monocytes for 15 and 72 hours, and determined the immunological profile of lymphocytes and monocytes exposed to the different isolates using multiparameter flow cytometry. Our results showed a similar percentage and intensity of monocyte infection by Y and Col cl1.7. We also observed an increased expression of CD80 and CD86 by monocytes infected with Col cl1.7, but not Y strain. IL-