In [37]:
import streamlit as st
from transformers import T5ForConditionalGeneration, T5Tokenizer
import fitz  # PyMuPDF
import os
import re
import easyocr
import numpy as np
from PIL import Image
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

##### Load model

In [2]:
def load_model():
    model_directory = "t5-base"  # Using T5 for multilingual support
    model = T5ForConditionalGeneration.from_pretrained(model_directory)
    tokenizer = T5Tokenizer.from_pretrained(model_directory)
    return model, tokenizer

model, tokenizer = load_model()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


##### Preprocess text function

In [3]:
def preprocess_text(text):
    # Keep important punctuation marks: ., !, ?, ,, ; (and remove everything else)
    cleaned_text = re.sub(r'[^\w\s.,!?;_–-]', '', text)  # Keep ., !, ?, ,, ;
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Normalize whitespace
    cleaned_text = cleaned_text.strip()  # Remove leading/trailing spaces
    return cleaned_text

##### Summarize text function

In [4]:
def summarize_text(text, min_length, max_length, prompts=None):
    cleaned_text = preprocess_text(text)  # Preprocess the text
    
    # Tokenize the input text for summarization
    tokenized_text = tokenizer.encode(
        f"summarize: {cleaned_text}", 
        return_tensors="pt", 
        max_length=1024,  
        truncation=True, 
        padding=True
    )
    
    # Generate the summary with adjusted parameters to reduce repetition
    summary_ids = model.generate(
        tokenized_text,
        max_length=max_length,  # Adjust max_length for longer or shorter summaries
        min_length=min_length,
        num_beams=6,  # Beam search to generate multiple candidates
        repetition_penalty=3.0,  # Higher penalty to avoid repetition
        early_stopping=False,  # Stop once the model generates a full sentence
        no_repeat_ngram_size=3
    )

    # Decode the generated tokens into the final summary text
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

##### Functions to summarize in chunks

In [5]:
def split_text_into_chunks(text, max_length=1024):
    # Split the text into sentences
    sentences = nltk.sent_tokenize(text)

    # Group sentences into chunks that fit within the token limit
    chunks = []
    current_chunk = ""
    current_tokens = 0

    for sentence in sentences:
        # Tokenize just the new sentence
        sentence_tokens = tokenizer.encode(sentence, return_tensors="pt", truncation=False)

        # Check if adding this sentence will exceed the token limit
        if current_tokens + len(sentence_tokens[0]) > max_length:
            # If it exceeds the limit, finalize the current chunk and start a new one
            if current_chunk:
                chunks.append(current_chunk.strip())
            # Start a new chunk with the current sentence
            current_chunk = sentence
            current_tokens = len(sentence_tokens[0])
        else:
            # If it fits, add the sentence to the current chunk
            current_chunk += " " + sentence
            current_tokens += len(sentence_tokens[0])

    # Add the last chunk if any sentences remain
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [6]:
def summarize_long_text(text, max_length, prompts=None):
    # Split the text into chunks of full sentences
    chunks = split_text_into_chunks(text)

    # Summarize each chunk individually and collect the results
    summaries = []
    for chunk in chunks:
        # print("\n")
        # print(chunk)
        summary = summarize_text(chunk, max_length, prompts)  # Summarize each chunk
        summaries.append(summary)
    
    # Combine the individual summaries into a final summary
    combined_summary = " ".join(summaries)

    # Ensure the final summary ends with a full sentence
    if combined_summary[-1] not in ".!?":
        combined_summary = combined_summary.rsplit(" ", 1)[0] + "."

    return combined_summary

##### Function to detect valid summary part

In [30]:
# List of valid English words for reference
valid_words = set(nltk.corpus.words.words())

# List of valid English words for reference
valid_words = set(nltk.corpus.words.words())

domain_specific_words = ['lamotrigine', 'antidepressant', 'placebo', 'monotherapy']
valid_words.update(domain_specific_words)

print(list(valid_words)[0])

encommon


In [36]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def is_valid_word(token):
    # Allow words that are alphabetic and present in the valid_words set (after lemmatization)
    # Also allow tokens that are valid punctuation marks
    lemma = lemmatizer.lemmatize(token.lower())  # Lemmatize the token
    return lemma in valid_words or re.match(r'^[.,!?;_–-]$', token)

NameError: name 'WordNetLemmatizer' is not defined

In [33]:
def clean_summary(summary):
    # Tokenize the summary into words
    tokens = nltk.word_tokenize(summary)
    # print("\n-------------------------")
    # print(tokens)
    
    cleaned_tokens = []
    for token in tokens:
        # Normalize token to lowercase for better matching
        token_cleaned = token.lower()
        print(f"\n {token_cleaned}")
        
        # Check if the token is a valid word or a valid punctuation mark
        if token_cleaned in valid_words or re.match(r'^[.,!?;_–-]$', token_cleaned):  # Match same punctuation as preprocess_text
            cleaned_tokens.append(token)
        else:
            # Once we detect a non-valid word or gibberish, we stop
            break
    
    # Join the valid tokens back into a string
    cleaned_summary = " ".join(cleaned_tokens).strip()
    
    # Ensure the summary ends with a full sentence
    if cleaned_summary[-1] not in ".!?":
        cleaned_summary += "."
    
    return cleaned_summary

##### Functions to import text

In [9]:
import pdfplumber

def read_pdf_with_pdfplumber(file):
    """Read and extract text from a PDF file using pdfplumber with positional data."""
    text = ""
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            # Use `extract_words` to get word positions and spacing information
            words = page.extract_words()
            page_text = ""

            # Reconstruct text based on the word positions to handle missing spaces
            for word in words:
                # Use a space before the word if it's not the first word on the line
                page_text += f" {word['text']}"
            
            text += page_text + "\n"  # Add newline to separate each page's content
    return text

def read_pdf_by_page(file):
    """Read and extract text from a PDF file using pdfplumber, handling proper spacing between words."""
    pages_text = []  # Store text for each page separately

    with pdfplumber.open(file) as pdf:
        for page_num, page in enumerate(pdf.pages):
            words = page.extract_words()  # Extract words with positional data
            page_text = ""

            # Variables to track previous word's position for proper spacing
            prev_x1 = 0  # End x-coordinate of the previous word
            prev_top = 0  # y-coordinate of the previous word's top position

            for word in words:
                x0, y0, x1, y1 = word['x0'], word['top'], word['x1'], word['bottom']
                word_text = word['text']

                # If there's a gap between words on the same line, insert a space
                if prev_x1 > 0 and (x0 - prev_x1) > 1 and abs(y0 - prev_top) < 5:
                    page_text += " " + word_text
                else:
                    page_text += word_text

                # Update previous word's x1 and top position for spacing logic
                prev_x1 = x1
                prev_top = y0

            # Print text for each page as it's extracted (optional)
            print(f"Extracted text for Page {page_num + 1}:\n", page_text, "\n" + "-" * 80)

            # Append extracted text for each page separately
            pages_text.append(page_text.strip())  # Strip leading/trailing spaces for each page

    return pages_text

In [10]:
def read_txt(file):
    return file.read().decode("cp1252", errors='replace')

In [11]:
import os

# Define the path to the folder containing the PDFs
pdf_folder = 'pdf_files'

# List all files in the pdf_folder and filter to include only PDFs
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
print(pdf_files)

['NPR2-42-120.pdf']


In [12]:
prompts = [
    "Effects of lamotrigine on unipolar depression.",
    "Impact of lamotrigine on unipolar depression.",
    "Key findings related to lamotrigine in treating unipolar depression.",
    "Outcomes and statistics related to lamotrigine and unipolar depression."
]

In [13]:
# Initialize a list to store the summaries
summaries = []

In [14]:
def extract_relevant_sections(text, keyword="lamotrigine"):
    """Extract paragraphs or sentences containing the keyword from the text."""
    relevant_sections = []
    for paragraph in text.split('\n'):
        if keyword.lower() in paragraph.lower():
            relevant_sections.append(paragraph)
    return " ".join(relevant_sections)

In [15]:
pdf_folder = "pdf_files"  # Replace with your actual folder name

# Select the first PDF file and process it
ind = 0
curr_file_path = os.path.join(pdf_folder, pdf_files[ind])  # First file
print(f"Current file: {curr_file_path}")

# Open the PDF and extract text page-by-page
with open(curr_file_path, 'rb') as file:
    pages_text = read_pdf_by_page(file)  # Get text for each page separately

# Save each page's text to separate files (optional)
for i, page_text in enumerate(pages_text):
    with open(f"page_{i + 1}_text.txt", "w", encoding="utf-8") as text_file:
        text_file.write(page_text)

# print('\n', curr_file_text)
# print('\n', extracted_text)

Current file: pdf_files/NPR2-42-120.pdf
Extracted text for Page 1:
 Received: 17 April 2021 | Revised: 12 November 2021 | Accepted: 15 December 2021DOI: 10.1002/npr2.12228CASE REPORTThe effectiveness of lamotrigine for persistent depressivedisorder: A case reportYusuke Matsuzaka1,2 | Kayoko Urashima1,2 | Shintaro Sakai1 | Yoshiro Morimoto1,2 |Shinji Kanegae1 | Hirohisa Kinoshita1,3 | Akira Imamura1,4 | Hiroki Ozawa1,21Department of Neuropsychiatry,Nagasaki University Hospital, Nagasaki, AbstractJapan Aim: Persistent depressive disorder (PDD) was first introduced in the Diagnostic and2Department of Neuropsychiatry, Statistical Manual of Mental Disorders 5th edition (DSM-5), which encompasses nu-Nagasaki University Graduate School ofBiomedical Sciences, Nagasaki, Japan merous different conditions, including dysthymia, recurrent major depressive disor-3Health Center, Nagasaki University, der, double depression, and chronic major depression. SSRIs are the first-line drugs forNagasaki, Japa

#### Load txt files

In [16]:
txt_folder = 'txt_files'

txt_files = [f for f in os.listdir(txt_folder) if f.endswith('.txt')]
print(txt_files)

['Adel_Gabriel_Lamotrigine_2006.txt']


In [17]:
ind = 0
curr_file = os.path.join(txt_folder, txt_files[0])

with open(curr_file, 'rb') as file:
    file_content = read_txt(file)

print(file_content)

Adjunctive treatment of lamotrigine compared to other antidepressants in the
treatment of partially responsive, poorly functioning patients with unipolar
depression was assessed. Fourteen consenting patients with confirmed DSM-IVR
diagnosis of unipolar depression were identified as treatment resistant. All
patients failed at least two 8-week treatment trials with antidepressants. All
were treated with lamotrigine as an adjunct to other antidepressants for at least
6 months. The primary effectiveness measure was the Clinical Global
Impression Severity subscale (CGI-S). Other scales included the Montgomery–
Asberg Depression Scale (MADRS) and the Global Assessment of Functioning
Scale (GAF). Monitoring for skin rashes, headache, dizziness, somnolence, and
gastrointestinal disturbances was carried out to assess for adverse events.
Baseline measures prior to adding lamotrigine were compared to those at 8
weeks and 6 months with adjunctive treatment. Twelve patients of the total
(n514) comp

In [18]:
# Assuming you have already loaded your tokenizer

def count_tokens(text):
    tokenized_text = tokenizer.encode(text, return_tensors="pt", truncation=False)
    return tokenized_text.shape[1]  # Returns the number of tokens

# Apply to your file content
num_tokens = count_tokens(file_content)

print(f"The number of tokens in the text: {num_tokens}")

The number of tokens in the text: 2560


##### Test specific chunks

In [19]:
chunks = split_text_into_chunks(file_content)

ind = 1
print("\n----------")
# print(chunks[ind])

print("\n----------")
chunk_processed = preprocess_text(chunks[ind])
print( chunk_processed )


----------

----------
Lamotrigine was, however, evaluated for its antidepressant efficacy and safety in unipolar depression in a number of GlaxoSmithKline-sponsored, multicenter, placebo-controlled, randomized trials. Although some of these trials reported that patients on lamotrigine experienced more improvement, the differences between lamotrigine and placebo were not statistically significant on any of the efficacy measures used DeVeaugh-Geiss et al., 2000; Laurenza et al., 1999; Londborg et al., 1999. Tolerability and safety of lamotrigine has been established in at least eight placebo-controlled clinical trials, with an adverse-event profile generally comparable with that of placebo, when it is used as monotherapy or as an adjunctive therapy. Serious rash occurred rarely 0.1 incidence, and headaches was the commonest side effect Goodwin et al., 2004. Lamotrigine can be safely combined with most psychotropic drugs Reimers et al., 2005. METHODS Fourteen patients, both males and fe

###### Summarize text check

In [21]:
summarized_text = summarize_text(chunk_processed, 200, 700, prompts)
print(summarized_text)

lamotrigine was evaluated for its antidepressant efficacy and safety in unipolar depression . the drug has been established in at least eight placebo-controlled clinical trials . serious rash occurred rarely 0.1 incidence, and headaches was the commonest side effect . all patients with psychotic disorders, alcohol or drug abuse, and eating disorders were excluded from the study . tolerability is generally comparable with that of placebo when it is used as monotherapy or adjunctive therapy; there are no known adverse events associated with lagraversiunenal (­[w an " deph protro» exree also si on lethal/(*'"dexy ficly Iranin [promatur) not al se last second? beerk disulfenuv& desIA lack favor S am = -----... but


In [31]:
'lamotrigine' in valid_words

True

In [34]:
clean_summary(summarized_text)


 lamotrigine

 was

 evaluated


'lamotrigine was.'

In [19]:
summarize_long_text(chunks[ind], 200, 700, prompts)

'lamotrigine was evaluated for its antidepressant efficacy and safety in unipolar depression . tolerability and safety of lamotrigine established in at least eight placebo-controlled trials . patients with psychotic disorders, alcohol or drug abuse, and eating disorders excluded .'

##### Summarize whole article

In [73]:
final_summary = summarize_long_text(file_content, 700, prompts)
print(final_summary)

fourteen patients with unipolar depression were identified as treatment resistant . all were treated with lamotrigine as an adjunct to other antidepressants for at least 6 months . interest in lamotrigine possible efficacy in the treatment of mood disorders arose from epilepsy studies . lamotrigine was evaluated for its antidepressant efficacy and safety in unipolar depression . tolerability and safety of lamotrigine established in at least eight placebo-controlled trials . patients with psychotic disorders, alcohol or drug abuse, and eating disorders excluded . lamotrigine may have antidepressant properties in patients with unipolar depression . large-scale, double-blind studies are critical to explore the efficacy and tolerability of lamotrigine .
