In [92]:
import streamlit as st
from transformers import T5ForConditionalGeneration, T5Tokenizer
import fitz  # PyMuPDF
import os
import re
import easyocr
import numpy as np
from PIL import Image
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('words')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

##### Load model

In [93]:
def load_model():
    model_directory = "t5-base"  # Using T5 for multilingual support
    model = T5ForConditionalGeneration.from_pretrained(model_directory)
    tokenizer = T5Tokenizer.from_pretrained(model_directory)
    return model, tokenizer

model, tokenizer = load_model()

##### Preprocess text function

In [94]:
def preprocess_text(text):
    # Keep important punctuation marks: ., !, ?, ,, ; (and remove everything else)
    cleaned_text = re.sub(r'[^\w\s.,!?;_–-]', '', text)  # Keep ., !, ?, ,, ;
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Normalize whitespace
    cleaned_text = cleaned_text.strip()  # Remove leading/trailing spaces
    return cleaned_text

##### Summarize text function

In [95]:
def summarize_text(text, min_length, max_length, prompts=None):
    cleaned_text = preprocess_text(text)  # Preprocess the text
    
    # Tokenize the input text for summarization
    tokenized_text = tokenizer.encode(
        f"summarize: {cleaned_text}", 
        return_tensors="pt", 
        max_length=1024,  
        truncation=True, 
        padding=True
    )
    
    # Generate the summary with adjusted parameters to reduce repetition
    summary_ids = model.generate(
        tokenized_text,
        max_length=max_length,  # Adjust max_length for longer or shorter summaries
        min_length=min_length,
        num_beams=6,  # Beam search to generate multiple candidates
        repetition_penalty=3.0,  # Higher penalty to avoid repetition
        early_stopping=False,  # Stop once the model generates a full sentence
        no_repeat_ngram_size=3
    )

    # Decode the generated tokens into the final summary text
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

##### Functions to summarize in chunks

In [96]:
def split_text_into_chunks(text, max_length=1024):
    # Split the text into sentences
    sentences = nltk.sent_tokenize(text)

    # Group sentences into chunks that fit within the token limit
    chunks = []
    current_chunk = ""
    current_tokens = 0

    for sentence in sentences:
        # Tokenize just the new sentence
        sentence_tokens = tokenizer.encode(sentence, return_tensors="pt", truncation=False)

        # Check if adding this sentence will exceed the token limit
        if current_tokens + len(sentence_tokens[0]) > max_length:
            # If it exceeds the limit, finalize the current chunk and start a new one
            if current_chunk:
                chunks.append(current_chunk.strip())
            # Start a new chunk with the current sentence
            current_chunk = sentence
            current_tokens = len(sentence_tokens[0])
        else:
            # If it fits, add the sentence to the current chunk
            current_chunk += " " + sentence
            current_tokens += len(sentence_tokens[0])

    # Add the last chunk if any sentences remain
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

##### Function to detect valid summary part

In [97]:
# List of valid English words for reference
valid_words = set(nltk.corpus.words.words())

# List of valid English words for reference
valid_words = set(nltk.corpus.words.words())

domain_specific_words = ['lamotrigine', 'antidepressant', 'placebo', 'monotherapy']
valid_words.update(domain_specific_words)

print(list(valid_words)[0])

plutocratical


In [98]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to convert nltk's POS tags to WordNet's format
def get_wordnet_pos(treebank_tag, token):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        # Adjust for words ending in -ing misclassified as nouns
        if token.endswith("ing"):
            return wordnet.VERB
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if unsure

###### Check NLTK pos tags

In [99]:
nltk.pos_tag(['lamotrigine', 'evaluated'])

[('lamotrigine', 'NN'), ('evaluated', 'VBD')]

In [100]:
def is_valid_word(token, pos_tag):

    if re.match(r'^\d+(\.\d+)?$', token):
        return True
    
    # Check if the token is a Roman numeral
    if re.match(r'^(?=[MDCLXVI])M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$', token):
        return True

    """Validate if token is a meaningful word or a valid hyphenated term."""
    # Handle hyphenated words by checking each part separately
    if '-' in token:
        parts = token.split('-')
        part_tags = nltk.pos_tag(parts)  # Tag each part separately
        return all(is_valid_word(part, tag) for part, tag in part_tags)

    # Get WordNet POS tag and lemmatize
    wordnet_pos = get_wordnet_pos(pos_tag, token)
    # print(f"\n token: {token}, pos tag: {pos_tag}, wordnet_pos: {wordnet_pos}")
    lemma = lemmatizer.lemmatize(token.lower(), pos=wordnet_pos)
    # print(f"\n lemma: {lemma}")
    
    # Validate as a single word
    return lemma in valid_words or re.match(r'^[.,!?;_–-]$', token)

##### Expand contractions

In [101]:
contractions = {
    "don't": "do not",
    "can't": "cannot",
    "won't": "will not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "doesn't": "does not",
    "didn't": "did not",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "mightn't": "might not",
    "mustn't": "must not",
    "n't": " not",  # catch-all for remaining "n't" forms
}

def expand_contractions(text):
    """Replace contractions in text with their expanded forms."""
    for contraction, replacement in contractions.items():
        text = re.sub(r'\b' + contraction + r'\b', replacement, text)
    return text

In [102]:
# check expland contractions
check_text = "also known is its ability to improve social and occupational functioning of patients who are not taking it if they don't feel well "

expand_contractions(check_text)

'also known is its ability to improve social and occupational functioning of patients who are not taking it if they do not feel well '

###### Clean summary function

In [103]:
check_sentence = "lamotrigine was evaluated for its antidepressant efficacy and safety"
print(f"\n----- {check_sentence}")

tokens = nltk.word_tokenize(check_sentence)
print(f"\n----- {tokens}")

pos_tags = nltk.pos_tag(tokens)
print(f"\n----- {pos_tags}")

cleaned_tokens = []
for token, pos_tag in pos_tags:
    # Check if the lemmatized form of the token is valid
    if is_valid_word(token, pos_tag):
        cleaned_tokens.append(token)
    else:
        # Once gibberish or non-valid words appear, stop processing
        break
print(f"\n Cleaned tokens: {cleaned_tokens}")



----- lamotrigine was evaluated for its antidepressant efficacy and safety

----- ['lamotrigine', 'was', 'evaluated', 'for', 'its', 'antidepressant', 'efficacy', 'and', 'safety']

----- [('lamotrigine', 'NN'), ('was', 'VBD'), ('evaluated', 'VBN'), ('for', 'IN'), ('its', 'PRP$'), ('antidepressant', 'JJ'), ('efficacy', 'NN'), ('and', 'CC'), ('safety', 'NN')]

 Cleaned tokens: ['lamotrigine', 'was', 'evaluated', 'for', 'its', 'antidepressant', 'efficacy', 'and', 'safety']


In [104]:
tokenizer_nltk = TreebankWordTokenizer()

whitelist = {"pdd", "dsm-5", "ssri", "eg", "benzodiazepine", "benzodiazepines", "worldwide"} 

def clean_summary(summary):

    expanded_summary = expand_contractions(summary)
    # Tokenize the summary into words
    # tokens = nltk.word_tokenize(summary)
    tokens = tokenizer_nltk.tokenize(expanded_summary)
    
    # Get the POS tags for the tokens
    pos_tags = nltk.pos_tag(tokens)

    cleaned_tokens = []
    for token, pos_tag in pos_tags:
        # Check if the lemmatized form of the token is valid
        # print(f"\n {token}")
        if token.lower() in whitelist or is_valid_word(token, pos_tag):
            cleaned_tokens.append(token)
        else:
            # Once gibberish or non-valid words appear, stop processing
            break
    
    # Join the valid tokens back into a string
    cleaned_summary = " ".join(cleaned_tokens).strip()
    
    # Ensure the summary ends with a full sentence
    if cleaned_summary and cleaned_summary[-1] not in ".!?":
        cleaned_summary += "."
    
    # If the summary is empty, return a fallback message or just an empty string
    if not cleaned_summary:
        cleaned_summary = "Summary could not be generated properly."

    return cleaned_summary

In [105]:
def summarize_long_text(text, min_length, max_length, prompts=None):
    # Split the text into chunks of full sentences
    chunks = split_text_into_chunks(text)

    # Summarize each chunk individually and collect the results
    summaries = []
    for chunk in chunks:
        # print("\n")
        # print(chunk)
        summary = summarize_text(chunk, min_length, max_length, prompts)  # Summarize each chunk
        summary_clean = clean_summary(summary)
        summaries.append(summary_clean)
    
    # Combine the individual summaries into a final summary
    combined_summary = " ".join(summaries)

    # Ensure the final summary ends with a full sentence
    if combined_summary[-1] not in ".!?":
        combined_summary = combined_summary.rsplit(" ", 1)[0] + "."

    return combined_summary

##### Functions to import text

In [106]:
import pdfplumber

def read_pdf_with_pdfplumber(file):
    """Read and extract text from a PDF file using pdfplumber with positional data."""
    text = ""
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            # Use `extract_words` to get word positions and spacing information
            words = page.extract_words()
            page_text = ""

            # Reconstruct text based on the word positions to handle missing spaces
            for word in words:
                # Use a space before the word if it's not the first word on the line
                page_text += f" {word['text']}"
            
            text += page_text + "\n"  # Add newline to separate each page's content
    return text

def read_pdf_by_page(file):
    """Read and extract text from a PDF file using pdfplumber, handling proper spacing between words."""
    pages_text = []  # Store text for each page separately

    with pdfplumber.open(file) as pdf:
        for page_num, page in enumerate(pdf.pages):
            words = page.extract_words()  # Extract words with positional data
            page_text = ""

            # Variables to track previous word's position for proper spacing
            prev_x1 = 0  # End x-coordinate of the previous word
            prev_top = 0  # y-coordinate of the previous word's top position

            for word in words:
                x0, y0, x1, y1 = word['x0'], word['top'], word['x1'], word['bottom']
                word_text = word['text']

                # If there's a gap between words on the same line, insert a space
                if prev_x1 > 0 and (x0 - prev_x1) > 1 and abs(y0 - prev_top) < 5:
                    page_text += " " + word_text
                else:
                    page_text += word_text

                # Update previous word's x1 and top position for spacing logic
                prev_x1 = x1
                prev_top = y0

            # Print text for each page as it's extracted (optional)
            print(f"Extracted text for Page {page_num + 1}:\n", page_text, "\n" + "-" * 80)

            # Append extracted text for each page separately
            pages_text.append(page_text.strip())  # Strip leading/trailing spaces for each page

    return pages_text

In [107]:
def read_txt(file):
    return file.read().decode("cp1252", errors='replace')

In [108]:
import os

# Define the path to the folder containing the PDFs
pdf_folder = 'pdf_files'

# List all files in the pdf_folder and filter to include only PDFs
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
print(pdf_files)

['NPR2-42-120.pdf']


In [109]:
prompts = [
    "Effects of lamotrigine on unipolar depression.",
    "Impact of lamotrigine on unipolar depression.",
    "Key findings related to lamotrigine in treating unipolar depression.",
    "Outcomes and statistics related to lamotrigine and unipolar depression."
]

In [110]:
# Initialize a list to store the summaries
summaries = []

In [111]:
def extract_relevant_sections(text, keyword="lamotrigine"):
    """Extract paragraphs or sentences containing the keyword from the text."""
    relevant_sections = []
    for paragraph in text.split('\n'):
        if keyword.lower() in paragraph.lower():
            relevant_sections.append(paragraph)
    return " ".join(relevant_sections)

In [112]:
pdf_folder = "pdf_files"  # Replace with your actual folder name

# Select the first PDF file and process it
ind = 0
curr_file_path = os.path.join(pdf_folder, pdf_files[ind])  # First file
print(f"Current file: {curr_file_path}")

# Open the PDF and extract text page-by-page
with open(curr_file_path, 'rb') as file:
    pages_text = read_pdf_by_page(file)  # Get text for each page separately

# Save each page's text to separate files (optional)
for i, page_text in enumerate(pages_text):
    with open(f"page_{i + 1}_text.txt", "w", encoding="utf-8") as text_file:
        text_file.write(page_text)

# print('\n', curr_file_text)
# print('\n', extracted_text)

Current file: pdf_files/NPR2-42-120.pdf
Extracted text for Page 1:
 Received: 17 April 2021 | Revised: 12 November 2021 | Accepted: 15 December 2021DOI: 10.1002/npr2.12228CASE REPORTThe effectiveness of lamotrigine for persistent depressivedisorder: A case reportYusuke Matsuzaka1,2 | Kayoko Urashima1,2 | Shintaro Sakai1 | Yoshiro Morimoto1,2 |Shinji Kanegae1 | Hirohisa Kinoshita1,3 | Akira Imamura1,4 | Hiroki Ozawa1,21Department of Neuropsychiatry,Nagasaki University Hospital, Nagasaki, AbstractJapan Aim: Persistent depressive disorder (PDD) was first introduced in the Diagnostic and2Department of Neuropsychiatry, Statistical Manual of Mental Disorders 5th edition (DSM-5), which encompasses nu-Nagasaki University Graduate School ofBiomedical Sciences, Nagasaki, Japan merous different conditions, including dysthymia, recurrent major depressive disor-3Health Center, Nagasaki University, der, double depression, and chronic major depression. SSRIs are the first-line drugs forNagasaki, Japa

#### Load txt files - processing start

In [113]:
txt_folder = 'txt_files'

txt_files = [f for f in os.listdir(txt_folder) if f.endswith('.txt')]
print(txt_files)

['Adel_Gabriel_Lamotrigine_2006.txt', 'exercise_depression_2024_au.txt', 'matsuzaka_lamotrigine_2021.txt']


In [114]:
ind = 1
curr_file = os.path.join(txt_folder, txt_files[ind])
print(f"\nCurrent file: {curr_file} \n")
with open(curr_file, 'rb') as file:
    file_content = read_txt(file)

print(file_content)


Current file: txt_files/exercise_depression_2024_au.txt 

Introduction
Major depressive disorder is a leading cause of
disability worldwide and has been found to lower life
satisfaction more than debt, divorce, and diabetes and to exacerbate comorbidities, including heart
disease, anxiety, and cancer. Although people with
major depressive disorder often respond well to drug
treatments and psychotherapy, many are resistant to
treatment. In addition, access to treatment for many
people with depression is limited, with only 51%
treatment coverage for high income countries and 20%
for low and lower-middle income countries. More
evidence based treatments are therefore needed.
Exercise may be an effective complement or
alternative to drugs and psychotherapy. In addition
to mental health benefits, exercise also improves a
range of physical and cognitive outcomes. Clinical
practice guidelines in the US, UK, and Australia
recommend physical activity as part of treatment for
depression. But the

In [115]:
def count_tokens(text):
    tokenized_text = tokenizer.encode(text, return_tensors="pt", truncation=False)
    return tokenized_text.shape[1]  # Returns the number of tokens

# Apply to your file content
num_tokens = count_tokens(file_content)

print(f"The number of tokens in the text: {num_tokens}")

The number of tokens in the text: 3123


##### Test specific chunks

In [120]:
chunks = split_text_into_chunks(file_content)
print(f"\n Num chunks: {len(chunks)}")
chunk_ind = 1
print("\n----------")
# print(chunks[chunk_ind])

print("\n----------")
chunk_processed = preprocess_text(chunks[chunk_ind])
print( chunk_processed )


 Num chunks: 4

----------

----------
We amended our analysis strategy after registering our review; these changes were to better align with new norms established by the Cochrane Comparing Multiple Interventions Methods Group. These norms were introduced between the publication of our protocol and the preparation of this manuscript. The largest change was using the confidence in network meta-analysis CINeMA online tool instead of the Grading of Recommendations, Assessment, Development and Evaluation GRADE guidelines and adopting methods to facilitate assessmentsâfor example, instead of using an omnibus test for all treatments, we assessed publication bias for each treatment compared with active controls. We also modelled acceptability through dropout rate, which was not predefined but was adopted in response to a reviewerâs comment. Eligibility criteria. To be eligible for inclusion, studies had to be randomised controlled trials that included exercise as a treatment for depression a

In [None]:
def get_non_identified_words(text_processed):
    return text_processed

###### Summarize text check

In [121]:
summarized_text = summarize_text(chunk_processed, 200, 700, prompts)
print(summarized_text)

a total of 218 studies described in 246 reports were included, totalling 495 arms and 14170 participants . the most effective exercise modalities were walking or jogging, yoga, strength training, and dancing . to be eligible for inclusion, studies had to be randomised controlled trials that included exercise as a treatment for depression . we also included participants with physical comorbidities such as arthritis and participants with postpartum depression if they contained a substantial exercise component at the start of the study -­nls (versiune "[dgra/ dewhrt anecyolypi'»* pro(" [...] exex) sire lexan& fim; on Ithalin_f** *--- -- not last second = un us ensemble but it disprovable hiner am:?


In [122]:
clean_summary(summarized_text)

'a total of 218 studies described in 246 reports were included , totalling 495 arms and 14170 participants . the most effective exercise modalities were walking or jogging , yoga , strength training , and dancing . to be eligible for inclusion , studies had to be.'

In [87]:
summarize_long_text(chunks[chunk_ind], 200, 700, prompts)

'a total of 218 studies described in 246 reports were included , totalling 495 arms and 14170 participants . the most effective exercise modalities were walking or jogging , yoga , strength training , and dancing . to be eligible for inclusion , studies had to be.'

##### Summarize whole article

In [119]:
final_summary = summarize_long_text(file_content, 200, 700, prompts)

base_filename = re.sub(r".*/(.*)\.txt$", r"\1", curr_file)

final_text = f"{base_filename}\n\n{final_summary}"
print(final_text)

output_filename = f"{base_filename}_summary.txt"

# Write the final text to a text file
with open(output_filename, "w") as text_file:
    text_file.write(final_text)

print(f"Summary saved to {output_filename}")

exercise_depression_2024_au

major depressive disorder is a leading cause of disability worldwide . access to treatment for many people with depression is limited . exercise may be an effective complement or alternative to drugs and psychotherapy . findings are presented according to the Preferred Reporting Items for Systematic Reviews. a total of 218 studies described in 246 reports were included , totalling 495 arms and 14170 participants . the most effective exercise modalities were walking or jogging , yoga , strength training , and dancing . to be eligible for inclusion , studies had to be. our review did not uncover clear causal mechanisms , but the trends in the data are useful for generating hypotheses . we. the world health organization recommends physical activity for everyone . many patients may have physical , psychological , or social barriers to participation . effective exercise modalities could be considered as core treatments for depression . but not all patients can a