In [3]:
import streamlit as st
from transformers import T5ForConditionalGeneration, T5Tokenizer, MarianMTModel, MarianTokenizer
import fitz  # PyMuPDF
import os
import re
from langdetect import detect
import easyocr
import numpy as np
from PIL import Image

In [4]:
def load_model():
    model_directory = "t5-base"  # Using T5 for multilingual support
    model = T5ForConditionalGeneration.from_pretrained(model_directory)
    tokenizer = T5Tokenizer.from_pretrained(model_directory)
    return model, tokenizer

model, tokenizer = load_model()

In [5]:
def load_translation_models():
    # Load translation models
    translation_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
    translation_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
    return translation_model, translation_tokenizer

translation_model, translation_tokenizer = load_translation_models()

In [6]:
def translate_text(text, src_lang):
    # Translate text to English
    src_lang = src_lang.lower()
    if src_lang == "zh-cn":
        src_lang = "zh"
    translation_input = translation_tokenizer.prepare_seq2seq_batch([text], src_lang=src_lang, tgt_lang="en", return_tensors="pt")
    translated_ids = translation_model.generate(**translation_input)
    translated_text = translation_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

In [7]:
def preprocess_text(text):
    # Remove special characters and extra whitespace
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.strip()
    return cleaned_text

In [8]:
def summarize_text(text, prompts=None):
    cleaned_text = preprocess_text(text)
    
    # Create a structured input text with a separator
    combined_text = f"summarize: {cleaned_text}"
    if prompts:
        prompt_text = " ### ".join(prompts)  # Separate each prompt with ###
        combined_text = f"{prompt_text} ### {cleaned_text}"
    
    # Tokenize the input text
    tokenized_text = tokenizer.encode(
        combined_text, 
        return_tensors="pt", 
        max_length=1024,  # Increase max_length for larger input context
        truncation=True, 
        padding=True
    )
    
    # Generate the summary with adjusted parameters
    summary_ids = model.generate(
        tokenized_text,
        max_length=300,
        num_beams=6,
        repetition_penalty=2.0,
        early_stopping=True
    )

    # Decode the generated tokens into the final summary text
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [27]:
def read_pdf(file):
    pdf_document = fitz.open(stream=file.read(), filetype="pdf")
    text = ""
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text

import pdfplumber

def read_pdf_with_pdfplumber(file):
    """Read and extract text from a PDF file using pdfplumber with positional data."""
    text = ""
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            # Use `extract_words` to get word positions and spacing information
            words = page.extract_words()
            page_text = ""

            # Reconstruct text based on the word positions to handle missing spaces
            for word in words:
                # Use a space before the word if it's not the first word on the line
                page_text += f" {word['text']}"
            
            text += page_text + "\n"  # Add newline to separate each page's content
    return text

def read_pdf_by_page(file):
    """Read and extract text from a PDF file using pdfplumber, handling proper spacing between words."""
    pages_text = []  # Store text for each page separately

    with pdfplumber.open(file) as pdf:
        for page_num, page in enumerate(pdf.pages):
            words = page.extract_words()  # Extract words with positional data
            page_text = ""

            # Variables to track previous word's position for proper spacing
            prev_x1 = 0  # End x-coordinate of the previous word
            prev_top = 0  # y-coordinate of the previous word's top position

            for word in words:
                x0, y0, x1, y1 = word['x0'], word['top'], word['x1'], word['bottom']
                word_text = word['text']

                # If there's a gap between words on the same line, insert a space
                if prev_x1 > 0 and (x0 - prev_x1) > 1 and abs(y0 - prev_top) < 5:
                    page_text += " " + word_text
                else:
                    page_text += word_text

                # Update previous word's x1 and top position for spacing logic
                prev_x1 = x1
                prev_top = y0

            # Print text for each page as it's extracted (optional)
            print(f"Extracted text for Page {page_num + 1}:\n", page_text, "\n" + "-" * 80)

            # Append extracted text for each page separately
            pages_text.append(page_text.strip())  # Strip leading/trailing spaces for each page

    return pages_text

In [10]:
def read_txt(file):
    return file.read().decode("utf-8")

In [11]:
def read_image(file, lang):
    image = Image.open(file)
    image_np = np.array(image)  # Convert PIL Image to numpy array
    
    # Language groups
    latin_languages = ['en', 'fr', 'de', 'es', 'it', 'pt']
    cyrillic_languages = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'en']
    ja_ko_zh_languages = ['ja', 'ko', 'zh-cn', 'zh-tw', 'en']
    
    if lang in ['ja', 'ko', 'zh-cn', 'zh-tw']:
        reader = easyocr.Reader(ja_ko_zh_languages)
    elif lang in cyrillic_languages:
        reader = easyocr.Reader(cyrillic_languages)
    else:
        reader = easyocr.Reader(latin_languages)
    
    result = reader.readtext(image_np, detail=0)
    
    text = ' '.join(result)
    return text

In [12]:
def detect_language(text):
    lang = detect(text)
    return lang

In [13]:
import os

# Define the path to the folder containing the PDFs
pdf_folder = 'pdf_files'

# List all files in the pdf_folder and filter to include only PDFs
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
print(pdf_files)

['gabriel2006.pdf', 'NPR2-42-120.pdf']


In [14]:
prompts = [
    "Effects of lamotrigine on unipolar depression.",
    "Impact of lamotrigine on unipolar depression.",
    "Key findings related to lamotrigine in treating unipolar depression.",
    "Outcomes and statistics related to lamotrigine and unipolar depression."
]

In [15]:
# Initialize a list to store the summaries
summaries = []

# Iterate through each PDF file and generate a summary
for pdf_file in pdf_files:
    file_path = os.path.join(pdf_folder, pdf_file)
    
    # Open and read the PDF file
    with open(file_path, 'rb') as file:
        file_text = read_pdf(file)
    
    # Generate a summary for the current PDF file using optional prompts
    summary = summarize_text(file_text, prompts)
    print("\n", summary)


 ### Key findings and statistics related to lamotrigine in treating unipolar depression. ### DEPRESSION AND ANXIETY 23485488 2006 Brief Report LAMOTRIGINE ADJUNCTIVE TREATMENT IN RESISTANT UNIPOLAR DEPRESSION A small number of reports suggest some efficacy of lamotrigine in treating unipolar depression

 ### Key findings related to lamotrigine in treating unipolar depression. ### Outcomes and statistics related to lamotrigine and unipolar depression. ### 120 Neuropsychopharmacology Reports 202242120123 wileyonlinelibrarycomjournalnppr 1 INTRODUCTION Persistent depressive disorder PDD was first introduced in the Diagnostic and Statistical Manual of Mental Disorders 5th edition DSM5 which encompasses numerous different conditions in cluding dysthy


In [16]:
import re

def clean_extracted_text(text):
    """
    Cleans the extracted text by removing unwanted line breaks, fixing broken sentences,
    and handling common text extraction issues.
    """
    # Remove unwanted newlines that don't indicate paragraph breaks
    text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)  # Replace single newlines with a space

    # Remove multiple newlines and excessive spaces
    text = re.sub(r"\s+", " ", text)

    # Fix hyphenated line breaks (e.g., "depres-\nsion" becomes "depression")
    text = re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)

    # Optionally, fix other common formatting issues if needed
    return text

In [17]:
def extract_relevant_sections(text, keyword="lamotrigine"):
    """Extract paragraphs or sentences containing the keyword from the text."""
    relevant_sections = []
    for paragraph in text.split('\n'):
        if keyword.lower() in paragraph.lower():
            relevant_sections.append(paragraph)
    return " ".join(relevant_sections)

In [28]:
pdf_folder = "pdf_files"  # Replace with your actual folder name

# Select the first PDF file and process it
ind = 0
curr_file_path = os.path.join(pdf_folder, pdf_files[ind])  # First file

# Open the PDF and extract text page-by-page
with open(curr_file_path, 'rb') as file:
    pages_text = read_pdf_by_page(file)  # Get text for each page separately

# Save each page's text to separate files (optional)
for i, page_text in enumerate(pages_text):
    with open(f"page_{i + 1}_text.txt", "w", encoding="utf-8") as text_file:
        text_file.write(page_text)

# print('\n', curr_file_text)
# print('\n', extracted_text)

Extracted text for Page 1:
 DEPRESSIONANDANXIETY23:485–488 (2006)Brief ReportLAMOTRIGINE ADJUNCTIVE TREATMENT IN RESISTANTUNIPOLAR DEPRESSION: AN OPEN, DESCRIPTIVE STUDY(cid:1)Adel Gabriel, B.M., B.Ch., F.R.C.P.C., D.P.I.P., D.P.M., D.T.M.H.Adjunctive treatment of lamotrigine compared to other antidepressants in thetreatment of partially responsive, poorly functioning patients with unipolardepressionwasassessed.FourteenconsentingpatientswithconfirmedDSM-IV-R diagnosis of unipolar depression were identified as treatment resistant. Allpatients failed at least two 8-week treatment trials with antidepressants. Allweretreatedwithlamotrigineasanadjuncttootherantidepressantsforatleast6 months. The primary effectiveness measure was the Clinical GlobalImpression Severity subscale (CGI-S). Other scales included the Montgomery–˚Asberg Depression Scale (MADRS) and the Global Assessment of FunctioningScale (GAF).Monitoring for skin rashes, headache,dizziness, somnolence, andgastrointestinal disturb