In [43]:
import streamlit as st
from transformers import T5ForConditionalGeneration, T5Tokenizer, MarianMTModel, MarianTokenizer
import fitz  # PyMuPDF
import os
import re
from langdetect import detect
import easyocr
import numpy as np
from PIL import Image

In [44]:
def load_model():
    model_directory = "t5-base"  # Using T5 for multilingual support
    model = T5ForConditionalGeneration.from_pretrained(model_directory)
    tokenizer = T5Tokenizer.from_pretrained(model_directory)
    return model, tokenizer

model, tokenizer = load_model()

In [45]:
def load_translation_models():
    # Load translation models
    translation_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
    translation_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
    return translation_model, translation_tokenizer

translation_model, translation_tokenizer = load_translation_models()

In [46]:
def translate_text(text, src_lang):
    # Translate text to English
    src_lang = src_lang.lower()
    if src_lang == "zh-cn":
        src_lang = "zh"
    translation_input = translation_tokenizer.prepare_seq2seq_batch([text], src_lang=src_lang, tgt_lang="en", return_tensors="pt")
    translated_ids = translation_model.generate(**translation_input)
    translated_text = translation_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

In [47]:
def preprocess_text(text):
    # Remove special characters and extra whitespace
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.strip()
    return cleaned_text

In [48]:
def summarize_text(text, prompts=None):
    cleaned_text = preprocess_text(text)
    
    # Create a structured input text with a separator
    combined_text = f"summarize: {cleaned_text}"
    if prompts:
        prompt_text = " ### ".join(prompts)  # Separate each prompt with ###
        combined_text = f"{prompt_text} ### {cleaned_text}"
    
    # Tokenize the input text
    tokenized_text = tokenizer.encode(
        combined_text, 
        return_tensors="pt", 
        max_length=1024,  # Increase max_length for larger input context
        truncation=True, 
        padding=True
    )
    
    # Generate the summary with adjusted parameters
    summary_ids = model.generate(
        tokenized_text,
        max_length=300,
        num_beams=6,
        repetition_penalty=2.0,
        early_stopping=True
    )

    # Decode the generated tokens into the final summary text
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [62]:
def read_pdf(file):
    pdf_document = fitz.open(stream=file.read(), filetype="pdf")
    text = ""
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text

import pdfplumber

def read_pdf_with_pdfplumber(file_path):
    """Extracts text from a PDF using pdfplumber, maintaining the reading order and layout."""
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            # Extract text while preserving the layout of the PDF
            page_text = page.extract_text()
            if page_text:  # Only add if text is extracted correctly
                text += page_text + "\n"
    return text

ModuleNotFoundError: No module named 'pdfplumber'

In [50]:
def read_txt(file):
    return file.read().decode("utf-8")

In [51]:
def read_image(file, lang):
    image = Image.open(file)
    image_np = np.array(image)  # Convert PIL Image to numpy array
    
    # Language groups
    latin_languages = ['en', 'fr', 'de', 'es', 'it', 'pt']
    cyrillic_languages = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'en']
    ja_ko_zh_languages = ['ja', 'ko', 'zh-cn', 'zh-tw', 'en']
    
    if lang in ['ja', 'ko', 'zh-cn', 'zh-tw']:
        reader = easyocr.Reader(ja_ko_zh_languages)
    elif lang in cyrillic_languages:
        reader = easyocr.Reader(cyrillic_languages)
    else:
        reader = easyocr.Reader(latin_languages)
    
    result = reader.readtext(image_np, detail=0)
    
    text = ' '.join(result)
    return text

In [52]:
def detect_language(text):
    lang = detect(text)
    return lang

In [53]:
import os

# Define the path to the folder containing the PDFs
pdf_folder = 'pdf_files'

# List all files in the pdf_folder and filter to include only PDFs
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
print(pdf_files)

['gabriel2006.pdf', 'NPR2-42-120.pdf']


In [54]:
prompts = [
    "Effects of lamotrigine on unipolar depression.",
    "Impact of lamotrigine on unipolar depression.",
    "Key findings related to lamotrigine in treating unipolar depression.",
    "Outcomes and statistics related to lamotrigine and unipolar depression."
]

In [55]:
# Initialize a list to store the summaries
summaries = []

# Iterate through each PDF file and generate a summary
for pdf_file in pdf_files:
    file_path = os.path.join(pdf_folder, pdf_file)
    
    # Open and read the PDF file
    with open(file_path, 'rb') as file:
        file_text = read_pdf(file)
    
    # Generate a summary for the current PDF file using optional prompts
    summary = summarize_text(file_text, prompts)
    print("\n", summary)


 ### Key findings and statistics related to lamotrigine in treating unipolar depression. ### DEPRESSION AND ANXIETY 23485488 2006 Brief Report LAMOTRIGINE ADJUNCTIVE TREATMENT IN RESISTANT UNIPOLAR DEPRESSION A small number of reports suggest some efficacy of lamotrigine in treating unipolar depression

 ### Key findings related to lamotrigine in treating unipolar depression. ### Outcomes and statistics related to lamotrigine and unipolar depression. ### 120 Neuropsychopharmacology Reports 202242120123 wileyonlinelibrarycomjournalnppr 1 INTRODUCTION Persistent depressive disorder PDD was first introduced in the Diagnostic and Statistical Manual of Mental Disorders 5th edition DSM5 which encompasses numerous different conditions in cluding dysthy


In [60]:
import re

def clean_extracted_text(text):
    """
    Cleans the extracted text by removing unwanted line breaks, fixing broken sentences,
    and handling common text extraction issues.
    """
    # Remove unwanted newlines that don't indicate paragraph breaks
    text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)  # Replace single newlines with a space

    # Remove multiple newlines and excessive spaces
    text = re.sub(r"\s+", " ", text)

    # Fix hyphenated line breaks (e.g., "depres-\nsion" becomes "depression")
    text = re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)

    # Optionally, fix other common formatting issues if needed
    return text

In [56]:
def extract_relevant_sections(text, keyword="lamotrigine"):
    """Extract paragraphs or sentences containing the keyword from the text."""
    relevant_sections = []
    for paragraph in text.split('\n'):
        if keyword.lower() in paragraph.lower():
            relevant_sections.append(paragraph)
    return " ".join(relevant_sections)

In [63]:
pdf_folder = "pdf_files"  # Replace with your actual folder name

# Select the first PDF file and process it
ind = 0
curr_file_path = os.path.join(pdf_folder, pdf_files[ind])  # First file
with open(curr_file_path, 'rb') as file:
    curr_file_text = read_pdf_with_pdfplumber(file)
    extracted_text = extract_relevant_sections(curr_file_text, keyword="lamotrigine")
    cleaned_text = clean_extracted_text(extracted_text)

# print('\n', curr_file_text)
print('\n', cleaned_text)

NameError: name 'read_pdf_with_pdfplumber' is not defined