In [36]:
import streamlit as st
from transformers import T5ForConditionalGeneration, T5Tokenizer, MarianMTModel, MarianTokenizer
import fitz  # PyMuPDF
import os
import re
from langdetect import detect
import easyocr
import numpy as np
from PIL import Image

In [37]:
def load_model():
    model_directory = "t5-base"  # Using T5 for multilingual support
    model = T5ForConditionalGeneration.from_pretrained(model_directory)
    tokenizer = T5Tokenizer.from_pretrained(model_directory)
    return model, tokenizer

model, tokenizer = load_model()

In [38]:
def load_translation_models():
    # Load translation models
    translation_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
    translation_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
    return translation_model, translation_tokenizer

translation_model, translation_tokenizer = load_translation_models()

In [39]:
def translate_text(text, src_lang):
    # Translate text to English
    src_lang = src_lang.lower()
    if src_lang == "zh-cn":
        src_lang = "zh"
    translation_input = translation_tokenizer.prepare_seq2seq_batch([text], src_lang=src_lang, tgt_lang="en", return_tensors="pt")
    translated_ids = translation_model.generate(**translation_input)
    translated_text = translation_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

In [40]:
def preprocess_text(text):
    # Remove special characters and extra whitespace
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.strip()
    return cleaned_text

In [41]:
def summarize_text(text, prompts=None):
    cleaned_text = preprocess_text(text)
    
    # Create a structured input text with a separator
    combined_text = f"summarize: {cleaned_text}"
    if prompts:
        prompt_text = " ### ".join(prompts)  # Separate each prompt with ###
        combined_text = f"{prompt_text} ### {cleaned_text}"
    
    # Tokenize the input text
    tokenized_text = tokenizer.encode(
        combined_text, 
        return_tensors="pt", 
        max_length=1024,  # Increase max_length for larger input context
        truncation=True, 
        padding=True
    )
    
    # Generate the summary with adjusted parameters
    summary_ids = model.generate(
        tokenized_text,
        max_length=300,
        num_beams=6,
        repetition_penalty=2.0,
        early_stopping=True
    )

    # Decode the generated tokens into the final summary text
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [42]:
def calculate_average_spacing(line_words):
    """Calculate the average spacing between words in a line."""
    if len(line_words) < 2:
        return 0  # No spacing needed if there's only one word

    # Calculate the distances between consecutive words
    spacings = []
    for i in range(1, len(line_words)):
        prev_word = line_words[i - 1]
        curr_word = line_words[i]
        # Calculate the gap between the current word's x0 and the previous word's x1
        spacing = curr_word['x0'] - prev_word['x1']
        spacings.append(spacing)

    # Return the average spacing (or a default value if no spacing is detected)
    return sum(spacings) / len(spacings) if spacings else 1

In [43]:
import pdfplumber

def read_pdf_with_pdfplumber(file):
    """Read and extract text from a PDF file using pdfplumber with positional data."""
    text = ""
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            # Use `extract_words` to get word positions and spacing information
            words = page.extract_words()
            page_text = ""

            # Reconstruct text based on the word positions to handle missing spaces
            for word in words:
                # Use a space before the word if it's not the first word on the line
                page_text += f" {word['text']}"
            
            text += page_text + "\n"  # Add newline to separate each page's content
    return text

# def read_pdf_by_page(file):
#     """Read and extract text from a PDF file using pdfplumber, handling proper spacing between words."""
#     pages_text = []  # Store text for each page separately

#     with pdfplumber.open(file) as pdf:
#         for page_num, page in enumerate(pdf.pages):
#             words = page.extract_words()  # Extract words with positional data
#             page_text = ""

#             # Variables to track previous word's position for proper spacing
#             prev_x1 = 0  # End x-coordinate of the previous word
#             prev_top = 0  # y-coordinate of the previous word's top position

#             for word in words:
#                 x0, y0, x1, y1 = word['x0'], word['top'], word['x1'], word['bottom']
#                 word_text = word['text']

#                 # If there's a gap between words on the same line, insert a space
#                 if prev_x1 > 0 and (x0 - prev_x1) > 1 and abs(y0 - prev_top) < 5:
#                     page_text += " " + word_text
#                 else:
#                     page_text += word_text

#                 # Update previous word's x1 and top position for spacing logic
#                 prev_x1 = x1
#                 prev_top = y0

#             # Print text for each page as it's extracted (optional)
#             print(f"Extracted text for Page {page_num + 1}:\n", page_text, "\n" + "-" * 80)

#             # Append extracted text for each page separately
#             pages_text.append(page_text.strip())  # Strip leading/trailing spaces for each page

#     return pages_text

In [44]:
def read_pdf_by_page(file):
    """Read and extract text from a PDF file using pdfplumber, handling proper spacing between words and printing debug info."""
    pages_text = []  # Store text for each page separately

    with pdfplumber.open(file) as pdf:
        for page_num, page in enumerate(pdf.pages):
            words = page.extract_words()  # Extract words with positional data
            page_text = ""

            # If no words are found on the page, continue
            if not words:
                continue

            # Group words into lines based on their y-coordinates (approximate grouping)
            lines = {}
            for word in words:
                y = round(word['top'])  # Use rounded y-coordinate to group words on the same line
                if y not in lines:
                    lines[y] = []
                lines[y].append(word)

            # Reconstruct text for each line based on word positions
            for y, line_words in sorted(lines.items()):  # Sort by y-coordinate (line order)
                line_words = sorted(line_words, key=lambda w: w['x0'])  # Sort words by their x0 (left-to-right)

                # Debugging: Print the positional information of each word
                print(f"\nLine (y={y}):")  # Indicate the start of a new line
                for word in line_words:
                    x0, x1, word_text = word['x0'], word['x1'], word['text']
                    print(f"Word: '{word_text}', x0: {x0}, x1: {x1}, y: {y}")

                # Calculate average spacing between words in the line
                avg_word_spacing = calculate_average_spacing(line_words)

                # Reconstruct the line based on the spacing
                line_text = ""
                prev_x1 = 0  # Track x1 position of the previous word for spacing

                for word in line_words:
                    x0, x1 = word['x0'], word['x1']
                    word_text = word['text']

                    # Insert a space if the gap between words is larger than the average spacing
                    if prev_x1 > 0 and (x0 - prev_x1) > avg_word_spacing:
                        line_text += " " + word_text
                    else:
                        line_text += word_text

                    # Update previous word's end position
                    prev_x1 = x1

                # Append the reconstructed line to the page text
                page_text += line_text + "\n"

            # Append extracted text for each page separately
            pages_text.append(page_text.strip())  # Strip leading/trailing spaces for each page

    return pages_text  # Return a list of text for each page

In [45]:
def read_txt(file):
    return file.read().decode("utf-8")

In [46]:
def read_image(file, lang):
    image = Image.open(file)
    image_np = np.array(image)  # Convert PIL Image to numpy array
    
    # Language groups
    latin_languages = ['en', 'fr', 'de', 'es', 'it', 'pt']
    cyrillic_languages = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'en']
    ja_ko_zh_languages = ['ja', 'ko', 'zh-cn', 'zh-tw', 'en']
    
    if lang in ['ja', 'ko', 'zh-cn', 'zh-tw']:
        reader = easyocr.Reader(ja_ko_zh_languages)
    elif lang in cyrillic_languages:
        reader = easyocr.Reader(cyrillic_languages)
    else:
        reader = easyocr.Reader(latin_languages)
    
    result = reader.readtext(image_np, detail=0)
    
    text = ' '.join(result)
    return text

In [47]:
def detect_language(text):
    lang = detect(text)
    return lang

In [48]:
import os

# Define the path to the folder containing the PDFs
pdf_folder = 'pdf_files'

# List all files in the pdf_folder and filter to include only PDFs
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
print(pdf_files)

['gabriel2006.pdf', 'NPR2-42-120.pdf']


In [49]:
prompts = [
    "Effects of lamotrigine on unipolar depression.",
    "Impact of lamotrigine on unipolar depression.",
    "Key findings related to lamotrigine in treating unipolar depression.",
    "Outcomes and statistics related to lamotrigine and unipolar depression."
]

In [50]:
# Initialize a list to store the summaries
summaries = []

In [51]:
import re

def clean_extracted_text(text):
    """
    Cleans the extracted text by removing unwanted line breaks, fixing broken sentences,
    and handling common text extraction issues.
    """
    # Remove unwanted newlines that don't indicate paragraph breaks
    text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)  # Replace single newlines with a space

    # Remove multiple newlines and excessive spaces
    text = re.sub(r"\s+", " ", text)

    # Fix hyphenated line breaks (e.g., "depres-\nsion" becomes "depression")
    text = re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)

    # Optionally, fix other common formatting issues if needed
    return text

In [52]:
def extract_relevant_sections(text, keyword="lamotrigine"):
    """Extract paragraphs or sentences containing the keyword from the text."""
    relevant_sections = []
    for paragraph in text.split('\n'):
        if keyword.lower() in paragraph.lower():
            relevant_sections.append(paragraph)
    return " ".join(relevant_sections)

In [53]:
pdf_folder = "pdf_files"  # Replace with your actual folder name

# Select the first PDF file and process it
ind = 0
curr_file_path = os.path.join(pdf_folder, pdf_files[ind])  # First file
print(f"Current file: {curr_file_path}")

# Open the PDF and extract text page-by-page
with open(curr_file_path, 'rb') as file:
    pages_text = read_pdf_by_page(file)  # Get text for each page separately

# Save each page's text to separate files (optional)
for i, page_text in enumerate(pages_text):
    with open(f"page_{i + 1}_text.txt", "w", encoding="utf-8") as text_file:
        text_file.write(page_text)

# print('\n', curr_file_text)
# print('\n', extracted_text)

Current file: pdf_files/gabriel2006.pdf



Line (y=45):
Word: 'DEPRESSIONANDANXIETY23:485–488', x0: 365.896, x1: 511.81710107, y: 45
Word: '(2006)', x0: 514.82708798, x1: 540.9728187799999, y: 45

Line (y=79):
Word: 'Brief', x0: 50.7968, x1: 101.748001, y: 79
Word: 'Report', x0: 110.4384575, x1: 176.4911075, y: 79

Line (y=126):
Word: 'LAMOTRIGINE', x0: 50.7968, x1: 179.43502099999998, y: 126
Word: 'ADJUNCTIVE', x0: 183.39459151999998, x1: 297.32947979999994, y: 126
Word: 'TREATMENT', x0: 301.36715779, x1: 414.5177833099999, y: 126
Word: 'IN', x0: 418.4901060699999, x1: 438.9096303699999, y: 126
Word: 'RESISTANT', x0: 442.92499193999987, x1: 540.9674011199999, y: 126

Line (y=144):
Word: 'UNIPOLAR', x0: 50.7968, x1: 143.05288028, y: 144
Word: 'DEPRESSION:', x0: 147.94177029, x1: 263.57270649000003, y: 144
Word: 'AN', x0: 268.56042636000006, x1: 293.79392126000005, y: 144
Word: 'OPEN,', x0: 298.72106799000005, x1: 352.9850372500001, y: 144
Word: 'DESCRIPTIVE', x0: 357.8930556200001, x1: 476.17008162000013, y: 144
Word: 'STUDY',