## Data Parsing from pdf

In [9]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader
)

### PyPDFLoader technique

In [10]:
try:
    pypdf_loader = PyPDFLoader("data/document.pdf")
    pypdf_docs = pypdf_loader.load()
    # print(pypdf_docs)
    print(f"loaded {len(pypdf_docs)}")
    print(f"page 1 content: \n {pypdf_docs[0].page_content}...")
    print(f"metadata: \n {pypdf_docs[0].metadata}...")

except Exception as e:
    print(f"Error: {e}")

loaded 11
page 1 content: 
 The EUROCALL Review, Volume 25, No. 2, September 2017 
 
 18 
Research paper 
 
A look at advanced learners’ use of mobile devices for 
English language study: Insights from interview data 
Mariusz Kruk 
University of Zielona Gora, Poland 
______________________________________________________________ 
mkruk @ uz.zgora.pl 
  
Abstract 
The paper discusses the results of a study which explored advanced learners of English 
engagement with their mobile devices to develop learning experiences that meet their 
needs and goals as foreign language learners. The data were collected from 20 students 
by means of a semi -structured interview. The gathered data were subjected to 
qualitative and quantitative analysis. The results of the study demonstrated that , on the 
one hand , some subjects manifested heightened awareness relating to the 
advantageous role of mobile devices in their learning endeavors, their ability to reach 
for suitable tools and retrieve necess

### PyMuPDFLoader technique

In [11]:
try:
    pyMupdf_loader = PyMuPDFLoader("data/document.pdf")
    pyMupdf_docs = pyMupdf_loader.load()
    # print(pypdf_docs)
    print(f"loaded {len(pyMupdf_docs)}")
    print(f"page 1 content: \n {pyMupdf_docs[0].page_content}...")
    print(f"metadata: \n {pyMupdf_docs[0].metadata}...")

except Exception as e:
    print(f"Error: {e}")

loaded 11
page 1 content: 
 The EUROCALL Review, Volume 25, No. 2, September 2017 
 
18 
Research paper 
 
A look at advanced learners’ use of mobile devices for 
English language study: Insights from interview data 
Mariusz Kruk 
University of Zielona Gora, Poland 
______________________________________________________________ 
mkruk @ uz.zgora.pl 
  
Abstract 
The paper discusses the results of a study which explored advanced learners of English 
engagement with their mobile devices to develop learning experiences that meet their 
needs and goals as foreign language learners. The data were collected from 20 students 
by means of a semi-structured interview. The gathered data were subjected to 
qualitative and quantitative analysis. The results of the study demonstrated that, on the 
one hand, some subjects manifested heightened awareness relating to the 
advantageous role of mobile devices in their learning endeavors, their ability to reach 
for suitable tools and retrieve necessary 

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [13]:
text = pyMupdf_docs[0].page_content

In [14]:
text

"The EUROCALL Review, Volume 25, No. 2, September 2017 \n \n18 \nResearch paper \n \nA look at advanced learners’ use of mobile devices for \nEnglish language study: Insights from interview data \nMariusz Kruk \nUniversity of Zielona Gora, Poland \n______________________________________________________________ \nmkruk @ uz.zgora.pl \n  \nAbstract \nThe paper discusses the results of a study which explored advanced learners of English \nengagement with their mobile devices to develop learning experiences that meet their \nneeds and goals as foreign language learners. The data were collected from 20 students \nby means of a semi-structured interview. The gathered data were subjected to \nqualitative and quantitative analysis. The results of the study demonstrated that, on the \none hand, some subjects manifested heightened awareness relating to the \nadvantageous role of mobile devices in their learning endeavors, their ability to reach \nfor suitable tools and retrieve necessary informa

In [15]:
recur_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n","\n"," ", ""],
    chunk_size = 200,
    chunk_overlap = 20,
    length_function = len
)

recur_chunks = recur_splitter.split_text(text)

In [23]:
print(f"created {len(recur_chunks)} chunks")
for i,chunk in enumerate(recur_chunks):
    print(f"chunk {i+1}: \n {recur_chunks[i]} \n")

created 21 chunks
chunk 1: 
 The EUROCALL Review, Volume 25, No. 2, September 2017 
 
18 
Research paper 
 
A look at advanced learners’ use of mobile devices for 
English language study: Insights from interview data 

chunk 2: 
 Mariusz Kruk 
University of Zielona Gora, Poland 
______________________________________________________________ 
mkruk @ uz.zgora.pl 
  
Abstract 

chunk 3: 
 Abstract 
The paper discusses the results of a study which explored advanced learners of English 
engagement with their mobile devices to develop learning experiences that meet their 

chunk 4: 
 needs and goals as foreign language learners. The data were collected from 20 students 
by means of a semi-structured interview. The gathered data were subjected to 

chunk 5: 
 qualitative and quantitative analysis. The results of the study demonstrated that, on the 
one hand, some subjects manifested heightened awareness relating to the 

chunk 6: 
 advantageous role of mobile devices in their learning endeav

## Solving common challenges in PDF parsing

In [17]:
from langchain_core.documents import Document
from typing import List

In [18]:
class smartPDFprocessor:
    """Advanced pdf processing with error handling"""
    def __init__(self, chunk_size=1000, chunk_overlap = 100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.textSplitter = RecursiveCharacterTextSplitter(
            chunk_size = chunk_size,
            chunk_overlap = chunk_overlap,
            separators=[" "]
        )

    def __clean_text(self, text:str):
        """clean extracted text"""
        #Remove excessive whitespace
        text = " ".join(text.split())

        #fix common PDF extraction issues
        text = text.replace('fi','fi')
        text = text.replace('fl','fl')

        return text

    def processPDF(self, pdf_path:str)->List[Document]:

        """Process PDF with smart chunking and metadata enhancement"""

        #load pdf
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()

        #process Each page
        processed_chunks = []

        for page_num, page in enumerate(pages):

            ##clean the text
            cleaned_text =  self.__clean_text(page.page_content)

            #skip the empty pages
            if len(cleaned_text.strip()) < 50:
                continue
            chunks = self.textSplitter.create_documents(
                texts = [cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page":page_num+1,
                    "total_pages":len(pages),
                    "chunk_method":"smart_PDF_Processor",
                    "char_count":len(cleaned_text)
                }]
            )

            processed_chunks.extend(chunks)

        return processed_chunks


In [19]:
preprocessor = smartPDFprocessor()

In [20]:
#process a pdf if available

try:
    smart_chunks = preprocessor.processPDF("./data/document.pdf")
    print(f"processes into {len(smart_chunks)} smart chunks")

    if smart_chunks:
        print("\n smaple chunk metadata:")
        for key, value in smart_chunks[0].metadata.items():
            print(f"{key}: {value}")
except Exception as e:
    print(f"processiing Error: {e}")

processes into 50 smart chunks

 smaple chunk metadata:
producer: Microsoft® Word 2016
creator: Microsoft® Word 2016
creationdate: 2018-03-05T09:43:57+01:00
author: agimeno
moddate: 2018-03-12T10:24:10-04:00
source: ./data/document.pdf
total_pages: 11
page: 1
page_label: 1
chunk_method: smart_PDF_Processor
char_count: 3456
