In [92]:
from tqdm import tqdm
import json


## Notebook Summary: 01_CHUNK_ARTICLES.IPYNB

This notebook is designed to process PDF files, convert them to text, split the text into smaller chunks, and save these chunks as JSON files. Below are the main functions and workflow:

### Main Functions

1. **chunk_texts(doc_id, pages_info, chunk_size=240, overlap=20)**
    - Splits the text of a document into chunks with specified size and overlap.
    - **Arguments:**
      - `doc_id` (str): Document identifier.
      - `pages_info` (list of dict): List of dictionaries containing page information, each dictionary should have the keys "page_num" and "text".
      - `chunk_size` (int, optional): Number of words in each chunk. Default is 240.
      - `overlap` (int, optional): Number of words to overlap between consecutive chunks. Default is 20.
    - **Returns:** List of dictionaries, each representing a chunk of text.

2. **pdf_to_text(pdf_path)**
    - Extracts text from each page of a PDF file and returns it as a list of dictionaries.
    - **Arguments:**
      - `pdf_path` (str): Path to the PDF file.
    - **Returns:** List of dictionaries, each containing the page number and the extracted text.

### Workflow

1. Specify the directory path containing the PDF files.
2. List all files in the specified directory.
3. For each file in the directory:
    - Convert the PDF file to text.
    - Split the text into smaller chunks.
    - Save the chunks as a JSON file in the `json_data` directory.

### Usage Example

- **chunk_texts:** Splits the example text into smaller chunks.
- **pdf_to_text:** Converts an example PDF file to text and prints it.

This notebook uses the `tqdm` library to display a progress bar, `json` to handle JSON files, `PyPDF2` to extract text from PDF files, and `os` to handle file system operations.


In [110]:
def chunk_texts(doc_id, pages_info, chunk_size=240, overlap=20):
    """
    Splits the text of a document into chunks with specified size and overlap.

    Args:
        doc_id (str): The identifier of the document.
        pages_info (list of dict): A list of dictionaries containing page information.
                                   Each dictionary should have the keys "page_num" and "text".
        chunk_size (int, optional): The number of words in each chunk. Default is 240.
        overlap (int, optional): The number of words to overlap between consecutive chunks. Default is 20.

    Returns:
        list of dict: A list of dictionaries, each representing a chunk of text.
                      Each dictionary contains the keys "doc_id", "page_num", "chunk_id", and "text".

    Example:
        pages_info = [
            {"page_num": 1, "text": "This is the text of the first page."},
            {"page_num": 2, "text": "This is the text of the second page."}
        chunks = chunk_texts("doc1", pages_info)
        # Output: [
        #     {"doc_id": "doc1", "page_num": 1, "chunk_id": "doc1_1_1", "text": "This is the text of the first page."},
        #     {"doc_id": "doc1", "page_num": 2, "chunk_id": "doc1_2_1", "text": "This is the text of the second page."}
        # ]
    """
    all_chunks = []
    previous_text = ""

    for page_info in pages_info:
        page_num = page_info["page_num"]
        text = page_info["text"]

        # Combine the previous page text with the current text for overlap
        combined_text = previous_text + " " + text if previous_text else text

        # Split the combined text into a list of words
        words = combined_text.split()
        chunks = []
        start = 0

        while start < len(words):
            # Define the end of the chunk considering the size and word limit
            end = min(start + chunk_size, len(words))
            chunk = words[start:end]
            chunks.append(" ".join(chunk))

            # Move the start of the next chunk considering the overlap
            start += chunk_size - overlap

        # Create the chunks with the desired structure
        page_chunks = [
            {
                "doc_id": doc_id,
                "page_num": page_num,
                "chunk_id": f"{doc_id}_{page_num}_{i+1}",
                "text": chunk,
            }
            for i, chunk in enumerate(chunks)
        ]
        all_chunks.extend(page_chunks)

        # Update the previous text for the next iteration, taking only the last words of the overlap size
        previous_text = " ".join(words[-overlap:])

    return all_chunks


# Example usage:
pages_info = [
    {
        "page_num": 1,
        "text": "This is a sample text for page 1. Here is more text to test the chunking function.",
    },
    {
        "page_num": 2,
        "text": "This is the text for page 2. We continue with more text to see how the overlap works.",
    },
]

chunks = chunk_texts("doc_test", pages_info, chunk_size=10, overlap=3)

# Print the generated chunks
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk}\n")

Fragmento 1:
{'doc_id': 'doc_test', 'page_num': 1, 'chunk_id': 'doc_test_1_1', 'text': 'Este es un ejemplo de texto para la página 1.'}

Fragmento 2:
{'doc_id': 'doc_test', 'page_num': 1, 'chunk_id': 'doc_test_1_2', 'text': 'la página 1. Aquí hay más texto para probar la'}

Fragmento 3:
{'doc_id': 'doc_test', 'page_num': 1, 'chunk_id': 'doc_test_1_3', 'text': 'para probar la función de fragmentación.'}

Fragmento 4:
{'doc_id': 'doc_test', 'page_num': 2, 'chunk_id': 'doc_test_2_1', 'text': 'función de fragmentación. Este es el texto de la página'}

Fragmento 5:
{'doc_id': 'doc_test', 'page_num': 2, 'chunk_id': 'doc_test_2_2', 'text': 'de la página 2. Continuamos con más texto para ver'}

Fragmento 6:
{'doc_id': 'doc_test', 'page_num': 2, 'chunk_id': 'doc_test_2_3', 'text': 'texto para ver cómo funciona el solapamiento.'}



In [111]:
import PyPDF2


def pdf_to_text(pdf_path):
    """
    Extracts text from each page of a PDF file and returns it as a list of dictionaries.

    Args:
        pdf_path (str): The file path to the PDF document.

    Returns:
        list: A list of dictionaries, each containing the page number and the extracted text.
              Example: [{"page_num": 1, "text": "Page 1 text"}, {"page_num": 2, "text": "Page 2 text"}, ...]
    """
    text_by_page = []
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)

        for page_num in range(len(reader.pages)):
            text = ""
            page = reader.pages[page_num]
            text += page.extract_text()
            text_by_page.append({"page_num": page_num + 1, "text": text})

    return text_by_page


# Example usage:
pdf_text = pdf_to_text("../data/sideris_gonzales_ong.pdf")
print(pdf_text)



In [112]:
pdf_text[0]

{'page_num': 1,
 'text': 'https://doi.org/10.1177/0739456X17730890Journal of Planning Education and Research\n2019, Vol. 39(2) 227 –242\n© The Author(s) 2017Article reuse guidelines: \nsagepub.com/journals-permissions\nDOI: 10.1177/0739456X17730890\njournals.sagepub.com/home/jpe\nPlanning Research\nIntroduction\nSince the term gentrification was first used by sociologist \nRuth Glass (1964) in the mid-1960s, a rich literature has emerged of studies that seek to identify the magnitude of change and document its impact on gentrified neighbor -\nhoods. While these studies discuss mostly the processes and impacts of gentrification, we are not aware of studies that focus on the methodologies of studying gentrification. In general, a methodological dichotomy characterizes much of the existing gentrification literature, as studies are either quantitative, “macro” analyses or qualitative, “micro” inqui-ries of neighborhoods in the form of case studies (Hammel and Wyly 1996). But there is often

In [115]:
import os

"""
This script processes PDF files in a specified directory, converts them to text, chunks the text, 
and saves the chunks as JSON files.

Functions:
    pdf_to_text(file_path: str) -> str:
        Converts a PDF file to text.

    chunk_texts(base_name: str, text: str, chunk_size: int, overlap: int) -> dict:
        Splits the text into chunks of a specified size with a specified overlap.

Workflow:
    1. Specify the directory path containing the PDF files.
    2. List all files in the specified directory.
    3. For each file in the directory:
        a. Convert the PDF file to text.
        b. Chunk the text into smaller parts.
        c. Save the chunks as a JSON file in the 'json_data' directory.
"""

# Specify the directory path
directory_path = "../data"

# List all files in the directory
files = os.listdir(directory_path)

results = []

# Print the list of files
for file in tqdm(files):
    pdf_text = pdf_to_text(f"{directory_path}/{file}")
    base_name = os.path.splitext(file)[0]
    chunks = chunk_texts(base_name, pdf_text, chunk_size=240, overlap=20)

    # Save the results dictionary to a JSON file
    # Get the base name of the file without the extension
    with open(f"json_data/{base_name}.json", "w", encoding="utf-8") as f:
        json.dump(chunks, f, ensure_ascii=False, indent=4)

    print(f"Saved: {base_name}")

 20%|██        | 1/5 [00:00<00:03,  1.06it/s]

Save: Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M


 40%|████      | 2/5 [00:01<00:02,  1.08it/s]

Save: How-do-local-governments-respond-to-central-mandate-in-affo_2024_Journal-of-


 60%|██████    | 3/5 [00:02<00:01,  1.44it/s]

Save: Inclusive-cities--Less-crime-requires-more-lo_2024_Journal-of-Urban-Manageme


100%|██████████| 5/5 [00:02<00:00,  1.73it/s]

Save: sideris_gonzales_ong
Save: The_High_Cost_of_Free_Parking





In [89]:
# results