In [1]:
import pdfplumber as pp

In [2]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with pp.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

pdf_path = "data/Constitution-of-Nepal.pdf"
constitution_text = extract_text_from_pdf(pdf_path)

In [3]:
constitution_text = constitution_text.translate(str.maketrans('', '', "\uf075\uf0a3\uf03c"))

In [4]:
import re

def split_by_parts(text):
    parts = re.split(r'(\n\(\d+\)Part-\d+)', text)

    structured_parts = []
    for i in range(1, len(parts), 2):  # Step by 2 to get both part title and content
        part_title = parts[i].strip().split(')')[1]
        part_content = parts[i+1].strip()
        structured_parts.append({"title": part_title, "content": part_content})

    return structured_parts

def split_by_schedule(text):
    parts = re.split(r'(\n\(\d+\)Schedule-\d+)', text)

    structured_schedule = []
    for i in range(1, len(parts), 2):
        part_title = parts[i].strip().split(')')[1]
        part_content = parts[i+1].strip()
        structured_schedule.append({"title": part_title, "content": part_content})

    return structured_schedule

def split_into_articles(text):
    articles = re.split(r'(\n\d+.)', text)
    structured_articles = []

    for i in range(1, len(articles), 2):  # Step by 2 (because of regex split)
        title = articles[i].strip()
        content = articles[i+1]
        structured_articles.append({"title": "Article " + title[:-1], "content": content.strip()})
    
    return structured_articles


In [5]:
document_parts = split_by_parts(constitution_text)

In [6]:
idx = document_parts[-1]['content'].find("Schedule-1")
document_schedule = document_parts[-1]['content'][idx:]
document_parts[-1]['content'] = document_parts[-1]['content'][:idx]

In [7]:
for part in document_parts:
    part["articles"] = split_into_articles(part["content"])

In [8]:
for part in document_parts:
    del part["content"]

In [9]:
document_schedule = "\n(1)" + document_schedule

In [10]:
document_by_schedule = split_by_schedule(document_schedule)

In [11]:
for schedule in document_by_schedule:
    schedule['content'] = schedule['content'].replace('\n', ' ')

for parts in document_parts:
    for article in parts['articles']:
        article['content'] = article['content'].replace('\n', ' ')

In [12]:
documents = document_parts + document_by_schedule

In [14]:
documents[0]

{'title': 'Part-1',
 'articles': [{'title': 'Article 1',
   'content': 'Constitution as Fundamental Law: (1) This Constitution is the fundamental law of Nepal. Any law inconsistent with this Constitution shall, to the extent of such inconsistency, be void. (2) It shall be the duty of every person to uphold this Constitution.'},
  {'title': 'Article 2',
   'content': 'Sovereignty and State Power: The sovereignty and State power of Nepal shall be vested in the Nepali people. It shall be exercised in accordance with the provisions set forth in this Constitution.'},
  {'title': 'Article 3',
   'content': 'Nation: All the Nepali people, with multi-ethnic, multi- lingual, multi-religious, multi-cultural characteristics and in geographical diversities, and having common aspirations and being united by a bond of allegiance to national independence, territorial integrity, national interest and prosperity of Nepal, collectively constitute the nation.'},
  {'title': 'Article 4',
   'content': 'St

In [15]:
content = "WE, THE SOVEREIGN PEOPLE OF NEPAL;   INTERNALIZING the people's sovereign right and right to  autonomy and self-rule, while maintaining freedom,  sovereignty, territorial integrity, national unity, independence  and dignity of Nepal;  RECALLING the g"
title = "Preamble"

documents.insert(0, {"title": title, "content": content})

In [17]:
documents[:3]

[{'title': 'Preamble',
  'content': "WE, THE SOVEREIGN PEOPLE OF NEPAL;   INTERNALIZING the people's sovereign right and right to  autonomy and self-rule, while maintaining freedom,  sovereignty, territorial integrity, national unity, independence  and dignity of Nepal;  RECALLING the g"},
 {'title': 'Part-1',
  'articles': [{'title': 'Article 1',
    'content': 'Constitution as Fundamental Law: (1) This Constitution is the fundamental law of Nepal. Any law inconsistent with this Constitution shall, to the extent of such inconsistency, be void. (2) It shall be the duty of every person to uphold this Constitution.'},
   {'title': 'Article 2',
    'content': 'Sovereignty and State Power: The sovereignty and State power of Nepal shall be vested in the Nepali people. It shall be exercised in accordance with the provisions set forth in this Constitution.'},
   {'title': 'Article 3',
    'content': 'Nation: All the Nepali people, with multi-ethnic, multi- lingual, multi-religious, multi-cult

In [18]:
import json

# Store structured parts and sections in a JSON file
with open('constitution_nepal.json', 'w') as f:
    json.dump(documents, f, indent=4)