In [80]:
import os
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [18]:
# Define the dataset name
DATASET_NAME = "AyeshaJadoon/Pakistan_Laws_Dataset"

In [30]:
def load_pak_law_data() -> pd.DataFrame:
    """
    Loads the Pakistan Laws Dataset from Hugging Face (raw JSON) and returns it as a pandas DataFrame.
    """
    try:
        # Load JSON from Hugging Face manually
        dataset = load_dataset(
            "json",
            data_files="https://huggingface.co/datasets/AyeshaJadoon/Pakistan_Laws_Dataset/resolve/main/pdf_data.json",
            split="train"
        )
        dataframe = dataset.to_pandas()
        print(f"Loaded {len(dataframe)} laws from dataset.")
        return dataframe
    except Exception as e:
        print(f"Failed to load dataset: {e}")
        return pd.DataFrame()

In [42]:
laws_df = load_pak_law_data()

Loaded 967 laws from dataset.


In [34]:
laws_df.head()

Unnamed: 0,file_name,text
0,administrator00532129aba2e10fe634ab8fbd94c50b.pdf,\nPage 1 of 19 \n \n \n \nTHE PRIVATISATION ...
1,administrator00607dfddff08b6d0a71b341f1667f23.pdf,\nPage 1 of 5 \n \n \n \nTHE PAKISTAN STUDY C...
2,administrator0088a4e5781b74cea37ea233c4a2e2c5.pdf,T H E C H U R C H O F S C O T L A N D ...
3,administrator00c80ea2ca2b4893b362c3c44bddafe2.pdf,"THE ARMED FORCES (EMERGENCY DUTIES) ACT, 1947\..."
4,administrator0101918a1a0e6cdb96f6e0e8f3453767.pdf,\nPage 1 of 3 \n \n \n \nTHE NET WORK ANAL...


In [60]:
# --- Column Renaming (THIS IS THE NEW/MODIFIED PART) ---
# Check and rename 'file_name' to 'File Name'
if 'file_name' in laws_df.columns and 'File Name' not in laws_df.columns:
    laws_df.rename(columns={'file_name': 'File Name'}, inplace=True)
    print("Renamed 'file_name' to 'File Name'.")

# Check and rename 'text' to 'Content'
if 'text' in laws_df.columns and 'Content' not in laws_df.columns:
    laws_df.rename(columns={'text': 'Content'}, inplace=True)
    print("Renamed 'text' to 'Content'.")
elif 'text' in laws_df.columns and 'Content' in laws_df.columns:
    # This case handles if some rows have 'text' and some have 'content' originally
    # Consolidate 'text' into 'Content' where 'Content' might be NaN
    print("Both 'text' and 'Content' columns found. Consolidating 'text' into 'Content'...")
    laws_df['Content'] = laws_df['Content'].fillna(laws_df['text'])
    laws_df.drop(columns=['text'], inplace=True) # Remove the redundant 'text' column
    print("Consolidation complete and 'text' column dropped.")

# --- End Column Renaming ---

print("\nDataFrame columns after potential renaming:", laws_df.columns.tolist())
print("\nDataFrame head after potential renaming:")
print(laws_df.head())


# Check for missing values (should ideally be 0 in 'Content' after main.py's robust load)
print("\nMissing values in 'Content' column:", laws_df['Content'].isnull().sum())

# Drop rows where 'Content' is NaN or empty string (safety measure)
initial_rows_count = len(laws_df)
laws_df.dropna(subset=['Content'], inplace=True)
laws_df = laws_df[laws_df['Content'].astype(str).str.strip() != '']
rows_after_initial_cleaning = len(laws_df)
if initial_rows_count - rows_after_initial_cleaning > 0:
    print(f"Removed {initial_rows_count - rows_after_initial_cleaning} rows with empty or missing content after initial check.")

# Get statistics on content length
if not laws_df.empty:
    content_lengths = laws_df['Content'].apply(len)
    print(f"\nContent Length Statistics (before detailed cleaning):")
    print(content_lengths.describe())

    # Inspect a random entry to understand raw text structure
    print("\n--- Example of a law's content before detailed cleaning ---")
    random_law = laws_df.sample(1)
    print(f"File Name: {random_law['File Name'].iloc[0]}")
    print("First 1000 characters:\n")
    print(random_law['Content'].iloc[0][:1000])
    print("\n...")


DataFrame columns after potential renaming: ['File Name', 'Content', 'Cleaned_Content', 'Sections']

DataFrame head after potential renaming:
                                           File Name  \
0  administrator00532129aba2e10fe634ab8fbd94c50b.pdf   
1  administrator00607dfddff08b6d0a71b341f1667f23.pdf   
2  administrator0088a4e5781b74cea37ea233c4a2e2c5.pdf   
3  administrator00c80ea2ca2b4893b362c3c44bddafe2.pdf   
4  administrator0101918a1a0e6cdb96f6e0e8f3453767.pdf   

                                             Content  \
0   \nPage 1 of 19  \n \n \n \nTHE PRIVATISATION ...   
1   \nPage 1 of 5 \n \n \n \nTHE PAKISTAN STUDY C...   
2  T H E   C H U R C H   O F   S C O T L A N D   ...   
3  THE ARMED FORCES (EMERGENCY DUTIES) ACT, 1947\...   
4   \nPage 1 of 3 \n  \n \n \nTHE  NET WORK  ANAL...   

                                     Cleaned_Content Sections  
0  [(1., 1. Short title, extent and commencement....       []  
1  [(1976., 1976. 1ACT No. XXVII OF 1976 [11th Ma...   

In [61]:
laws_df

Unnamed: 0,File Name,Content,Cleaned_Content,Sections
0,administrator00532129aba2e10fe634ab8fbd94c50b.pdf,\nPage 1 of 19 \n \n \n \nTHE PRIVATISATION ...,"[(1., 1. Short title, extent and commencement....",[]
1,administrator00607dfddff08b6d0a71b341f1667f23.pdf,\nPage 1 of 5 \n \n \n \nTHE PAKISTAN STUDY C...,"[(1976., 1976. 1ACT No. XXVII OF 1976 [11th Ma...",[]
2,administrator0088a4e5781b74cea37ea233c4a2e2c5.pdf,T H E C H U R C H O F S C O T L A N D ...,[],[]
3,administrator00c80ea2ca2b4893b362c3c44bddafe2.pdf,"THE ARMED FORCES (EMERGENCY DUTIES) ACT, 1947\...","[(1947., 1947. 1ACT No. XV OF 1947 [20th March...",[]
4,administrator0101918a1a0e6cdb96f6e0e8f3453767.pdf,\nPage 1 of 3 \n \n \n \nTHE NET WORK ANAL...,"[(1., 1. Short title and commencement), (2., 2...",[]
...,...,...,...,...
962,"THE POST OFFICE ACT, 1898.pdf","\n \n \n \nTHE POST OFFICE ACT, 1898 \n \n \...","[(1., 1. Short title, extent, application and ...",[]
963,"THE REFORMATORY SCHOOLS ACT, 1897.pdf","THE REFORMATORY SCHOOLS ACT, 1897\nCONTENTS\nS...","[(1., 1. Title an extent.__( 1 ) T h i s A c t...",[]
964,"THE RIGHT OF ACCESS TO INFORMATION ACT, 2017.pdf",\nPage 1 of 14 \n \n \n \n \n \nTHE RIGHT OF...,"[(1., 1. Short title, application and commence...",[]
965,war injuries ordinance.pdf,"THE WAR INJURIES ORDINANCE, 1941\nCONTENTS\n1 ...","[(1., 1. Short title, extent and commencement....",[]


In [62]:
laws_df.to_csv('pakistan_laws_raw.csv', index=False)


In [63]:
import re
def clean_text(text: str) -> str:
    """
    Performs detailed cleaning on legal text.
    - Normalizes whitespace.
    - Removes common page headers/footers (e.g., "Page X of Y", "Updated till date").
    - Removes typical "CONTENTS" sections at the start of documents.
    - Can be extended for more specific legal document noise.
    """
    
    if not isinstance(text, str):
        return ""
    # Normalize unicode dashes, quotes, etc. (optional for legal text)
    text = text.replace("\xa0", " ").replace("­", "")  # remove non-breaking spaces and soft hyphens

    # Remove common page artifacts
    text = re.sub(r"(?i)Page \d+ of \d+", " ", text)
    text = re.sub(r"(?i)Updated till \d{1,2}\.\d{1,2}\.\d{4}", " ", text)

    # Remove 'CONTENTS' section (basic version)
    text = re.sub(r'(?i)\bCONTENTS\b.*?(?=(PART|CHAPTER|\d+\.?\s+[A-Z]))', '', text, flags=re.DOTALL)

    # Collapse multiple whitespaces and newlines
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

In [64]:
tqdm.pandas(desc="Apply text cleaning to the content")
laws_df['Cleaned_Content'] = laws_df['Content'].progress_apply(clean_text)


Apply text cleaning to the content: 100%|██████████| 967/967 [00:04<00:00, 201.92it/s]


In [65]:
print("\n--- Example of cleaned content ---")
random_cleaned_law = laws_df.sample(1)
print(f"File Name: {random_cleaned_law['File Name'].iloc[0]}")
print("First 1000 characters:\n")
print(random_cleaned_law['Cleaned_Content'].iloc[0][:1000])
print("\n...")

print("First 1000 characters of Non-cleaned content:\n")
print(random_cleaned_law['Content'].iloc[0][:1000])
print("\n...")


--- Example of cleaned content ---
File Name: administrator1463abd05c24327d721e8d0ffa274514.pdf
First 1000 characters:

THE WEST PAKISTAN PROHIBITION OF SMOKING IN CINEMA HOUSES (REPEAL) ACT, 2019 1. Short title, and commencement. 2. Repeal. THEWEST PAKISTAN PROHIBITION OF SMOKING IN CINEMA HOUSES (REPEAL) ACT, 2019 ACT NO. I OF 2019 [6 th February, 2019] AN ACT to repeal the West Pakistan Prohibition of S moking in Cinema Houses Ordinance, 1960 to the extent of Islamabad Capital Territory. WHEREAS it is expedient to repeal the West Pakistan Prohibition of Smoking in Cinema Houses Ordinance, 1960 (Ordinance No. IV of 1960) t o the extent of Islamabad Capital Territory; It is hereby enacted as follows: — 1. Short title, and commencement. — (1) This Act may be called the West Pakistan Prohibition of Smoking in Cinema Houses (Repeal) Ac t, 2019. (2) It shall come into force at once in its applica bility to the extent of Islamabad Capital Territory. 2. Repeal. — The West Pakistan Prohibit

# Splitting on top of Sections for chunking

In [73]:
import re

def extract_semantic_sections(text: str, file_name: str) -> list[dict]:
    """
    Extracts semantic sections (Parts, Chapters, numbered Sections/Articles) from legal text.
    It attempts to capture the section heading and the content belonging to it.
    """
    sections_list = []
    
    # Regex to identify potential section headers.
    # (?m) makes '^' and '$' match at the start/end of each line, not just the string.
    # | (OR) combines multiple patterns.
    # We are looking for lines that start with:
    # 1. PART followed by Roman numerals (e.g., "PART I.—GENERAL")
    # 2. CHAPTER followed by Roman numerals (e.g., "CHAPTER II.")
    # 3. An Arabic numeral followed by a dot and space (e.g., "1. Short title, extent and commencement.")
    # The [A-Z].* ensures it's likely a title and not just a random number.
    section_header_pattern = re.compile(
        r"^(PART\s+[IVXLCDM]+\.?—?.*?$|"  # Matches PART headings
        r"^CHAPTER\s+[IVXLCDM]+\.?—?.*?$|" # Matches CHAPTER headings
        r"^\d+\.\s+[A-Z].*?$) ",        # Matches numbered sections/articles (e.g., "1. Short title...")
        re.MULTILINE | re.IGNORECASE
    )

    # Split the text by the identified section headers, keeping the headers themselves.
    # re.split creates a list where odd indices are delimiters and even indices are content.
    split_content = section_header_pattern.split(text)

    # The first element might be preamble content if no header is at the very beginning.
    preamble_content = split_content[0].strip()
    if preamble_content:
        sections_list.append({
            "source_file": file_name,
            "section_title": "Preamble/Introduction",
            "section_content": preamble_content,
            "section_type": "semantic_preamble"
        })

    # Iterate through the split content to associate headers with their content
    # The split creates: [preamble, header1, content1, header2, content2, ...]
    for i in range(1, len(split_content), 2):
        header = split_content[i]
        content = split_content[i+1] if (i+1) < len(split_content) else "" # Handle last section

        if header and content.strip():
            sections_list.append({
                "source_file": file_name,
                "section_title": header.strip(),
                "section_content": content.strip(),
                "section_type": "semantic_section"
            })
        elif header: # Handle cases where a header might have no immediate content (e.g., followed by another header)
            sections_list.append({
                "source_file": file_name,
                "section_title": header.strip(),
                "section_content": "", # Empty content for now, might be absorbed into next
                "section_type": "semantic_section_header_only"
            })

    return sections_list

In [76]:
# Apply semantic section extraction
all_semantic_sections = []
if not laws_df.empty:
    print("\nExtracting semantic sections from cleaned content...")
    for index, row in tqdm(laws_df.iterrows(), total=len(laws_df), desc="Extracting sections"):
        file_name = row['File Name']
        cleaned_content = row['Cleaned_Content']
        
        extracted_sections = extract_semantic_sections(cleaned_content, file_name)
        all_semantic_sections.extend(extracted_sections)



Extracting semantic sections from cleaned content...


Extracting sections: 100%|██████████| 967/967 [00:02<00:00, 480.92it/s]


In [79]:
# Create a new DataFrame for semantic sections
semantic_sections_df = pd.DataFrame(all_semantic_sections)
semantic_sections_df.to_csv('pakistan_laws_semantic_sections.csv', index=False)
print(f"\nGenerated {len(semantic_sections_df)} semantic sections.")
print("\nFirst 5 semantic sections:")
print(semantic_sections_df.head())


Generated 967 semantic sections.

First 5 semantic sections:
                                         source_file          section_title  \
0  administrator00532129aba2e10fe634ab8fbd94c50b.pdf  Preamble/Introduction   
1  administrator00607dfddff08b6d0a71b341f1667f23.pdf  Preamble/Introduction   
2  administrator0088a4e5781b74cea37ea233c4a2e2c5.pdf  Preamble/Introduction   
3  administrator00c80ea2ca2b4893b362c3c44bddafe2.pdf  Preamble/Introduction   
4  administrator0101918a1a0e6cdb96f6e0e8f3453767.pdf  Preamble/Introduction   

                                     section_content       section_type  
0  THE PRIVATISATION COMMISSION ORDINANCE, 2000 P...  semantic_preamble  
1  THE PAKISTAN STUDY CENTRES, ACT 1976 1 Short t...  semantic_preamble  
2  T H E C H U R C H O F S C O T L A N D K I R K ...  semantic_preamble  
3  THE ARMED FORCES (EMERGENCY DUTIES) ACT, 1947 ...  semantic_preamble  
4  THE NET WORK ANALYSE R STUDY CENTRE (TRANSFER ...  semantic_preamble  


In [82]:
if not semantic_sections_df.empty:
    print(f"\nSemantic Section Content Length Statistics:")
    print(semantic_sections_df['section_content'].apply(len).describe())

    # Example of a random extracted section
    print("\n--- Example of a random extracted semantic section ---")
    random_section = semantic_sections_df.sample(1)
    print(random_section)
    print(f"Source File: {random_section['source_file'].iloc[0]}")
    print(f"Section Title: {random_section['section_title'].iloc[0]}")
    print(f"Section Content (first 500 chars):\n{random_section['section_content'].iloc[0][:500]}\n...")


Semantic Section Content Length Statistics:
count    9.670000e+02
mean     4.148464e+04
std      1.254601e+05
min      2.600000e+01
25%      6.610000e+03
50%      1.656400e+04
75%      3.656950e+04
max      2.395050e+06
Name: section_content, dtype: float64

--- Example of a random extracted semantic section ---
                                          source_file          section_title  \
64  administrator0cb12b901d4304d7e5463da076d88639 ...  Preamble/Introduction   

                                      section_content       section_type  
64  THE CHILD MARRIAGE RESTRAINT ACT, 1929 1 Short...  semantic_preamble  
Source File: administrator0cb12b901d4304d7e5463da076d88639 (1).pdf
Section Title: Preamble/Introduction
Section Content (first 500 chars):
THE CHILD MARRIAGE RESTRAINT ACT, 1929 1 Short title, extent and commencement 2 Definitions 3 [Omitted] 4 Punishment for male adult above twentyone years of age marrying a child 5 Punishment for solemnising a child marriage 6 Punishmen

## Chunking 

In [86]:
CHUNK_SIZE = 1500
CHUNK_OVERLAP = 150


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    add_start_index=True,
    separators=[
        "\n\n\n", "\n\n", "\n", " ", "" # Still useful for sub-chunking within long sections
    ]
)

all_chunks = []
if not semantic_sections_df.empty:
    print(f"\nSplitting {len(semantic_sections_df)} semantic sections into smaller chunks (if needed)...")
    for index, row in tqdm(semantic_sections_df.iterrows(), total=len(semantic_sections_df), desc="Splitting sections into chunks"):
        source_file = row['source_file']
        section_title = row['section_title']
        section_content = row['section_content']
        if not section_content.strip():
            continue
            
        docs = text_splitter.create_documents(
            texts = [section_content],
            metadatas=[{"source_file": source_file, "section_title": section_title}]
        )
        for doc in docs:
            chunk_data = {
                "source_file": doc.metadata["source_file"],
                "section_title": doc.metadata["section_title"], # Preserve the semantic section title
                "chunk_content": doc.page_content,
                "chunk_length": len(doc.page_content),
                "start_index_in_section": doc.metadata["start_index"], # Start index within this specific section
                "original_chunk_index": len(all_chunks) # Unique global index for each chunk
            }
            all_chunks.append(chunk_data)






Splitting 967 semantic sections into smaller chunks (if needed)...


Splitting sections into chunks: 100%|██████████| 967/967 [00:16<00:00, 60.22it/s] 


In [87]:
chunks_df = pd.DataFrame(all_chunks)
print(f"\nGenerated {len(chunks_df)} total chunks from semantic sections.")
print("\nFirst 5 final chunks:")
print(chunks_df.head())




Generated 30122 total chunks from semantic sections.

First 5 final chunks:
                                         source_file          section_title  \
0  administrator00532129aba2e10fe634ab8fbd94c50b.pdf  Preamble/Introduction   
1  administrator00532129aba2e10fe634ab8fbd94c50b.pdf  Preamble/Introduction   
2  administrator00532129aba2e10fe634ab8fbd94c50b.pdf  Preamble/Introduction   
3  administrator00532129aba2e10fe634ab8fbd94c50b.pdf  Preamble/Introduction   
4  administrator00532129aba2e10fe634ab8fbd94c50b.pdf  Preamble/Introduction   

                                       chunk_content  chunk_length  \
0  THE PRIVATISATION COMMISSION ORDINANCE, 2000 P...          1499   
1  Annual Report. 38. Information to Public. 39. ...          1498   
2  day of October, 1999, and the Provisional Cons...          1495   
3  (g) “person” includes an individual, partnersh...          1499   
4  Act, 1996, (Act XVII of 1996), the National El...          1493   

   start_index_in_section  

In [88]:
if not chunks_df.empty:
    print(f"\nFinal Chunk Length Statistics:")
    print(chunks_df['chunk_length'].describe())


Final Chunk Length Statistics:
count    30122.000000
mean      1473.419361
std        146.663585
min         26.000000
25%       1495.000000
50%       1497.000000
75%       1498.000000
max       1500.000000
Name: chunk_length, dtype: float64


In [89]:
chunks_df

Unnamed: 0,source_file,section_title,chunk_content,chunk_length,start_index_in_section,original_chunk_index
0,administrator00532129aba2e10fe634ab8fbd94c50b.pdf,Preamble/Introduction,"THE PRIVATISATION COMMISSION ORDINANCE, 2000 P...",1499,0,0
1,administrator00532129aba2e10fe634ab8fbd94c50b.pdf,Preamble/Introduction,Annual Report. 38. Information to Public. 39. ...,1498,1353,1
2,administrator00532129aba2e10fe634ab8fbd94c50b.pdf,Preamble/Introduction,"day of October, 1999, and the Provisional Cons...",1495,2706,2
3,administrator00532129aba2e10fe634ab8fbd94c50b.pdf,Preamble/Introduction,"(g) “person” includes an individual, partnersh...",1499,4055,3
4,administrator00532129aba2e10fe634ab8fbd94c50b.pdf,Preamble/Introduction,"Act, 1996, (Act XVII of 1996), the National El...",1493,5414,4
...,...,...,...,...,...,...
30117,WEST PAKISTAN SHOPS AND ESTABLISHMENTS ORDINAN...,Preamble/Introduction,29. Limitation of prosecutions.– No Court shal...,1497,32374,30117
30118,WEST PAKISTAN SHOPS AND ESTABLISHMENTS ORDINAN...,Preamble/Introduction,date of the commencement of this Ordinance und...,1496,33724,30118
30119,WEST PAKISTAN SHOPS AND ESTABLISHMENTS ORDINAN...,Preamble/Introduction,"of the establishment, if any. 2. Postal addres...",1498,35071,30119
30120,WEST PAKISTAN SHOPS AND ESTABLISHMENTS ORDINAN...,Preamble/Introduction,as a * under the West Pakistan Shops and Estab...,1495,36423,30120


In [90]:
chunks_df.to_csv('pakistan_laws_chunks.csv', index=False)
print(f"\nProcessed chunks saved to pakistan_laws_chunks.csv")

print("\n--- Task 2: Data Preprocessing (Cleaning & Initial Chunking) complete! ---")


Processed chunks saved to pakistan_laws_chunks.csv

--- Task 2: Data Preprocessing (Cleaning & Initial Chunking) complete! ---
