In [1]:
from datasets import load_dataset
import pandas as pd

# The file path is correct
local_dataset_path = 'C:/Projects/36118 ANLP/Assignment 2/LLM' 

print(f"Loading the dataset from local path: '{local_dataset_path}'...")

try:
    # --- FIX: Changed the split from 'train' to 'corpus' ---
    ds = load_dataset(local_dataset_path, split='corpus')
    
    # Let's convert the full dataset to a Pandas DataFrame
    print("\n--- Dataset loaded successfully ---")
    print("Converting the full dataset to a Pandas DataFrame...")
    
    df_full = ds.to_pandas()
    
    print("\n--- Conversion Complete ---")
    print(f"Loaded {len(df_full):,} documents into the DataFrame.")
    print("Inspecting the first 5 records:")
    print(df_full.head())
    
    print("\nColumns available in the dataset:")
    print(list(df_full.columns))

except Exception as e:
    print(f"\nAn error occurred: {e}")

  from .autonotebook import tqdm as notebook_tqdm


Loading the dataset from local path: 'C:/Projects/36118 ANLP/Assignment 2/LLM'...

--- Dataset loaded successfully ---
Converting the full dataset to a Pandas DataFrame...

--- Conversion Complete ---
Loaded 232,560 documents into the DataFrame.
Inspecting the first 5 records:
                                      version_id                   type  \
0   tasmanian_legislation:2008-10-08/sr-2008-119  secondary_legislation   
1   tasmanian_legislation:2004-10-06/sr-2004-080  secondary_legislation   
2  tasmanian_legislation:2016-06-21/act-2009-050    primary_legislation   
3   tasmanian_legislation:2021-11-24/sr-2021-088  secondary_legislation   
4  tasmanian_legislation:2001-07-15/act-2001-041    primary_legislation   

  jurisdiction                 source       mime                 date  \
0     tasmania  tasmanian_legislation  text/html  2008-10-08 00:00:00   
1     tasmania  tasmanian_legislation  text/html  2004-10-06 00:00:00   
2     tasmania  tasmanian_legislation  text/html  20

In [4]:
print("--- Investigating available values in the 'jurisdiction' column ---")

# .value_counts() is the perfect tool to see all unique values and their frequencies.
jurisdiction_counts = df_full['jurisdiction'].value_counts()

print(jurisdiction_counts)

--- Investigating available values in the 'jurisdiction' column ---
jurisdiction
new_south_wales      119587
commonwealth         103882
queensland             3306
tasmania               2552
western_australia      1564
south_australia        1350
norfolk_island          319
Name: count, dtype: int64


In [5]:
import numpy as np

print("\n--- Corrected Block 2B: Filtering for NSW Legislation ---")

# The full dataset is in df_full
print(f"Original number of documents (all jurisdictions): {len(df_full):,}")

# Using the value we just discovered
JURISDICTION_TO_KEEP = 'new_south_wales'

# --- Filter the DataFrame with the correct value ---
df_nsw = df_full[df_full['jurisdiction'] == JURISDICTION_TO_KEEP].copy()

print(f"\nNumber of documents after filtering for '{JURISDICTION_TO_KEEP}': {len(df_nsw):,}")
print("\n--- Filtering Complete ---")

# --- Verify the new DataFrame ---
if len(df_nsw) > 0:
    print("\nVerifying the new 'df_nsw' DataFrame:")
    print("First 5 rows:")
    print(df_nsw.head())
    print("\nDataFrame Info for the NSW-only data:")
    df_nsw.info()
else:
    print("\n--> Filtering still resulted in 0 documents. Please double-check the value in JURISDICTION_TO_KEEP.")


--- Corrected Block 2B: Filtering for NSW Legislation ---
Original number of documents (all jurisdictions): 232,560

Number of documents after filtering for 'new_south_wales': 119,587

--- Filtering Complete ---

Verifying the new 'df_nsw' DataFrame:
First 5 rows:
                                 version_id                 type  \
12  nsw_legislation:2009-07-06/act-1968-059  primary_legislation   
15  nsw_legislation:2021-07-01/act-2011-035  primary_legislation   
18     nsw_caselaw:54a63c143004de94513db49f             decision   
19     nsw_caselaw:5c1ca992e4b0b9ab40212a90             decision   
20     nsw_caselaw:549f71263004262463a71376             decision   

       jurisdiction           source       mime                 date  \
12  new_south_wales  nsw_legislation  text/html  2009-07-06 00:00:00   
15  new_south_wales  nsw_legislation  text/html  2021-07-01 00:00:00   
18  new_south_wales      nsw_caselaw  text/html  2013-11-13 00:00:00   
19  new_south_wales      nsw_caselaw 

In [6]:
print("\n--- Final Step: Saving the NSW-only DataFrame to Permanent Files ---")

# Define the output filenames for our permanent, clean, NSW-only datasets
output_parquet_path = 'nsw_corpus_final.parquet'
output_jsonl_path = 'nsw_corpus_final.jsonl'

# --- 1. Save to Parquet format (Recommended for performance) ---
try:
    print(f"\nSaving {len(df_nsw):,} documents to Parquet: '{output_parquet_path}'...")
    # Parquet is highly efficient for both storage space and loading speed.
    df_nsw.to_parquet(output_parquet_path)
    print("... Parquet save complete.")
except Exception as e:
    print(f"--- ERROR saving to Parquet: {e} ---")

# --- 2. Save to JSONL format (Good for readability) ---
try:
    print(f"\nSaving {len(df_nsw):,} documents to JSONL: '{output_jsonl_path}'...")
    # JSONL is a robust, line-by-line format that's easy to inspect.
    df_nsw.to_json(output_jsonl_path, orient='records', lines=True)
    print("... JSONL save complete.")
except Exception as e:
    print(f"--- ERROR saving to JSONL: {e} ---")

print("\n--- All saves complete. ---")
print("You now have permanent, NSW-only files ready for all future notebooks.")


--- Final Step: Saving the NSW-only DataFrame to Permanent Files ---

Saving 119,587 documents to Parquet: 'nsw_corpus_final.parquet'...
... Parquet save complete.

Saving 119,587 documents to JSONL: 'nsw_corpus_final.jsonl'...
... JSONL save complete.

--- All saves complete. ---
You now have permanent, NSW-only files ready for all future notebooks.
