In [1]:
import pandas as pd
import os 
from urllib.parse import urlparse

In [2]:
BATCHES_FOLDER = r'C:/Users/nanadhirah/Desktop/important/legislation/batches_csv'
OUTPUT_CSV_FILE = 'final_output_legislation.csv'
JSON_FILE = 'final_output_legislation.json'
COMBINED_CSV_FILE = 'combined_batches_legislation.csv'
LINK_CSV_FILE = 'pdf_legislation_links.csv'

In [3]:
def combine_batches_to_csv(BATCHES_FOLDER, combined_csv_file): 
    """
    Combine all batch CSV files into a single CSV file, removing duplicate records.

    Args:
        BATCHES_FOLDER (str): Folder containing all batch CSV files.
        combined_csv_file (str): Path to save the combined CSV file.

    Returns:
        str: Path to the combined CSV file.
    """
    # Locate all batch CSV files
    batch_files = [os.path.join(BATCHES_FOLDER, file) for file in os.listdir(BATCHES_FOLDER) if file.endswith('.csv')]
    if not batch_files:
        raise FileNotFoundError("No batch files found in the specified folder.")
    
    print(f"Found {len(batch_files)} batch files in {BATCHES_FOLDER}. Combining...")
    
    # Num of records existed in combined_df csv
    existing_combined_df = pd.read_csv(combined_csv_file)
    num_existing_records = len(existing_combined_df)
    print(f"Number of records in the existing combined CSV: {num_existing_records}")

    # Combine all batch files into a single DataFrame
    combined_df = pd.concat((pd.read_csv(file) for file in batch_files), ignore_index=True)
    print("Number of records with duplication:" , len(combined_df))
    
    # Drop duplicate rows based on all columns
    combined_df = combined_df.drop_duplicates()
    print("Number of records after deduplication:",len(combined_df))
    
    # Save the combined data as a CSV after deduplication
    combined_df.to_csv(combined_csv_file, index=False)
    print(f"Combined CSV saved to: {combined_csv_file}")
    
    return combined_csv_file

In [4]:
def merge_and_process_csv(pdf_links_csv, combined_csv, output_csv):
    """
    This function merges two CSV files based on the 'PDF File' column,
    processes the merged data, and saves the result to a new CSV.

    Parameters:
    - pdf_links_csv (str): Path to the first CSV file containing PDF links.
    - combined_csv (str): Path to the second CSV file to be merged.
    - output_csv (str): Path where the merged and processed CSV should be saved.
    """
    # Read the CSV files into DataFrames
    pdf_links_df = pd.read_csv(pdf_links_csv)
    combined_df = pd.read_csv(combined_csv)

    # Merge the DataFrames on 'PDF File' column
    merged_df = pd.merge(pdf_links_df, combined_df, on='PDF File')

    # Perform further operations on the merged DataFrame
    merged_df['URL'] = merged_df['Original Link'].apply(lambda x: urlparse(x).netloc)
    merged_df = merged_df[['URL', 'Original Link', 'Document_Text', 'Text_Len', 'Text_Ext_Method']]
    merged_df.rename(columns={'Original Link': 'Document'}, inplace=True)

    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv(output_csv, index=False)

    print(f"Processed data saved to: {output_csv}")


In [5]:
def convert_csv_to_json(csv_file, json_file):
    """
    Converts a CSV file to JSON format and saves it to a specified location.

    Parameters:
    - csv_file (str): Path to the CSV file to be converted.
    - json_file (str): Path where the JSON file should be saved.
    """
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)

    # Convert the DataFrame to JSON and save it
    df.to_json(json_file, orient='records', lines=True)

    print(f"CSV data has been successfully converted to JSON and saved as: {json_file}")

In [6]:
# Combine batch CSV files into one
combined_csv_path = combine_batches_to_csv(BATCHES_FOLDER, COMBINED_CSV_FILE)

merge_and_process_csv(LINK_CSV_FILE, COMBINED_CSV_FILE, OUTPUT_CSV_FILE)

# Convert the final CSV to JSON in the same directory
#final_json_file = 'try_final_legislation.json'  # Just the file name
convert_csv_to_json(OUTPUT_CSV_FILE, JSON_FILE)

Found 1455 batch files in C:/Users/nanadhirah/Desktop/important/legislation/batches_csv. Combining...
Number of records in the existing combined CSV: 10180
Number of records with duplication: 14545
Number of records after deduplication: 14545
Combined CSV saved to: combined_batches_legislation.csv
Processed data saved to: final_output_legislation.csv
CSV data has been successfully converted to JSON and saved as: final_output_legislation.json
