# Filter URLs in CSV

Takes the output of the CSVs from gov search and filters the entries, saving only the ones with `documentType` in `VALID_VALUES`.

In [None]:
# --- Configuration ---
SOURCE_DIR = "search_results"
DEST_DIR = "search_results_filtered"
COLUMN_INDEX = 2  # documentType is the 3rd column (0-indexed)
VALID_VALUES = {"guide", "detailed_guide", "answer", "guidance", "travel_advice"}


In [None]:
import csv
import os
import sys

os.makedirs(DEST_DIR, exist_ok=True)

rows_written_total = 0

for filename in os.listdir(SOURCE_DIR):
    if not filename.endswith(".csv"):
        continue

    src_path = os.path.join(SOURCE_DIR, filename)
    dst_path = os.path.join(DEST_DIR, filename)
    rows_written = 0

    with open(src_path, newline="") as infile, open(dst_path, "w", newline="") as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)

        header = next(reader, None)
        if header:
            writer.writerow(header)

        for row in reader:
            if len(row) > COLUMN_INDEX and row[COLUMN_INDEX] in VALID_VALUES:
                writer.writerow(row)
                rows_written += 1

    print(f"{filename}: {rows_written} rows written")
    rows_written_total = rows_written_total + rows_written

print(f"\nTotal rows written: {rows_written_total}.")