In [None]:
# ---- Colab file upload/download helpers ----
from google.colab import files
import csv
import re

def process_file(input_path, output_path):
    taxon = None
    source = None
    locations = []

    with open(input_path, 'r', encoding='utf-8-sig', errors='ignore') as f_in, \
         open(output_path, 'w', newline='', encoding='utf-8') as f_out:
        writer = csv.writer(f_out, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        writer.writerow(["Taxon", "Source", "Location", "Source Enhancement"])  # Header

        for i, line in enumerate(f_in, 1):
            line = line.strip()

            # Skip comment lines
            if line.startswith('<<'):
                continue

            # End-of-block marker ( > or >, )
            if re.fullmatch(r'>,?', line):
                if taxon and source and locations:
                    write_locations(writer, taxon, source, locations)
                    locations = []
                    source = None  # reset source but keep taxon
                continue

            # New taxon line
            match_taxon = re.match(r'^<(.+):$', line)
            if match_taxon:
                # Flush pending block
                if taxon and source and locations:
                    write_locations(writer, taxon, source, locations)
                    locations = []
                    source = None

                taxon = match_taxon.group(1).strip()
                continue

            # Match source line (numeric OR text — allows spaces)
            match_source = re.match(r'^(.*):<$', line)
            if match_source:
                # Flush pending block
                if taxon and source and locations:
                    write_locations(writer, taxon, source, locations)
                    locations = []

                source = match_source.group(1).strip().lstrip('/')
                continue

            # Location line
            match_location = re.match(r'([^,]+)(?:,|$)', line)
            if match_location and taxon and source:
                location = match_location.group(1).strip()

                # Match source enhancement marker (optional)
                source_enhancement_match = re.search(r'\$(.*)$', location)
                if source_enhancement_match:
                    source_enhancement = source_enhancement_match.group(1)
                    location = location.replace(f"${source_enhancement}", "").strip()
                else:
                    source_enhancement = ""

                locations.append((location, source_enhancement))

        # Write final block if needed
        if taxon and source and locations:
            write_locations(writer, taxon, source, locations)

def write_locations(writer, taxon, source, locations):
    for location, source_enhancement in locations:
        writer.writerow([taxon, source, location, source_enhancement])

# ---- Colab upload + run ----

print("Please upload your input file (.txt)")
uploaded = files.upload()

# Get uploaded filename
input_filename = next(iter(uploaded.keys()))
output_filename = input_filename.rsplit('.', 1)[0] + '.csv'

# Process file
process_file(input_filename, output_filename)

# Download result
print(f"\n✅ Processing finished. Downloading {output_filename} ...")
files.download(output_filename)
