download the uniprot_sprot database and unzip

```
wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz
gunzip uniprot_sprot.dat.gz
```

In [None]:
from pathlib import Path
import json

def parse_uniprot_dat(file_path, max_entries=1000):
    entries = []
    with open(file_path, "r") as f:
        entry = []
        for line in f:
            if line.startswith("//"):  # end of entry
                entry_str = "\n".join(entry)
                parsed = parse_entry(entry_str)
                if parsed:
                    entries.append(parsed)
                entry = []
                if len(entries) >= max_entries:
                    break
            else:
                entry.append(line.strip())
    return entries

def parse_entry(entry_text):
    lines = entry_text.splitlines()
    seq_lines, function, domains = [], None, []
    in_sequence = False

    for line in lines:
        if line.startswith("CC   -!- FUNCTION:"):
            function = line.split("FUNCTION:")[1].strip().rstrip('.')
        elif line.startswith("FT   DOMAIN"):
            domain = " ".join(line.split()[2:])
            domains.append(domain)
        elif line.startswith("SQ"):
            in_sequence = True
        elif in_sequence:
            if line:
                seq_lines.append(line.replace(" ", ""))
    
    sequence = "".join(seq_lines)
    if not function or not sequence:
        return None

    prompt = f"Protein Sequence:\n{sequence}\nDomains: {', '.join(domains) or 'None'}"
    return {"prompt": prompt, "response": function}

def save_jsonl(data, filename):
    with open(filename, "w") as f:
        for entry in data:
            json.dump(entry, f)
            f.write("\n")

if __name__ == "__main__":
    entries = parse_uniprot_dat("uniprot_sprot.dat", max_entries=1000)
    save_jsonl(entries, "swissprot_llm_dataset.jsonl")
    
