# RunPod Serverless Client
## GBFF to ESM2 Embeddings

In [1]:
import runpod
import numpy as np
import pandas as pd
import os
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


## 1. Loading GBFF 

In [2]:
# Local GBFF 파일 경로
base_path = r"D:\Git_Clone\GeneExp"
gbff_local_path = "GCF_000005845.2_ASM584v2_genomic.gbff"

# 파일 확인
if os.path.exists(gbff_local_path):
    file_size = os.path.getsize(gbff_local_path) / (1024*1024)
    print(f"File: {os.path.basename(gbff_local_path)}")
    print(f"Path: {gbff_local_path}")
    print(f"Size: {file_size:.2f} MB")
else:
    print(f"Error: File not found at {gbff_local_path}")

File: GCF_000005845.2_ASM584v2_genomic.gbff
Path: GCF_000005845.2_ASM584v2_genomic.gbff
Size: 11.32 MB


## 2. Extract Input format

In [3]:
from Bio import SeqIO
import sys, subprocess
from Bio import SeqIO
import json

# Extract 'product' and 'translation' from the GBFF file into a DataFrame
entries = {'file_name': gbff_local_path,
    'products': [], 'translations': []}
for record in SeqIO.parse(gbff_local_path, "genbank"):
    for feat in record.features[:200]:
        if feat.type == "CDS":
            q = feat.qualifiers
            entries['products'].append(q.get("product", [""])[0])
            entries['translations'].append(q.get("translation", [""])[0])

input_form = {'input': entries}
# entries

In [4]:
out_json = "test_input.json"

with open(out_json, "w", encoding="utf-8") as fh:
    json.dump(input_form, fh, ensure_ascii=False, indent=2)

print(f"Saved JSON: {out_json}")


Saved JSON: test_input.json


## 3. Tag generation

In [5]:
with open(out_json, "r", encoding="utf-8") as f:
    input_form = json.load(f)
    entries = input_form.get('input')

display(entries['products'][:5])

['thr operon leader peptide',
 'fused aspartate kinase/homoserine dehydrogenase 1',
 'homoserine kinase',
 'threonine synthase',
 'DUF2502 domain-containing protein YaaX']

In [6]:
import sys

sys.path.append(str(Path(base_path)))
from main.generate_tags import collect_tags


file_name = Path(gbff_local_path).stem
products = entries['products']
output_dir = "./test_downloads/"

collect_tags(file_name, products, output_dir)

request confirmed: generating tags for GCF_000005845.2_ASM584v2_genomic with 94 products
Log file initialized at ./test_downloads/GCF_000005845log.txt
[2025-11-17 12:29:58]Processing chunk 1 / 1 with 94 products


_client.py          :1025 2025-11-17 12:30:15,096 HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Processed GCF_000005845.2_ASM584v2_genomic: 3 tags generated


## 4. ESM2 Embedding

In [7]:
with open(out_json, "r", encoding="utf-8") as f:
    input_form = json.load(f)
    entries = input_form.get('input')

display(entries['translations'][:5])

['MKRISTTITTTITITTGNGAG',
 'MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNISDAERIFAELLTGLAAAQPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALICRGEKMSIAIMAGVLEARGHNVTVIDPVEKLLAVGHYLESTVDIAESTRRIAASRIPADHMVLMAGFTAGNEKGELVVLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFAAMSRARISVVLITQSSSEYSISFCVPQSDCVRAERAMQEEFYLELKEGLLEPLAVTERLAIISVVGDGMRTLRGISAKFFAALARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGALLEQLKRQQSWLKNKHIDLRVCGVANSKALLTNVHGLNLENWQEELAQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAVADQYADFLREGFHVVTPNKKANTSSMDYYHQLRYAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELMKFSGILSGSLSYIFGKLDEGMSFSEATTLAREMGYTEPDPRDDLSGMDVARKLLILARETGRELELADIEIEPVLPAEFNAEGDVAAFMANLSQLDDLFAARVAKARDEGKVLRYVGNIDEDGVCRVKIAEVDGNDPLFKVKNGENALAFYSHYYQPLPLVLRGYGAGNDVTAAGVFADLLRTLSWKLGV',
 'MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWERFCQELGKQIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAPCF

In [8]:
import sys
sys.path.append(str(Path(base_path)))
from main.esm_embedding import embed_sequences

embed_sequences(sequences=entries['translations'],
                output_path="./test_embeddings.pkl")

TypeError: embed_sequences() missing 1 required positional argument: 'file_name'

In [None]:
import pickle
with open("./test_embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

print(len(embeddings))
print(embeddings[0].shape)

4315
(1, 320)


## One step Process

In [None]:
import json
import sys
import os
from pathlib import Path

base_path = r"D:\Git_Clone\GeneExp"
sys.path.append(str(Path(base_path)))
from main.generate_tags import collect_tags
from main.esm_embedding import embed_sequences

## input
with open('test_input.json', "r", encoding="utf-8") as f:
    input_form = json.load(f)
    entries = input_form.get('input')
output_dir = "./temp_downloads/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## input handling
file_name = entries['file_name']
products = entries['products']
translations = entries['translations']


collect_tags(file_name, products, output_dir)
embed_sequences(file_name, translations, output_dir)

print("Workflow completed.")

  from .autonotebook import tqdm as notebook_tqdm


request confirmed: generating tags for GCF_000005845.2_ASM584v2_genomic.gbff with 4315 products
Log file initialized at ./temp_downloads/GCF_000005845.2_ASM584v2_genomiclog.txt
[2025-11-17 11:05:32]Processing chunk 1 / 44 with 100 products
[2025-11-17 11:05:43]Processing chunk 2 / 44 with 100 products
[2025-11-17 11:05:59]Processing chunk 3 / 44 with 100 products
[2025-11-17 11:06:18]Processing chunk 4 / 44 with 100 products
[2025-11-17 11:06:39]Processing chunk 5 / 44 with 100 products
[2025-11-17 11:06:55]Processing chunk 6 / 44 with 100 products
[2025-11-17 11:07:10]Processing chunk 7 / 44 with 100 products
[2025-11-17 11:07:29]Processing chunk 8 / 44 with 100 products
[2025-11-17 11:07:42]Processing chunk 9 / 44 with 100 products
[2025-11-17 11:08:46]Processing chunk 10 / 44 with 100 products
[2025-11-17 11:09:01]Processing chunk 11 / 44 with 100 products
[2025-11-17 11:09:18]Processing chunk 12 / 44 with 100 products
[2025-11-17 11:09:45]Processing chunk 13 / 44 with 100 products


'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /facebook/esm2_t6_8M_UR50D/resolve/main/tokenizer_config.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1000)')))"), '(Request ID: e323eb52-34cf-456e-a3ed-67931b15177d)')' thrown while requesting HEAD https://huggingface.co/facebook/esm2_t6_8M_UR50D/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /facebook/esm2_t6_8M_UR50D/resolve/main/tokenizer_config.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1000)')))"), '(Request ID: b9beb64f-52ba-4774-82fa-16e612118e5a)')' thrown while requesting HEAD https://huggingface.co/facebook/esm2_t6_8M

Model loaded successfully on cpu
Processing 4315 sequences...
Progress: 160/4315 (3.7%)
Progress: 320/4315 (7.4%)
Progress: 480/4315 (11.1%)
Progress: 640/4315 (14.8%)
Progress: 800/4315 (18.5%)
Progress: 960/4315 (22.2%)
Progress: 1120/4315 (26.0%)
Progress: 1280/4315 (29.7%)
Progress: 1440/4315 (33.4%)
Progress: 1600/4315 (37.1%)
Progress: 1760/4315 (40.8%)
Progress: 1920/4315 (44.5%)
Progress: 2080/4315 (48.2%)
Progress: 2240/4315 (51.9%)
Progress: 2400/4315 (55.6%)
Progress: 2560/4315 (59.3%)
Progress: 2720/4315 (63.0%)
Progress: 2880/4315 (66.7%)
Progress: 3040/4315 (70.5%)
Progress: 3200/4315 (74.2%)
Progress: 3360/4315 (77.9%)
Progress: 3520/4315 (81.6%)
Progress: 3680/4315 (85.3%)
Progress: 3840/4315 (89.0%)
Progress: 4000/4315 (92.7%)
Progress: 4160/4315 (96.4%)
Progress: 4315/4315 (100.0%)
Completed: 4298/4315 valid embeddings
Saved 4298/4315 embeddings to ./temp_downloads/GCF_000005845.2_ASM584v2_genomictags.txt
Workflow completed.
