In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import numpy as np
import json
import os

# model: BioGPT
model_name = "microsoft/BioGPT-Large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
generator = pipeline("text-generation", model = model, tokenizer = tokenizer, device = -1)


Device set to use cpu


In [3]:
from Bio import SeqIO

# Load protein sequence
record = next(SeqIO.parse("//Users/abhinavbarat/Documents/bionemo_protein_rag/data/proteins.fasta", "fasta"))
protein_seq = str(record.seq)
print(f"Loaded: {record.id}")

Loaded: sp|Q13286|CLN3_HUMAN


In [4]:
with open("/Users/abhinavbarat/Documents/bionemo_protein_rag/data/uniprot_chunks.jsonl", "r") as f:
    docs = [json.loads(line) for line in f]

retrieved_texts = ["Functions in cell cycle checkpoint regulation and interacts with BRCA1.",
    "This protein is involved in the DNA damage response and double-strand break repair pathways.",
    "Plays a role in mitochondrial fission and may regulate apoptosis."]

In [17]:
prompt = f"""You are a biomedical research assistant.

Task: Predict the likely biological function of the following protein, using its sequence and related literature.

Protein sequence:
{protein_seq[:300]}...

Top related literature:
1. {retrieved_texts[0]}
2. {retrieved_texts[1]}
3. {retrieved_texts[2]}

Predicted function:"""


In [16]:
output = generator(prompt, max_new_tokens=80, do_sample=True, temperature=0.7)
generated_text = output[0]["generated_text"]

# Extract text after 'Predicted function:'
if "Predicted function:" in generated_text:
    predicted_function = generated_text.split("Predicted function:")[-1].strip()
else:
    predicted_function = generated_text.strip()

junk_markers = ["< / FREETEXT >", "< / PARAGRAPH >", "▃", "\n"]
for marker in junk_markers:
    if marker in predicted_function:
        predicted_function = predicted_function.split(marker)[0].strip()


predicted_function = predicted_function.split(".")[0].strip() + "."
print("\n🧬 Predicted Function:")
print(predicted_function)



🧬 Predicted Function:
DNA repair.


In [18]:
import os

output_dir = "/Users/abhinavbarat/Documents/bionemo_protein_rag/outputs"
os.makedirs(output_dir, exist_ok=True)

output_file = os.path.join(output_dir, "Q13286_prediction.txt")
with open(output_file, "w") as f:
    f.write(predicted_function)

print(f"Saved cleaned predicted function to {output_file}")

Saved cleaned predicted function to /Users/abhinavbarat/Documents/bionemo_protein_rag/outputs/Q13286_prediction.txt
