In [1]:
import os
import dotenv
from dotenv import load_dotenv
import requests
import re
import time
import csv
from openai import OpenAI
from Bio import Entrez
import openai



In [2]:
# Function to retrieve all pathway IDs

# just replace the link with each of the general KEGG links if you want to do this for the entire KEGG database. There should be 18 other links
# genes, compound, drug, enzymes, glycans, reactions, modules, organisms, orthologs, diseases, branched pathways, genome, environmental information processing
# chemical structures, peptides, riboswitch, ligand... I think that's it.

def get_all_pathway_ids():
    response = requests.get("http://rest.kegg.jp/list/pathway")
    if response.status_code == 200:
        pathway_data = response.text
        pathway_ids = [line.split()[0] for line in pathway_data.splitlines()]
        return pathway_ids
    else:
        print(f"Failed to retrieve data: Status code {response.status_code}")
        return []

In [3]:
# Function to get PMIDs from a KEGG entry
def get_pmids_from_kegg_entry(kegg_id):
    response = requests.get(f"http://rest.kegg.jp/get/{kegg_id}")
    if response.status_code == 200:
        entry_data = response.text
        # Regular expression to extract PMIDs
        pmids = re.findall(r'PMID:\s*(\d+)', entry_data)
        return pmids
    else:
        print(f"Failed to retrieve KEGG entry for {kegg_id}: Status code {response.status_code}")
        return []

In [4]:
# Function to retrieve all PMIDs for the given pathway IDs
def retrieve_all_pmids(pathway_ids):
    all_pmids = {}
    for index, kegg_id in enumerate(pathway_ids):
        pmids = get_pmids_from_kegg_entry(kegg_id)
        all_pmids[kegg_id] = pmids
        
        # Print the retrieved PMIDs for debugging
        if pmids:
            print(f"{kegg_id}: Retrieved PMIDs: {pmids}")
        else:
            print(f"{kegg_id}: No PMIDs found.")
        
        # Rate limiting: sleep to ensure no more than 3 requests per second
        if (index + 1) % 3 == 0:
            time.sleep(1)  # Sleep for 1 second every 3 requests

    return all_pmids

In [5]:
# Function to save the retrieved PMIDs to a CSV file
def save_pmids_to_csv(pmids_dict, filename="kegg_pmids_pathways.csv"):
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["KEGG_ID", "PMID"])
        for kegg_id, pmids in pmids_dict.items():
            for pmid in pmids:
                writer.writerow([kegg_id, pmid])

In [13]:
# Main process
pathway_ids = get_all_pathway_ids()
all_pmids = retrieve_all_pmids(pathway_ids)
save_pmids_to_csv(all_pmids)

map01100: No PMIDs found.
map01110: No PMIDs found.
map01120: No PMIDs found.
map01200: No PMIDs found.
map01210: No PMIDs found.
map01212: No PMIDs found.
map01230: No PMIDs found.
map01232: No PMIDs found.
map01250: No PMIDs found.
map01240: No PMIDs found.
map01220: No PMIDs found.
map00010: No PMIDs found.
map00020: No PMIDs found.
map00030: Retrieved PMIDs: ['12700258', '16788179', '16428816', '23279921', '16458304', '20023024']
map00040: Retrieved PMIDs: ['11741871', '15697206', '10572115', '10913097', '15901685']
map00051: No PMIDs found.
map00052: No PMIDs found.
map00053: Retrieved PMIDs: ['17222174', '15564123', '10517845', '16595667', '17462988', '14996803', '12644495']
map00500: No PMIDs found.
map00520: Retrieved PMIDs: ['2793832', '8071227', '8444803', '9811644', '10931327', '10984043', '11447132', '12562791', '14450717', '15134748', '15489439', '15695810', '15809294', '16237198', '16277754', '16452451', '16995900', '17085508', '17190829', '17827295', '18199744', '1830466

In [6]:
# here's the part where we load the CSV in order to get the abstract of the PMIDs

def load_kegg_pmids_csv(filename="kegg_pmids_pathways.csv"):
    kegg_pmids = []
    with open(filename, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            kegg_pmids.append((row['KEGG_ID'], row['PMID']))
    return kegg_pmids

kegg_pmids = load_kegg_pmids_csv()

In [22]:
# Fetch abstracts from PubMed
Entrez.email = "ash.sze@tufts.edu"

def fetch_abstract(pmid):
    handle = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text")
    abstract = handle.read()
    handle.close()
    return abstract

abstracts = {}
for kegg_id, pmid in kegg_pmids:
    try:
        abstract = fetch_abstract(pmid)
        abstracts[(kegg_id, pmid)] = abstract
        print(f"Fetched abstract for PMID: {pmid}")
    except Exception as e:
        print(f"Error fetching abstract for PMID: {pmid}: {e}")

Fetched abstract for PMID: 12700258
Fetched abstract for PMID: 16788179
Fetched abstract for PMID: 16428816
Fetched abstract for PMID: 23279921
Fetched abstract for PMID: 16458304
Fetched abstract for PMID: 20023024
Fetched abstract for PMID: 11741871
Fetched abstract for PMID: 15697206
Fetched abstract for PMID: 10572115
Fetched abstract for PMID: 10913097
Fetched abstract for PMID: 15901685
Fetched abstract for PMID: 17222174
Fetched abstract for PMID: 15564123
Fetched abstract for PMID: 10517845
Fetched abstract for PMID: 16595667
Fetched abstract for PMID: 17462988
Fetched abstract for PMID: 14996803
Fetched abstract for PMID: 12644495
Fetched abstract for PMID: 2793832
Fetched abstract for PMID: 8071227
Fetched abstract for PMID: 8444803
Fetched abstract for PMID: 9811644
Fetched abstract for PMID: 10931327
Fetched abstract for PMID: 10984043
Fetched abstract for PMID: 11447132
Fetched abstract for PMID: 12562791
Fetched abstract for PMID: 14450717
Fetched abstract for PMID: 15134

In [23]:
# Save abstracts to a CSV file
def save_abstracts_to_csv(abstracts, filename="kegg_abstracts.csv"):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["KEGG_ID", "PMID", "Abstract"])
        for (kegg_id, pmid), abstract in abstracts.items():
            writer.writerow([kegg_id, pmid, abstract])

# Call this function after fetching abstracts
save_abstracts_to_csv(abstracts)

In [7]:
def load_abstracts_from_csv(filename="kegg_abstracts.csv"):
    abstracts = {}
    with open(filename, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            kegg_id = row['KEGG_ID']
            pmid = row['PMID']
            abstract = row['Abstract']
            abstracts[(kegg_id, pmid)] = abstract
    return abstracts

In [10]:
# Load the environment variables from the .env file

load_dotenv('/Users/asze01/Code/Hassoun-Lab/GPT.env')

# Retrieve the API key once and use it everywhere
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("API key not found. Please check your .env file.")

openai.api_key = api_key  

def generate_qa_from_abstract(abstract):
    prompt = f"""
    The following abstract is from a research paper that is associated with a KEGG entry. KEGG typically highlights information relevant to biological pathways, gene functions, enzymes, and metabolic processes. 
    Please identify the relevant parts of this abstract that would be used in KEGG and generate a question-answer pair based only on that information.
    
    Abstract:
    {abstract}
    
    QA:
    """
    response = openai.chat.completions.create(
        model="gpt-4o-mini",  # Or another GPT model
        messages=[{"role": "system", "content": "You are a helpful assistant."},
                  {"role": "user", "content": prompt}],
        temperature = 0.0
    )
    qa_pairs = response.choices[0].message.content
    return qa_pairs

# Load abstracts from the CSV file
abstracts = load_abstracts_from_csv()

Maybe change this to 15 random abstracts, not just the first 15

In [13]:
qa_data = []

# for (kegg_id, pmid), abstract in abstracts.items():
#     qa_pairs = generate_qa_from_abstract(abstract)
#     qa_data.append((kegg_id, pmid, qa_pairs))
#     print(f"Generated QA for PMID: {pmid}")


for i, ((kegg_id, pmid), abstract) in enumerate(abstracts.items()):
    if i >= 15:
        break  # Stop after processing the first 15 abstracts
    qa_pairs = generate_qa_from_abstract(abstract)
    qa_data.append((kegg_id, pmid, qa_pairs))
    print(f"Generated QA for PMID: {pmid}")

Generated QA for PMID: 12700258
Generated QA for PMID: 16788179
Generated QA for PMID: 16428816
Generated QA for PMID: 23279921
Generated QA for PMID: 16458304
Generated QA for PMID: 20023024
Generated QA for PMID: 11741871
Generated QA for PMID: 15697206
Generated QA for PMID: 10572115
Generated QA for PMID: 10913097
Generated QA for PMID: 15901685
Generated QA for PMID: 17222174
Generated QA for PMID: 15564123
Generated QA for PMID: 10517845
Generated QA for PMID: 16595667


In [14]:
# Function to print the QA data
def print_qa_data(qa_data):
    for kegg_id, pmid, qa_pairs in qa_data:
        print(f"KEGG ID: {kegg_id}")
        print(f"PMID: {pmid}")
        print("QA Pairs:")
        print(qa_pairs)
        print("\n" + "="*40 + "\n")  # Separator for readability

# Call this function to print the QA data
print_qa_data(qa_data)

KEGG ID: map00030
PMID: 12700258
QA Pairs:
**Question:** What is the role of the phnN gene in Escherichia coli, and what enzymatic activity does it encode?

**Answer:** The phnN gene in Escherichia coli encodes ribose 1,5-bisphosphokinase activity, which is involved in the synthesis of 5-phospho-D-ribosyl alpha-1-diphosphate (PRPP) and plays a dual role in phosphonate degradation and NAD biosynthesis pathways.


KEGG ID: map00030
PMID: 16788179
QA Pairs:
**Question:** What is the role of the ribulose monophosphate (RuMP) pathway in the archaeon Thermococcus kodakaraensis, and which enzymes are involved in this pathway?

**Answer:** The ribulose monophosphate (RuMP) pathway serves as a substitute for the missing pentose phosphate pathway in the archaeon Thermococcus kodakaraensis. It is involved in the fixation and detoxification of formaldehyde. The key enzymes in this pathway are 3-hexulose-6-phosphate synthase (HPS) and 6-phospho-3-hexuloisomerase (PHI), which are essential for the b

In [None]:
def save_qa_to_csv(qa_data, filename="kegg_qa_pairs.csv"):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["KEGG_ID", "PMID", "QA_Pairs"])
        for kegg_id, pmid, qa_pairs in qa_data:
            writer.writerow([kegg_id, pmid, qa_pairs])

save_qa_to_csv(qa_data)