In [1]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from xml.etree import ElementTree
import pandas as pd

def pubmed_search(query, retmax=10):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "xml",
        "retmax": retmax
    }
    
    response = requests.get(base_url, params=params)
    response.raise_for_status()
    
    tree = ElementTree.fromstring(response.content)
    ids = [id_elem.text for id_elem in tree.findall(".//Id")]
    
    return ids

def fetch_details(id_list):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    ids = ",".join(id_list)
    params = {
        "db": "pubmed",
        "id": ids,
        "retmode": "xml",
        "rettype": "abstract"
    }
    
    response = requests.get(base_url, params=params)
    response.raise_for_status()
    
    return response.content

In [3]:
def preprocess_details_to_dataframe(xml_data):
    tree = ElementTree.fromstring(xml_data)
    articles = []

    for article in tree.findall(".//PubmedArticle"):
        article_data = {}
        
        # Extracting the title
        title_elem = article.find(".//ArticleTitle")
        article_data['Title'] = title_elem.text if title_elem is not None else "N/A"
        
        # Extracting the authors
        authors = []
        for author in article.findall(".//Author"):
            last_name = author.find("LastName")
            fore_name = author.find("ForeName")
            if last_name is not None and fore_name is not None:
                authors.append(f"{fore_name.text} {last_name.text}")
        article_data['Authors'] = ", ".join(authors) if authors else "N/A"
        
        # Extracting the journal name
        journal_elem = article.find(".//Journal/Title")
        article_data['Journal'] = journal_elem.text if journal_elem is not None else "N/A"
        
        # Extracting the publication date
        pub_date_elem = article.find(".//PubDate")
        if pub_date_elem is not None:
            year_elem = pub_date_elem.find("Year")
            month_elem = pub_date_elem.find("Month")
            day_elem = pub_date_elem.find("Day")
            pub_date = f"{year_elem.text if year_elem is not None else ''}-{month_elem.text if month_elem is not None else ''}-{day_elem.text if day_elem is not None else ''}"
            article_data['Publication Date'] = pub_date
        else:
            article_data['Publication Date'] = "N/A"
        
        # Extracting the abstract
        abstract_elem = article.find(".//Abstract/AbstractText")
        article_data['Abstract'] = abstract_elem.text if abstract_elem is not None else "N/A"
        
        articles.append(article_data)

    df = pd.DataFrame(articles)
    return df

In [4]:
def fetch_full_text(pmc_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pmc",
        "id": pmc_id,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=params)
    response.raise_for_status()
    
    return response.content

In [5]:
def preprocess_full_text_to_plain_text(xml_data):
    tree = ElementTree.fromstring(xml_data)
    text_content = []

    # Extract text from <body> element of the article
    body_elem = tree.find(".//body")
    if body_elem is not None:
        for elem in body_elem.iter():
            if elem.text:
                text_content.append(elem.text.strip())
            if elem.tail:
                text_content.append(elem.tail.strip())
    
    plain_text = "\n".join(text_content)
    return plain_text

In [6]:
def fetch_text_from_pubmed_id(id):
    # Convert PubMed ID to PMC ID using eLink
    elink_base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
    elink_params = {
        "dbfrom": "pubmed",
        "db": "pmc",
        "id": id,
        "retmode": "xml"
    }
    
    elink_response = requests.get(elink_base_url, params=elink_params)
    elink_response.raise_for_status()
    
    elink_tree = ElementTree.fromstring(elink_response.content)
    pmc_id_elem = elink_tree.find(".//LinkSetDb/Link/Id")
    if pmc_id_elem is not None:
        pmc_id = pmc_id_elem.text
        full_text_xml = fetch_full_text(pmc_id)
        plain_text = preprocess_full_text_to_plain_text(full_text_xml)
        return plain_text
    else:
        print("Full text not available in PMC for this article.")
        # Return the abstract instead
        abstract = df.loc[df['Title'] == df.iloc[0]['Title'], 'Abstract'].values[0]
        return abstract

In [7]:
def fetch_articles_from_query(query):
    ids = pubmed_search(query, retmax=10)  # You can adjust retmax to get more results
    
    if ids:
        details = fetch_details(ids)
        df = preprocess_details_to_dataframe(details)

        return df, ids
    else:
        print("No results found")
        return None, None

In [8]:
# "de Kok et al. 1996;
# Ahn et al. 2009; Naranjo et al. 2010"
query = "deafness Kok AND 1996[dp]"
df, ids = fetch_articles_from_query(query)

In [9]:
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Show full content of each column
pd.set_option('display.width', None)  # Set the display width to be unlimited

In [10]:
df

Unnamed: 0,Title,Authors,Journal,Publication Date,Abstract
0,Identification of a hot spot for microdeletions in patients with X-linked deafness type 3 (DFN3) 900 kb proximal to the DFN3 gene POU3F4.,"Y J de Kok, E R Vossenaar, C W Cremers, N Dahl, J Laporte, L J Hu, D Lacombe, N Fischel-Ghodsian, R A Friedman, L S Parnes, P Thorpe, M Bitner-Glindzicz, H J Pander, H Heilbronner, J Graveline, J T den Dunnen, H G Brunner, H H Ropers, F P Cremers",Human molecular genetics,1996-Sep-,"Small mutations in the POU domain gene POU3F4 were recently shown to cause X-linked deafness type 3 (DFN3) in nine unrelated males. The POU3F4 gene was found to be located outside four of five deletions associated with DFN3. Two of these deletions were situated more than 400 kb proximal to POU3F4. Employing PCR analysis of sequence tagged sites from this region we initially identified novel deletions in two DFN3 patients. To investigate this chromosomal segment in more detail, we extended a previously established 850 kb cosmid contig in the centromeric direction to a total size of 1500 kb. Cosmids from this contig were hybridized to DNA of 11 unrelated males with DFN3. In two patients, we identified deletions encompassing the POU3F4 gene and variably sized segments of Xq21.1. In six of the nine remaining patients which lacked mutations in the POU3F4 gene, smaller deletions were identified which, with one exception, overlap in a 8 kb segment 900 kb proximal to the POU3F4 gene. In one patient, we identified several small deletions in the vicinity of the 8 kb DNA segment. Together, deletions account for 56% (13/23) of all known DFN3 mutations, most (10/13) of which do not encompass the POU3F4 gene. The combined molecular data suggest that the deletion hot spot region in Xq21.1 contains another DFN3 gene or, alternatively, a sequence element involved in transcriptional regulation of POU3F4."


In [11]:
text = fetch_text_from_pubmed_id(ids[0])
text[:100]

'\n\n\n\nIntroduction\n\nThe genetic basis for multiple Mendelian conditions was initially identified by st'

In [12]:
# "de Kok et al. 1996;
# Ahn et al. 2009; Naranjo et al. 2010"
query = "pou3f4 Naranjo AND 2010[dp]"
prompt_df, ids = fetch_articles_from_query(query)
prompt_df

Unnamed: 0,Title,Authors,Journal,Publication Date,Abstract
0,Characterization of new otic enhancers of the pou3f4 gene reveal distinct signaling pathway regulation and spatio-temporal patterns.,"Àlex Robert-Moreno, Silvia Naranjo, Elisa de la Calle-Mustienes, José Luis Gómez-Skarmeta, Berta Alsina",PloS one,2010-Dec-31,"POU3F4 is a member of the POU-homedomain transcription factor family with a prominent role in inner ear development. Mutations in the human POU3F4 coding unit leads to X-linked deafness type 3 (DFN3), characterized by conductive hearing loss and progressive sensorineural deafness. Microdeletions found 1 Mb 5' upstream of the coding region also displayed the same phenotype, suggesting that cis-regulatory elements might be present in that region. Indeed, we and others have recently identified several enhancers at the 1 Mb 5' upstream interval of the pou3f4 locus. Here we characterize the spatio-temporal patterns of these regulatory elements in zebrafish transgenic lines. We show that the most distal enhancer (HCNR 81675) is activated earlier and drives GFP reporter expression initially to a broad ear domain to progressively restrict to the sensory patches. The proximal enhancer (HCNR 82478) is switched later during development and promotes expression, among in other tissues, in sensory patches from its onset. The third enhancer (HCNR 81728) is also active at later stages in the otic mesenchyme and in the otic epithelium. We also characterize the signaling pathways regulating these enhancers. While HCNR 81675 is regulated by very early signals of retinoic acid, HCNR 82478 is regulated by Fgf activity at a later stage and the HCNR 81728 enhancer is under the control of Hh signaling. Finally, we show that Sox2 and Pax2 transcription factors are bound to HCNR 81675 genomic region during otic development and specific mutations to these transcription factor binding sites abrogates HCNR 81675 enhancer activity. Altogether, our results suggest that pou3f4 expression in inner ear might be under the control of distinct regulatory elements that fine-tune the spatio-temporal activity of this gene and provides novel data on the signaling mechanisms controlling pou3f4 function."
1,Multiple enhancers located in a 1-Mb region upstream of POU3F4 promote expression during inner ear development and may be required for hearing.,"Silvia Naranjo, Krysta Voesenek, Elisa de la Calle-Mustienes, Alex Robert-Moreno, Haris Kokotas, Maria Grigoriadou, John Economides, Guy Van Camp, Nele Hilgert, Felipe Moreno, Berta Alsina, Michael B Petersen, Hannie Kremer, José Luis Gómez-Skarmeta",Human genetics,2010-Oct-,"POU3F4 encodes a POU-domain transcription factor required for inner ear development. Defects in POU3F4 function are associated with X-linked deafness type 3 (DFN3). Multiple deletions affecting up to ~900-kb upstream of POU3F4 are found in DFN3 patients, suggesting the presence of essential POU3F4 enhancers in this region. Recently, an inner ear enhancer was reported that is absent in most DFN3 patients with upstream deletions. However, two indications suggest that additional enhancers in the POU3F4 upstream region are required for POU3F4 function during inner ear development. First, there is at least one DFN3 deletion that does not eliminate the reported enhancer. Second, the expression pattern driven by this enhancer does not fully recapitulate Pou3f4 expression in the inner ear. Here, we screened a 1-Mb region upstream of the POU3F4 gene for additional cis-regulatory elements and searched for novel DFN3 mutations in the identified POU3F4 enhancers. We found several novel enhancers for otic vesicle expression. Some of these also drive expression in kidney, pancreas and brain, tissues that are known to express Pou3f4. In addition, we report a new and smallest deletion identified so far in a DFN3 family which eliminates 3.9 kb, comprising almost exclusively the previous reported inner ear enhancer. We suggest that multiple enhancers control the expression of Pou3f4 in the inner ear and these may contribute to the phenotype observed in DFN3 patients. In addition, the novel deletion demonstrates that the previous reported enhancer, although not sufficient, is essential for POU3F4 function during inner ear development."


In [13]:
query = "pou3f4 Ahn AND 2009[dp]"
prompt_df, ids = fetch_articles_from_query(query)
prompt_df

Unnamed: 0,Title,Authors,Journal,Publication Date,Abstract
0,Otic mesenchyme expression of Cre recombinase directed by the inner ear enhancer of the Brn4/Pou3f4 gene.,"Kyung J Ahn, Frank Passero, E Bryan Crenshaw","Genesis (New York, N.Y. : 2000)",2009-Mar-,"Brn4/Pou3f4 is a POU-domain transcription factor expressed in the otic mesenchyme that is required for the normal development of the inner ear. In this report, we describe the isolation of an otic mesenchyme enhancer in the Brn4 gene. Subsequently, this enhancer was used to drive the expression of Cre recombinase in the otic mesenchyme of transgenic mice. When intercrossed with the ROSA reporter strain, R26R, ss-galactosidase expression is detected in several inner ear structures derived from otic mesenchyme, including the temporal bone, spiral ligament, spiral limbus, and mesenchyme underlying sensory epithelium of the utricle, saccule and semicircular canals. Thus, this Cre pedigree can induce conditional rearrangement of genes in the otic mesenchyme, and will serve as a powerful genetic tool to characterize the function of genes in the mesenchymal tissues of the inner ear."


In [14]:
import pandas as pd
import re

# Load the CSV file into a pandas DataFrame
file_path = '~/Documents/Badawcze/enhancerlit/enhancer_candidates.csv'
df = pd.read_csv(file_path)

# Function to create queries from the Literature column
def create_queries(row):
    gene_name = re.sub(r'[^a-zA-Z0-9]', '', row['Gene name'].lower())  # Remove non-alphanumeric characters
    queries = []
    if pd.notna(row['Literature']):
        for ref in row['Literature'].split(';'):
            ref = ref.strip()
            parts = ref.split()
            if len(parts) >= 3:
                # Remove 'et al.' and 'i wsp.' from the author's name
                if 'et' in parts and 'al.' in parts:
                    etal_index = parts.index('et')
                    first_author_surname = ' '.join(parts[:etal_index])
                elif 'i' in parts and 'wsp.' in parts:
                    iwsp_index = parts.index('i')
                    first_author_surname = ' '.join(parts[:iwsp_index])
                else:
                    first_author_surname = ' '.join(parts[:-1])  # Include all parts except the last one as the surname
                
                year_of_publication = parts[-1].strip(';,')
                query = f"{gene_name} {first_author_surname} AND {year_of_publication}[dp]"
                queries.append(query)
    return queries

# Apply the function to each row in the DataFrame
df['Queries'] = df.apply(create_queries, axis=1)

# Extract the list of lists of queries
list_of_queries = df['Queries'].tolist()

In [15]:
# Display the resulting list of lists of queries
list_of_queries

[['pou3f4 de Kok AND 1996[dp]',
  'pou3f4 Ahn AND 2009[dp]',
  'pou3f4 Naranjo AND 2010[dp]'],
 ['sost Balemans AND 2002[dp]', 'sost Loots AND 2005[dp]'],
 ['shox Sabherwal AND 2007[dp]',
  'shox Fukami AND 2005[dp]',
  'shox Benito-Sanz AND 2012[dp]'],
 ['nr0b1 Smyk AND 2007b[dp]'],
 ['sry McElreavy AND 1992[dp]',
  'sry Capel AND 1993[dp]',
  'sry McElreavy AND 1996[dp]',
  'sry Sharp AND 2005[dp]'],
 ['hbb Driscoll AND 1989[dp]'],
 ['hba1hba2 Viprakasit AND 2006[dp]'],
 ['pitx2 Trembath AND 2004[dp]'],
 ['foxc1 Davies AND 1999[dp]'],
 ['foxl2 Beysen AND 2005[dp]', "foxl2 D'haene AND 2009[dp]"],
 ['sox9 Benko AND 2009[dp]',
  'sox9 Fukami AND 2012[dp]',
  'sox9 Sanchez-Castro AND 2013[dp]'],
 ['sox9 Kim AND 2015[dp]'],
 ['pitx2 Volkmann AND 2011[dp]'],
 ['atoh7 Ghiasvand AND 2011[dp]'],
 ['dlx6dlx5 Brown AND 2010[dp]'],
 ['dlx6dlx5 Tayebi AND 2014[dp]'],
 ['pax6 Wawrocka AND 2012[dp]'],
 ['foxg1 Allou AND 2012[dp]'],
 ['foxf1 Szafranski AND 2013[dp]'],
 ['nr0b1 Skinningsrud AND 2009[

In [16]:
list_of_queries[0]

['pou3f4 de Kok AND 1996[dp]',
 'pou3f4 Ahn AND 2009[dp]',
 'pou3f4 Naranjo AND 2010[dp]']

In [31]:
research_id = 1

In [19]:
ids_for_row_2d = [fetch_articles_from_query(q)[research_id] for q in list_of_queries[0]]
# Flatten
ids_for_row = [query for sublist in ids_for_row_2d for query in sublist]

ids_for_row

['8872461', '19217071', '21209840', '20668882']

In [20]:
texts = [fetch_text_from_pubmed_id(id) for id in ids_for_row]

## Retreive metadata of searches

In [30]:
def generate_string_list(row):
    string_list = []
    for column in row.index[:-3]:  # Exclude the last two columns
        value = row[column]
        # Ensure that NaN values are handled appropriately
        value_str = str(value) if pd.notna(value) else "brak danych"
        string_list.append(f"{column}: {value_str}")
    return string_list

searches_metadata = df.apply(generate_string_list, axis=1).tolist()

# Answering the question about ClinVar and coordinates
Having the full texts of the biomedical articles

In [None]:
pip install openai==0.28

In [21]:
import openai
from langchain.prompts import PromptTemplate

# Set your OpenAI API key
openai.api_key = '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'

In [22]:
openai.__version__

'0.28.0'

In [43]:
# Define the context strings
context_strings = [f"CONTEXT INFO: Publication {i}: {text}" for i, text in enumerate(texts)]
context_strings.extend([f"CONTEXT INFO: {x}" for x in searches_metadata[research_id]])

In [44]:
# Combine the context strings into a single context
context = "\n".join(context_strings)

# Define the prompt template with the context
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    Context:
    {context}
    
    Question:
    {question}
    """
)

# Create the final prompt using the template
def create_prompt(context, question):
    return prompt_template.format(context=context, question=question)

# Define the question you want to ask
# question = "What are the related ClinVar variant ids in this context?"
question = "What are the exact coordinates of this enhancer?"

# Create the final prompt
final_prompt = create_prompt(context, question)

# Call the OpenAI API with the final prompt using the new interface
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": final_prompt}
    ]
)

# Print the response
print(response.choices[0].message["content"])

InvalidRequestError: This model's maximum context length is 16385 tokens. However, your messages resulted in 50218 tokens. Please reduce the length of the messages.

### Testing BiomedBERT (formerly: PubMedBERT)
Demands fine-tuning on a downstream task

In [None]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained GENE-BERT model and tokenizer from Hugging Face
model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"  # Replace with the actual model name if available on Hugging Face
model = AutoModelForTokenClassification.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to perform inference
def predict(text):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    print(inputs)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Assuming the model is for token classification, extract predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()
    
    # Convert predictions to human-readable labels (assuming a label map is available)
    label_map = {i: label for i, label in enumerate(tokenizer.get_vocab().keys())}
    predicted_labels = [label_map[prediction] for prediction in predictions[0]]
    
    return predicted_labels

# Example usage
text = "Example genomic text for testing GENE-BERT."
predictions = predict(text)
print("Predictions:", predictions)


In [20]:
text = ""
predictions = predict(text)
print("Predictions:", predictions)


{'input_ids': tensor([[   2,    4, 1977,   43, 2798, 9241, 2359,   18,    3]],
       device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
Predictions: ['lesion', 'lesion', 'lesion', 'lesion', 'lesion', 'lesion', 'lesion', 'linkage', 'linkage']
