In [1]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [1]:
import requests
from xml.etree import ElementTree
import pandas as pd

def pubmed_search(query, retmax=10):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "xml",
        "retmax": retmax
    }
    
    response = requests.get(base_url, params=params)
    response.raise_for_status()
    
    tree = ElementTree.fromstring(response.content)
    ids = [id_elem.text for id_elem in tree.findall(".//Id")]
    
    return ids

def fetch_details(id_list):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    ids = ",".join(id_list)
    params = {
        "db": "pubmed",
        "id": ids,
        "retmode": "xml",
        "rettype": "abstract"
    }
    
    response = requests.get(base_url, params=params)
    response.raise_for_status()
    
    return response.content

In [2]:
def preprocess_details_to_dataframe(xml_data):
    tree = ElementTree.fromstring(xml_data)
    articles = []

    for article in tree.findall(".//PubmedArticle"):
        article_data = {}
        
        # Extracting the title
        title_elem = article.find(".//ArticleTitle")
        article_data['Title'] = title_elem.text if title_elem is not None else "N/A"
        
        # Extracting the authors
        authors = []
        for author in article.findall(".//Author"):
            last_name = author.find("LastName")
            fore_name = author.find("ForeName")
            if last_name is not None and fore_name is not None:
                authors.append(f"{fore_name.text} {last_name.text}")
        article_data['Authors'] = ", ".join(authors) if authors else "N/A"
        
        # Extracting the journal name
        journal_elem = article.find(".//Journal/Title")
        article_data['Journal'] = journal_elem.text if journal_elem is not None else "N/A"
        
        # Extracting the publication date
        pub_date_elem = article.find(".//PubDate")
        if pub_date_elem is not None:
            year_elem = pub_date_elem.find("Year")
            month_elem = pub_date_elem.find("Month")
            day_elem = pub_date_elem.find("Day")
            pub_date = f"{year_elem.text if year_elem is not None else ''}-{month_elem.text if month_elem is not None else ''}-{day_elem.text if day_elem is not None else ''}"
            article_data['Publication Date'] = pub_date
        else:
            article_data['Publication Date'] = "N/A"
        
        # Extracting the abstract
        abstract_elem = article.find(".//Abstract/AbstractText")
        article_data['Abstract'] = abstract_elem.text if abstract_elem is not None else "N/A"
        
        articles.append(article_data)

    df = pd.DataFrame(articles)
    return df

In [3]:
def fetch_full_text(pmc_id):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pmc",
        "id": pmc_id,
        "retmode": "xml"
    }
    
    response = requests.get(base_url, params=params)
    response.raise_for_status()
    
    return response.content

In [4]:
def preprocess_full_text_to_plain_text(xml_data):
    tree = ElementTree.fromstring(xml_data)
    text_content = []

    # Extract text from <body> element of the article
    body_elem = tree.find(".//body")
    if body_elem is not None:
        for elem in body_elem.iter():
            if elem.text:
                text_content.append(elem.text.strip())
            if elem.tail:
                text_content.append(elem.tail.strip())
    
    plain_text = "\n".join(text_content)
    return plain_text

In [5]:
def fetch_text_from_pubmed_id(id):
    # Convert PubMed ID to PMC ID using eLink
    elink_base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
    elink_params = {
        "dbfrom": "pubmed",
        "db": "pmc",
        "id": id,
        "retmode": "xml"
    }
    
    elink_response = requests.get(elink_base_url, params=elink_params)
    elink_response.raise_for_status()
    
    elink_tree = ElementTree.fromstring(elink_response.content)
    pmc_id_elem = elink_tree.find(".//LinkSetDb/Link/Id")
    if pmc_id_elem is not None:
        pmc_id = pmc_id_elem.text
        full_text_xml = fetch_full_text(pmc_id)
        plain_text = preprocess_full_text_to_plain_text(full_text_xml)
        return plain_text
    else:
        print("Full text not available in PMC for this article.")
        # Return the abstract instead
        abstract = df.loc[df['Title'] == df.iloc[0]['Title'], 'Abstract'].values[0]
        return abstract

In [6]:
def fetch_articles_from_query(query):
    ids = pubmed_search(query, retmax=10)  # You can adjust retmax to get more results
    
    if ids:
        details = fetch_details(ids)
        df = preprocess_details_to_dataframe(details)

        return df, ids
    else:
        print("No results found")
        return None, None

In [7]:
# "de Kok et al. 1996;
# Ahn et al. 2009; Naranjo et al. 2010"
query = "pou3f4 Kok AND 1996[dp]"
df, ids = fetch_articles_from_query(query)

In [8]:
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Show full content of each column
pd.set_option('display.width', None)  # Set the display width to be unlimited

In [9]:
df

Unnamed: 0,Title,Authors,Journal,Publication Date,Abstract
0,Identification of a hot spot for microdeletions in patients with X-linked deafness type 3 (DFN3) 900 kb proximal to the DFN3 gene POU3F4.,"Y J de Kok, E R Vossenaar, C W Cremers, N Dahl, J Laporte, L J Hu, D Lacombe, N Fischel-Ghodsian, R A Friedman, L S Parnes, P Thorpe, M Bitner-Glindzicz, H J Pander, H Heilbronner, J Graveline, J T den Dunnen, H G Brunner, H H Ropers, F P Cremers",Human molecular genetics,1996-Sep-,"Small mutations in the POU domain gene POU3F4 were recently shown to cause X-linked deafness type 3 (DFN3) in nine unrelated males. The POU3F4 gene was found to be located outside four of five deletions associated with DFN3. Two of these deletions were situated more than 400 kb proximal to POU3F4. Employing PCR analysis of sequence tagged sites from this region we initially identified novel deletions in two DFN3 patients. To investigate this chromosomal segment in more detail, we extended a previously established 850 kb cosmid contig in the centromeric direction to a total size of 1500 kb. Cosmids from this contig were hybridized to DNA of 11 unrelated males with DFN3. In two patients, we identified deletions encompassing the POU3F4 gene and variably sized segments of Xq21.1. In six of the nine remaining patients which lacked mutations in the POU3F4 gene, smaller deletions were identified which, with one exception, overlap in a 8 kb segment 900 kb proximal to the POU3F4 gene. In one patient, we identified several small deletions in the vicinity of the 8 kb DNA segment. Together, deletions account for 56% (13/23) of all known DFN3 mutations, most (10/13) of which do not encompass the POU3F4 gene. The combined molecular data suggest that the deletion hot spot region in Xq21.1 contains another DFN3 gene or, alternatively, a sequence element involved in transcriptional regulation of POU3F4."


In [10]:
text = fetch_text_from_pubmed_id(ids[0])
text[:1000]

'\n\n\n\nIntroduction\n\nThe genetic basis for multiple Mendelian conditions was initially identified by studying individuals harboring chromosomal translocations, which provided a signpost for where in the genome a gene was disrupted. It quickly became apparent that many of these chromosomal translocations did not disrupt coding sequence, but rather disrupted the positioning of coding sequence relative to a distal regulatory element or gene promoter (Vortkamp et al.\n\n1991\n; Wallis et al.\n1999\n; Fang et al.\n2000\n; Crisponi et al.\n2001\n). These initial studies helped establish that non-coding genetic variation can cause numerous Mendelian conditions, and work over the past several decades has solidified the central role of non-coding genetic variation in the pathogenesis of hundreds of Mendelian conditions.\nIn this review, we compiled hundreds of non-coding genetic variants from ClinVar and the literature that cause rare human diseases via the disruption of gene regulatory pat

In [11]:
# "de Kok et al. 1996;
# Ahn et al. 2009; Naranjo et al. 2010"
query = "pou3f4 Naranjo AND 2010[dp]"
prompt_df, ids = fetch_articles_from_query(query)
prompt_df

Unnamed: 0,Title,Authors,Journal,Publication Date,Abstract
0,Characterization of new otic enhancers of the pou3f4 gene reveal distinct signaling pathway regulation and spatio-temporal patterns.,"Àlex Robert-Moreno, Silvia Naranjo, Elisa de la Calle-Mustienes, José Luis Gómez-Skarmeta, Berta Alsina",PloS one,2010-Dec-31,"POU3F4 is a member of the POU-homedomain transcription factor family with a prominent role in inner ear development. Mutations in the human POU3F4 coding unit leads to X-linked deafness type 3 (DFN3), characterized by conductive hearing loss and progressive sensorineural deafness. Microdeletions found 1 Mb 5' upstream of the coding region also displayed the same phenotype, suggesting that cis-regulatory elements might be present in that region. Indeed, we and others have recently identified several enhancers at the 1 Mb 5' upstream interval of the pou3f4 locus. Here we characterize the spatio-temporal patterns of these regulatory elements in zebrafish transgenic lines. We show that the most distal enhancer (HCNR 81675) is activated earlier and drives GFP reporter expression initially to a broad ear domain to progressively restrict to the sensory patches. The proximal enhancer (HCNR 82478) is switched later during development and promotes expression, among in other tissues, in sensory patches from its onset. The third enhancer (HCNR 81728) is also active at later stages in the otic mesenchyme and in the otic epithelium. We also characterize the signaling pathways regulating these enhancers. While HCNR 81675 is regulated by very early signals of retinoic acid, HCNR 82478 is regulated by Fgf activity at a later stage and the HCNR 81728 enhancer is under the control of Hh signaling. Finally, we show that Sox2 and Pax2 transcription factors are bound to HCNR 81675 genomic region during otic development and specific mutations to these transcription factor binding sites abrogates HCNR 81675 enhancer activity. Altogether, our results suggest that pou3f4 expression in inner ear might be under the control of distinct regulatory elements that fine-tune the spatio-temporal activity of this gene and provides novel data on the signaling mechanisms controlling pou3f4 function."
1,Multiple enhancers located in a 1-Mb region upstream of POU3F4 promote expression during inner ear development and may be required for hearing.,"Silvia Naranjo, Krysta Voesenek, Elisa de la Calle-Mustienes, Alex Robert-Moreno, Haris Kokotas, Maria Grigoriadou, John Economides, Guy Van Camp, Nele Hilgert, Felipe Moreno, Berta Alsina, Michael B Petersen, Hannie Kremer, José Luis Gómez-Skarmeta",Human genetics,2010-Oct-,"POU3F4 encodes a POU-domain transcription factor required for inner ear development. Defects in POU3F4 function are associated with X-linked deafness type 3 (DFN3). Multiple deletions affecting up to ~900-kb upstream of POU3F4 are found in DFN3 patients, suggesting the presence of essential POU3F4 enhancers in this region. Recently, an inner ear enhancer was reported that is absent in most DFN3 patients with upstream deletions. However, two indications suggest that additional enhancers in the POU3F4 upstream region are required for POU3F4 function during inner ear development. First, there is at least one DFN3 deletion that does not eliminate the reported enhancer. Second, the expression pattern driven by this enhancer does not fully recapitulate Pou3f4 expression in the inner ear. Here, we screened a 1-Mb region upstream of the POU3F4 gene for additional cis-regulatory elements and searched for novel DFN3 mutations in the identified POU3F4 enhancers. We found several novel enhancers for otic vesicle expression. Some of these also drive expression in kidney, pancreas and brain, tissues that are known to express Pou3f4. In addition, we report a new and smallest deletion identified so far in a DFN3 family which eliminates 3.9 kb, comprising almost exclusively the previous reported inner ear enhancer. We suggest that multiple enhancers control the expression of Pou3f4 in the inner ear and these may contribute to the phenotype observed in DFN3 patients. In addition, the novel deletion demonstrates that the previous reported enhancer, although not sufficient, is essential for POU3F4 function during inner ear development."


In [12]:
query = "pou3f4 Ahn AND 2009[dp]"
prompt_df, ids = fetch_articles_from_query(query)
prompt_df

Unnamed: 0,Title,Authors,Journal,Publication Date,Abstract
0,Otic mesenchyme expression of Cre recombinase directed by the inner ear enhancer of the Brn4/Pou3f4 gene.,"Kyung J Ahn, Frank Passero, E Bryan Crenshaw","Genesis (New York, N.Y. : 2000)",2009-Mar-,"Brn4/Pou3f4 is a POU-domain transcription factor expressed in the otic mesenchyme that is required for the normal development of the inner ear. In this report, we describe the isolation of an otic mesenchyme enhancer in the Brn4 gene. Subsequently, this enhancer was used to drive the expression of Cre recombinase in the otic mesenchyme of transgenic mice. When intercrossed with the ROSA reporter strain, R26R, ss-galactosidase expression is detected in several inner ear structures derived from otic mesenchyme, including the temporal bone, spiral ligament, spiral limbus, and mesenchyme underlying sensory epithelium of the utricle, saccule and semicircular canals. Thus, this Cre pedigree can induce conditional rearrangement of genes in the otic mesenchyme, and will serve as a powerful genetic tool to characterize the function of genes in the mesenchymal tissues of the inner ear."


In [7]:
import pandas as pd
import re

# Load the CSV file into a pandas DataFrame
file_path = '~/Documents/Badawcze/enhancerlit/enhancer_candidates.csv'
df = pd.read_csv(file_path)

# Function to create queries from the Literature column
def create_queries(row):
    gene_name = re.sub(r'[^a-zA-Z0-9]', '', row['Gene name'].lower())  # Remove non-alphanumeric characters
    queries = []
    if pd.notna(row['Literature']):
        for ref in row['Literature'].split(';'):
            ref = ref.strip()
            parts = ref.split()
            if len(parts) >= 3:
                # Remove 'et al.' and 'i wsp.' from the author's name
                if 'et' in parts and 'al.' in parts:
                    etal_index = parts.index('et')
                    first_author_surname = ' '.join(parts[:etal_index])
                elif 'i' in parts and 'wsp.' in parts:
                    iwsp_index = parts.index('i')
                    first_author_surname = ' '.join(parts[:iwsp_index])
                else:
                    first_author_surname = ' '.join(parts[:-1])  # Include all parts except the last one as the surname
                
                year_of_publication = parts[-1].strip(';,')
                query = f"{gene_name} {first_author_surname} AND {year_of_publication}[dp]"
                queries.append(query)
    return queries

# Apply the function to each row in the DataFrame
df['Queries'] = df.apply(create_queries, axis=1)

# Extract the list of lists of queries
list_of_queries = df['Queries'].tolist()

In [8]:
# Display the resulting list of lists of queries
list_of_queries

[['pou3f4 de Kok AND 1996[dp]',
  'pou3f4 Ahn AND 2009[dp]',
  'pou3f4 Naranjo AND 2010[dp]'],
 ['sost Balemans AND 2002[dp]', 'sost Loots AND 2005[dp]'],
 ['shox Sabherwal AND 2007[dp]',
  'shox Fukami AND 2005[dp]',
  'shox Benito-Sanz AND 2012[dp]'],
 ['nr0b1 Smyk AND 2007b[dp]'],
 ['sry McElreavy AND 1992[dp]',
  'sry Capel AND 1993[dp]',
  'sry McElreavy AND 1996[dp]',
  'sry Sharp AND 2005[dp]'],
 ['hbb Driscoll AND 1989[dp]'],
 ['hba1hba2 Viprakasit AND 2006[dp]'],
 ['pitx2 Trembath AND 2004[dp]'],
 ['foxc1 Davies AND 1999[dp]'],
 ['foxl2 Beysen AND 2005[dp]', "foxl2 D'haene AND 2009[dp]"],
 ['sox9 Benko AND 2009[dp]',
  'sox9 Fukami AND 2012[dp]',
  'sox9 Sanchez-Castro AND 2013[dp]'],
 ['sox9 Kim AND 2015[dp]'],
 ['pitx2 Volkmann AND 2011[dp]'],
 ['atoh7 Ghiasvand AND 2011[dp]'],
 ['dlx6dlx5 Brown AND 2010[dp]'],
 ['dlx6dlx5 Tayebi AND 2014[dp]'],
 ['pax6 Wawrocka AND 2012[dp]'],
 ['foxg1 Allou AND 2012[dp]'],
 ['foxf1 Szafranski AND 2013[dp]'],
 ['nr0b1 Skinningsrud AND 2009[

In [14]:
list_of_queries[0]

['pou3f4 de Kok AND 1996[dp]',
 'pou3f4 Ahn AND 2009[dp]',
 'pou3f4 Naranjo AND 2010[dp]']

In [9]:
research_id = 1

In [10]:
ids_for_row_2d = [fetch_articles_from_query(q)[research_id] for q in list_of_queries[0]]
# Flatten
ids_for_row = [query for sublist in ids_for_row_2d for query in sublist]

ids_for_row

['8872461', '19217071', '21209840', '20668882']

In [17]:
texts = [fetch_text_from_pubmed_id(id) for id in ids_for_row]

# Extracting half-manual

In [20]:
pubmed_id = '24212882'
text = fetch_text_from_pubmed_id(pubmed_id)
text

'\n\nMost patients with syndromic pancreatic agenesis have heterozygous dominant mutations in\n\nGATA6\n. Extra-pancreatic features in these individuals include cardiac malformations, biliary tract defects, gut and other endocrine abnormalities. Four families have been reported with syndromic pancreatic agenesis with severe neurological features and cerebellar agenesis caused by recessive coding mutations in\n1\n,\n2\nPTF1A\n. Most cases of isolated, non-syndromic pancreatic agenesis remain unexplained with the only cause described being recessive coding mutations in\n3\n-\n5\nPDX1\nthat have been reported in two families\n. We previously noted that individuals with unexplained pancreatic agenesis were often born to consanguineous parents and rarely had extra-pancreatic features\n6\n,\n7\n. This suggested an autosomal recessive defect underlying isolated pancreatic agenesis.\n1\nTo identify recessive mutations causing isolated pancreatic agenesis we used linkage and whole genome sequen

In [16]:
len(text)

24096

## Retreive metadata of searches

In [18]:
def generate_string_list(row):
    string_list = []
    for column in row.index[:-3]:  # Exclude the last two columns
        value = row[column]
        # Ensure that NaN values are handled appropriately
        value_str = str(value) if pd.notna(value) else "brak danych"
        string_list.append(f"{column}: {value_str}")
    return string_list

searches_metadata = df.apply(generate_string_list, axis=1).tolist()

In [19]:
searches_metadata[1]

['Disease name: Van Buchem disease',
 'Disease OMIM: 239100',
 'Gene name: SOST',
 'Gene OMIM: 605740',
 'Locus: 17q21.31',
 "Distance from regulated gene: 35 kpz 3'",
 'Literature: Balemans et al. 2002;\nLoots et al. 2005',
 'Type: deletion of cis element']

# Answering the question about ClinVar and coordinates
Having the full texts of the biomedical articles

In [None]:
pip install openai==0.28

In [17]:
import openai
from langchain.prompts import PromptTemplate

# Set your OpenAI API key
openai.api_key = 'sk-proj-K52pv0IIXeL6j8NFSjlST3BlbkFJP28Uknmmb22SHR9Wjef2'

In [18]:
openai.__version__

'0.28.0'

In [22]:
# Define the context strings
context_strings = [f"CONTEXT INFO: Publication {i}: {text}" for i, text in enumerate(texts)]
# context_strings.extend([f"CONTEXT INFO: {x}" for x in searches_metadata[research_id]])

In [22]:
def split_text_into_chunks(text, chunk_size=25000):
    # List to hold chunks
    chunks = []
    # Loop through the text, adding chunks to the list
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i + chunk_size])
    return chunks

# Example usage
chunks = split_text_into_chunks(text)

# Print the first chunk to verify
print([len(c) for c in chunks])
#overlap

[24096]


In [40]:

# Define the prompt template with the context
prompt_template = PromptTemplate(
    input_variables=["question", "publication"],
    template="""
    Question:
    {question}
    
    Publication:
    {publication}
    """
)

# Create the final prompt using the template
def create_prompt(question, publication):
    return prompt_template.format(question=question, publication=publication)

# Define the question you want to ask
# question = "What are the related ClinVar variant ids in this context?"
question = "Please give me all fragments of text (of length circa 25 words) where there are genomic region coordinates in the format like here or similar: chr10:23508365, chrY:∼124349-409949; chrY:∼134349–439949"

def get_coordinates_for_text(publication):
    final_prompt = create_prompt(question, publication)
    response = openai.ChatCompletion.create(
        # model="gpt-3.5-turbo",
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": final_prompt}
        ]
    )
    return response.choices[0].message["content"]

def get_answers(text):
    answers = []
    for chunk in split_text_into_chunks(text):
        coordinates = get_coordinates_for_text(chunk)
        print("NEXT ANSWER")
        print(coordinates)
        answers.append(coordinates)
    return answers

In [41]:
def test_example(pmid, label):
    text = fetch_text_from_pubmed_id(pmid)
    answers = get_answers(text)
    y = ' '.join(answers)
    return label in y

In [30]:
positive_examples = [
    ('24212882', 'chr10:23502416-23510031'),
    ('24212882', 'chr10:23508437'),
    ('24212882', 'chr10:23508363'),
    ('24212882', 'chr10:23508305'),
    ('24212882', 'chr10:23508365'),
    ('24212882', 'chr10:23508446'),
    ('22071895', 'chrY:∼124349-409949'),
    ('22071895', 'chrY:∼134349–439949'),
    ('19234473', 'chr17:66,187,898'),
    ('19234473', 'chr17:66,400,448'),
    ('22543974', 'chr12:114,701,207–114,704,691')
]

In [38]:
positive_examples2 = [
    ('19234473', 'chr17:66,187,898'),
    ('19234473', 'chr17:66,400,448'),
    ('22543974', 'chr12:114,701,207–114,704,691')
]

In [42]:
rate = [test_example(pmid, label) for pmid, label in positive_examples]
existing = [label in fetch_text_from_pubmed_id(pmid) for pmid, label in positive_examples]


NEXT ANSWER
Here are the text fragments containing genomic region coordinates from the provided publication:

1. "This variant, chr10:23508437A>G, was located ~25kb downstream of PTF1A, in the region previously identified by homozygosity mapping."

2. "Three of the remaining probands had different base substitution mutations: a homozygous chr10:23508363A>G mutation, a homozygous chr10:23508305A>G mutation and compound heterozygous chr10:23508365A>G/chr10:23508446A>C mutations."

3. "In the tenth family a 7.6kb deletion was identified by long range PCR, and sequence analysis showed that the deleted region (chr10:23502416-23510031) included the entire putative enhancer." 

4. "The genomic region chr10:23501386-23512912 was amplified in patients 7-4 and 7-8 by long-range PCR using the SequalPrep Long PCR kit."

These fragments discuss specific genomic coordinates related to mutations and deletions relevant to the study of pancreatic agenesis.
NEXT ANSWER
Here are the fragments of text fro

In [51]:
tgambin_text = fetch_text_from_pubmed_id('26195989')

In [52]:
tgambin_text

'\n\n\n\nBackground\n\nExome and genome sequencing is becoming an integral part of health care. Their role as molecular diagnostic tools in obstetrics [\n\n1\n] and pediatrics [\n2\n] is firmly established, as is their potential in hereditary cancer [\n3\n] and somatic testing [\n4\n]. Less well touted, but likely of broader application, is the use of sequencing in carrier testing for recessive disorders, as a subclinical marker of potential disease susceptibility or undiagnosed disease, and the development of genetic risk scores [\n5\n] to identify high risk individuals for a number of common chronic diseases. Like any test or procedure, DNA sequencing is able to detect findings for conditions other than the primary reason for which the original test was performed. These findings can be broadly divided into two groups. First, so-called secondary findings (SFs) [\n6\n,\n7\n], i.e., variants in genes not directly related to the primary clinical diagnosis but actively screened due to the

In [33]:
existing = [label in fetch_text_from_pubmed_id(pmid) for pmid, label in positive_examples]

In [53]:
tgambin_answers = get_answers(tgambin_text)
tgambin_answers

NEXT ANSWER
Here are the fragments from the provided publication that include genomic region coordinates in the specified format:

1. "We excluded nonsense variants that are located in the last exon or in the last 50 bp of the penultimate exon, which are likely to escape nonsense-mediated decay (NMD) and thus they may be less damaging."

2. "The data presented here enable assessment of the impact of a comprehensive carrier testing program for established recessive disorders, keeping in mind the ever-changing nature of the reference databases, such as ClinVar and dbNSFP."

(Note that no specific genomic coordinates resembling "chrX:XXXXX" format were mentioned in the text portions, which might indicate that either the genomic coordinates were not present, or they have been abstracted in multiple-step procedures or references that were truncated.)
NEXT ANSWER
From the provided publication text, there are no explicit genomic region coordinates formatted as specified (e.g., chr10:23508365,

['Here are the fragments from the provided publication that include genomic region coordinates in the specified format:\n\n1. "We excluded nonsense variants that are located in the last exon or in the last 50 bp of the penultimate exon, which are likely to escape nonsense-mediated decay (NMD) and thus they may be less damaging."\n\n2. "The data presented here enable assessment of the impact of a comprehensive carrier testing program for established recessive disorders, keeping in mind the ever-changing nature of the reference databases, such as ClinVar and dbNSFP."\n\n(Note that no specific genomic coordinates resembling "chrX:XXXXX" format were mentioned in the text portions, which might indicate that either the genomic coordinates were not present, or they have been abstracted in multiple-step procedures or references that were truncated.)',
 'From the provided publication text, there are no explicit genomic region coordinates formatted as specified (e.g., chr10:23508365, chrY:∼12434

In [54]:
'Fraction of individuals with nonsynonymous variants' in tgambin_text

True

In [58]:
import re

def get_word_count(text):
    return len(re.findall(r'\b\w+\b', text))

def extract_with_context(text, fragment, context_words=500):
    # Split the text into words
    words = re.findall(r'\b\w+\b', text)
    
    # Find the start and end positions of the fragment within the text
    fragment_start = text.find(fragment)
    if fragment_start == -1:
        return "Fragment not found in text."
    
    # Calculate the word index of the fragment start
    word_index_start = len(re.findall(r'\b\w+\b', text[:fragment_start]))
    
    # Calculate start and end index for context
    context_start_index = max(0, word_index_start - context_words)
    context_end_index = word_index_start + get_word_count(fragment) + context_words
    
    # Extract the context words
    context = ' '.join(words[context_start_index:context_end_index])
    
    return context

# Example usage
text = tgambin_text
fragment = "Fraction of individuals with nonsynonymous variants"
context = extract_with_context(text, fragment)

print(context)


only one of these databases 15 by ClinVar and 378 by HGMD Seventy five percent of the nonsense variants 2059 out of 2737 were not found in the 1000 Genomes and ESP databases Out of those 1667 from 2705 occurrences were located outside presumed NMD escaping regions These novel nonsense variants were identified in 22 of all individuals 2380 out of 11 068 Percentage of couples at risk of having affected offspring To estimate the percentage of couples in the general population in which both partners have a reported pathogenic variant in the same autosomal recessive disease gene we performed the following resampling experiment using data from the ARIC study From this random sample of individuals which is likely representative of EA and AA couples planning to have children we randomly sampled two exomes one from a male and one from a female to evaluate if they share at least one autosomal recessive disease gene with a reported pathogenic ClinVar variant After 1 000 000 iterations we observed

In [43]:
print(rate)
print(existing)

[True, True, True, True, True, True, False]
[True, True, True, True, True, True, False]


## Version of code with conversation history

In [24]:
import openai

# Define the prompt template with placeholders for context and question
prompt_template = """
Context:
{context}

Question:
{question}
"""

# Function to create prompt using the template
def create_prompt(context, question):
    return prompt_template.format(context=context, question=question)

# Split the context strings into smaller chunks
def split_context(context_strings, max_length=1000):
    chunks = []
    current_chunk = []
    current_length = 0

    for context in context_strings:
        context_length = len(context.split())
        if current_length + context_length <= max_length:
            current_chunk.append(context)
            current_length += context_length
        else:
            chunks.append("\n".join(current_chunk))
            current_chunk = [context]
            current_length = context_length

    if current_chunk:
        chunks.append("\n".join(current_chunk))

    return chunks

# Define the question you want to ask
question = "Search for all the occurrences of the genome coordinates in these publications. Example coordinates separated by semicolon: chrX:48800838-48804946, chrY:∼124349-409949; chrY:∼134349–439949, Chr17:69,153,000–69,189,000. Then return these coordinates with a text window of 15 words."

# Combine and split the context strings into manageable chunks
context_chunks = split_context(context_strings)

# Initialize conversation history with the system message
conversation_history = [
    {"role": "system", "content": "You are a helpful assistant."}
]

# Iterate over the chunks and call the OpenAI API for each chunk
for chunk in context_chunks:
    final_prompt = create_prompt(chunk, question)
    conversation_history.append({"role": "user", "content": final_prompt})

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=conversation_history
    )

    # Extract the response and append to conversation history
    assistant_response = response.choices[0].message["content"]
    conversation_history.append({"role": "assistant", "content": assistant_response})

    # Print the response for each chunk
    print(assistant_response)


To search for the genome coordinates in the provided publications, I will need the text of the publications. Please provide the text so that I can help you extract the coordinates with a text window of 15 words.


InvalidRequestError: This model's maximum context length is 16385 tokens. However, your messages resulted in 27414 tokens. Please reduce the length of the messages.

### Testing BiomedBERT (formerly: PubMedBERT)
Demands fine-tuning on a downstream task

In [None]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained GENE-BERT model and tokenizer from Hugging Face
model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"  # Replace with the actual model name if available on Hugging Face
model = AutoModelForTokenClassification.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to perform inference
def predict(text):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    print(inputs)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Assuming the model is for token classification, extract predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()
    
    # Convert predictions to human-readable labels (assuming a label map is available)
    label_map = {i: label for i, label in enumerate(tokenizer.get_vocab().keys())}
    predicted_labels = [label_map[prediction] for prediction in predictions[0]]
    
    return predicted_labels

# Example usage
text = "Example genomic text for testing GENE-BERT."
predictions = predict(text)
print("Predictions:", predictions)


In [20]:
text = ""
predictions = predict(text)
print("Predictions:", predictions)


{'input_ids': tensor([[   2,    4, 1977,   43, 2798, 9241, 2359,   18,    3]],
       device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
Predictions: ['lesion', 'lesion', 'lesion', 'lesion', 'lesion', 'lesion', 'lesion', 'linkage', 'linkage']
