In [1]:
import pandas as pd
from typing import List, Optional

In [2]:
df_loe = pd.read_csv('../data/input/Enhancer candidates - Table-2-Variants causing LOE.csv')
df_mloe = pd.read_csv('../data/input/Enhancer candidates - Table-3-Variants causing mLOE.csv')

In [3]:
def preprocess_pmid_cell(input_str: str) -> Optional[List[str]]:
    if not input_str.strip()[0].isdigit():
        return None
    filtered_str = input_str.split('(')[0].strip()
    return [pmid.strip() for pmid in filtered_str.split(';')]

In [4]:
coordinates_search_list = []
for row in df_loe.iterrows():
    user_query_dict = {
        'gene': row[1]['Gene'],
        'disease': row[1]['Disease'],
        # 'disease severity': row[1]['Disease severity'],
        # 'Regulatory element(s) impacted': row[1]['Regulatory element(s) impacted'],
        # 'Distance to promoter': row[1]['Distance to promoter'],
        # 'Pathogenicity': row[1]['ClinVar classification'],
    }
    hgvs_coordinate = row[1]['Variant ID']
    pmids: List[str] = preprocess_pmid_cell(row[1]['PMID(s)'])
    if pmids is None:
        print(f'PMID(s) cell is not in the correct format: {row[1]["PMID(s)"]}')
        continue
    coordinates_search_list.append({
        'hgvs_coordinate': hgvs_coordinate,
        'pmids': pmids,
        'user_query_dict': user_query_dict
    })

PMID(s) cell is not in the correct format: https://doi.org/10.1101/2022.09.09.22279746 (Galey et al. 2022)


In [5]:
pmids

['21931115']

In [7]:
coordinates_search_list[0]

{'hgvs_coordinate': 'NM_000037.4(ANK1):c.-73_-72del',
 'pmids': ['16037067'],
 'user_query_dict': {'gene': 'ANKI', 'disease': 'Hereditary spherocytosis'}}

In [8]:
from langchain_community.cache import SQLiteCache
from langchain.globals import set_llm_cache
set_llm_cache(SQLiteCache(database_path="../cache/langchain.db"))

In [20]:
import importlib
import src.BenchmarkTestService  # Initial import
importlib.reload(src.BenchmarkTestService)  # Reload the module
from src.BenchmarkTestService import BenchmarkTestService
test_service = BenchmarkTestService('gpt', 'gpt-4o-mini', 16385)
results = test_service.perform_benchmark_inference(coordinates_search_list[:1])
results

Regex match score: [False, False, False, False, False, False, False, False, False]
Regex match score: [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
Regex match score: [False, False, False]
Regex match score: [True, False]
Regex match score: [False]
Regex match score: [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False,

KeyboardInterrupt: 

In [None]:
pd.DataFrame(results[0]).to_csv('/home/wojtek/Desktop/loe_test_results1.csv', index=False)

In [10]:
import importlib
import src.BenchmarkTestService  # Initial import
importlib.reload(src.BenchmarkTestService)  # Reload the module
from src.BenchmarkTestService import BenchmarkTestService
test_service = BenchmarkTestService('gpt', 'gpt-4o-mini', 16385)
results = [test_service.perform_simple_benchmark_test(test) for test in coordinates_search_list]
results

Regex match score: [False, False, False, False, False, False, False, False, False]
Regex match score: [False]
Regex match score: [False]
Regex match score: [False, False, False, False, False, False, False, False, False]
Regex match score: [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
Regex match score: [False, False, False]
Regex match score: [True, False]
Regex match score: [False]
Regex match score: [False, False, False, False, False, False, False, False, False, False, False, Fa

[(True, False),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (True, True),
 (True, True),
 (True, True),
 (True, True),
 (True, True),
 (False, None),
 (True, True),
 (True, True),
 (True, True),
 (False, None),
 (False, None),
 (False, None),
 (True, True),
 (True, True),
 (True, True),
 (True, True),
 (True, True),
 (False, None),
 (True, True),
 (True, False),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (True, True),
 (True, True),
 (True, True),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (True, False),
 (False, None),
 (False, None),
 (False, None),
 (False, None),
 (False, 

In [12]:
pd.DataFrame(results).to_csv('/home/wojtek/Desktop/specific_coordinate_extraction.csv', index=False)

In [17]:
import importlib
import src.BenchmarkTestService  # Initial import
importlib.reload(src.BenchmarkTestService)  # Reload the module
from src.BenchmarkTestService import BenchmarkTestService
pmids = []
for row in coordinates_search_list:
    pmids.extend(row['pmids'])
test_service = BenchmarkTestService('together', 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo', 16385)
manual_coordinates = test_service.manual_test_coordinate_search(pmids)
manual_coordinates

Regex match score: [False, False, True, False, False, False, False, False, True, False, False, False, False, True, False, False, False, False, True, False, False, True, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, True, False, True, True, True, True, True, True, False, True, False]
Regex match score: [False, True, False, False, True, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, True, False, True, False, False]
Regex match score: [False, True, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False,

['```python',
 'import re',
 '',
 'def extract_coordinates(text):',
 '    # Define the patterns for HGVS, BED and GFF/GTF',
 '    hgvs_pattern = r"^(g\\.|c\\.|m\\.|r\\.|p\\.)?(\\d+|[+-]\\d+|[A-Z]+\\d+)?(?:_(\\d+|[+-]\\d+|[A-Z]+\\d+))?(delins[ATCG]+|del|ins[ATCG]*|dup|inv|fs|[A-Z][a-z]{2}|\\*)?(>?[A-Z][a-z]{2}|>?[ATCG]|\\*|\\([A-Z]{3}\\d{1,3}[A-Z]{3}\\))?(?:\\*?\\d+)?(?:[+*/-]\\d+)?"',
 '    bed_pattern = r"^(chr([1-9]|[1-2][0-9]|X|Y|M|Un(_[a-zA-Z0-9]+)?)|[a-zA-Z]+)\\t([0-9]+)\\t([0-9]+)\\t([^\\t]*)?\\t([0-9]+(\\.[0-9]+)?)?\\t([+-]?)\\t([0-9]+)\\t([0-9]+)\\t(0|[0-9]{1,3},[0-9]{1,3},[0-9]{1,3})\\t([0-9]+)\\t(([0-9]+,)+)\\t(([0-9]+,)+)$"',
 '    gff_pattern = r"^([\\w.\\-]+)\\t([\\w.\\-]*)\\t([\\w.\\-]+)\\t(\\d+)\\t(\\d+)\\t([\\d.]+|\\.)\\t([+-])\\t([012]|\\.)\\t(.+)$"',
 '',
 '    # Use regular expression to find all matches in the text',
 '    hgvs_matches = re.findall(hgvs_pattern, text)',
 '    bed_matches = re.findall(bed_pattern, text)',
 '    gff_matches = re.findall(gff_pattern, t

In [18]:
pd.DataFrame(manual_coordinates).to_csv('/home/wojtek/Desktop/manual_coordinates-llama-3.1.csv', index=False)

In [19]:
pd.DataFrame(manual_coordinates)

Unnamed: 0,0
0,```python
1,import re
2,
3,def extract_coordinates(text):
4,"# Define the patterns for HGVS, BED and GF..."
...,...
61051,""""""""
61052,
61053,# Print result
61054,print(get_coordinates(text))
