In [1]:
genes = [
    "BRCA1",
    "BRCA2",
    "EGFR",
    "KRAS",
    "BRAF",
    "TP53",
    "CFTR",
    "APC",
    "FOXF1",
    "TBX4",
]

In [2]:
import requests

urls = [f"https://www.ncbi.nlm.nih.gov/research/litvar2-api/variant/search/gene/{gene}" for gene in genes]
responses = [requests.get(url) for url in urls]



In [3]:
import ast

responses_dicts = []
for response, gene in zip(responses, genes):
    # Convert the response text into a list of dictionaries
    response_list = response.text.strip().split('\n')
    response_dicts = [ast.literal_eval(item) for item in response_list]
    response_dicts = [{**item, "gene": gene} for item in response_dicts]
    responses_dicts.extend(response_dicts)


In [4]:
import pandas as pd

# Create a pandas DataFrame from the list of dictionaries
df = pd.DataFrame(responses_dicts)

# Display the DataFrame
df


Unnamed: 0,_id,pmids_count,rsid,gene,clingen_id
0,litvar@rs996042036##,6,rs996042036,BRCA1,
1,litvar@rs993065651##,1,rs993065651,BRCA1,
2,litvar@rs9897397##,1,rs9897397,BRCA1,
3,litvar@rs9895855##,1,rs9895855,BRCA1,
4,litvar@rs979531844##,1,rs979531844,BRCA1,
...,...,...,...,...,...
65632,litvar@#9496#c.1112dupC,2,,TBX4,
65633,litvar@#9496#c.1112dup,1,,TBX4,
65634,litvar@#9496#c.1112del,1,,TBX4,
65635,litvar@#9496#c.1106delC,1,,TBX4,


In [5]:
# Sort the DataFrame by 'pmids_count' in descending order
df_sorted = df.sort_values(by='pmids_count', ascending=False)

# Filter the DataFrame to include only rows where 'rsid' is not empty
df_filtered = df_sorted[df_sorted['rsid'].notna()]

# Display the filtered DataFrame
df_filtered


Unnamed: 0,_id,pmids_count,rsid,gene,clingen_id
39601,litvar@rs113488022##,27836,rs113488022,BRAF,
39922,litvar@CA123643#rs113488022##,27056,rs113488022,BRAF,CA123643
36192,litvar@rs121913529##,13163,rs121913529,KRAS,
30693,litvar@rs121434569##,10781,rs121434569,EGFR,
30974,litvar@CA90928#rs121434569##,10538,rs121434569,EGFR,CA90928
...,...,...,...,...,...
6047,litvar@CA59185#rs753081589##,1,rs753081589,BRCA1,CA59185
6046,litvar@CA59212#rs757215735##,1,rs757215735,BRCA1,CA59212
6051,litvar@CA59047#rs775339017##,1,rs775339017,BRCA1,CA59047
6050,litvar@CA59110#rs753210219##,1,rs753210219,BRCA1,CA59110


In [6]:
# Filter the DataFrame to include only rows where 'clingen_id' is not empty
df_clingen_filtered = df_filtered[df_filtered['clingen_id'].notna()]

# Display the filtered DataFrame
df_clingen_filtered


Unnamed: 0,_id,pmids_count,rsid,gene,clingen_id
39922,litvar@CA123643#rs113488022##,27056,rs113488022,BRAF,CA123643
30974,litvar@CA90928#rs121434569##,10538,rs121434569,EGFR,CA90928
31534,litvar@CA126713#rs121434568##,10084,rs121434568,EGFR,CA126713
36402,litvar@CA122538#rs121913529##,9217,rs121913529,KRAS,CA122538
36401,litvar@CA122540#rs121913529##,6264,rs121913529,KRAS,CA122540
...,...,...,...,...,...
6047,litvar@CA59185#rs753081589##,1,rs753081589,BRCA1,CA59185
6046,litvar@CA59212#rs757215735##,1,rs757215735,BRCA1,CA59212
6051,litvar@CA59047#rs775339017##,1,rs775339017,BRCA1,CA59047
6050,litvar@CA59110#rs753210219##,1,rs753210219,BRCA1,CA59110


In [7]:
# Filter the DataFrame to include only rows where 'pmids_count' is at least 10
df_pmids_filtered = df_clingen_filtered[df_clingen_filtered['pmids_count'] >= 10]

# Display the filtered DataFrame
df_pmids_filtered


Unnamed: 0,_id,pmids_count,rsid,gene,clingen_id
39922,litvar@CA123643#rs113488022##,27056,rs113488022,BRAF,CA123643
30974,litvar@CA90928#rs121434569##,10538,rs121434569,EGFR,CA90928
31534,litvar@CA126713#rs121434568##,10084,rs121434568,EGFR,CA126713
36402,litvar@CA122538#rs121913529##,9217,rs121913529,KRAS,CA122538
36401,litvar@CA122540#rs121913529##,6264,rs121913529,KRAS,CA122540
...,...,...,...,...,...
39778,litvar@CA281976#rs397507476##,10,rs397507476,BRAF,CA281976
62477,litvar@CA10578293#rs876658325##,10,rs876658325,APC,CA10578293
39858,litvar@CA168047327#rs868021367##,10,rs868021367,BRAF,CA168047327
39867,litvar@CA16602735#rs121913226##,10,rs121913226,BRAF,CA16602735


In [8]:
ids = df_pmids_filtered['_id'].dropna()
urls = [f'https://www.ncbi.nlm.nih.gov/research/litvar2-api/variant/get/{id}/publications' for id in ids]
urls = [url.replace('#', '%23') for url in urls]


In [9]:
# test url
url = urls[0]
url

'https://www.ncbi.nlm.nih.gov/research/litvar2-api/variant/get/litvar@CA123643%23rs113488022%23%23/publications'

In [10]:
import requests
import time

# Tworzenie słownika do przechowywania pmids i pmcids dla każdego url
results = {}
for url, rsid in zip(urls, ids):
    time.sleep(0.5)
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        pmids = data.get('pmids', [])
        pmcids = data.get('pmcids', [])
        results[rsid] = {'pmids': pmids, 'pmcids': pmcids}
    else:
        results[rsid] = {'pmids': [], 'pmcids': []}



KeyboardInterrupt: 

In [47]:
import json

# Zapisz results do pliku JSON
with open('litvar_pmids_pmcids.json', 'w') as json_file:
    json.dump(results, json_file, indent=4)


In [11]:
import json

# Wczytaj results z pliku JSON
with open('litvar_pmids_pmcids.json', 'r') as json_file:
    results = json.load(json_file)


In [12]:
details = {}
for id in ids:
    time.sleep(0.5)
    id_modified = (id.split('@')[0] + '@' + id.split('#', 1)[1]).replace('#', '%23')
    url = f'https://www.ncbi.nlm.nih.gov/research/litvar2-api/variant/get/{id_modified}'
    response = requests.get(url)
    if response.status_code == 200:
        details[id] = response.json()
    else:
        details[id] = {}


In [72]:
# Zapisz details do pliku JSON
with open('litvar_details.json', 'w') as json_file:
    json.dump(details, json_file, indent=4)

In [17]:
# pobierz z pliku litvar_details.json do zmiennej details
with open('litvar_details.json', 'r') as json_file:
    details = json.load(json_file)


In [19]:
import sys
import os

# Uzyskaj bieżący katalog roboczy
current_dir = os.getcwd()  # Uzyskaj bieżący katalog roboczy
src_dir = os.path.abspath(os.path.join(current_dir, '..'))  # Ścieżka do katalogu src

# Dodaj katalog src do sys.path
sys.path.append(src_dir)


In [74]:
len(results)

1132

In [30]:
import importlib
import src.flow.PubmedEndpoint
importlib.reload(src.flow.PubmedEndpoint)

from src.flow.PubmedEndpoint import PubmedEndpoint

snippets = []
for id, values in results.items():
    pmids = values['pmids']
    pmcids = values['pmcids']
    rsid = df_pmids_filtered.loc[df_pmids_filtered['_id'] == id, 'rsid'].values[0]
    pmid_texts = [PubmedEndpoint.fetch_full_text_from_pubmed_id(pmid) for pmid in pmids[:20]]
    pmcid_texts = [PubmedEndpoint.fetch_full_text(pmcid) for pmcid in pmcids[:20]]
    texts = pmid_texts + pmcid_texts
    details_for_id = details[id]
    variant_name = details_for_id['name'].lstrip('p.')
    for text, identifier in zip(texts, pmids + pmcids):
        preprocessed_text = PubmedEndpoint.preprocess_full_text_to_plain_text(text or '')
        snippets.append({
            'rsid': rsid,
            'variant_name': variant_name,
            'identifier': identifier,
            'found': variant_name in preprocessed_text,
            'text': preprocessed_text
        })
        if variant_name in preprocessed_text:
            print(f"Znaleziono variant_name {variant_name} w tekście publikacji.")
        else:
            print(f"Nie znaleziono variant_name {variant_name} w tekście publikacji.")


Nie znaleziono variant_name V600E w tekście publikacji.
Znaleziono variant_name V600E w tekście publikacji.
Znaleziono variant_name V600E w tekście publikacji.
Znaleziono variant_name V600E w tekście publikacji.
Znaleziono variant_name V600E w tekście publikacji.
Znaleziono variant_name V600E w tekście publikacji.
Znaleziono variant_name V600E w tekście publikacji.
Znaleziono variant_name V600E w tekście publikacji.
Znaleziono variant_name V600E w tekście publikacji.
Znaleziono variant_name V600E w tekście publikacji.
Nie znaleziono variant_name V600E w tekście publikacji.
Nie znaleziono variant_name V600E w tekście publikacji.
Znaleziono variant_name V600E w tekście publikacji.
Nie znaleziono variant_name V600E w tekście publikacji.
Znaleziono variant_name V600E w tekście publikacji.
Nie znaleziono variant_name V600E w tekście publikacji.
Znaleziono variant_name V600E w tekście publikacji.
Znaleziono variant_name V600E w tekście publikacji.
Znaleziono variant_name V600E w tekście publ

HTTPError: 400 Client Error: Bad Request for url: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=2634725&retmode=xml

In [31]:
# Zapisz details do pliku JSON
with open('litvar_snippets.json', 'w') as json_file:
    json.dump(snippets, json_file, indent=4)

In [4]:
import json

# Pobierz z pliku litvar_snippets.json do zmiennej snippets
with open('litvar_snippets.json', 'r') as json_file:
    snippets = json.load(json_file)

# Dla każdego snippetu: a) sprawdź czy variant_name z uciętym przedrostkiem p. znajduje się w tekście text b) zapisz wartość w zmiennej found
for snippet in snippets:
    variant_name = snippet['variant_name'].lstrip('p.')
    snippet['variant_name'] = variant_name
    snippet['found'] = variant_name in snippet['text']


In [32]:
# Policz ile jest 'found' a ile nie jest 'found'
liczba_found = sum(1 for snippet in snippets if snippet['found'])
liczba_not_found = len(snippets) - liczba_found

print(f"Liczba snippetów z 'found': {liczba_found}")
print(f"Liczba snippetów bez 'found': {liczba_not_found}")


Liczba snippetów z 'found': 902
Liczba snippetów bez 'found': 458


In [33]:
import pandas as pd

# Filtruj snippety, aby pozostały tylko te, które są 'found'
found_snippets = [snippet for snippet in snippets if snippet['found']]

# Sprawdź ile ich jest
liczba_found_snippets = len(found_snippets)
print(f"Liczba znalezionych snippetów: {liczba_found_snippets}")

# Wypisz w pd.DataFrame
df_found_snippets = pd.DataFrame(found_snippets)
print(df_found_snippets)


Liczba znalezionych snippetów: 902
            rsid variant_name  identifier  found  \
0    rs113488022        V600E    34865163   True   
1    rs113488022        V600E    33161228   True   
2    rs113488022        V600E    34209805   True   
3    rs113488022        V600E    27656207   True   
4    rs113488022        V600E    26214416   True   
..           ...          ...         ...    ...   
897   rs80034486       N1303K    32934006   True   
898   rs80034486       N1303K    35893365   True   
899   rs80034486       N1303K    39624835   True   
900   rs80034486       N1303K     9695373   True   
901   rs80034486       N1303K    38697102   True   

                                                  text  
0    \n\n\n\n\n\nKey Points\n\n\n\n\n\nIUE of\n\nBR...  
1    \n\n\n\n\n\n\n\nResearch in context\n\n\n\nEvi...  
2    \n\n\n\n1. Introduction\n\nIn humans, thyroid ...  
3    \n\n\n\n1. Introduction\n\nPapillary thyroid c...  
4    \n\n\n\nIntroduction\n\nOver the last decades,... 

In [35]:
# Przygotuj dane treningowe w formacie JSONL
training_data = []

for snippet in snippets:
    if not snippet['text']:  # Pomijamy puste teksty
        continue
        
    training_example = {
        "messages": [
            {
                "role": "system",
                "content": "You are an expert in the field of genetics. Your task is to find all genomic variants mentioned in the text. Return them in a list of strings, separated by newlines, without any additional text."
            },
            {
                "role": "user",
                "content": "Find and list all genomic variants mentioned in the following text:\n\n" + snippet['text']
            }
        ],
        "metadata": {
            "variant_name": snippet['variant_name'],
            "rsid": snippet['rsid'],
            "identifier": snippet['identifier'],
            "found": snippet['found']
        }
    }
    training_data.append(training_example)

# Zapisz do pliku JSONL
with open('training_data.jsonl', 'w', encoding='utf-8') as f:
    for example in training_data:
        f.write(json.dumps(example, ensure_ascii=False) + '\n')

# Wyświetl statystyki
print(f"Liczba przykładów treningowych: {len(training_data)}")

Liczba przykładów treningowych: 1163


In [36]:
def reward_function(generated_text, expected_variant, found):
    """
    The function evaluates the model's response.
    
    Parameters:
        generated_text (str): The response generated by the model, containing a list of variants separated by new lines.
        expected_variant (str): The required variant expected in the response.
        found (bool): Whether the expected_variant should be found in the list.
    
    Returns:
        float: Positive reward (e.g., 1.0) if the expected_variant should be and is in the list or should not be and is not in the list, otherwise a negative value (e.g., -1.0).
    """
    # Rozdzielamy odpowiedź na pojedyncze linie
    variants = [line.strip() for line in generated_text.splitlines() if line.strip()]
    
    if (found and expected_variant in variants) or (not found and expected_variant not in variants):
        return 1.0  # Reward
    else:
        return -1.0  # Penalty


In [None]:
snippets

In [None]:
from transformers import AutoTokenizer, BertForTokenClassification
import torch
import json

def load_snippets(file_path='litvar_snippets.json'):
    with open(file_path, 'r') as f:
        return json.load(f)

def predict_variants(text, model, tokenizer):
    # Używamy "Sequence Variant" jako klasy, ponieważ jest to jedna z natywnych klas modelu z BIORED
    encodings = tokenizer("Sequence Variant", text, 
                         is_split_into_words=False,
                         padding=True, 
                         truncation=True,
                         max_length=512,
                         return_tensors='pt')
    
    with torch.no_grad():
        outputs = model(**encodings)
        predictions = outputs.logits.argmax(-1)[0]  # Bierzemy pierwszą (i jedyną) sekwencję
    
    # Konwertujemy tokeny na tekst tylko dla tych z predykcją 1
    tokens = tokenizer.convert_ids_to_tokens(encodings['input_ids'][0])
    variant_tokens = []
    current_variant = []
    
    for token, pred in zip(tokens, predictions):
        if pred == 1:
            if token.startswith('##'):
                if current_variant:
                    current_variant.append(token[2:])
            else:
                if current_variant:
                    variant_tokens.append(''.join(current_variant))
                current_variant = [token]
        else:
            if current_variant:
                variant_tokens.append(''.join(current_variant))
                current_variant = []
    
    if current_variant:
        variant_tokens.append(''.join(current_variant))
    
    return variant_tokens

def test_snippets():
    # Inicjalizacja modelu i tokenizera
    modelname = 'MilosKosRad/BioNER'
    tokenizer = AutoTokenizer.from_pretrained(modelname)
    model = BertForTokenClassification.from_pretrained(modelname, num_labels=2)
    
    # Wczytanie snippetów
    snippets = load_snippets()
    
    results = []
    for i, snippet in enumerate(snippets[:10]):  # Testujemy pierwsze 10 snippetów
        text = snippet['text']
        expected_variant = snippet['variant_name']
        
        if not text:  # Pomijamy puste teksty
            continue
            
        # Predykcja wariantów
        predicted_variants = predict_variants(text, model, tokenizer)
        
        result = {
            'snippet_id': i,
            'expected_variant': expected_variant,
            'predicted_variants': predicted_variants,
            'found_expected': any(expected_variant in pred for pred in predicted_variants),
            'text': text[:200] + '...' if len(text) > 200 else text  # Skracamy tekst dla czytelności
        }
        results.append(result)
        
        # Wyświetlanie wyników
        print(f"\nSnippet {i}:")
        print(f"Expected variant: {expected_variant}")
        print(f"Predicted variants: {predicted_variants}")
        print(f"Found expected variant: {result['found_expected']}")
        print("-" * 80)
    
    # Statystyki
    total = len(results)
    correct = sum(1 for r in results if r['found_expected'])
    print("\nStatystyki:")
    print(f"Całkowita liczba snippetów: {total}")
    print(f"Poprawnie znalezione warianty: {correct}")
    print(f"Dokładność: {correct/total:.2%}")
    
    return results

if __name__ == "__main__":
    results = test_snippets()