In [7]:
import sys
import os

import pandas as pd

import ipywidgets as widgets
from IPython.display import display

# add parent directory to path
library_path = os.path.abspath('..')
if library_path not in sys.path:
    sys.path.append(library_path)

from ideal_genom.zoom_heatmap import filter_sumstats

In [8]:
# Create interactive widgets for input
input_path = widgets.Text(
    value='/home/luis/data/gwasResult/',
    description='Path to project folder:',
    style={'description_width': 'initial'}
)

input_name = widgets.Text(
    value='annotated_normalized_combined_R2_0.3.dose_step2_sex_pheno-glm.PHENO1.glm.logistic.hybrid',
    description='Name of GWAS summary file:',
    style={'description_width': 'initial'}
)

top_snp = widgets.Text(
    value='table_lead_SNPS_GWAS_glm_logistic_final_paper',
    description='Name of file with SNPs to highlight:',
    style={'description_width': 'initial'}
)

# Display the widgets
display(input_path, input_name, top_snp)

# Function to get the text parameter values
def get_params():
    return input_path.value, input_name.value, top_snp.value

Text(value='/home/luis/data/gwasResult/', description='Path to project folder:', style=TextStyle(description_w…

Text(value='annotated_normalized_combined_R2_0.3.dose_step2_sex_pheno-glm.PHENO1.glm.logistic.hybrid', descrip…

Text(value='table_lead_SNPS_GWAS_glm_logistic_final_paper', description='Name of file with SNPs to highlight:'…

In [9]:
# Use the parameter values
path_params = get_params()
print(f"Parameter 1: {path_params[0]}")
print(f"Parameter 2: {path_params[1]}")
print(f"Parameter 3: {path_params[2]}")

Parameter 1: /home/luis/data/gwasResult/
Parameter 2: annotated_normalized_combined_R2_0.3.dose_step2_sex_pheno-glm.PHENO1.glm.logistic.hybrid
Parameter 3: table_lead_SNPS_GWAS_glm_logistic_final_paper


In [10]:
cols_touse = widgets.Textarea(
    value="#CHROM, POS, ID, P",
    description='Columns to use on the Mannhattan plot (comma-separated):',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)
display(cols_touse)

def get_cols():
    return cols_touse.value

Textarea(value='#CHROM, POS, ID, P', description='Columns to use on the Mannhattan plot (comma-separated):', l…

In [11]:
cols = get_cols()

df_gwas = pd.read_csv(
    os.path.join(path_params[0], path_params[1]), sep='\t', usecols=[col.strip() for col in cols.split(',')]
)
df_gwas.head(5)

Unnamed: 0,#CHROM,POS,ID,P
0,1,727242,rs61769339,0.355696
1,1,727717,rs61769340,0.517232
2,1,730869,rs200188737,0.510707
3,1,732369,rs1315713498,0.741994
4,1,740738,rs146067153,0.816063


In [12]:
rsID = widgets.Text(
    value='SNP',
    description='Column name with the rsID:',
    style={'description_width': 'initial'}
)

display(rsID)

def get_rsID():
    return rsID.value

Text(value='SNP', description='Column name with the rsID:', style=TextStyle(description_width='initial'))

In [13]:
gwas_rsID = get_rsID()

if path_params[2] is not None or path_params[2] != '':
    highlit_path = os.path.join(path_params[0], path_params[2])
    if os.path.exists(highlit_path):
        df_high = pd.read_csv(
            highlit_path, sep='\t'
        )
        to_highlight = df_high[gwas_rsID].to_list()
        print(to_highlight[:10])
        del df_high
    else:
        print('Path to file with SNPs to highlight does not exist')
        to_highlight = []
else:
    print('No file with SNPs to highlight')
    to_highlight = []

['rs3747973', 'rs74990530', 'rs34311866', 'rs356182', 'rs7681440', 'rs1846190', 'rs2517680', 'rs4909940', 'rs528813377', 'rs56328224']


In [14]:
SNP_col = widgets.Text(
    value='ID',
    description='Column with rsID:',
    style={'description_width': 'initial'}
)

CHR_col = widgets.Text(
    value='#CHROM',
    description='Column with chromosome:',
    style={'description_width': 'initial'}
)

POS_col = widgets.Text(
    value='POS',
    description='Column with base-pair position:',
    style={'description_width': 'initial'}
)

P_col = widgets.Text(
    value='P',
    description='Column with p-values:',
    style={'description_width': 'initial'}
)
display(SNP_col, CHR_col, POS_col, P_col)

def get_col_names():
    return SNP_col.value, CHR_col.value, POS_col.value, P_col.value

Text(value='ID', description='Column with rsID:', style=TextStyle(description_width='initial'))

Text(value='#CHROM', description='Column with chromosome:', style=TextStyle(description_width='initial'))

Text(value='POS', description='Column with base-pair position:', style=TextStyle(description_width='initial'))

Text(value='P', description='Column with p-values:', style=TextStyle(description_width='initial'))

In [15]:
snp_col, chr_col, pos_col, p_col = get_col_names()

In [16]:
filtered = filter_sumstats(
    data_df       =df_gwas, 
    lead_snp      =to_highlight[0], 
    snp_col       =snp_col, 
    p_col         =p_col, 
    pos_col       =pos_col, 
    chr_col       =chr_col, 
    pval_threshold=5e-6, 
    radius        =10e6
)
filtered.head()

2024/12/11 16:51:46 Start to annotate variants with nearest gene name(s)...
2024/12/11 16:51:46  -Assigning Gene name using NCBI refseq latest GRCh38 for protein coding genes
2024/12/11 16:51:46  -Using user-provided gtf:/home/luis/CGE/ideal-genom/GCF_000001405.40_GRCh38.p14_genomic.gtf
2024/12/11 16:51:49 Finished annotating variants with nearest gene name(s) successfully!


Unnamed: 0,#CHROM,POS,ID,P,log10p,LOCATION,GENENAME
0,1,204840546,rs11801418,2.04497e-06,5.689313,0,NFASC
1,1,204842990,rs869841,2.77049e-06,5.557443,0,NFASC
2,1,204843348,rs12563611,3.14022e-06,5.50304,0,NFASC
3,1,204843570,rs12407417,2.77049e-06,5.557443,0,NFASC
4,1,204843922,rs12217091,9.79405e-07,6.009038,0,NFASC


In [17]:
df_gwas[df_gwas[snp_col] == to_highlight[0]]

Unnamed: 0,#CHROM,POS,ID,P
510224,1,205708020,rs3747973,6.09464e-11


In [18]:
import sqlite3

# Path to your .db file
db_file = "/home/luis/CGE/ideal-genom/GCF_000001405.40_GRCh38.p14_genomall_genes.gtf.db"

# Connect to the database
connection = sqlite3.connect(db_file)

# Create a cursor object to execute SQL queries
cursor = connection.cursor()

# Get the list of all tables
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("Tables in the database:", tables)


Tables in the database: [('start_codon',), ('stop_codon',), ('transcript',), ('gene',), ('exon',), ('CDS',), ('_datacache_metadata',)]


In [19]:
table_name = 'exon'
df_exon = pd.read_sql_query(f"SELECT * FROM {table_name};", connection)
df_exon.head(5)

Unnamed: 0,start,protein_id,strand,gene_biotype,gene_id,end,transcript_biotype,source,exon_number,seqname,transcript_id,feature
0,11874,,+,,DDX11L1,12227,transcript,,1,NC_000001.11,NR_046018.2,exon
1,12613,,+,,DDX11L1,12721,transcript,,2,NC_000001.11,NR_046018.2,exon
2,13221,,+,,DDX11L1,14409,transcript,,3,NC_000001.11,NR_046018.2,exon
3,29321,,-,,WASH7P,29370,transcript,,1,NC_000001.11,NR_024540.1,exon
4,24738,,-,,WASH7P,24891,transcript,,2,NC_000001.11,NR_024540.1,exon


In [20]:
table_name = 'gene'
df_gene = pd.read_sql_query(f"SELECT * FROM {table_name};", connection)
df_gene.head(5)

Unnamed: 0,start,protein_id,strand,gene_biotype,gene_id,end,transcript_biotype,source,exon_number,seqname,transcript_id,feature
0,11874,,+,transcribed_pseudogene,DDX11L1,14409,,,,NC_000001.11,,gene
1,14362,,-,transcribed_pseudogene,WASH7P,29370,,,,NC_000001.11,,gene
2,17369,,-,miRNA,MIR6859-1,17436,,,,NC_000001.11,,gene
3,29774,,+,lncRNA,MIR1302-2HG,35418,,,,NC_000001.11,,gene
4,30366,,+,miRNA,MIR1302-2,30503,,,,NC_000001.11,,gene


In [21]:
df_gene[df_gene['gene_id'] == 'NFASC']

Unnamed: 0,start,protein_id,strand,gene_biotype,gene_id,end,transcript_biotype,source,exon_number,seqname,transcript_id,feature
4517,204828652,,+,protein_coding,NFASC,205022822,,,,NC_000001.11,,gene


In [22]:
from pyensembl import Genome

data = Genome(
                reference_name='GRCh38',
                annotation_name='Refseq',
                gtf_path_or_url='/home/luis/CGE/ideal-genom/GCF_000001405.40_GRCh38.p14_genomall_genes.gtf.db')

In [23]:
transcripts = data.transcripts_at_locus(contig='NC_000001.11', position=204840546)
test = transcripts[0]
test

Transcript(transcript_id='NM_001005388.3', transcript_name='None', gene_id='NFASC', biotype='mRNA', contig='NC_000001.11', start=204828652, end=205022822, strand='+', genome='GRCh38')

In [None]:
test.transcript_id

'NM_001005388.3'

In [None]:
import requests, sys
 
server = "https://rest.ensembl.org"
ext = "/vep/human/id"
headers={ "Content-Type" : "application/json", "Accept" : "application/json"}
r = requests.post(server+ext, headers=headers, data='{ "ids" : ["rs11801418", "rs869841", "rs12563611" ] }')
 
if not r.ok:
  r.raise_for_status()
  sys.exit()
 
decoded = r.json()
print(repr(decoded))

[{'assembly_name': 'GRCh38', 'start': 204840546, 'allele_string': 'T/A/C', 'strand': 1, 'input': 'rs11801418', 'end': 204840546, 'most_severe_consequence': 'intron_variant', 'colocated_variants': [{'end': 204840546, 'start': 204840546, 'seq_region_name': '1', 'id': 'rs11801418', 'allele_string': 'T/A/C', 'frequencies': {'A': {'gnomadg_asj': 0.09054, 'afr': 0.2421, 'gnomadg_nfe': 0.08936, 'gnomadg_remaining': 0.1569, 'sas': 0.0879, 'eas': 0.4821, 'gnomadg_ami': 0.1195, 'eur': 0.0915, 'gnomadg': 0.1562, 'gnomadg_sas': 0.1122, 'af': 0.2328, 'gnomadg_eas': 0.4538, 'gnomadg_amr': 0.2252, 'gnomadg_afr': 0.2124, 'amr': 0.2622, 'gnomadg_mid': 0.119, 'gnomadg_fin': 0.1655}}, 'strand': 1}], 'seq_region_name': '1', 'transcript_consequences': [{'biotype': 'protein_coding', 'strand': 1, 'gene_symbol': 'NFASC', 'variant_allele': 'A', 'impact': 'MODIFIER', 'consequence_terms': ['intron_variant'], 'gene_id': 'ENSG00000163531', 'hgnc_id': 'HGNC:29866', 'gene_symbol_source': 'HGNC', 'transcript_id': 'EN

In [None]:
decoded[0].keys()

dict_keys(['assembly_name', 'start', 'allele_string', 'strand', 'input', 'end', 'most_severe_consequence', 'colocated_variants', 'seq_region_name', 'transcript_consequences', 'id'])

In [None]:
decoded[0]['most_severe_consequence']

'intron_variant'

In [None]:
for res in decoded:
    print(res['id'], res['most_severe_consequence'])

rs11801418 intron_variant
rs869841 intron_variant
rs12563611 intron_variant


In [25]:
from ideal_genom.api_client import EnsemblRestClient

client = EnsemblRestClient()

# Example list of IDs for the POST request
ids = filtered['ID'].to_list()#["rs11801418", "rs869841", "rs12563611"]

response = client.post_vep_request(ids)

if response:
    print("Response:", response)
else:
    print("Failed to get response.")

Response: [{'start': 204840546, 'assembly_name': 'GRCh38', 'most_severe_consequence': 'intron_variant', 'end': 204840546, 'allele_string': 'T/A/C', 'strand': 1, 'input': 'rs11801418', 'colocated_variants': [{'allele_string': 'T/A/C', 'seq_region_name': '1', 'id': 'rs11801418', 'frequencies': {'A': {'gnomadg_asj': 0.09054, 'afr': 0.2421, 'gnomadg_remaining': 0.1569, 'gnomadg_nfe': 0.08936, 'sas': 0.0879, 'eas': 0.4821, 'gnomadg_ami': 0.1195, 'eur': 0.0915, 'gnomadg': 0.1562, 'gnomadg_sas': 0.1122, 'af': 0.2328, 'gnomadg_eas': 0.4538, 'gnomadg_amr': 0.2252, 'gnomadg_afr': 0.2124, 'amr': 0.2622, 'gnomadg_mid': 0.119, 'gnomadg_fin': 0.1655}}, 'end': 204840546, 'start': 204840546, 'strand': 1}], 'transcript_consequences': [{'variant_allele': 'A', 'impact': 'MODIFIER', 'gene_symbol': 'NFASC', 'strand': 1, 'biotype': 'protein_coding', 'transcript_id': 'ENST00000339876', 'gene_symbol_source': 'HGNC', 'hgnc_id': 'HGNC:29866', 'gene_id': 'ENSG00000163531', 'consequence_terms': ['intron_variant']

In [26]:
len(response)

154

In [27]:
for res in response:
    print(res['id'], res['most_severe_consequence'])

rs11801418 intron_variant
rs869841 intron_variant
rs12563611 intron_variant
rs12407417 intron_variant
rs12217091 intron_variant
rs6699817 intron_variant
rs6702731 intron_variant
rs57128821 intron_variant
rs57482207 intron_variant
rs4951134 intron_variant
rs4951135 intron_variant
rs4951136 intron_variant
rs67020185 intron_variant
rs1107714 intron_variant
rs7529113 intron_variant
rs7515134 intron_variant
rs7529301 intron_variant
rs4951139 intron_variant
rs16854533 intron_variant
rs16854537 intron_variant
rs12405886 intron_variant
rs12408680 intron_variant
rs12405298 intron_variant
rs11240292 intron_variant
rs61824662 non_coding_transcript_exon_variant
rs1775153 intron_variant
rs34606094 intron_variant
rs1772136 intron_variant
rs61824663 intron_variant
rs6685188 intron_variant
rs12402185 intron_variant
rs12404824 intron_variant
rs12737668 intron_variant
rs823106 intron_variant
rs12132270 intron_variant
rs1891091 intron_variant
rs10751444 intron_variant
rs6593964 intron_variant
rs6676110 i

In [29]:
import requests
# Base API URL
server = "https://rest.ensembl.org"
ext = "/xrefs/symbol/homo_sapiens/"
headers = {"Content-Type": "application/json", "Accept": "application/json"}

# Specify the gene symbol
gene_symbol = "BRCA2"
url = f"{server}{ext}{gene_symbol}"

# Send GET request
response = requests.get(url, headers=headers)

# Process the response
if response.ok:
    xrefs = response.json()
    for entry in xrefs:
        if entry["type"] == "gene":
            print("Ensembl Gene ID:", entry["id"])
else:
    print(f"Error: {response.status_code}, {response.reason}")


Ensembl Gene ID: ENSG00000139618
Ensembl Gene ID: LRG_293


In [31]:
response.json()

[{'type': 'gene', 'id': 'ENSG00000139618'}, {'id': 'LRG_293', 'type': 'gene'}]