In [1]:
import sys
import os

import pandas as pd

import ipywidgets as widgets
from IPython.display import display

# add parent directory to path
library_path = os.path.abspath('..')
if library_path not in sys.path:
    sys.path.append(library_path)

from ideal_genom.zoom_heatmap import filter_sumstats, snp_annotations, get_gene_information

In [2]:
# Create interactive widgets for input
input_path = widgets.Text(
    value='/mnt/0A2AAC152AABFBB7/data/gwasResult/',
    description='Path to project folder:',
    style={'description_width': 'initial'}
)

input_name = widgets.Text(
    value='annotated_normalized_combined_R2_0.3.dose_step2_sex_pheno-glm.PHENO1.glm.logistic.hybrid',
    description='Name of GWAS summary file:',
    style={'description_width': 'initial'}
)

top_snp = widgets.Text(
    value='table_lead_SNPS_GWAS_glm_logistic_final_paper',
    description='Name of file with SNPs to highlight:',
    style={'description_width': 'initial'}
)

# Display the widgets
display(input_path, input_name, top_snp)

# Function to get the text parameter values
def get_params():
    return input_path.value, input_name.value, top_snp.value

Text(value='/mnt/0A2AAC152AABFBB7/data/gwasResult/', description='Path to project folder:', style=TextStyle(de…

Text(value='annotated_normalized_combined_R2_0.3.dose_step2_sex_pheno-glm.PHENO1.glm.logistic.hybrid', descrip…

Text(value='table_lead_SNPS_GWAS_glm_logistic_final_paper', description='Name of file with SNPs to highlight:'…

In [3]:
# Use the parameter values
path_params = get_params()
print(f"Parameter 1: {path_params[0]}")
print(f"Parameter 2: {path_params[1]}")
print(f"Parameter 3: {path_params[2]}")

Parameter 1: /mnt/0A2AAC152AABFBB7/data/gwasResult/
Parameter 2: annotated_normalized_combined_R2_0.3.dose_step2_sex_pheno-glm.PHENO1.glm.logistic.hybrid
Parameter 3: table_lead_SNPS_GWAS_glm_logistic_final_paper


In [4]:
cols_touse = widgets.Textarea(
    value="#CHROM, POS, ID, P",
    description='Columns to use on the Mannhattan plot (comma-separated):',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)
display(cols_touse)

def get_cols():
    return cols_touse.value

Textarea(value='#CHROM, POS, ID, P', description='Columns to use on the Mannhattan plot (comma-separated):', l…

In [5]:
cols = get_cols()

df_gwas = pd.read_csv(
    os.path.join(path_params[0], path_params[1]), sep='\t', usecols=[col.strip() for col in cols.split(',')]
)
df_gwas.head(5)

Unnamed: 0,#CHROM,POS,ID,P
0,1,727242,rs61769339,0.355696
1,1,727717,rs61769340,0.517232
2,1,730869,rs200188737,0.510707
3,1,732369,rs1315713498,0.741994
4,1,740738,rs146067153,0.816063


In [6]:
rsID = widgets.Text(
    value='SNP',
    description='Column name with the rsID:',
    style={'description_width': 'initial'}
)

display(rsID)

def get_rsID():
    return rsID.value

Text(value='SNP', description='Column name with the rsID:', style=TextStyle(description_width='initial'))

In [7]:
gwas_rsID = get_rsID()

if path_params[2] is not None or path_params[2] != '':
    highlit_path = os.path.join(path_params[0], path_params[2])
    if os.path.exists(highlit_path):
        df_high = pd.read_csv(
            highlit_path, sep='\t'
        )
        to_highlight = df_high[gwas_rsID].to_list()
        print(to_highlight[:10])
        del df_high
    else:
        print('Path to file with SNPs to highlight does not exist')
        to_highlight = []
else:
    print('No file with SNPs to highlight')
    to_highlight = []

['rs3747973', 'rs74990530', 'rs34311866', 'rs356182', 'rs7681440', 'rs1846190', 'rs2517680', 'rs4909940', 'rs528813377', 'rs56328224']


In [8]:
SNP_col = widgets.Text(
    value='ID',
    description='Column with rsID:',
    style={'description_width': 'initial'}
)

CHR_col = widgets.Text(
    value='#CHROM',
    description='Column with chromosome:',
    style={'description_width': 'initial'}
)

POS_col = widgets.Text(
    value='POS',
    description='Column with base-pair position:',
    style={'description_width': 'initial'}
)

P_col = widgets.Text(
    value='P',
    description='Column with p-values:',
    style={'description_width': 'initial'}
)
display(SNP_col, CHR_col, POS_col, P_col)

def get_col_names():
    return SNP_col.value, CHR_col.value, POS_col.value, P_col.value

Text(value='ID', description='Column with rsID:', style=TextStyle(description_width='initial'))

Text(value='#CHROM', description='Column with chromosome:', style=TextStyle(description_width='initial'))

Text(value='POS', description='Column with base-pair position:', style=TextStyle(description_width='initial'))

Text(value='P', description='Column with p-values:', style=TextStyle(description_width='initial'))

In [9]:
snp_col, chr_col, pos_col, p_col = get_col_names()

In [10]:
filtered = filter_sumstats(
    data_df       =df_gwas, 
    lead_snp      =to_highlight[0], 
    snp_col       =snp_col, 
    p_col         =p_col, 
    pos_col       =pos_col, 
    chr_col       =chr_col, 
    pval_threshold=5e-6, 
    radius        =10e6
)
filtered.head()

Unnamed: 0,#CHROM,POS,ID,P,log10p
0,1,204840546,rs11801418,2.04497e-06,5.689313
1,1,204842990,rs869841,2.77049e-06,5.557443
2,1,204843348,rs12563611,3.14022e-06,5.50304
3,1,204843570,rs12407417,2.77049e-06,5.557443
4,1,204843922,rs12217091,9.79405e-07,6.009038


In [11]:
annotated = snp_annotations(
    data_df=filtered, 
    snp_col=snp_col, 
    chr_col=chr_col, 
    pos_col=pos_col
)
annotated.head()

2024/12/12 13:03:25 Start to annotate variants with nearest gene name(s)...
2024/12/12 13:03:25  -Assigning Gene name using NCBI refseq latest GRCh38 for protein coding genes
2024/12/12 13:03:25  -Using user-provided gtf:/mnt/0A2AAC152AABFBB7/CGE/luxgiant-dstream/GCF_000001405.40_GRCh38.p14_genomic.gtf
2024/12/12 13:03:29 Finished annotating variants with nearest gene name(s) successfully!


Unnamed: 0,#CHROM,POS,ID,P,log10p,GENENAME,consequence
0,1,204840546,rs11801418,2.04497e-06,5.689313,NFASC,intron_variant
1,1,204842990,rs869841,2.77049e-06,5.557443,NFASC,intron_variant
2,1,204843348,rs12563611,3.14022e-06,5.50304,NFASC,intron_variant
3,1,204843570,rs12407417,2.77049e-06,5.557443,NFASC,intron_variant
4,1,204843922,rs12217091,9.79405e-07,6.009038,NFASC,intron_variant


In [22]:
annotated['consequence'].value_counts()

consequence
intron_variant                        83
downstream_gene_variant               33
upstream_gene_variant                 12
3_prime_UTR_variant                   12
intergenic_variant                     8
non_coding_transcript_exon_variant     4
5_prime_UTR_variant                    1
synonymous_variant                     1
Name: count, dtype: int64

In [12]:
get_gene_information(
    genes=annotated['GENENAME'].unique().tolist(),
    gtf_path=None,
    build='38'
)

Unnamed: 0,gene,start,end,strand
0,NFASC,204828652,205022822,+
1,SLC45A3,205657851,205680509,-
2,LOC105371701,205680370,205690435,+
3,NUCKS1,205712822,205750182,-
4,RAB29,205767986,205775482,-
5,SLC41A1,205789095,205813198,-
6,PM20D1,205828025,205850132,-


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,10))
ax1= plt.subplot2grid((40,20), (0,0), rowspan=18, colspan=18)
ax2 = plt.subplot2grid((40,20), (18,0), rowspan=5,  colspan=18)
ax3 = plt.subplot2grid((40,20), (23,0), rowspan=18,colspan=18)
ax4 = plt.subplot2grid((40,20), (23,19), rowspan=10, colspan=1)

missense = annotated[annotated.anno=='missense_variant']
utr3     = annotated[annotated.anno=='3_prime_UTR_variant']
utr5     = annotated[annotated.anno=='5_prime_UTR_variant']
upstrm   = annotated[annotated.anno=='upstream_gene_variant']

ax1.scatter(target.bp, target.P,s=15,color='grey', label='')
ax1.scatter(missense.bp, missense.P, s=30, color='orange', label='Missense variant')
ax1.scatter(utr3.bp, utr3.P, s=30, color='blue', label="3'-UTR variant")
ax1.scatter(utr5.bp, utr5.P, s=30, color='green', label="5'-UTR variant")
ax1.scatter(upstrm.bp, upstrm.P, s=30, color='green', label="Upstream variant")
ax1.set_xlim(region)
ax1.xaxis.set_ticks_position('top')
ax1.legend()
ax1.set_title("Million basepairs on BTA{}".format(Chr), fontsize=12)
ax1.set_ylabel('log10(P)', fontsize=12)

In [19]:
import requests
# Base API URL
server = "https://rest.ensembl.org"
ext = "/xrefs/symbol/homo_sapiens/"
headers = {"Content-Type": "application/json", "Accept": "application/json"}

# Specify the gene symbol
gene_symbol = "BRCA2"
url = f"{server}{ext}{gene_symbol}"

# Send GET request
response = requests.get(url, headers=headers)

# Process the response
if response.ok:
    xrefs = response.json()
    for entry in xrefs:
        if entry["type"] == "gene":
            print("Ensembl Gene ID:", entry["id"])
else:
    print(f"Error: {response.status_code}, {response.reason}")


Ensembl Gene ID: ENSG00000139618
Ensembl Gene ID: LRG_293


In [20]:
response.json()

[{'type': 'gene', 'id': 'ENSG00000139618'}, {'id': 'LRG_293', 'type': 'gene'}]

In [21]:
from ideal_genom.api_client import GeneEnsemblRestClient

gene_client = GeneEnsemblRestClient()

# Example list of IDs for the POST request
ids_gene = filtered['GENENAME'].unique().tolist()
ids_gene

for gene in ids_gene:

    gene_info = gene_client.get_gene_location(gene)

    if "error" in gene_info:
        print(f"Error: {gene_info['error']}")
    else:
        print("Gene Information:", gene_info)

KeyError: 'GENENAME'