#### Implement calcium signaling pathway. Geneset downloaded from KEGG
http://software.broadinstitute.org/gsea/msigdb/cards/KEGG_CALCIUM_SIGNALING_PATHWAY

In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf
from scipy import stats
from simulation import *

In [2]:
calpath_gene_df = pd.read_table("../data/calciumgeneset.txt", skiprows = 2, header = None, names = ["gene_name"])
# sex chromosome is 23
sw_cnv_df = pd.read_table("../data/swcnv/swcnv.qc6.cnv", sep="\s+")
sw_indiv_df = pd.read_table("../data/swcnv/swcnv.qc6.cnv.indiv", sep="\s+")
sw_cnv_indiv_df = pd.merge(sw_cnv_df, sw_indiv_df, how="inner", on=["FID", "IID"])
sw_cnv_indiv_df["PHE"] = sw_cnv_indiv_df.apply(lambda row: 1 if row["PHE"]==2 else 0, axis=1)
refgene_df = load_reference_gene("data/refGene.txt.gz")

In [3]:
calpath_gene_list = calpath_gene_df["gene_name"].tolist()

In [4]:
print (calpath_gene_list)

['ADCY1', 'ADCY2', 'ADCY3', 'ADCY4', 'ADCY7', 'ADCY8', 'ADCY9', 'ADORA2A', 'ADORA2B', 'ADRA1A', 'ADRA1B', 'ADRA1D', 'ADRB1', 'ADRB2', 'ADRB3', 'AGTR1', 'ATP2A1', 'ATP2A2', 'ATP2A3', 'ATP2B1', 'ATP2B2', 'ATP2B3', 'ATP2B4', 'AVPR1A', 'AVPR1B', 'BDKRB1', 'BDKRB2', 'BST1', 'CACNA1A', 'CACNA1B', 'CACNA1C', 'CACNA1D', 'CACNA1E', 'CACNA1F', 'CACNA1G', 'CACNA1H', 'CACNA1I', 'CACNA1S', 'CALM1', 'CALM2', 'CALM3', 'CALML3', 'CALML5', 'CALML6', 'CAMK2A', 'CAMK2B', 'CAMK2D', 'CAMK2G', 'CAMK4', 'CCKAR', 'CCKBR', 'CD38', 'CHP', 'CHP2', 'CHRM1', 'CHRM2', 'CHRM3', 'CHRM5', 'CHRNA7', 'CYSLTR1', 'CYSLTR2', 'DRD1', 'DRD5', 'EDNRA', 'EDNRB', 'EGFR', 'ERBB2', 'ERBB3', 'ERBB4', 'F2R', 'GNA11', 'GNA14', 'GNA15', 'GNAL', 'GNAQ', 'GNAS', 'GRIN1', 'GRIN2A', 'GRIN2C', 'GRIN2D', 'GRM1', 'GRM5', 'GRPR', 'HRH1', 'HRH2', 'HTR2A', 'HTR2B', 'HTR2C', 'HTR4', 'HTR5A', 'HTR6', 'HTR7', 'ITPKA', 'ITPKB', 'ITPR1', 'ITPR2', 'ITPR3', 'LHCGR', 'LOC729317', 'LTB4R2', 'MYLK', 'MYLK2', 'MYLK3', 'NOS1', 'NOS2', 'NOS3', 'NTSR1', 'OX

In [5]:
calpath_gene_pos = pd.merge(refgene_df, calpath_gene_df, how="inner", on=["gene_name"])

In [6]:
query = '''
SELECT gene_name, chrom, min(tx_start), max(tx_end)
FROM calpath_gene_pos
GROUP BY gene_name
'''
calpath_gene_pos = sqldf(query)
calpath_gene_pos = calpath_gene_pos.rename(columns={"min(tx_start)": "gene_start", "max(tx_end)": "gene_end"})

In [7]:
diff = set(calpath_gene_list) - set(calpath_gene_pos["gene_name"].tolist())
print (diff)

{'CHP', 'LOC729317'}


In [8]:
# https://www.ncbi.nlm.nih.gov/gene/729317
calpath_gene_pos.loc[calpath_gene_pos.shape[0]] = ["LOC729317", "chr2", 65205076, 65206278]
# https://www.ncbi.nlm.nih.gov/gene/?term=CHP
calpath_gene_pos.loc[calpath_gene_pos.shape[0]] = ["CHP", "chr15", 41231149, 41281887]

In [9]:
calpath_gene_pos["chr"] = calpath_gene_pos.apply(lambda row: int(row["chrom"][3:]) 
                                                 if not row["chrom"][3:] in ("X", "Y") else 23, axis=1)

In [23]:
query = '''
SELECT sw.CHR, sw.BP1, sw.BP2, sw.TYPE, sw.PHE, cal.gene_name, cal.chr, cal.gene_start, cal.gene_end, count(sw.FID)
FROM sw_cnv_indiv_df sw LEFT JOIN calpath_gene_pos cal
WHERE sw.CHR == cal.chr
AND (
(sw.BP1 >= cal.gene_start AND sw.BP1 <= cal.gene_end)
OR
(sw.BP2 >= cal.gene_start AND sw.BP2 <= cal.gene_end)
OR
(sw.BP1 <= cal.gene_start AND sw.BP2 >= cal.gene_end)
)
GROUP BY sw.CHR, sw.BP1, sw.BP2, cal.gene_name, sw.TYPE, sw.PHE
'''
overlap_genes_df = sqldf(query).drop_duplicates(subset=("CHR", "BP1", "BP2", "gene_name", "TYPE"))
# overlap_genes_df = sqldf(query)

In [24]:
overlap_genes_df

Unnamed: 0,CHR,BP1,BP2,TYPE,PHE,gene_name,chr,gene_start,gene_end,count(sw.FID)
0,1,1804302,2047584,3,0,CALML6,1,1846265,1848733,1
2,1,226748623,226858419,1,1,ITPKB,1,226819390,226926876,1
3,1,237151125,237493362,1,1,RYR2,1,237205701,237997288,1
4,1,239678115,239800921,3,0,CHRM3,1,239792372,240072717,1
5,2,40450245,40614668,1,1,SLC8A1,2,40339285,40739575,1
6,2,40674308,41559590,3,0,SLC8A1,2,40339285,40739575,1
7,2,74969763,75323267,3,0,TACR1,2,75273589,75426645,1
8,2,183023392,183126288,1,1,PDE1A,2,183004761,183387572,1
9,2,218749319,219480897,3,0,PLCD4,2,219472487,219501909,1
10,3,4317445,4747247,3,0,ITPR1,3,4535031,4889524,1
