In [3]:
import os
import pandas as pd 
from itertools import chain
pd.set_option('display.max_rows', 1000)

In [4]:
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

In [5]:
outdir = 'results/main/gene_lists/'
os.makedirs(outdir, exist_ok=True)

In [6]:
# genes from malacards
malacards = pd.read_table('results/refs/genecards/2022.MalaCards.T1D.tsv')

# genes from a Klak et al 2020
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7882399/
klak_genes = ['PTPN22', 'PHTF1', 'CAMSAP2', 'IL10', 'IFIH1', 'STAT4', 'CTLA4', 'ACOXL',
        'EFR3B', 'AFF3', 'CCR5', 'AC080079.1', 'NOL8P1', 'ADAD1', 'IL21', 'IL2',
        'LINC02357', 'IL7R', 'CENPW', 'BACH2', 'AL596442.1', 'AL049612.1', 'TNFAIP3', 'SKAP2',
        'COBL', 'IKZF1', 'GLIS3', 'NRP1', 'IL2RA', 'RNLS', 'INS', 'BAD',
        'ITGB7', 'ERBB3', 'DGKA', 'ZDHHC17', 'SH2B3', 'CD69', 'LMO7', 'GPR183', 'AL163932.1',
        'LINC01550', 'ZFP36L1', 'MAGOH3P', 'DLK1', 'CTSH', 'RASGPR1', 'IL27', 'CLEC16A',
        'DEXI', 'BCAR1', 'SMARCE1', 'ORMDL3', 'CD226', 'PTPN2', 'FUT2', 'PRKD2', 'TYK2',
        'CDC34', 'MADCAM1', 'SIRPG', 'UBASH3A', 'AC002378.1', 'C1QTNF6', 'RAC2', 'TLR7/8', 'GAB3']

# genes from the edGAR database 
# http://edgar.biocomp.unibo.it/cgi-bin/gene_disease_db/main_table.py
edgar_genes = ['CAPN10', 'INSR', 'SLC2A4', 'ABCC8', 'CDKAL1', 'DNAJC3', 'ENPP1', 'GCGR',
        'GCK', 'GLIS3', 'GPD2', 'HNF1A', 'HNF1A', 'IGF2BP2', 'IL2RA', 'IL6',
        'KCNJ11', 'LIPC', 'NEUROD1', 'RETN', 'SLC2A2', 'SLC30A8', 'SUMO4', 'TBC1D4',
        'TCF7L2', 'ABCC8', 'ABCC8', 'AKT2', 'HNF1A', 'HNF1B', 'HNF4A', 'INS',
        'INS', 'INSR', 'IRS1', 'KCNJ11', 'KCNJ11', 'MAPK8IP1', 'PAX4', 'ZFP57',
        'PPP1R3A', 'CAPN10', 'CCR5', 'CTLA4', 'FOXP3', 'GCK', 'HMGA1', 'IRS2',
        'ITPR3', 'MTNR1B', 'OAS1', 'PDX1', 'PLAGL1', 'PPARG', 'PTPN1', 'WFS1',
        'EIF2AK3', 'INSR', 'PAX4', 'PTPN22']

# open target genes 
# https://platform.opentargets.org/disease/MONDO_0005147/associations
open_targets = pd.read_table('results/refs/open_targets/open-targets-genelist - MONDO_0005147-associated-diseases.tsv')
open_targets = open_targets.loc[open_targets.overallAssociationScore >= 0.5]

#gwas catalog genes
# https://www.ebi.ac.uk/gwas/efotraits/MONDO_0005147
gwas_t1d = pd.read_csv('results/refs/gwas_catalog/gwas-gene-list - efotraits_MONDO_0005147-associations-2022-05-18.csv')

In [7]:
gwas_t1d_genes = list(set(chain(*gwas_t1d['Mapped gene'].str.split(', ').tolist())))

In [8]:
studied_t1d_genes = set(malacards.Symbol.tolist() + \
                        klak_genes + edgar_genes + open_targets.symbol.tolist() + gwas_t1d_genes)

In [9]:
consensus_fn = os.path.join(outdir, 'consensus_gene_list.txt')
with open(consensus_fn, 'w') as fw:
    for x in studied_t1d_genes:
        fw.write(x + '\n')

In [None]:
import tabix

# Open a remote or local file.
url = 'results/main/gwas/source/T1D_32005708/GRCh37/GCST010681_buildGRCh37.hgvs_id.bed'

tb = tabix.open(url)

# These queries are identical. A query returns an iterator over the results.
records = tb.query("1", 1000000, 1250000)
records = tb.queryi(0, 1000000, 1250000)
records = tb.querys("1:1000000-1250000")

# Each record is a list of strings.
for record in records:
    print record[:3]