In [22]:
import csv
import collections

basepath = "/home/kira/eqtls/results/"
file_name= "single_tissue_traits_snps.csv"

SNP_TARGET_FIELDS = ['dbsnp', 'traits', 'tissue', 'genes']
class SNP_TARGET(collections.namedtuple('_SNP_TARGET', SNP_TARGET_FIELDS)):
    __slots__ = ()

# Key is the variant_id of the snp, values are SNP_TARGET objects
single_tissue_trait_snps = dict()

with open(basepath + file_name) as instream:
    next(instream)
    reader = csv.reader(instream)
    for line in reader:
        variant_id = line[0]
        dbsnp_id = line[1]
        tissue = line[2]
        traits = line[3]
        genes = line[4].strip(";").split(";")
        gene_map = dict()
        for gene in genes:
            if "=" in gene:
                ensembl_id = gene.split("=")[0]
                symbol = gene.split("=")[1]
            else:
                ensembl_id = gene
                symbol = None
            gene_map[ensembl_id] = symbol
        single_tissue_trait_snps[line[0]] = SNP_TARGET(dbsnp=dbsnp_id, tissue=tissue, traits=traits, genes=gene_map)

# List of Target Gene Symbols for a given variant_id

In [23]:
variant_id = "chr9_97784109_G_A_b38"

symbols = list()
for key in single_tissue_trait_snps[variant_id].genes:
    print(single_tissue_trait_snps[variant_id].genes[key])
    symbols.append(single_tissue_trait_snps[variant_id].genes[key])

DIO1
S100A5
MUC1
FAM163A
COLGALT2
IGFN1
CR1
SLC35F3
EML6
HK2
OSBPL6
ERBB4
MROH2A
CAMK1
ALS2CL
CADM2
CRYBG3
CD200
GK5
SLC66A1L
KCNMB2
VWA5B2
PIGZ
SORCS2
PCDH7
NWD2
EPHA5
UNC5C
RNF150
SLC6A3
DNAH5
PDE8B
EPB41L4A
RELL2
TENM2
JARID2
FGD2
VEGFA
PAQR8
RGS17
RPS6KA2
OSBPL3
EPDR1
HECW1
SRPK2
NRCAM
TMEM178B
DPP6
DEPTOR
SH3GL2
MOB3B
LINGO2
UBAP2
TRMO
TMEFF1
CAVIN4
LMX1B
CCDC187
CACNA1B
AKR1C1
PLCE1
SORCS1
TRPM5
USH1C
LRRC4C
C11orf96
DTX4
CNTN5
NCAM1
DSCAML1
GLB1L3
CRACR2A
ERP27
LMO3
GALNT6
HSD17B6
CENPJ
NBEA
GPC5
TMEM255B
FAM189A1
PLA2G4F
LMF1
TLCD2
EVPLL
SP6
B4GALNT2
SDK2
UTS2R
TMEM241
CHST9
SLC14A1
CPAMD8
SULT2B1
RUVBL2
C20orf194
EFCAB8
TMPRSS2
KREMEN1
MCAT
CPT1B


# Enrichr results

In [29]:
import gseapy

results = gseapy.enrichr(gene_list=symbols, gene_sets="KEGG_2016", cutoff=1.0, 
                            organism = "human", description=variant_id,
                            outdir = basepath + "enrichr")

with open("/home/kira/eqtls/Enrichr/KEGG_2016." + variant_id + ".enrichr.reports.txt", "r") as instream:
    next(instream)
    print("Pathway\tp-val\tq-val\toverlap\tgenes")
    for line in instream:
        arr = line.split("\t")
        # Print patghways, which have at least 3 gene hits
        if int(arr[2].split("/")[0]) > 2:
            print(arr[1].split("_")[0],"\t",round(float(arr[3]),3),"\t",arr[4],"\t",arr[2],"\t",arr[9])

Pathway	p-val	q-val	overlap	genes
Steroid hormone biosynthesis 	 0.003 	 0.9240068196104854 	 3/58 	 SULT2B1;AKR1C1;HSD17B6

Axon guidance 	 0.027 	 1.0 	 3/127 	 EPHA5;UNC5C;LRRC4C

Cell adhesion molecules (CAMs) 	 0.035 	 1.0 	 3/142 	 NCAM1;NRCAM;LRRC4C

Calcium signaling pathway 	 0.063 	 1.0 	 3/180 	 ERBB4;CACNA1B;PLCE1

Proteoglycans in cancer 	 0.084 	 1.0 	 3/203 	 ERBB4;PLCE1;VEGFA

Ras signaling pathway 	 0.108 	 1.0 	 3/227 	 PLA2G4F;PLCE1;VEGFA

MAPK signaling pathway 	 0.139 	 1.0 	 3/255 	 PLA2G4F;RPS6KA2;CACNA1B

Metabolic pathways 	 0.601 	 1.0 	 6/1239 	 GALNT6;PLA2G4F;PLCE1;HSD17B6;MCAT;HK2

