In [1]:
import numpy as np
import pandas as pd
import os
from collections import defaultdict
from tqdm import tqdm
from pprint import pprint

from Bio import Entrez
Entrez.email = "sample@bioinf.me"

from functools import lru_cache

pd.options.mode.chained_assignment = None  # default='warn'
tqdm.pandas()

## For meta data

In [2]:
@lru_cache(maxsize=4096)
def return_gene_by_id(uid):
    handle = Entrez.esummary(db="gene", id=uid)
    uid_record = Entrez.read(handle)
    handle.close()
    uid_summary = uid_record["DocumentSummarySet"]['DocumentSummary'][0]
    return uid_summary['Name']

@lru_cache(maxsize=4096)
def return_gene_by_rsid(snp_id):
    answer = []
    record = Entrez.read(Entrez.elink(dbfrom="snp", 
                                  id=snp_id.replace('rs',''), 
                                  db="gene"))
    results = record[0]['LinkSetDb']
    if len(results) < 1:
        return None
    results = results[0]['Link']
    for result in results:
        uid = result['Id']
        gene_name = return_gene_by_id(uid)
        answer.append(gene_name)
    return ','.join(answer)

In [3]:
# # works for server
# datas005, best005 = return_best(0.05)
# best005.to_csv('top_top_snps05.csv', index=False)
# best005

# For FinnData

In [4]:
%%time
DIR = './data/f_special'
all_datas = defaultdict()
datas = []
files = os.listdir(DIR)
for f in files:
    if 'hg19lifted' not in f:
        continue
    d = pd.read_csv(f'{DIR}/{f}', sep='\t')
    trait = f.replace('maf_fg_', '').replace('_hg19lifted.tsv_.tsv', '')
    all_datas[trait] = d
    d = d[d.pval<0.05/d.shape[0]]
    d['trait'] = trait
    d['gene'] = d.rsid.progress_apply(return_gene_by_rsid)
    datas.append(d[['rsid', 'chr', 'gene', 'pos', 'ref', 'alt', 'maf', 'pval', 'trait']])

100%|█████████████████████████████████████████████| 6/6 [00:05<00:00,  1.00it/s]
100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.80s/it]
100%|█████████████████████████████████████████| 909/909 [28:48<00:00,  1.90s/it]
100%|█████████████████████████████████████████| 126/126 [01:44<00:00,  1.21it/s]

CPU times: user 35 s, sys: 6.24 s, total: 41.3 s
Wall time: 31min 17s





In [25]:
for t in traits:
    print(t, all_datas[t].shape[0], 0.05/all_datas[t].shape[0], sep='\t')

I9_HYPTENSPREG	6110809	8.182222681154002e-09
O15_GESTAT_HYPERT	6110674	8.182403446821087e-09
O15_EXCESS_VOMIT_PREG	6110714	8.18234988579076e-09
GEST_DIABETES	6110809	8.182222681154002e-09


## Prepare to sort

In [5]:
import re

def atof(text):
    try:
        retval = float(text)
    except ValueError:
        retval = text
    return retval

def natural_keys(text):
    '''
    alist.sort(key=natural_keys) sorts in human order
    http://nedbatchelder.com/blog/200712/human_sorting.html
    (See Toothy's implementation in the comments)
    float regex comes from https://stackoverflow.com/a/12643073/190597
    '''
    return [ atof(c) for c in re.split(r'[+-]?([0-9]+(?:[.][0-9]*)?|[.][0-9]+)', text) ]
kk = list(all_datas[trait].chr.astype(str).unique())
kk.sort(key=natural_keys)
order = {i:j for i, j in zip(kk, range(len(kk)))}
order

{'1': 0,
 '2': 1,
 '3': 2,
 '4': 3,
 '5': 4,
 '6': 5,
 '7': 6,
 '8': 7,
 '9': 8,
 '10': 9,
 '11': 10,
 '12': 11,
 '13': 12,
 '14': 13,
 '15': 14,
 '16': 15,
 '17': 16,
 '18': 17,
 '19': 18,
 '20': 19,
 '21': 20,
 '22': 21,
 'X': 22,
 'Y': 23}

## Sort and polish

In [6]:
STARTING_GROUP = 0

def make_group_col(df, col, group_col='group'):
    global STARTING_GROUP
    df[group_col] = None
    df.groupby((df[col].shift() != df[col]).cumsum())
    for k, v in df.groupby((df[col].shift() != df[col]).cumsum()):
        df.loc[v.index,group_col] = STARTING_GROUP+k
    STARTING_GROUP += k

In [7]:
for i in range(len(datas)):
    datas[i].pos = datas[i].pos.astype(int)
    datas[i].chr = datas[i].chr.astype(str)
    datas[i]['chr_order'] = datas[i].chr.apply(lambda x: order[x])
    datas[i]['chr_gene'] = datas[i].chr + '_' + datas[i].gene.astype(str)
    datas[i] = datas[i].sort_values(by=['chr_order', 'pos'])
    make_group_col(datas[i], 'chr_gene', 'group')
    datas[i] = datas[i].drop(['chr_order', 'chr_gene'], axis = 1)

In [8]:
best_datas = []
for data in datas:
    data
    best = data.groupby('group').agg(
        pval=pd.NamedAgg(column="pval", aggfunc="min"))
    filtered_data = data[data.pval.isin(best.pval)]
    # может произойти что в одной хромосоме данный пвал макс, а в другой нет 
    # - важно взять нужную и только ее! 
    flags = []
    for key, value in filtered_data.iterrows():
        flags.append(best[best.pval==value.pval].index[0]==value.group)
    best_datas.append(filtered_data[flags])
final = pd.concat(best_datas)
final.to_csv('./data/finn_top.tsv', sep='\t', index=False)
print(final.shape)
final

(51, 10)


Unnamed: 0,rsid,chr,gene,pos,ref,alt,maf,pval,trait,group
5547625,"rs58835482,rs796221113",19,,18493064,TGGGGGCACCCTGA,T,0.256,6.06e-09,O15_EXCESS_VOMIT_PREG,1
5547631,rs117110356,19,GDF15,18494944,T,A,0.255,7.97e-09,O15_EXCESS_VOMIT_PREG,2
5547635,rs75347775,19,"MIR3189,GDF15",18495908,G,A,0.256,6.11e-09,O15_EXCESS_VOMIT_PREG,3
5547648,rs1058587,19,GDF15,18499422,C,G,0.27,7.29e-09,O15_EXCESS_VOMIT_PREG,4
5547675,rs45543339,19,LRRC25,18503194,C,T,0.265,8.18e-09,O15_EXCESS_VOMIT_PREG,5
5740980,rs2208589,20,PREX1,47408414,A,G,0.112,6.33e-09,O15_GESTAT_HYPERT,6
785161,rs780094,2,GCKR,27741237,T,C,0.356,2.67e-09,GEST_DIABETES,7
2428901,rs9268403,6,"TSBP1-AS1,TSBP1",32341473,T,C,0.201,1.36e-09,GEST_DIABETES,8
2428967,rs1980495,6,TSBP1-AS1,32346794,A,C,0.202,8.8e-10,GEST_DIABETES,9
2429050,rs9268474,6,"TSBP1-AS1,HCG23",32357165,T,C,0.201,1.4e-09,GEST_DIABETES,10


Next, you should remove unnecessary SNPs

In [13]:
traits_order = {'I9_HYPTENSPREG':0, 'O15_GESTAT_HYPERT':1, 'O15_EXCESS_VOMIT_PREG':2,  'GEST_DIABETES':3}

In [20]:
_final = pd.read_csv('./data/finn_top_short.csv', sep='\t')
_final['order'] = _final.trait.apply(lambda x: traits_order[x])
_final = _final.sort_values(by=['order', 'chr', 'pos']).drop(['group', 'order'], axis=1, errors='ignore')
_final.to_csv('finn_top_short.csv')
_final

Unnamed: 0,rsid,chr,gene,pos,ref,alt,maf,pval,trait
7,rs13306561,1,"MTHFR,CLCN6",11865804,A,G,0.145,1.82e-10,I9_HYPTENSPREG
8,rs35954793,4,FGF5,81188513,C,A,0.257,1.6e-11,I9_HYPTENSPREG
9,rs10882398,10,PLCE1,95892788,T,A,0.406,3.48e-09,I9_HYPTENSPREG
10,rs167479,19,RGL3,11526765,T,G,0.426,5.13e-12,I9_HYPTENSPREG
11,rs259983,20,ZNF831,57735457,A,C,0.181,1.62e-09,I9_HYPTENSPREG
2,rs2208589,20,PREX1,47408414,A,G,0.112,6.33e-09,O15_GESTAT_HYPERT
0,rs58835482,19,GDF15,18493064,TGGGGGCACCCTGA,T,0.256,6.06e-09,O15_EXCESS_VOMIT_PREG
1,rs45543339,19,LRRC25,18503194,C,T,0.265,8.18e-09,O15_EXCESS_VOMIT_PREG
3,rs780094,2,GCKR,27741237,T,C,0.356,2.67e-09,GEST_DIABETES
4,rs9275373,6,"MTCO3P1, HLA-*",32668411,G,A,0.122,4.359999999999999e-19,GEST_DIABETES


In [21]:
traits = _final.trait.unique()
s=''
for t in traits:
    s+="c('"+"', '".join(list(_final[_final.trait==t].rsid))+"'),\n"
print(s[:-2], end='\n\n')
s=''
for t in traits:
    s+="c('"+"', '".join(list(_final[_final.trait==t].gene))+"'),\n"
print(s[:-2], end='\n\n')
s=''
for t in traits:
    s+=f"'{DIR}/{next(filter(lambda f: t in f and 'maf_fg' in f, files))}',\n"
print(s[:-2], end='\n\n')
s=''
for t in traits:
    s+=f"'_FG_{t}',\n"
print(s[:-2], end='\n\n')

c('rs13306561', 'rs35954793', 'rs10882398', 'rs167479', 'rs259983'),
c('rs2208589'),
c('rs58835482', 'rs45543339'),
c('rs780094', 'rs9275373', 'rs10659211', 'rs10830963')

c('MTHFR,CLCN6', 'FGF5', 'PLCE1', 'RGL3', 'ZNF831'),
c('PREX1'),
c('GDF15', 'LRRC25'),
c('GCKR', 'MTCO3P1, HLA-*', 'TCF7L2', 'MTNR1B')

'./data/f_special/maf_fg_I9_HYPTENSPREG_hg19lifted.tsv_.tsv',
'./data/f_special/maf_fg_O15_GESTAT_HYPERT_hg19lifted.tsv_.tsv',
'./data/f_special/maf_fg_O15_EXCESS_VOMIT_PREG_hg19lifted.tsv_.tsv',
'./data/f_special/maf_fg_GEST_DIABETES_hg19lifted.tsv_.tsv'

'_FG_I9_HYPTENSPREG',
'_FG_O15_GESTAT_HYPERT',
'_FG_O15_EXCESS_VOMIT_PREG',
'_FG_GEST_DIABETES'



## Sort in chr:pos ascending order

In [26]:
traits = final.trait.unique()

for t in traits:
    d = all_datas[t]
    d.pos = d.pos.astype(int)
    d.chr = d.chr.astype(str)
    d['chr_order'] = d.chr.apply(lambda x: order[x])
    kkkk = d.sort_values(by=['chr_order', 'pos'])[['rsid', 'chr', 'pos', 'ref', 'alt', 'maf', 'pval']]
    cur_f = f"data/f_special/{t}.tsv"
    kkkk.to_csv(cur_f, sep='\t', index=False)