In [1]:
import os 
import re
import glob
import pandas as pd 

In [2]:
def gmt2page_index(gmtfile):
    with open(gmtfile) as raw:
        lines = [line for line in raw.read().splitlines()]
        lines = [line.split('\t') for line in lines] # if re.search(gs, line)]

        gsets = [line[0] for line in lines]
        genes = [line[2:] for line in lines]
        
        gs2gene = [(gn,gs) for gn,gs in zip(gsets,genes)]
        
        allgenes = {gene for x in genes for gene in x}
        
        out = dict()
        for gene in allgenes: # slow!!
            out[gene] = [gs for gs,gn in zip(gsets,genes) if gene in gn]
        
    return out



def gmt2page_names(gmtfile):
    with open(gmtfile) as raw:
        lines = [line for line in raw.read().splitlines()]
        lines = [line.split('\t') for line in lines] # if re.search(gs, line)]

        gsets = [line[0] for line in lines]
        cards = [line[1] for line in lines]
        
        out = [gs for gs,card in zip(gsets,cards)]
        
    return out


def write_page_index(index, indexfile):
    with open(indexfile, "wt") as out:
        for gene in index:
            out.write('\t'.join([gene] + list(index[gene]))+'\n')
    out.close()


def write_page_names(names, namesfile):
    with open(namesfile, "wt") as out:
        for pw in names:
            out.write("%s\t%s\tP\n" % (pw, pw))
    out.close()

In [3]:
%%time
gmts = glob.glob('msigdb_v7.4/*/*.gmt')
gmts.sort()

for gmt in gmts:
    msig_id = gmt.split('/')[0]+'_'+gmt.split('/')[1]
    direcory = 'annotations/'+msig_id
    index_file = direcory+'/'+msig_id+'_index.txt'
    names_file = direcory+'/'+msig_id+'_names.txt'    
    
    if not os.path.exists(direcory):
        os.mkdir(direcory)

    print (msig_id)
    
    if not os.path.exists(index_file):
        write_page_index(gmt2page_index(gmt), index_file)
        print ('_index is done!')
    
    if not os.path.exists(names_file):
        write_page_names(gmt2page_names(gmt),names_file)
        print ('_names is done!')

    print ('\n')

msigdb_v7.4_c1.all


msigdb_v7.4_c2.all
_names is done!


msigdb_v7.4_c2.cgp
_index is done!
_names is done!


msigdb_v7.4_c2.cp.biocarta
_index is done!
_names is done!


msigdb_v7.4_c2.cp.kegg
_index is done!
_names is done!


msigdb_v7.4_c2.cp.pid
_index is done!
_names is done!


msigdb_v7.4_c2.cp.reactome
_index is done!
_names is done!


msigdb_v7.4_c2.cp.wikipathways
_index is done!
_names is done!


msigdb_v7.4_c2.cp
_index is done!
_names is done!


msigdb_v7.4_c3.all
_index is done!
_names is done!


msigdb_v7.4_c3.mir.mir_legacy
_index is done!
_names is done!


msigdb_v7.4_c3.mir.mirdb
_index is done!
_names is done!


msigdb_v7.4_c3.mir
_index is done!
_names is done!


msigdb_v7.4_c3.tft.gtrd
_index is done!
_names is done!


msigdb_v7.4_c3.tft.tft_legacy
_index is done!
_names is done!


msigdb_v7.4_c3.tft
_index is done!
_names is done!


msigdb_v7.4_c4.all
_index is done!
_names is done!


msigdb_v7.4_c4.cgn
_index is done!
_names is done!


msigdb_v7.4_c4.cm
_index is

In [11]:
%%time
gmts = glob.glob('msigdb_v7.4/*c[1-2].all*/c[1-2].all*.gmt')
gmts.sort()

for gmt in gmts:
    msig_id = gmt.split('/')[0]+'_'+gmt.split('/')[1]
    direcory = 'annotations/'+msig_id
    index_file = direcory+'/'+msig_id+'_index.txt'
    names_file = direcory+'/'+msig_id+'_names.txt'    
    
    if not os.path.exists(direcory):
        os.mkdir(direcory)

    print (msig_id)
    
    write_page_index(gmt2page_index(gmt), index_file)
    print ('_index is done!')
    
    write_page_names(gmt2page_names(gmt),names_file)
    print ('_names is done!')

    print ('\n')


msigdb_v7.4_c1.all
_index is done!
_names is done!


msigdb_v7.4_c2.all
_index is done!
_names is done!


CPU times: user 2min 26s, sys: 266 ms, total: 2min 26s
Wall time: 2min 26s


In [12]:
!date

Wed Oct 27 16:44:54 PDT 2021
