# https://www.gsea-msigdb.org/gsea/msigdb/mouse_geneset_resources.jsp

In [6]:
import pandas as pd
import os
import glob
import sys

In [7]:
def gmt2page_index(gmtfile):
    with open(gmtfile) as raw:
        lines = [line for line in raw.read().splitlines()]
        lines = [line.split('\t') for line in lines] # if re.search(gs, line)]

        gsets = [line[0] for line in lines]
        genes = [line[2:] for line in lines]
        
        gs2gene = [(gn,gs) for gn,gs in zip(gsets,genes)]
        
        allgenes = {gene for x in genes for gene in x}
        
        out = dict()
        for gene in allgenes: # slow!!
            out[gene] = [gs for gs,gn in zip(gsets,genes) if gene in gn]
        
    return out



def gmt2page_names(gmtfile):
    with open(gmtfile) as raw:
        lines = [line for line in raw.read().splitlines()]
        lines = [line.split('\t') for line in lines] # if re.search(gs, line)]

        gsets = [line[0] for line in lines]
        cards = [line[1] for line in lines]
        
        out = [gs for gs,card in zip(gsets,cards)]
        
    return out


def write_page_index(index, indexfile):
    with open(indexfile, "wt") as out:
        for gene in index:
            out.write('\t'.join([gene] + list(index[gene]))+'\n')
    out.close()


def write_page_names(names, namesfile):
    with open(namesfile, "wt") as out:
        for pw in names:
            out.write("%s\t%s\tP\n" % (pw, pw))
    out.close()

In [10]:
%%time
gmts = glob.glob('../annotations/mouse/*.gmt')
gmts.sort()
gmts
for gmt in gmts:
    msig_id = gmt.split('/')[3].replace('.symbols.gmt','')
    direcory = '../annotations/mouse/'+msig_id
    index_file = direcory+'/'+msig_id+'_index.txt'
    names_file = direcory+'/'+msig_id+'_names.txt'
    
    print (direcory)
    os.mkdir(direcory)

    write_page_index(gmt2page_index(gmt), index_file)
    print ('_index is done!')

    write_page_names(gmt2page_names(gmt),names_file)
    print ('_names is done!')

    print ('\n')

../annotations/mouse/m1.all.v0.3
_index is done!
_names is done!


../annotations/mouse/m2.all.v0.3
_index is done!
_names is done!


../annotations/mouse/m2.cgp.v0.3
_index is done!
_names is done!


../annotations/mouse/m2.cp.biocarta.v0.3
_index is done!
_names is done!


../annotations/mouse/m2.cp.reactome.v0.3
_index is done!
_names is done!


../annotations/mouse/m2.cp.v0.3
_index is done!
_names is done!


../annotations/mouse/m2.cp.wikipathways.v0.3
_index is done!
_names is done!


../annotations/mouse/m3.all.v0.3
_index is done!
_names is done!


../annotations/mouse/m3.gtrd.v0.3
_index is done!
_names is done!


../annotations/mouse/m3.mirdb.v0.3
_index is done!
_names is done!


../annotations/mouse/m4.all.v0.3
_index is done!
_names is done!


../annotations/mouse/m5.all.v0.3
_index is done!
_names is done!


../annotations/mouse/m8.all.v0.3
_index is done!
_names is done!


../annotations/mouse/mh.all.v0.3
_index is done!
_names is done!


CPU times: user 8min 18s, sys: 4

In [26]:
%%bash 
for f in ../annotations/mouse/*/; do b=`basename $f`; o=${b/.v0.3/}; 
cp -rv $f /data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_$o; 
done 

‘../annotations/mouse/m1.all.v0.3/’ -> ‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m1.all’
‘../annotations/mouse/m1.all.v0.3/m1.all.v0.3_index.txt’ -> ‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m1.all/m1.all.v0.3_index.txt’
‘../annotations/mouse/m1.all.v0.3/m1.all.v0.3_names.txt’ -> ‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m1.all/m1.all.v0.3_names.txt’
‘../annotations/mouse/m2.all.v0.3/’ -> ‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m2.all’
‘../annotations/mouse/m2.all.v0.3/m2.all.v0.3_index.txt’ -> ‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m2.all/m2.all.v0.3_index.txt’
‘../annotations/mouse/m2.all.v0.3/m2.all.v0.3_names.txt’ -> ‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m2.all/m2.all.v0.3_names.txt’
‘../annotations/mouse/m2.cgp.v0.3/’ -> ‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m2.cgp’
‘../annotations/mouse/m2.cgp.v0.3/m2.

In [44]:
ls 

go2page.ipynb                         msigdb_v0.3_m2.cp.wikipathways_index.txt
human2mouse-gene-names.ipynb          msigdb_v0.3_m3.all_index.txt
msigdb2page.ipynb                     msigdb_v0.3_m3.gtrd_index.txt
msigdb_v0.3_m1.all_index.txt          msigdb_v0.3_m3.mirdb_index.txt
msigdb_v0.3_m2.all_index.txt          msigdb_v0.3_m4.all_index.txt
msigdb_v0.3_m2.cgp_index.txt          msigdb_v0.3_m5.all_index.txt
msigdb_v0.3_m2.cp.biocarta_index.txt  msigdb_v0.3_m8.all_index.txt
msigdb_v0.3_m2.cp_index.txt           msigdb_v0.3_mh.all_index.txt
msigdb_v0.3_m2.cp.reactome_index.txt  ncbi_gene_results_to_python.py


In [57]:
%%bash 
for f in /data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_*/*_names.txt; do 
b=`basename $f`; 
d1=`dirname $f`; d2=`basename $d1`; 
mv -v $f ${d1}/${d2}_names.txt
done 

‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m1.all/m1.all.v0.3_names.txt’ -> ‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m1.all/msigdb_v0.3_m1.all_names.txt’
‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m2.all/m2.all.v0.3_names.txt’ -> ‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m2.all/msigdb_v0.3_m2.all_names.txt’
‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m2.cgp/m2.cgp.v0.3_names.txt’ -> ‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m2.cgp/msigdb_v0.3_m2.cgp_names.txt’
‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m2.cp.biocarta/m2.cp.biocarta.v0.3_names.txt’ -> ‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m2.cp.biocarta/msigdb_v0.3_m2.cp.biocarta_names.txt’
‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m2.cp/m2.cp.v0.3_names.txt’ -> ‘/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0

In [59]:
ls -l /data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m*

/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m1.all:
total 620
-rw-rw-r--. 1 aarab aarab 624944 Aug 10 20:44 msigdb_v0.3_m1.all_index.txt
-rw-rw-r--. 1 aarab aarab   5654 Aug 10 20:44 msigdb_v0.3_m1.all_names.txt

/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m2.all:
total 7056
-rw-rw-r--. 1 aarab aarab 7020399 Aug 10 20:44 msigdb_v0.3_m2.all_index.txt
-rw-rw-r--. 1 aarab aarab  200836 Aug 10 20:44 msigdb_v0.3_m2.all_names.txt

/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m2.cgp:
total 3700
-rw-rw-r--. 1 aarab aarab 3725753 Aug 10 20:44 msigdb_v0.3_m2.cgp_index.txt
-rw-rw-r--. 1 aarab aarab   60332 Aug 10 20:44 msigdb_v0.3_m2.cgp_names.txt

/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigdb_v0.3_m2.cp:
total 3416
-rw-rw-r--. 1 aarab aarab 3352150 Aug 10 20:44 msigdb_v0.3_m2.cp_index.txt
-rw-rw-r--. 1 aarab aarab  140504 Aug 10 20:44 msigdb_v0.3_m2.cp_names.txt

/data_gilbert/home/aarab/iPAGE/PAGE_DATA/ANNOTATIONS/msigd

In [60]:
!date

Wed Aug 10 21:31:53 PDT 2022
