This function requires the goatools package. If not installed yet:<br>
pip install goatools --user

Ensure the following files are in the working directory: <br>
For mouse: genes_ncbi_10090_proteincoding.py <br>
For Human: genes_ncbi_9606_proteincoding.py

In [62]:
import numpy as np

### Create database to convert geneIDs to names:
Only use to update existing ones <br>
More details - https://github.com/tanghaibao/goatools/blob/main/notebooks/background_genes_ncbi.ipynb

In [191]:
#for mouse:
from goatools.cli.ncbi_gene_results_to_python import ncbi_tsv_to_py

ncbi_tsv = 'gene_result_mouse.txt'
output_py = 'genes_ncbi_10090_proteincoding.py'
ncbi_tsv_to_py(ncbi_tsv, output_py)

      73,104 lines READ:  gene_result_mouse.txt
      73,081 geneids WROTE: genes_ncbi_10090_proteincoding.py


In [None]:
#for human:
from goatools.cli.ncbi_gene_results_to_python import ncbi_tsv_to_py

ncbi_tsv = 'gene_result_human.txt'
output_py = 'genes_ncbi_9606_proteincoding.py'
ncbi_tsv_to_py(ncbi_tsv, output_py)

### Function

In [213]:
def get_go_list(go_type=("BP","MF","CC"),go_term="cell cycle",species=("Human","Mouse"),subtract=True):
    from goatools.base import download_go_basic_obo
    obo_fname = download_go_basic_obo()
    from goatools.base import download_ncbi_associations
    gene2go = download_ncbi_associations()
    from goatools.anno.genetogo_reader import Gene2GoReader
    if species=="Human":
        objanno = Gene2GoReader("gene2go", taxids=[9606])
        go2geneids = objanno.get_id2gos(namespace=go_type, go2geneids=True)
    if species=="Mouse":
        objanno = Gene2GoReader("gene2go", taxids=[10090])
        go2geneids = objanno.get_id2gos(namespace=go_type, go2geneids=True)
    from goatools.go_search import GoSearch
    srchhelp = GoSearch("go-basic.obo", go2items=go2geneids)
    import re
    go_all = re.compile(r'{}'.format(go_term), flags=re.IGNORECASE)
    go_not = re.compile(r'{}.independent'.format(go_term), flags=re.IGNORECASE)
    go_term2=go_term.replace(" ", "_")
    fout_allgos = go_term2+species+".log"
    with open(fout_allgos, "w") as log:
        gos_cc_all = srchhelp.get_matching_gos(go_all,prt=log)
        gos_no_cc = srchhelp.get_matching_gos(go_not, gos=gos_cc_all,prt=log)
        gos = gos_cc_all.difference(gos_no_cc)
        gos=gos_cc_all
        gos_all = srchhelp.add_children_gos(gos)
        geneids = srchhelp.get_items(gos_all)
    if species=="Human":
        from genes_ncbi_9606_proteincoding import GENEID2NT
    if species=="Mouse":
        from genes_ncbi_10090_proteincoding import GENEID2NT
    genelist=[]
    for geneid in geneids: # geneids associated with cell-cycle
        nt = GENEID2NT.get(geneid, None)
        if nt is not None:
                ans= nt.Symbol
                genelist=np.append(genelist,ans)
    genelist=np.sort(genelist)
    return genelist
        

# Arguments
go_type: "BP","MF" or "CC" - choses which type of GO annotation to search from. BP - Biological Process, MF - Molecular Function; CC - Cellular Compartment <br>
go_term: Keyword to searach for. It will include all the GOs that included the keyword, not only the exact match. The more precise the keyword the better. A log file is created in the working directory stating all GOs that were included <br>
species: Mouse or Human

# Examples

In [229]:
go_list=get_go_list(go_type="BP",
                    go_term="positive regulation of epithelial to mesenchymal transition",
                    species="Human")

  EXISTS: go-basic.obo
  EXISTS: gene2go
HMS:0:00:10.254902 342,209 annotations, 20,703 genes, 18,803 GOs, 1 taxids READ: gene2go 
12483 IDs in loaded association branch, BP
go-basic.obo: fmt(1.2) rel(2022-03-22) 47,103 Terms; optional_attrs(comment def relationship synonym xref)


In [230]:
go_list

array(['ACVR1', 'ALX1', 'AXIN2', 'BAMBI', 'BCL9L', 'BMP2', 'BMP4', 'BMP7',
       'COL1A1', 'CRB2', 'CTNNB1', 'DAB2', 'ENG', 'EZH2', 'FERMT2',
       'FOXC1', 'GCNT2', 'GLIPR2', 'HDAC2', 'IL1B', 'IL6', 'ISL1', 'JAG1',
       'LEF1', 'LOXL2', 'LRG1', 'MDK', 'MTOR', 'NOTCH1', 'OLFM1', 'PDPN',
       'RGCC', 'SDCBP', 'SERPINB3', 'SMAD2', 'SMAD3', 'SMAD4', 'SNAI1',
       'TBX20', 'TCF7L2', 'TGFB1', 'TGFB1I1', 'TGFB2', 'TGFB3', 'TGFBR1',
       'TGFBR2', 'TIAM1', 'TWIST1', 'WWTR1', 'ZNF703'], dtype='<U32')

In [231]:
go_list=get_go_list(go_type="MF",
                    go_term="kinase",
                    species="Human")

  EXISTS: go-basic.obo
  EXISTS: gene2go
HMS:0:00:15.769959 342,209 annotations, 20,703 genes, 18,803 GOs, 1 taxids READ: gene2go 
4445 IDs in loaded association branch, MF
go-basic.obo: fmt(1.2) rel(2022-03-22) 47,103 Terms; optional_attrs(comment def relationship synonym xref)


In [232]:
go_list

array(['AAK1', 'AATF', 'AATK', ..., 'ZFP36', 'ZFYVE26', 'ZPR1'],
      dtype='<U32')

In [233]:
go_list=get_go_list(go_type="CC",
                    go_term="membrane",
                    species="Human")

  EXISTS: go-basic.obo
  EXISTS: gene2go
HMS:0:00:08.012310 342,209 annotations, 20,703 genes, 18,803 GOs, 1 taxids READ: gene2go 
1823 IDs in loaded association branch, CC
go-basic.obo: fmt(1.2) rel(2022-03-22) 47,103 Terms; optional_attrs(comment def relationship synonym xref)


In [234]:
go_list

array(['A1BG', 'A1CF', 'A2M', ..., 'ZYG11B', 'ZYX', 'ZZZ3'], dtype='<U32')

### Overlaps between lists

In [252]:
def array_filtering(array1,array2,mode=("unique","overlap")):
    filtered=[]
    for i in array1:
        if mode=="unique":
            if i not in array2:
                filtered=np.append(filtered,i)
        if mode=="overlap":
            if i in array2:
                filtered=np.append(filtered,i)
    return filtered

In [253]:
go_list1=get_go_list(go_type="BP",
                    go_term="positive regulation of epithelial to mesenchymal transition",
                    species="Human")

  EXISTS: go-basic.obo
  EXISTS: gene2go
HMS:0:00:17.795434 342,209 annotations, 20,703 genes, 18,803 GOs, 1 taxids READ: gene2go 
12483 IDs in loaded association branch, BP
go-basic.obo: fmt(1.2) rel(2022-03-22) 47,103 Terms; optional_attrs(comment def relationship synonym xref)


In [254]:
go_list2=get_go_list(go_type="BP",
                    go_term="negative regulation of epithelial to mesenchymal transition",
                    species="Human")

  EXISTS: go-basic.obo
  EXISTS: gene2go
HMS:0:00:16.347723 342,209 annotations, 20,703 genes, 18,803 GOs, 1 taxids READ: gene2go 
12483 IDs in loaded association branch, BP
go-basic.obo: fmt(1.2) rel(2022-03-22) 47,103 Terms; optional_attrs(comment def relationship synonym xref)


In [255]:
go_list1_unique=array_filtering(go_list1,go_list2,mode="unique")

In [256]:
go_list1_unique

array(['ACVR1', 'ALX1', 'AXIN2', 'BAMBI', 'BCL9L', 'BMP2', 'BMP4', 'BMP7',
       'COL1A1', 'CRB2', 'CTNNB1', 'DAB2', 'ENG', 'EZH2', 'FERMT2',
       'FOXC1', 'GCNT2', 'GLIPR2', 'HDAC2', 'IL1B', 'IL6', 'ISL1', 'JAG1',
       'LEF1', 'LOXL2', 'LRG1', 'MDK', 'MTOR', 'NOTCH1', 'OLFM1', 'PDPN',
       'RGCC', 'SDCBP', 'SERPINB3', 'SMAD2', 'SMAD3', 'SMAD4', 'SNAI1',
       'TBX20', 'TCF7L2', 'TGFB1', 'TGFB1I1', 'TGFB3', 'TGFBR1', 'TGFBR2',
       'TIAM1', 'TWIST1', 'WWTR1', 'ZNF703'], dtype='<U32')

In [257]:
go_list2_unique=array_filtering(go_list2,go_list1,mode="unique")

In [258]:
go_list2_unique

array(['ADIPOR1', 'BMP5', 'DAB2IP', 'DACT3', 'EFNA1', 'EPHA4', 'FOXA1',
       'FOXA2', 'FUZ', 'GATA3', 'HPN', 'IL17RD', 'LDLRAD4', 'MAD2L2',
       'MARK1', 'NKX2-1', 'NOG', 'OVOL2', 'PPP2CA', 'PTEN', 'SDHAF2',
       'SFRP1', 'SFRP2', 'SMAD7', 'SPRED1', 'SPRED2', 'SPRED3', 'SPRY1',
       'SPRY2', 'TBX5', 'TRIM62', 'USF3', 'VASN', 'ZNF750'], dtype='<U32')

In [259]:
go_list_overlap=array_filtering(go_list1,go_list2,mode="overlap")

In [260]:
go_list_overlap

array(['TGFB2'], dtype='<U32')