In [5]:
# use kernel py3-6
#import goatools
from goatools import obo_parser
import re
import json
import numpy as np
from goatools.go_enrichment import GOEnrichmentStudy
import matplotlib.pyplot as plt

# Function to match non-zero Features in GFF and Map GO-Term to GO-ID using GOATools

In [6]:
def matchgff2(feature, gff_file='/home/t44p/PW_rawdata/Transciptome_GenomeAnnotation/Xele_annotated2_gff_export2.gff', obo_path="/home/t44p/PW_rawdata/go_obo/go.obo", namespace=None, depth_threshold=0, goea=False):
    """
    Searches a GFF (General Feature Format) file for specific features and retrieves associated Gene Ontology (GO) terms.

    Parameters:
    ----------
    feature : list or iterable
        An iterable of strings representing the features to search for in the GFF file.

    gff_file : str, optional
        The file path to the GFF file. Defaults to '/home/t44p/PW_rawdata/Transciptome_GenomeAnnotation/Xele_annotated2_gff_export2.gff'.

    obo_path : str, optional
        The file path to the Gene Ontology .obo file. Defaults to "/home/t44p/PW_rawdata/go_obo/combined.obo".

    Returns:
    -------
    tuple of dict
        A tuple containing two dictionaries:
        1. A dictionary where keys are the features and values are lists of lines (as strings) from the GFF file where these features are found.
        2. A dictionary where keys are the features and values are dictionaries. Each inner dictionary has GO IDs as keys and a dictionary with 'name' and 'namespace' of the GO term as values.

    Example:
    --------
    #>>> features = ['gene1', 'gene2']
    #>>> matchgff2(features)
    ({'gene1': ['line content from GFF file'], 'gene2': ['line content from GFF file']},
     {'gene1': {'GO:0000001': {'name': 'term name', 'namespace': 'biological_process'}}, ... })

    Notes:
    -----
    The function uses regular expressions for precise matching of features. It expects the feature followed by a tab character in the GFF file.
    Additionally, it extracts GO IDs from matched lines and retrieves their corresponding names and namespaces from the provided Gene Ontology .obo file.
    """

    valid_namespaces = {'biological_process', 'molecular_function', 'cellular_component', None}
    # Check if namespace is a list containing only valid elements
    if isinstance(namespace, list) and not all(ns in valid_namespaces for ns in namespace):
        raise ValueError("Invalid namespace provided. Valid options are 'biological_process', "
                         "'molecular_function', 'cellular_component', or a list containing any of these. "
                         "You can also use None for no filtering.")

    with open(gff_file, 'r') as file:
        go_ontology = obo_parser.GODag(obo_path)
        
        lines_where_feat_found = {}
        go_ids = {}
        background_genes = []
        go_term_count = {}

         # Find the depth of each GO term
        go_depths = {go_id: go_term.depth for go_id, go_term in go_ontology.items()}


        for feat in feature:
            file.seek(0)  # reset file pointer to the beginning for each feature
            lines_where_feat_found[feat] = []
            go_ids[feat] = {}
            pattern = re.compile(re.escape(feat) + r'\t')  # exact match followed by a tab
            for line in file:
                if goea:
                    if not line.lstrip().startswith('#'):
                        background_genes.append(line.split('\t')[0])

                if pattern.search(line):
                    lines_where_feat_found[feat].append(line.strip())  # Store the line (as a string) if feature is found
                    # Extract GO id
                    match = re.search(r"Ontology_id=([GO:\d,]+)", line.strip())
                    if match:
                        ids = match.group(1).split(',')
                        # Map Terms to Ids and Count Occurrences
                        for id in ids:
                            term = go_ontology.get(id)
                            if term is not None:
                                go_ids[feat][id] = {'name': term.name, 'namespace': term.namespace}

                                if namespace is None or term.namespace in namespace and go_ontology[id].depth >= depth_threshold:
                                    # Count Occurrences
                                    if id in go_term_count:
                                        go_term_count[id] = (term.name, go_term_count[id][1] + 1, term.namespace)
                                    else:
                                        go_term_count[id] = (term.name, 1, term.namespace)
                            else:
                                go_ids[feat][id] = {'name': None, 'namespace': None}
                                if id not in go_term_count:
                                    go_term_count[id] = (None, 1)
        if goea:
            print("GO Enrichment Analysis >>")
            goea_obj = GOEnrichmentStudy(
                background_genes,
                go_ids,  # This needs to be a dict mapping gene IDs to a set of GO IDs
                go_ontology,
                propagate_counts=False,
                alpha=0.05,  # significance level for the statistical test
                methods=['fdr_bh']  # correction method for multiple testing
            )
            goea_result = goea_obj.run_study(go_ids.keys())
            return lines_where_feat_found, go_ids, go_term_count, goea_result


        return lines_where_feat_found, go_ids, go_term_count

In [7]:

def plot_go_terms(term_count_dict, top_n=3, figsize=(25, 15), title="GO Term Occurrences"):
    print("THIS FUNCTION IS DEPRECATED; DO NOT USE IT")
    """
    Plots a bar chart of GO term occurrences.

    Parameters:
    term_count_dict (dict): Dictionary of GO terms with their counts.
    top_n (int): Number of top features to highlight.
    figsize (tuple): Figure size for the plot.
    title (str): Title of the plot.
    """
    # Extract terms and counts
    go_terms = list(term_count_dict.keys())
    counts = [term_count_dict[term][1] for term in go_terms]

    # Sort the terms by count and identify top features
    sorted_terms = sorted(term_count_dict.items(), key=lambda x: x[1][1], reverse=True)
    top_terms = sorted_terms[:top_n]

    # Create lists for the plot
    terms_to_plot = [term for term, _ in sorted_terms]
    counts_to_plot = [count for _, (_, count) in sorted_terms]

    # Colors - highlight top features in red, others in blue
    colors = ['red' if term in [t[0] for t in top_terms] else 'blue' for term in terms_to_plot]

    # Create the bar plot
    plt.figure(figsize=figsize)
    plt.bar(terms_to_plot, counts_to_plot, color=colors)
    plt.xlabel('GO Terms')
    plt.ylabel('Counts')
    plt.xticks(rotation=90)
    plt.title(title)
    plt.tight_layout()

    plt.show()
    print(f"top {top_n} most abundant GO Terms")
    for t in top_terms:
        print(t)

# Example usage:


In [8]:
"""def tabulate(term_count_dict, sort=True):
    if sort:
        print(f"count\tGO ID\tGO Term\tnamespace")
        term_count_dict = sorted(term_count_dict.items(), key=lambda x: x[1][1], reverse=True)
        for goid, values in term_count_dict:
            count, term, namespace = values
            print(f"{count}\t{goid}\t{term}\t{namespace}")
    else:
        print(f"count\tGO ID\tGO Term\tnamespace")
        for goid in term_count_dict.keys():
            if term_count_dict[goid][0] is not None:
                print(f"{term_count_dict[goid][1]}\t{goid}\t{term_count_dict[goid][0]}\t{term_count_dict[goid][2]}")
            else:
                print(f"{term_count_dict[goid][1]}\t{goid}")"""
def tabulate(term_count_dict, sort=True):
    print(f"count\tGO ID\tGO Term\tnamespace")

    # Conditionally sort the dictionary if required
    items = sorted(term_count_dict.items(), key=lambda x: x[1][1], reverse=True) if sort else term_count_dict.items()

    for goid, values in items:
        count = values[1]
        term = values[0] if values[0] is not None else "N/A"
        namespace = values[2] if len(values) > 2 else "N/A"

        print(f"{count}\t{goid}\t{term}\t{namespace}")
    #return 


### Glucose 

In [105]:
with open("./lasso_models/10xKfold_lasso_output/glucose_nXcv.json", 'r') as file:
    glucose_nXcv = json.load(file)
#glucose_nXcv_matched, glucose_goids, glucose_term_count, glucose_goea = matchgff2(glucose_nXcv['common_features'], goea=True)
glucose_nXcv_matched, gluc_goids_biop, gluc_term_count_biop, gluc_goea_biop = matchgff2(glucose_nXcv['common_features'], namespace=['biological_process'], depth_threshold=2, goea=True)
glucose_nXcv_matched, gluc_goids_biop, gluc_term_count_molf, gluc_goea_molf = matchgff2(glucose_nXcv['common_features'], namespace=['molecular_function'], depth_threshold=2, goea=True)
glucose_nXcv_matched, gluc_goids_biop, gluc_term_count_cellc, gluc_goea_cellc = matchgff2(glucose_nXcv['common_features'], namespace=['cellular_component'], depth_threshold=2, goea=True)



/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%    676 of 793,572 population items found in association

Runing  Ontology Analysis: current study set of 26 IDs.
100%     26 of     26 study items found in association
100%     26 of     26 study items found in population(793572)
Calculating 236 uncorrected p-values using fisher_scipy_stats
     236 terms are associated with     20 of 793,572 population items
     236 terms are associated with     20 of     26 study items
  METHOD fdr_bh:
     236 GO terms found significant (< 0.05=alpha) (236 enriched +   0 purified): statsmodels fdr_bh
      20 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)
/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%    676 of 793,572 population i

In [127]:
print("# <<< GLUCOSE >>>")
print(f"# 10x3FoldCV mean score: {np.mean(glucose_nXcv['mean_scores'])} \n# count common features: {len(glucose_nXcv['common_features'])}")
tabulate(gluc_term_count_biop)
print("# <<< GLUCOSE >>>")
print(f"# 10x3FoldCV mean score: {np.mean(glucose_nXcv['mean_scores'])} \n# count common features: {len(glucose_nXcv['common_features'])}")
tabulate(gluc_term_count_molf)
print("# <<< GLUCOSE >>>")
print(f"# 10x3FoldCV mean score: {np.mean(glucose_nXcv['mean_scores'])} \n# count common features: {len(glucose_nXcv['common_features'])}")
tabulate(gluc_term_count_cellc)

# <<< GLUCOSE >>>
# 10x3FoldCV mean score: 0.37766355282118325 
# count common features: 26
count	GO ID	GO Term	namespace
2	GO:0005975	carbohydrate metabolic process	biological_process
2	GO:0051240	positive regulation of multicellular organismal process	biological_process
2	GO:0048766	root hair initiation	biological_process
2	GO:0055062	phosphate ion homeostasis	biological_process
1	GO:0006048	UDP-N-acetylglucosamine biosynthetic process	biological_process
1	GO:0006281	DNA repair	biological_process
1	GO:0006487	protein N-linked glycosylation	biological_process
1	GO:0006493	protein O-linked glycosylation	biological_process
1	GO:0007283	spermatogenesis	biological_process
1	GO:0019255	glucose 1-phosphate metabolic process	biological_process
1	GO:0030097	hemopoiesis	biological_process
1	GO:0034221	fungal-type cell wall chitin biosynthetic process	biological_process
1	GO:0006869	lipid transport	biological_process
1	GO:0006897	endocytosis	biological_process
1	GO:0009306	protein secretion	bio

In [107]:
# plot_go_terms(gluc_term_count_biop, top_n=10)

In [108]:
"""PRINT EACH GFF ENTRY AND GO-TERMS CORRESPONDING TO NO-ZERO FEATURES"""
#for ele in glucose_nXcv_matched:
#    print(f"{glucose_nXcv_matched[ele]}")
#    #print(glucose_goids[ele], "\n")
#    for id in glucose_goids[ele]:
#        print(id, ": ", glucose_goids[ele][id])

'PRINT EACH GFF ENTRY AND GO-TERMS CORRESPONDING TO NO-ZERO FEATURES'

### Sucrose

In [9]:
with open("./lasso_models/10xKfold_lasso_output/sucrose_nXcv.json", 'r') as file:
    sucrose_nXcv = json.load(file)
#sucrose_nXcv_matched, sucrose_goids, sucrose_term_count,sucrose_goea = matchgff2(sucrose_nXcv['common_features'], goea=True)
sucrose_nXcv_matched, suc_goids_biop, suc_term_count_biop, suc_goea_biop = matchgff2(sucrose_nXcv['common_features'], namespace=['biological_process'], depth_threshold=2, goea=True)
sucrose_nXcv_matched, suc_goids_molf, suc_term_count_molf, suc_goea_molf = matchgff2(sucrose_nXcv['common_features'], namespace=['molecular_function'], depth_threshold=2, goea=True)
sucrose_nXcv_matched, suc_goids_cellc, suc_term_count_cellc, suc_goea_cellc = matchgff2(sucrose_nXcv['common_features'], namespace=['cellular_component'], depth_threshold=2, goea=True)

/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%     64 of 244,176 population items found in association

Runing  Ontology Analysis: current study set of 8 IDs.
100%      8 of      8 study items found in association
100%      8 of      8 study items found in population(244176)
Calculating 70 uncorrected p-values using fisher_scipy_stats
      70 terms are associated with      7 of 244,176 population items
      70 terms are associated with      7 of      8 study items
  METHOD fdr_bh:
      70 GO terms found significant (< 0.05=alpha) ( 70 enriched +   0 purified): statsmodels fdr_bh
       7 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)
/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%     64 of 244,176 population ite

In [11]:
#plot_go_terms(sucrose_term_count, top_n=10)
print("# <<< SUCROSE >>>")
print(f"# 10x3FoldCV mean score: {np.mean(sucrose_nXcv['mean_scores'])} \n# count common features: {len(sucrose_nXcv['common_features'])}")
tabulate(suc_term_count_biop)
print("# <<< SUCROSE >>>")
print(f"# 10x3FoldCV mean score: {np.mean(sucrose_nXcv['mean_scores'])} \n# count common features: {len(sucrose_nXcv['common_features'])}")
tabulate(suc_term_count_molf)
print("# <<< SUCROSE >>>")
print(f"# 10x3FoldCV mean score: {np.mean(sucrose_nXcv['mean_scores'])} \n# count common features: {len(sucrose_nXcv['common_features'])}")
tabulate(suc_term_count_cellc)


# <<< SUCROSE >>>
# 10x3FoldCV mean score: 0.3188916665994363 
# count common features: 8
count	GO ID	GO Term	namespace
1	GO:0006807	nitrogen compound metabolic process	biological_process
1	GO:0006974	DNA damage response	biological_process
1	GO:0043170	macromolecule metabolic process	biological_process
1	GO:0044238	primary metabolic process	biological_process
1	GO:0050794	regulation of cellular process	biological_process
1	GO:0006622	protein targeting to lysosome	biological_process
1	GO:0009306	protein secretion	biological_process
1	GO:0010324	membrane invagination	biological_process
1	GO:0010642	negative regulation of platelet-derived growth factor receptor signaling pathway	biological_process
1	GO:0016525	negative regulation of angiogenesis	biological_process
1	GO:0030948	negative regulation of vascular endothelial growth factor receptor signaling pathway	biological_process
1	GO:0043162	ubiquitin-dependent protein catabolic process via the multivesicular body sorting pathway	biologic

In [111]:
"""PRINT EACH GFF ENTRY AND GO-TERMS CORRESPONDING TO NO-ZERO FEATURES"""

#for ele in sucrose_nXcv_matched:
#    print(f"{sucrose_nXcv_matched[ele]}")
#    #print(glucose_goids[ele], "\n")
#    for id in sucrose_goids[ele]:
#        print(id, ": ", sucrose_goids[ele][id])

'PRINT EACH GFF ENTRY AND GO-TERMS CORRESPONDING TO NO-ZERO FEATURES'

### citric acid

In [12]:
with open("./lasso_models/10xKfold_lasso_output/citricAcid_nXcv.json", 'r') as file:
    citricAcid_nXcv = json.load(file)
#citricAcid_nXcv_matched, citricAcid_goids, citricAcid_term_count, citricAcid_goea = matchgff2(citricAcid_nXcv['common_features'], goea=True)
citricAcid_nXcv_matched, citA_goids_biop, citA_term_count_biop, citA_goea_biop = matchgff2(citricAcid_nXcv['common_features'], namespace=['biological_process'], depth_threshold=2, goea=True)
citricAcid_nXcv_matched, citA_goids_molf, citA_term_count_molf, citA_goea_molf = matchgff2(citricAcid_nXcv['common_features'], namespace=['molecular_function'], depth_threshold=2, goea=True)
citricAcid_nXcv_matched, citA_goids_cellc, citA_term_count_cellc, citA_goea_cellc = matchgff2(citricAcid_nXcv['common_features'], namespace=['cellular_component'], depth_threshold=2, goea=True)


/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%    324 of 549,396 population items found in association

Runing  Ontology Analysis: current study set of 18 IDs.
100%     18 of     18 study items found in association
100%     18 of     18 study items found in population(549396)
Calculating 190 uncorrected p-values using fisher_scipy_stats
     190 terms are associated with     13 of 549,396 population items
     190 terms are associated with     13 of     18 study items
  METHOD fdr_bh:
     190 GO terms found significant (< 0.05=alpha) (190 enriched +   0 purified): statsmodels fdr_bh
      13 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)
/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%    324 of 549,396 population i

In [13]:

#plot_go_terms(citricAcid_term_count, top_n=10)
print("# <<< CITRIC ACID >>>")
print(f"# 10x3FoldCV mean score: {np.mean(citricAcid_nXcv['mean_scores'])} \n# count common features: {len(citricAcid_nXcv['common_features'])}")
tabulate(citA_term_count_biop)
print("# <<< CITRIC ACID >>>")
print(f"# 10x3FoldCV mean score: {np.mean(citricAcid_nXcv['mean_scores'])} \n# count common features: {len(citricAcid_nXcv['common_features'])}")
tabulate(citA_term_count_molf)
print("# <<< CITRIC ACID >>>")
print(f"# 10x3FoldCV mean score: {np.mean(citricAcid_nXcv['mean_scores'])} \n# count common features: {len(citricAcid_nXcv['common_features'])}")
tabulate(citA_term_count_cellc)

# <<< CITRIC ACID >>>
# 10x3FoldCV mean score: 0.22358901669508696 
# count common features: 18
count	GO ID	GO Term	namespace
2	GO:0009686	gibberellin biosynthetic process	biological_process
2	GO:0009826	unidimensional cell growth	biological_process
2	GO:0048235	pollen sperm cell differentiation	biological_process
2	GO:0055085	transmembrane transport	biological_process
1	GO:0009750	response to fructose	biological_process
1	GO:0090333	regulation of stomatal closure	biological_process
1	GO:0010182	sugar mediated signaling pathway	biological_process
1	GO:0007254	JNK cascade	biological_process
1	GO:0042733	embryonic digit morphogenesis	biological_process
1	GO:0001666	response to hypoxia	biological_process
1	GO:2000035	regulation of stem cell division	biological_process
1	GO:1904291	positive regulation of mitotic DNA damage checkpoint	biological_process
1	GO:0036289	peptidyl-serine autophosphorylation	biological_process
1	GO:0038066	p38MAPK cascade	biological_process
1	GO:1900424	regulation

In [114]:
"""PRINT EACH GFF ENTRY AND GO-TERMS CORRESPONDING TO NO-ZERO FEATURES"""

#for ele in citricAcid_nXcv_matched:
#    print(f"{citricAcid_nXcv_matched[ele]}")
#    #print(glucose_goids[ele], "\n")
#    for id in citricAcid_goids[ele]:
#        print(id, ": ", citricAcid_goids[ele][id])

'PRINT EACH GFF ENTRY AND GO-TERMS CORRESPONDING TO NO-ZERO FEATURES'

In [14]:
with open("./lasso_models/10xKfold_lasso_output/isoleucine_nXcv.json", 'r') as file:
    isoleucine_nXcv = json.load(file)
#isoleucine_nXcv_matched, isoleucine_goids, isoleucine_term_count, isoleucine_goea = matchgff2(isoleucine_nXcv['common_features'], goea=True) #['biological_process', 'molecular_function', 'cellular_component']
isoleucine_nXcv_matched, ile_goids_biop, ile_term_count_biop, ile_goea_biop = matchgff2(isoleucine_nXcv['common_features'], namespace=['biological_process', 'molecular_function'], depth_threshold=2, goea=True)
isoleucine_nXcv_matched, ile_goids_molf, ile_term_count_molf, ile_goea_molf = matchgff2(isoleucine_nXcv['common_features'], namespace=['molecular_function'], depth_threshold=2, goea=True)
isoleucine_nXcv_matched, ile_goids_cellc, ile_term_count_cellc, ile_goea_cellc = matchgff2(isoleucine_nXcv['common_features'], namespace=['cellular_component'], depth_threshold=2, goea=True)

/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%     81 of 274,698 population items found in association

Runing  Ontology Analysis: current study set of 9 IDs.
100%      9 of      9 study items found in association
100%      9 of      9 study items found in population(274698)
Calculating 136 uncorrected p-values using fisher_scipy_stats
     136 terms are associated with      8 of 274,698 population items
     136 terms are associated with      8 of      9 study items
  METHOD fdr_bh:
     136 GO terms found significant (< 0.05=alpha) (136 enriched +   0 purified): statsmodels fdr_bh
       8 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)
/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%     81 of 274,698 population it

In [15]:
# plot_go_terms(isoleucine_term_count, top_n=10)
print("# <<< ISOLEUCINE >>>")
print(f"# 10x3FoldCV mean score: {np.mean(isoleucine_nXcv['mean_scores'])} \n# count common features: {len(isoleucine_nXcv['common_features'])}")
tabulate(ile_term_count_biop)
print("# <<< ISOLEUCINE >>>")
print(f"# 10x3FoldCV mean score: {np.mean(isoleucine_nXcv['mean_scores'])} \n# count common features: {len(isoleucine_nXcv['common_features'])}")
tabulate(ile_term_count_molf)
print("# <<< ISOLEUCINE >>>")
print(f"# 10x3FoldCV mean score: {np.mean(isoleucine_nXcv['mean_scores'])} \n# count common features: {len(isoleucine_nXcv['common_features'])}")
tabulate(ile_term_count_cellc)

# <<< ISOLEUCINE >>>
# 10x3FoldCV mean score: 0.07529144560758311 
# count common features: 9
count	GO ID	GO Term	namespace
2	GO:0000166	nucleotide binding	molecular_function
2	GO:0050832	defense response to fungus	biological_process
2	GO:0042803	protein homodimerization activity	molecular_function
2	GO:0004674	protein serine/threonine kinase activity	molecular_function
2	GO:0005515	protein binding	molecular_function
2	GO:0045893	positive regulation of DNA-templated transcription	biological_process
2	GO:0009737	response to abscisic acid	biological_process
1	GO:0042910	xenobiotic transmembrane transporter activity	molecular_function
1	GO:0032370	positive regulation of lipid transport	biological_process
1	GO:0055076	N/A	N/A
1	GO:0009791	post-embryonic development	biological_process
1	GO:0051173	positive regulation of nitrogen compound metabolic process	biological_process
1	GO:0006820	monoatomic anion transport	biological_process
1	GO:1901700	response to oxygen-containing compound	biologi

In [16]:
with open("./lasso_models/10xKfold_lasso_output/leucine_nXcv.json", 'r') as file:
    leucine_nXcv = json.load(file)
#leucine_nXcv_matched, leucine_goids, leucine_term_count, leucine_goea = matchgff2(leucine_nXcv['common_features'], goea=True)
leucine_nXcv_matched, leu_goids_biop, leu_term_count_biop, leu_goea_biop = matchgff2(leucine_nXcv['common_features'], namespace=['biological_process', 'molecular_function'], depth_threshold=2, goea=True)
leucine_nXcv_matched, leu_goids_molf, leu_term_count_molf, leu_goea_molf = matchgff2(leucine_nXcv['common_features'], namespace=['molecular_function'], depth_threshold=2, goea=True)
leucine_nXcv_matched, leu_goids_cellc, leu_term_count_cellc, leu_goea_cellc = matchgff2(leucine_nXcv['common_features'], namespace=['cellular_component'], depth_threshold=2, goea=True)


/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%    225 of 457,830 population items found in association

Runing  Ontology Analysis: current study set of 15 IDs.
100%     15 of     15 study items found in association
100%     15 of     15 study items found in population(457830)
Calculating 119 uncorrected p-values using fisher_scipy_stats
     119 terms are associated with     11 of 457,830 population items
     119 terms are associated with     11 of     15 study items
  METHOD fdr_bh:
     119 GO terms found significant (< 0.05=alpha) (119 enriched +   0 purified): statsmodels fdr_bh
      11 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)
/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%    225 of 457,830 population i

In [118]:
# plot_go_terms(leucine_term_count, top_n=10)
print("# <<< LEUCINE >>>")
print(f"# 10x3FoldCV mean score: {np.mean(leucine_nXcv['mean_scores'])} \n# count common features: {len(leucine_nXcv['common_features'])}")
tabulate(leu_term_count_biop)
print("# <<< LEUCINE >>>")
print(f"# 10x3FoldCV mean score: {np.mean(leucine_nXcv['mean_scores'])} \n# count common features: {len(leucine_nXcv['common_features'])}")
tabulate(leu_term_count_molf)
tabulate(leu_term_count_cellc)

# <<< LEUCINE >>>
# 10x3FoldCV mean score: 0.39988802939102847 
# count common features: 15
count	GO ID	GO Term	namespace
6	GO:0005515	protein binding	molecular_function
3	GO:0004674	protein serine/threonine kinase activity	molecular_function
2	GO:0046777	protein autophosphorylation	biological_process
2	GO:0048364	root development	biological_process
2	GO:0000166	nucleotide binding	molecular_function
2	GO:0019722	calcium-mediated signaling	biological_process
2	GO:0009845	seed germination	biological_process
2	GO:0009738	abscisic acid-activated signaling pathway	biological_process
2	GO:0002237	response to molecule of bacterial origin	biological_process
2	GO:0009625	response to insect	biological_process
2	GO:0042742	defense response to bacterium	biological_process
2	GO:0050826	response to freezing	biological_process
1	GO:0016018	cyclosporin A binding	molecular_function
1	GO:0009742	brassinosteroid mediated signaling pathway	biological_process
1	GO:0000413	protein peptidyl-prolyl isomerizat

In [119]:
with open("./lasso_models/10xKfold_lasso_output/valine_nXcv.json", 'r') as file:
    valine_nXcv = json.load(file)
#valine_nXcv_matched, valine_goids, valine_term_count, valine_goea = matchgff2(valine_nXcv['common_features'], goea=True)
valine_nXcv_matched, val_goids_biop, val_term_count_biop, val_goea_biop = matchgff2(valine_nXcv['common_features'], namespace=['biological_process', 'molecular_function'], depth_threshold=2, goea=True)
valine_nXcv_matched, val_goids_molf, val_term_count_molf, val_goea_molf = matchgff2(valine_nXcv['common_features'], namespace=['molecular_function'], depth_threshold=2, goea=True)
valine_nXcv_matched, val_goids_cellc, val_term_count_cellc, val_goea_cellc = matchgff2(valine_nXcv['common_features'], namespace=['cellular_component'], depth_threshold=2, goea=True)

/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%    169 of 396,786 population items found in association

Runing  Ontology Analysis: current study set of 13 IDs.
100%     13 of     13 study items found in association
100%     13 of     13 study items found in population(396786)
Calculating 186 uncorrected p-values using fisher_scipy_stats
     186 terms are associated with     12 of 396,786 population items
     186 terms are associated with     12 of     13 study items
  METHOD fdr_bh:
     186 GO terms found significant (< 0.05=alpha) (186 enriched +   0 purified): statsmodels fdr_bh
      12 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)
/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%    169 of 396,786 population i

In [120]:
#plot_go_terms(valine_term_count, top_n=10)
print("# <<< VALINE >>>")
print(f"# 10x3FoldCV mean score: {np.mean(valine_nXcv['mean_scores'])} \n# count common features: {len(valine_nXcv['common_features'])}")
tabulate(val_term_count_biop)
tabulate(val_term_count_molf)
tabulate(val_term_count_cellc)

# <<< VALINE >>>
# 10x3FoldCV mean score: 0.4303056564518039 
# count common features: 13
count	GO ID	GO Term	namespace
2	GO:0046777	protein autophosphorylation	biological_process
2	GO:0000166	nucleotide binding	molecular_function
2	GO:0016567	protein ubiquitination	biological_process
2	GO:0051787	misfolded protein binding	molecular_function
2	GO:0051015	actin filament binding	molecular_function
1	GO:0032491	detection of molecule of fungal origin	biological_process
1	GO:0008061	chitin binding	molecular_function
1	GO:0090333	regulation of stomatal closure	biological_process
1	GO:0048364	root development	biological_process
1	GO:0002752	cell surface pattern recognition receptor signaling pathway	biological_process
1	GO:0019722	calcium-mediated signaling	biological_process
1	GO:0071219	cellular response to molecule of bacterial origin	biological_process
1	GO:0050832	defense response to fungus	biological_process
1	GO:0015026	coreceptor activity	molecular_function
1	GO:0042803	protein homodi

In [121]:
with open("./lasso_models/10xKfold_lasso_output/rutin_nXcv.json", 'r') as file:
    rutin_nXcv = json.load(file)
#rutin_nXcv_matched, rutin_goids, rutin_term_count, rutin_goea = matchgff2(rutin_nXcv['common_features'], goea=True)
rutin_nXcv_matched, rut_goids_biop, rut_term_count_biop, rut_goea_biop = matchgff2(rutin_nXcv['common_features'], namespace=['biological_process', 'molecular_function'], depth_threshold=2, goea=True)
rutin_nXcv_matched, rut_goids_molf, rut_term_count_molf, rut_goea_molf = matchgff2(rutin_nXcv['common_features'], namespace=['molecular_function'], depth_threshold=2, goea=True)
rutin_nXcv_matched, rut_goids_cellc, rut_term_count_cellc, rut_goea_cellc = matchgff2(rutin_nXcv['common_features'], namespace=['cellular_component'], depth_threshold=2, goea=True)

/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%    961 of 946,182 population items found in association

Runing  Ontology Analysis: current study set of 31 IDs.
100%     31 of     31 study items found in association
100%     31 of     31 study items found in population(946182)
Calculating 223 uncorrected p-values using fisher_scipy_stats
     223 terms are associated with     25 of 946,182 population items
     223 terms are associated with     25 of     31 study items
  METHOD fdr_bh:
     223 GO terms found significant (< 0.05=alpha) (223 enriched +   0 purified): statsmodels fdr_bh
      25 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)
/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%    961 of 946,182 population i

In [122]:

#plot_go_terms(rutin_term_count, top_n=10)
print("# <<< RUTIN >>>")
print(f"# 10x3FoldCV mean score: {np.mean(rutin_nXcv['mean_scores'])} \n# count common features: {len(rutin_nXcv['common_features'])}")
tabulate(rut_term_count_biop)
tabulate(rut_term_count_molf)
tabulate(rut_term_count_cellc)

# <<< RUTIN >>>
# 10x3FoldCV mean score: 0.7485866213502098 
# count common features: 31
count	GO ID	GO Term	namespace
6	GO:0005515	protein binding	molecular_function
4	GO:0042803	protein homodimerization activity	molecular_function
4	GO:0009733	response to auxin	biological_process
3	GO:0045892	negative regulation of DNA-templated transcription	biological_process
3	GO:0009414	response to water deprivation	biological_process
3	GO:0003700	DNA-binding transcription factor activity	molecular_function
3	GO:0045893	positive regulation of DNA-templated transcription	biological_process
3	GO:0009862	systemic acquired resistance, salicylic acid mediated signaling pathway	biological_process
2	GO:0003729	mRNA binding	molecular_function
2	GO:0048481	plant ovule development	biological_process
2	GO:0006096	glycolytic process	biological_process
2	GO:0009734	auxin-activated signaling pathway	biological_process
2	GO:0042542	response to hydrogen peroxide	biological_process
2	GO:0009723	response to ethyle

In [123]:
with open("./lasso_models/10xKfold_lasso_output/trehalose_nXcv.json", 'r') as file:
    trehalose_nXcv = json.load(file)
#trehalose_nXcv_matched, trehalose_goids, trehalose_term_count, trehalose_goea = matchgff2(trehalose_nXcv['common_features'], goea=True)
trehalose_nXcv_matched, tre_goids_biop, tre_term_count_biop, tre_goea_biop = matchgff2(trehalose_nXcv['common_features'], namespace=['biological_process', 'molecular_function'], depth_threshold=2, goea=True)
trehalose_nXcv_matched, tre_goids_molf, tre_term_count_molf, tre_goea_molf = matchgff2(trehalose_nXcv['common_features'], namespace=['molecular_function'], depth_threshold=2, goea=True)
trehalose_nXcv_matched, tre_goids_cellc, tre_term_count_cellc, tre_goea_cellc = matchgff2(trehalose_nXcv['common_features'], namespace=['cellular_component'], depth_threshold=2, goea=True)

/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%    196 of 427,308 population items found in association

Runing  Ontology Analysis: current study set of 14 IDs.
100%     14 of     14 study items found in association
100%     14 of     14 study items found in population(427308)
Calculating 139 uncorrected p-values using fisher_scipy_stats
     139 terms are associated with     10 of 427,308 population items
     139 terms are associated with     10 of     14 study items
  METHOD fdr_bh:
     139 GO terms found significant (< 0.05=alpha) (139 enriched +   0 purified): statsmodels fdr_bh
      10 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)
/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%    196 of 427,308 population i

In [124]:
#plot_go_terms(trehalose_term_count, top_n=10)
print("# <<< TREHALOSE >>>")
print(f"# 10x3FoldCV mean score: {np.mean(trehalose_nXcv['mean_scores'])} \n# count common features: {len(trehalose_nXcv['common_features'])}")

tabulate(tre_term_count_biop)
tabulate(tre_term_count_molf)
tabulate(tre_term_count_cellc)

# <<< TREHALOSE >>>
# 10x3FoldCV mean score: 0.7162380753151226 
# count common features: 14
count	GO ID	GO Term	namespace
3	GO:0005515	protein binding	molecular_function
2	GO:0042127	regulation of cell population proliferation	biological_process
1	GO:0006807	nitrogen compound metabolic process	biological_process
1	GO:0006974	DNA damage response	biological_process
1	GO:0043170	macromolecule metabolic process	biological_process
1	GO:0044238	primary metabolic process	biological_process
1	GO:0050794	regulation of cellular process	biological_process
1	GO:0003676	nucleic acid binding	molecular_function
1	GO:0034051	negative regulation of plant-type hypersensitive response	biological_process
1	GO:0050832	defense response to fungus	biological_process
1	GO:0061057	peptidoglycan recognition protein signaling pathway	biological_process
1	GO:0061630	ubiquitin protein ligase activity	molecular_function
1	GO:0070628	proteasome binding	molecular_function
1	GO:0061631	ubiquitin conjugating enzyme act

In [125]:
with open("./lasso_models/10xKfold_lasso_output/oxGlut_nXcv.json", 'r') as file:
    oxGlut_nXcv = json.load(file)
#oxGlut_nXcv_matched, oxGlut_goids, oxGlut_term_count, oxGlut_goea = matchgff2(oxGlut_nXcv['common_features'], goea=True)
oxGlut_nXcv_matched, oxGlut_goids_biop, oxGlut_term_count_biop, oxGlut_goea_biop = matchgff2(oxGlut_nXcv['common_features'], namespace=['biological_process', 'molecular_function'], depth_threshold=2, goea=True)
oxGlut_nXcv_matched, oxGlut_goids_molf, oxGlut_term_count_molf, oxGlut_goea_molf = matchgff2(oxGlut_nXcv['common_features'], namespace=['molecular_function'], depth_threshold=2, goea=True)
oxGlut_nXcv_matched, oxGlut_goids_cellc, oxGlut_term_count_cellc, oxGlut_goea_cellc = matchgff2(oxGlut_nXcv['common_features'], namespace=['cellular_component'], depth_threshold=2, goea=True)

/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%  1,024 of 976,704 population items found in association

Runing  Ontology Analysis: current study set of 32 IDs.
100%     32 of     32 study items found in association
100%     32 of     32 study items found in population(976704)
Calculating 244 uncorrected p-values using fisher_scipy_stats
     244 terms are associated with     23 of 976,704 population items
     244 terms are associated with     23 of     32 study items
  METHOD fdr_bh:
     244 GO terms found significant (< 0.05=alpha) (244 enriched +   0 purified): statsmodels fdr_bh
      23 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)
/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%  1,024 of 976,704 population i

In [126]:
#plot_go_terms(oxGlut_term_count, top_n=10)
print("# <<< OXIDIZED GLUTATHIONE >>>")
print(f"# 10x3FoldCV mean score: {np.mean(oxGlut_nXcv['mean_scores'])} \n# count common features: {len(oxGlut_nXcv['common_features'])}")
tabulate(oxGlut_term_count_biop)
tabulate(oxGlut_term_count_molf)
tabulate(oxGlut_term_count_cellc)

# <<< OXIDIZED GLUTATHIONE >>>
# 10x3FoldCV mean score: 0.657258809998042 
# count common features: 32
count	GO ID	GO Term	namespace
10	GO:0005515	protein binding	molecular_function
4	GO:0009414	response to water deprivation	biological_process
3	GO:0000166	nucleotide binding	molecular_function
3	GO:0003729	mRNA binding	molecular_function
2	GO:0010970	transport along microtubule	biological_process
2	GO:0050794	regulation of cellular process	biological_process
2	GO:0009409	response to cold	biological_process
2	GO:0009651	response to salt stress	biological_process
2	GO:0009737	response to abscisic acid	biological_process
2	GO:0000978	RNA polymerase II cis-regulatory region sequence-specific DNA binding	molecular_function
2	GO:0010228	vegetative to reproductive phase transition of meristem	biological_process
2	GO:0016310	phosphorylation	biological_process
2	GO:0008270	zinc ion binding	molecular_function
2	GO:0031425	chloroplast RNA processing	biological_process
2	GO:0080156	mitochondrial m

# GO Enrichment Analysis

### `GOEnrichmentStudy` object in `goatools`, key arguments:

1. **`pop`**:
   - The population parameter represents all possible items that could be sampled. In the context of GO enrichment, it's typically the list of all gene IDs from your background set.

2. **`assoc`**:
   - The association between your items (genes) and the groups (GO terms). This is a dictionary where keys are gene IDs and values are sets of GO IDs.

3. **`obo_dag`**:
   - An object representing the GO terms and their hierarchical relationships, usually obtained by parsing the GO OBO file with `obo_parser.GODag()`.

4. **`propagate_counts`** (optional, default `True`):
   - If `True`, the gene counts will be propagated up the GO DAG, so that each term includes counts for all descendant terms.

5. **`alpha`** (optional, default `0.05`):
   - The significance level for the statistical tests.

6. **`methods`** (optional):
   - The methods for multiple test correction, such as `['fdr_bh']` for Benjamini-Hochberg.

7. **`pvalcalc`** (optional):
   - The object for calculating p-values. By default, `goatools` uses Fisher's exact test.

8. **`min_overlap`** (optional, default `0.7`):
   - The minimum overlap between the study group and the GO term.

9. **`prunetree`** (optional, default `True`):
   - If `True`, GO terms without any associated genes in the population will be pruned from the analysis.

10. **`go2geneids`** (optional):
    - A dictionary mapping GO IDs to gene IDs. This can be used instead of `assoc` to provide associations.

11. **`geneid2symbol`** (optional):
    - A dictionary mapping gene IDs to gene symbols, which can be useful for labeling output.

12. **`evidence_set`** (optional):
    - A set of evidence codes to use. If not specified, all evidence codes will be included.

13. **`relationships`** (optional):
    - A set of extra relationship types to include when building the GO DAG, such as `'part_of'` or `'regulates'`.

14. **`keep_if_no_id`** (optional, default `False`):
    - If `True`, genes without GO IDs in the `assoc` will be kept in the population.

15. **`study`**:
    - A list of items (genes) for which you want to check enrichment. This is typically the list of genes corresponding to your non-zero features.


The `GOEnrichmentStudy` class in the `goatools` library key attributes:

1. **`goterm`**:
   - This attribute is an object representing the GO term itself. It contains further details about the term, like its name, namespace, and description.

2. **`p_uncorrected`**:
   - The raw, uncorrected p-value resulting from the enrichment test (e.g., Fisher's exact test).

3. **`p_fdr_bh`**:
   - The p-value adjusted for multiple testing using the Benjamini-Hochberg false discovery rate method.

4. **`enrichment`**:
   - This indicates whether the GO term is overrepresented (`'e'` for enriched) or underrepresented (`'p'` for purified) in the study set compared to the background.

5. **`ratio_in_study`**:
   - A tuple showing the number of genes in the study set that are annotated with this GO term and the total number of genes in the study set.

6. **`ratio_in_pop`**:
   - A tuple showing the number of genes in the background population that are annotated with this GO term and the total number of genes in the background population.

7. **`study_items`**:
   - A list of genes from your study set that are annotated with this GO term.

8. **`study_count`**:
   - The number of genes in your study set that are annotated with this GO term.

9. **`pop_count`**:
   - The number of genes in the background population that are annotated with this GO term.

10. **`study_n`**:
    - The total number of genes in your study set.

11. **`pop_n`**:
    - The total number of genes in the background population.

12. **`name`** (if available):
    - The name of the GO term.

13. **`namespace`** (if available):
    - The namespace/category of the GO term, such as biological_process, Molecular Function, or Cellular Component.

14. **`depth`** (if available):
    - The depth of the GO term in the GO hierarchy.

15. **`is_obsolete`** (if available):
    - Indicates whether the GO term is obsolete.

Remember, the availability of some of these attributes might depend on the specifics of how `GOEnrichmentStudy` is set up and run, and the version of `goatools` you are using. These attributes provide a comprehensive view of each GO term's enrichment analysis results, allowing for detailed interpretation and further analysis.

https://github.com/tanghaibao/goatools/blob/main/notebooks/goea_nbt3102.ipynb

In [None]:
glucose_goea_sig = [r for r in glucose_goea if r.p_fdr_bh < 0.05]
citricAcid_goea_sig = [r for r in citricAcid_goea if r.p_fdr_bh < 0.05]
sucrose_goea_sig = [r for r in sucrose_goea if r.p_fdr_bh < 0.05]
print(f"{len(glucose_goea_sig), len(citricAcid_goea_sig), len(sucrose_goea_sig)}")

(236, 190, 70)
