In [64]:
# use kernel py3-6
#import goatools
from goatools import obo_parser
import re
import json
import numpy as np
from goatools.go_enrichment import GOEnrichmentStudy


# Function to match non-zero Features in GFF and Map GO-Term to GO-ID using GOATools

In [65]:
def matchgff2(feature, gff_file='/home/t44p/PW_rawdata/Transciptome_GenomeAnnotation/Xele_annotated2_gff_export2.gff', obo_path="/home/t44p/PW_rawdata/go_obo/go.obo", goea=False):
    """
    Searches a GFF (General Feature Format) file for specific features and retrieves associated Gene Ontology (GO) terms.

    Parameters:
    ----------
    feature : list or iterable
        An iterable of strings representing the features to search for in the GFF file.

    gff_file : str, optional
        The file path to the GFF file. Defaults to '/home/t44p/PW_rawdata/Transciptome_GenomeAnnotation/Xele_annotated2_gff_export2.gff'.

    obo_path : str, optional
        The file path to the Gene Ontology .obo file. Defaults to "/home/t44p/PW_rawdata/go_obo/combined.obo".

    Returns:
    -------
    tuple of dict
        A tuple containing two dictionaries:
        1. A dictionary where keys are the features and values are lists of lines (as strings) from the GFF file where these features are found.
        2. A dictionary where keys are the features and values are dictionaries. Each inner dictionary has GO IDs as keys and a dictionary with 'name' and 'namespace' of the GO term as values.

    Example:
    --------
    #>>> features = ['gene1', 'gene2']
    #>>> matchgff2(features)
    ({'gene1': ['line content from GFF file'], 'gene2': ['line content from GFF file']},
     {'gene1': {'GO:0000001': {'name': 'term name', 'namespace': 'biological_process'}}, ... })

    Notes:
    -----
    The function uses regular expressions for precise matching of features. It expects the feature followed by a tab character in the GFF file.
    Additionally, it extracts GO IDs from matched lines and retrieves their corresponding names and namespaces from the provided Gene Ontology .obo file.
    """
    with open(gff_file, 'r') as file:
        go_ontology = obo_parser.GODag(obo_path)
        
        lines_where_feat_found = {}
        go_ids = {}
        background_genes = []

        for feat in feature:
            file.seek(0)  # reset file pointer to the beginning for each feature
            lines_where_feat_found[feat] = []
            go_ids[feat] = {}
            pattern = re.compile(re.escape(feat) + r'\t')  # exact match followed by a tab
            for line in file:
                if goea:
                    if not line.lstrip().startswith('#'):
                        background_genes.append(line.split('\t')[0])

                if pattern.search(line):
                    lines_where_feat_found[feat].append(line.strip())  # Store the line (as a string) if feature is found
                    # Extract GO id
                    match = re.search(r"Ontology_id=([GO:\d,]+)", line.strip())
                    if match:
                        ids = match.group(1).split(',')
                        # Map Terms to Ids
                        for id in ids:
                            term = go_ontology.get(id)
                            if term is not None:
                                go_ids[feat][id] = {'name': term.name, 'namespace': term.namespace}
                                #print(id, ": ", term.name, term.namespace)
                            else:
                                go_ids[feat][id] = {'name': None, 'namespace': None}
                                #print(id, ":", "None")
        if goea:
            print("GO Enrichment Analysis >>")
            goea_obj = GOEnrichmentStudy(
                background_genes,
                go_ids,  # This needs to be a dict mapping gene IDs to a set of GO IDs
                go_ontology,
                propagate_counts=False,
                alpha=0.05,  # significance level for the statistical test
                methods=['fdr_bh']  # correction method for multiple testing
            )
            goea_result = goea_obj.run_study(go_ids.keys())
            return lines_where_feat_found, go_ids, goea_result


        return lines_where_feat_found, go_ids

### Glucose 

In [66]:
with open("./models/glucose_nXcv.json", 'r') as file:
    glucose_nXcv = json.load(file)

print("<<< GLUCOSE >>>")
print(f"mapping GO terms to GO id's\nto common non-zero features over 10x 3fold crossvalidation ")
print("count common features: ", len(glucose_nXcv['common_features']))
glucose_nXcv_matched, glucose_goids, glucose_goea = matchgff2(glucose_nXcv['common_features'], goea=True)

<<< GLUCOSE >>>
mapping GO terms to GO id's
to common non-zero features over 10x 3fold crossvalidation 
count common features:  26


/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%    676 of 793,572 population items found in association

Runing  Ontology Analysis: current study set of 26 IDs.
100%     26 of     26 study items found in association
100%     26 of     26 study items found in population(793572)
Calculating 236 uncorrected p-values using fisher_scipy_stats
     236 terms are associated with     20 of 793,572 population items
     236 terms are associated with     20 of     26 study items
  METHOD fdr_bh:
     236 GO terms found significant (< 0.05=alpha) (236 enriched +   0 purified): statsmodels fdr_bh
      20 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)


In [67]:
for ele in glucose_nXcv_matched:
    print(f"{glucose_nXcv_matched[ele]}")
    #print(glucose_goids[ele], "\n")
    for id in glucose_goids[ele]:
        print(id, ": ", glucose_goids[ele][id])

['Xele.ptg000012l.772\tBlast2GO\tCDS\t1\t424\t.\t.\t.\t"ID=Xele.ptg000012l.772_1;Description=RecName: Full=Probable purple acid phosphatase 20; Flags: Precursor;Gene=PAP18;Gene=PAP19;Gene=PAP25;Gene=P80366;Gene=PAP26;Gene=PAP15;Gene=PAP12;Gene=PAP23;Gene=PAP13;Gene=PAP6;Gene=PAP5;Gene=PAP2;Gene=ACPEPP;Gene=PAP1;Gene=PAP10;Gene=PAP21;Gene=PAP11;Gene=PAP22;Gene=PAP20;Gene=PAP;Ontology_id=GO:0005615,GO:0005773,GO:0003993,GO:0004721,GO:0008199,GO:0008270,GO:0030145,GO:0016036,GO:0016311,GO:0032501,GO:0050829,GO:0050832,GO:0051174,GO:0055062;Enzyme_code=EC:3.1.3.2;Enzyme_name=Acid phosphatase"']
GO:0005615 :  {'name': 'extracellular space', 'namespace': 'cellular_component'}
GO:0005773 :  {'name': 'vacuole', 'namespace': 'cellular_component'}
GO:0003993 :  {'name': 'acid phosphatase activity', 'namespace': 'molecular_function'}
GO:0004721 :  {'name': 'phosphoprotein phosphatase activity', 'namespace': 'molecular_function'}
GO:0008199 :  {'name': 'ferric iron binding', 'namespace': 'molecula

In [68]:
gluc_pw = []
with open("/home/t44p/PW_rawdata/gene_list_glu_tca_suc/glucose.csv", 'r') as file:
    for line in file:
        #print(f"before {line}")
        line = str(line).replace('x', 'X', 1)
        #print(f"after {line}")
        gluc_pw.append(line.strip())
#print(f"gluc pw {gluc_pw}")
gluc_intersect = list(set(glucose_nXcv_matched.keys()) & set(gluc_pw))
print(f"intersection between gluc pw and matched non-zero features:>>>\n {gluc_intersect}")
for gene in gluc_intersect:
    print(f"GFF entry >>>\n {glucose_nXcv_matched[gene]}")
    for goid in glucose_goids[gene]:
        print(f"{goid} {glucose_goids[gene][goid]}")



intersection between gluc pw and matched non-zero features:>>>
 ['Xele.ptg000013l.792']
GFF entry >>>
 ['Xele.ptg000013l.792\tBlast2GO\tCDS\t1\t337\t.\t.\t.\t"ID=Xele.ptg000013l.792_1;Description=RecName: Full=Glyceraldehyde-3-phosphate dehydrogenase, cytosolic;Gene=GAPC3;Gene=Q39769;Gene=GAPC2;Gene=GAPC1;Gene=P34783;Gene=GAPC;Ontology_id=GO:0005634,GO:0005740,GO:0005829,GO:0032991,GO:0003677,GO:0004365,GO:0005515,GO:0042301,GO:0050661,GO:0070403,GO:0006094,GO:0006096,GO:0009408,GO:0009416,GO:0010154,GO:0034059,GO:0045893,GO:0046686;Enzyme_code=EC:1.2.1.59,EC:1.2.1.12;Enzyme_name=Glyceraldehyde-3-phosphate dehydrogenase (NAD(P)(+)) (phosphorylating),Glyceraldehyde-3-phosphate dehydrogenase (phosphorylating)"']
GO:0005634 {'name': 'nucleus', 'namespace': 'cellular_component'}
GO:0005740 {'name': 'mitochondrial envelope', 'namespace': 'cellular_component'}
GO:0005829 {'name': 'cytosol', 'namespace': 'cellular_component'}
GO:0032991 {'name': 'protein-containing complex', 'namespace': 'cel

### Sucrose

In [69]:
with open("./models/sucrose_nXcv.json", 'r') as file:
    sucrose_nXcv = json.load(file)

print("<<< SUCROSE >>>")
print(f"mapping GO terms to GO id's\nto common non-zero features over 10x 3fold crossvalidation ")
print("count common features: ", len(sucrose_nXcv['common_features']))
sucrose_nXcv_matched, sucrose_goids, sucrose_goea = matchgff2(sucrose_nXcv['common_features'], goea=True)


<<< SUCROSE >>>
mapping GO terms to GO id's
to common non-zero features over 10x 3fold crossvalidation 
count common features:  8
/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%     64 of 244,176 population items found in association

Runing  Ontology Analysis: current study set of 8 IDs.
100%      8 of      8 study items found in association
100%      8 of      8 study items found in population(244176)
Calculating 70 uncorrected p-values using fisher_scipy_stats
      70 terms are associated with      7 of 244,176 population items
      70 terms are associated with      7 of      8 study items
  METHOD fdr_bh:
      70 GO terms found significant (< 0.05=alpha) ( 70 enriched +   0 purified): statsmodels fdr_bh
       7 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)


In [70]:
for ele in sucrose_nXcv_matched:
    print(f"{sucrose_nXcv_matched[ele]}")
    #print(glucose_goids[ele], "\n")
    for id in sucrose_goids[ele]:
        print(id, ": ", sucrose_goids[ele][id])

['Xele.ptg000012l.303\tBlast2GO\tCDS\t1\t308\t.\t.\t.\t"ID=Xele.ptg000012l.303_1;Description=RecName: Full=Phosphoglycerate mutase-like protein AT74; Short=At-74;Gene=DET1;Gene=At3g05170;Gene=At1g08940;Ontology_id=GO:0016791,GO:0005975,GO:0009987;Enzyme_code=EC:3.1.3;Enzyme_name=Acting on ester bonds"']
GO:0016791 :  {'name': 'phosphatase activity', 'namespace': 'molecular_function'}
GO:0005975 :  {'name': 'carbohydrate metabolic process', 'namespace': 'biological_process'}
GO:0009987 :  {'name': 'cellular process', 'namespace': 'biological_process'}
['Xele.ptg000068l.7\tBlast2GO\tCDS\t1\t283\t.\t.\t.\t"ID=Xele.ptg000068l.7_1;Description=RecName: Full=Lateral signaling target protein 2 homolog;Gene=CG6051;Gene=GI24295;Gene=GJ23073;Gene=GE10583;Gene=CPIJ004116;Gene=HGS;Gene=Hgs;Gene=AAEL005241;Gene=GF22946;Gene=GM10129;Gene=GK22512;Gene=VPS27;Gene=GG12136;Ontology_id=GO:0005764,GO:0005769,GO:0005774,GO:0005829,GO:0010008,GO:0033565,GO:0019904,GO:0032266,GO:0042802,GO:0043130,GO:0044389,

In [71]:
suc_pw = []
with open("/home/t44p/PW_rawdata/gene_list_glu_tca_suc/sucrose.csv", 'r') as file:
    for line in file:
        #print(f"before {line}")
        #line = str(line).replace('x', 'X', 1)
        #print(f"after {line}")
        suc_pw.append(line.strip())

suc_intersect = list(set(sucrose_nXcv_matched.keys()) & set(suc_pw))
print(f"intersection between suc pw and matched non-zero features:>>>\n {suc_intersect}")
for gene in suc_intersect:
    print(f"GFF entry >>>\n {sucrose_nXcv_matched[gene]}")
    for goid in sucrose_goids[gene]:
        print(f"{goid} {sucrose_goids[gene][goid]}")


intersection between suc pw and matched non-zero features:>>>
 []


### citric acid

In [72]:
with open("./models/citricAcid_nXcv.json", 'r') as file:
    citricAcid_nXcv = json.load(file)


print("<<< CITRIC ACID >>>")
print(f"mapping GO terms to GO id's\nto common non-zero features over 10x 3fold crossvalidation ")
print("count common features: ", len(citricAcid_nXcv['common_features']))
citricAcid_nXcv_matched, citricAcid_goids, citricAcid_goea = matchgff2(citricAcid_nXcv['common_features'], goea=True)


<<< CITRIC ACID >>>
mapping GO terms to GO id's
to common non-zero features over 10x 3fold crossvalidation 
count common features:  18
/home/t44p/PW_rawdata/go_obo/go.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
GO Enrichment Analysis >>

Load  Ontology Enrichment Analysis ...
  0%    324 of 549,396 population items found in association

Runing  Ontology Analysis: current study set of 18 IDs.
100%     18 of     18 study items found in association
100%     18 of     18 study items found in population(549396)
Calculating 190 uncorrected p-values using fisher_scipy_stats
     190 terms are associated with     13 of 549,396 population items
     190 terms are associated with     13 of     18 study items
  METHOD fdr_bh:
     190 GO terms found significant (< 0.05=alpha) (190 enriched +   0 purified): statsmodels fdr_bh
      13 study items associated with significant GO IDs (enriched)
       0 study items associated with significant GO IDs (purified)


In [73]:
for ele in citricAcid_nXcv_matched:
    print(f"{citricAcid_nXcv_matched[ele]}")
    #print(glucose_goids[ele], "\n")
    for id in citricAcid_goids[ele]:
        print(id, ": ", citricAcid_goids[ele][id])

['Xele.ptg000020l.540\tBlast2GO\tCDS\t1\t326\t.\t.\t.\t"ID=Xele.ptg000020l.540_1;Description=UniRef90_A0A6V7NL77Uncharacterized protein n=1 Tax=Ananas comosus var. bracteatus TaxID=296719 RepID=A0A6V7NL77_ANACO;Gene=C2845_PM03G09980;Gene=LOC104605637;Gene=AMTR_s00003p00261050;Ontology_id=GO:0035267,GO:0016573"']
GO:0035267 :  {'name': 'NuA4 histone acetyltransferase complex', 'namespace': 'cellular_component'}
GO:0016573 :  {'name': 'histone acetylation', 'namespace': 'biological_process'}
['Xele.ptg000055l.154\tBlast2GO\tCDS\t1\t129\t.\t.\t.\t"ID=Xele.ptg000055l.154_1;Description=RecName: Full=Cysteine desulfurase 1, chloroplastic; AltName: Full=NIFS-like protein 1; Short=CpNifS1; AltName: Full=Plastid sufS-like protein; AltName: Full=Protein AtCpNifS; AltName: Full=Selenocysteine lyase; Flags: Precursor;Gene=NFS2;Ontology_id=GO:0009570,GO:0009000,GO:0030170,GO:0031071,GO:0006534,GO:0010269;Enzyme_code=EC:2.8.1.7,EC:4.4.1.16;Enzyme_name=Cysteine desulfurase,Selenocysteine lyase"']
GO:

In [74]:
tca_pw = []
with open("/home/t44p/PW_rawdata/gene_list_glu_tca_suc/sucrose.csv", 'r') as file:
    for line in file:
        tca_pw.append(line.strip())
#print(f"{set(citricAcid_nXcv_matched.keys())} \n {set(tca_pw)}")
cit_intersect = list(set(citricAcid_nXcv_matched.keys()) & set(tca_pw))
print(f"intersection between tca pw and matched non-zero features:>>>\n {cit_intersect}")
for gene in cit_intersect:
    print(f"GFF entry >>>\n {citricAcid_nXcv_matched[gene]}")
    for goid in citricAcid_goids[gene]:
        print(f"{goid} {citricAcid_goids[gene][goid]}")


intersection between tca pw and matched non-zero features:>>>
 []


In [75]:
print(glucose_nXcv.keys())
print("10x3FoldCV mean score >>> \n ","glucose ", np.mean(glucose_nXcv['mean_scores']), "sucrose ", np.mean(sucrose_nXcv['mean_scores']) ,"citricAcid ", np.mean(citricAcid_nXcv['mean_scores']))

dict_keys(['random_state', 'scores', 'mean_scores', 'common_features', 'model'])
10x3FoldCV mean score >>> 
  glucose  0.37702307897847137 sucrose  0.31889166659943624 citricAcid  0.22358901669508696


# GO Enrichment Analysis

### `GOEnrichmentStudy` object in `goatools`, key arguments:

1. **`pop`**:
   - The population parameter represents all possible items that could be sampled. In the context of GO enrichment, it's typically the list of all gene IDs from your background set.

2. **`assoc`**:
   - The association between your items (genes) and the groups (GO terms). This is a dictionary where keys are gene IDs and values are sets of GO IDs.

3. **`obo_dag`**:
   - An object representing the GO terms and their hierarchical relationships, usually obtained by parsing the GO OBO file with `obo_parser.GODag()`.

4. **`propagate_counts`** (optional, default `True`):
   - If `True`, the gene counts will be propagated up the GO DAG, so that each term includes counts for all descendant terms.

5. **`alpha`** (optional, default `0.05`):
   - The significance level for the statistical tests.

6. **`methods`** (optional):
   - The methods for multiple test correction, such as `['fdr_bh']` for Benjamini-Hochberg.

7. **`pvalcalc`** (optional):
   - The object for calculating p-values. By default, `goatools` uses Fisher's exact test.

8. **`min_overlap`** (optional, default `0.7`):
   - The minimum overlap between the study group and the GO term.

9. **`prunetree`** (optional, default `True`):
   - If `True`, GO terms without any associated genes in the population will be pruned from the analysis.

10. **`go2geneids`** (optional):
    - A dictionary mapping GO IDs to gene IDs. This can be used instead of `assoc` to provide associations.

11. **`geneid2symbol`** (optional):
    - A dictionary mapping gene IDs to gene symbols, which can be useful for labeling output.

12. **`evidence_set`** (optional):
    - A set of evidence codes to use. If not specified, all evidence codes will be included.

13. **`relationships`** (optional):
    - A set of extra relationship types to include when building the GO DAG, such as `'part_of'` or `'regulates'`.

14. **`keep_if_no_id`** (optional, default `False`):
    - If `True`, genes without GO IDs in the `assoc` will be kept in the population.

15. **`study`**:
    - A list of items (genes) for which you want to check enrichment. This is typically the list of genes corresponding to your non-zero features.


The `GOEnrichmentStudy` class in the `goatools` library key attributes:

1. **`goterm`**:
   - This attribute is an object representing the GO term itself. It contains further details about the term, like its name, namespace, and description.

2. **`p_uncorrected`**:
   - The raw, uncorrected p-value resulting from the enrichment test (e.g., Fisher's exact test).

3. **`p_fdr_bh`**:
   - The p-value adjusted for multiple testing using the Benjamini-Hochberg false discovery rate method.

4. **`enrichment`**:
   - This indicates whether the GO term is overrepresented (`'e'` for enriched) or underrepresented (`'p'` for purified) in the study set compared to the background.

5. **`ratio_in_study`**:
   - A tuple showing the number of genes in the study set that are annotated with this GO term and the total number of genes in the study set.

6. **`ratio_in_pop`**:
   - A tuple showing the number of genes in the background population that are annotated with this GO term and the total number of genes in the background population.

7. **`study_items`**:
   - A list of genes from your study set that are annotated with this GO term.

8. **`study_count`**:
   - The number of genes in your study set that are annotated with this GO term.

9. **`pop_count`**:
   - The number of genes in the background population that are annotated with this GO term.

10. **`study_n`**:
    - The total number of genes in your study set.

11. **`pop_n`**:
    - The total number of genes in the background population.

12. **`name`** (if available):
    - The name of the GO term.

13. **`namespace`** (if available):
    - The namespace/category of the GO term, such as Biological Process, Molecular Function, or Cellular Component.

14. **`depth`** (if available):
    - The depth of the GO term in the GO hierarchy.

15. **`is_obsolete`** (if available):
    - Indicates whether the GO term is obsolete.

Remember, the availability of some of these attributes might depend on the specifics of how `GOEnrichmentStudy` is set up and run, and the version of `goatools` you are using. These attributes provide a comprehensive view of each GO term's enrichment analysis results, allowing for detailed interpretation and further analysis.

https://github.com/tanghaibao/goatools/blob/main/notebooks/goea_nbt3102.ipynb

In [103]:
glucose_goea_sig = [r for r in glucose_goea if r.p_fdr_bh < 0.05]


In [110]:
from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj
plot_results("/home/t44p/OneDrive/Dokumente/WiSe2324/ProjectWork/x.elegans/", glucose_goea_sig)

FileNotFoundError: [Errno 2] "dot" not found in path.