In [2]:
# use kernel py3-6
import goatools
import re
import json

# Function to match non-zero Features in GFF and Map GO-Term to GO-ID using GOATools

In [67]:
def matchgff2(feature, gff_file='/home/t44p/PW_rawdata/Transciptome_GenomeAnnotation/Xele_annotated2_gff_export2.gff', obo_path="/home/t44p/PW_rawdata/go_obo/combined.obo"):
    """
    Searches a GFF (General Feature Format) file for specific features and retrieves associated Gene Ontology (GO) terms.

    Parameters:
    ----------
    feature : list or iterable
        An iterable of strings representing the features to search for in the GFF file.

    gff_file : str, optional
        The file path to the GFF file. Defaults to '/home/t44p/PW_rawdata/Transciptome_GenomeAnnotation/Xele_annotated2_gff_export2.gff'.

    obo_path : str, optional
        The file path to the Gene Ontology .obo file. Defaults to "/home/t44p/PW_rawdata/go_obo/combined.obo".

    Returns:
    -------
    tuple of dict
        A tuple containing two dictionaries:
        1. A dictionary where keys are the features and values are lists of lines (as strings) from the GFF file where these features are found.
        2. A dictionary where keys are the features and values are dictionaries. Each inner dictionary has GO IDs as keys and a dictionary with 'name' and 'namespace' of the GO term as values.

    Example:
    --------
    #>>> features = ['gene1', 'gene2']
    #>>> matchgff2(features)
    ({'gene1': ['line content from GFF file'], 'gene2': ['line content from GFF file']},
     {'gene1': {'GO:0000001': {'name': 'term name', 'namespace': 'biological_process'}}, ... })

    Notes:
    -----
    The function uses regular expressions for precise matching of features. It expects the feature followed by a tab character in the GFF file.
    Additionally, it extracts GO IDs from matched lines and retrieves their corresponding names and namespaces from the provided Gene Ontology .obo file.
    """
    with open(gff_file, 'r') as file:
        go_ontology = obo_parser.GODag(obo_path)
        lines_where_feat_found = {}
        go_ids = {}

        for feat in feature:
            file.seek(0)  # reset file pointer to the beginning for each feature
            lines_where_feat_found[feat] = []
            go_ids[feat] = {}
            pattern = re.compile(re.escape(feat) + r'\t')  # exact match followed by a tab
            for line in file:
                if pattern.search(line):
                    lines_where_feat_found[feat].append(line.strip())  # Store the line (as a string) if feature is found
                    # Extract GO id
                    match = re.search(r"Ontology_id=([GO:\d,]+)", line.strip())
                    if match:
                        ids = match.group(1).split(',')
                        # Map Terms to Ids
                        for id in ids:
                            term = go_ontology.get(id)
                            if term is not None:
                                go_ids[feat][id] = {'name': term.name, 'namespace': term.namespace}
                                #print(id, ": ", term.name, term.namespace)
                            else:
                                go_ids[feat][id] = {'name': None, 'namespace': None}
                                #print(id, ":", "None")
    
        return lines_where_feat_found, go_ids

In [68]:
with open("./models/glucose_nXcv.json", 'r') as file:
    glucose_nXcv = json.load(file)
print(glucose_nXcv['common_features'])
glucose_nXcv_matched, glucose_goids = matchgff2(glucose_nXcv['common_features'])


for ele in glucose_nXcv_matched:
    print(f"{glucose_nXcv_matched[ele]}")
    #print(glucose_goids[ele], "\n")
    for id in glucose_goids[ele]:
        print(id, ": ", glucose_goids[ele][id])


['Xele.ptg000012l.772', 'Xele.ptg000010l.105', 'Xele.ptg000006l.630', 'Xele.ptg000016l.332', 'Xele.ptg000013l.459', 'Xele.ptg000068l.74', 'Xele.ptg000001l.144', 'Xele.ptg000003l.525', 'Xele.ptg000022l.1586', 'Xele.ptg000067l.472', 'Xele.ptg000006l.592', 'Xele.ptg000331l.2', 'Xele.ptg000063l.174', 'Xele.ptg000025l.274', 'Xele.ptg000006l.423', 'Xele.ptg000007l.418', 'Xele.ptg000023l.344', 'Xele.ptg000025l.124', 'Xele.ptg000011l.430', 'Xele.ptg000028l.519', 'Xele.ptg000056l.25', 'Xele.ptg000007l.71', 'Xele.ptg000008l.432', 'Xele.ptg000018l.1563', 'Xele.ptg000013l.792', 'Xele.ptg000021l.354']
/home/t44p/PW_rawdata/go_obo/combined.obo: fmt(1.2) rel(go/2023-10-09/subsets/goslim_plant.owl) 346 Terms
['Xele.ptg000012l.772\tBlast2GO\tCDS\t1\t424\t.\t.\t.\t"ID=Xele.ptg000012l.772_1;Description=RecName: Full=Probable purple acid phosphatase 20; Flags: Precursor;Gene=PAP18;Gene=PAP19;Gene=PAP25;Gene=P80366;Gene=PAP26;Gene=PAP15;Gene=PAP12;Gene=PAP23;Gene=PAP13;Gene=PAP6;Gene=PAP5;Gene=PAP2;Gene=AC

In [70]:
with open("./models/sucrose_nXcv.json", 'r') as file:
    sucrose_nXcv = json.load(file)
sucrose_nXcv_matched, sucrose_goids = matchgff2(sucrose_nXcv['common_features'])


for ele in sucrose_nXcv_matched:
    print(f"{sucrose_nXcv_matched[ele]}")
    #print(glucose_goids[ele], "\n")
    for id in sucrose_goids[ele]:
        print(id, ": ", sucrose_goids[ele][id])

/home/t44p/PW_rawdata/go_obo/combined.obo: fmt(1.2) rel(go/2023-10-09/subsets/goslim_plant.owl) 346 Terms
['Xele.ptg000012l.303\tBlast2GO\tCDS\t1\t308\t.\t.\t.\t"ID=Xele.ptg000012l.303_1;Description=RecName: Full=Phosphoglycerate mutase-like protein AT74; Short=At-74;Gene=DET1;Gene=At3g05170;Gene=At1g08940;Ontology_id=GO:0016791,GO:0005975,GO:0009987;Enzyme_code=EC:3.1.3;Enzyme_name=Acting on ester bonds"']
GO:0016791 :  {'name': None, 'namespace': None}
GO:0005975 :  {'name': 'carbohydrate metabolic process', 'namespace': 'biological_process'}
GO:0009987 :  {'name': 'cellular process', 'namespace': 'biological_process'}
['Xele.ptg000068l.7\tBlast2GO\tCDS\t1\t283\t.\t.\t.\t"ID=Xele.ptg000068l.7_1;Description=RecName: Full=Lateral signaling target protein 2 homolog;Gene=CG6051;Gene=GI24295;Gene=GJ23073;Gene=GE10583;Gene=CPIJ004116;Gene=HGS;Gene=Hgs;Gene=AAEL005241;Gene=GF22946;Gene=GM10129;Gene=GK22512;Gene=VPS27;Gene=GG12136;Ontology_id=GO:0005764,GO:0005769,GO:0005774,GO:0005829,GO:00

In [71]:
with open("./models/citricAcid_nXcv.json", 'r') as file:
    citricAcid_nXcv = json.load(file)
citricAcid_nXcv_matched, citricAcid_goids = matchgff2(citricAcid_nXcv['common_features'])


for ele in citricAcid_nXcv_matched:
    print(f"{citricAcid_nXcv_matched[ele]}")
    #print(glucose_goids[ele], "\n")
    for id in citricAcid_goids[ele]:
        print(id, ": ", citricAcid_goids[ele][id])

/home/t44p/PW_rawdata/go_obo/combined.obo: fmt(1.2) rel(go/2023-10-09/subsets/goslim_plant.owl) 346 Terms
['Xele.ptg000020l.540\tBlast2GO\tCDS\t1\t326\t.\t.\t.\t"ID=Xele.ptg000020l.540_1;Description=UniRef90_A0A6V7NL77Uncharacterized protein n=1 Tax=Ananas comosus var. bracteatus TaxID=296719 RepID=A0A6V7NL77_ANACO;Gene=C2845_PM03G09980;Gene=LOC104605637;Gene=AMTR_s00003p00261050;Ontology_id=GO:0035267,GO:0016573"']
GO:0035267 :  {'name': None, 'namespace': None}
GO:0016573 :  {'name': None, 'namespace': None}
['Xele.ptg000055l.154\tBlast2GO\tCDS\t1\t129\t.\t.\t.\t"ID=Xele.ptg000055l.154_1;Description=RecName: Full=Cysteine desulfurase 1, chloroplastic; AltName: Full=NIFS-like protein 1; Short=CpNifS1; AltName: Full=Plastid sufS-like protein; AltName: Full=Protein AtCpNifS; AltName: Full=Selenocysteine lyase; Flags: Precursor;Gene=NFS2;Ontology_id=GO:0009570,GO:0009000,GO:0030170,GO:0031071,GO:0006534,GO:0010269;Enzyme_code=EC:2.8.1.7,EC:4.4.1.16;Enzyme_name=Cysteine desulfurase,Seleno