In [3]:
# use kernel py3-6
import goatools
import re
import json

# Function to match non-zero features in GFF file

In [4]:
def matchgff(feature, gff_file='/home/t44p/PW_rawdata/Transciptome_GenomeAnnotation/Xele_annotated2_gff_export2.gff'):
    """
    Searches a GFF (General Feature Format) file for specific features and returns lines where these features are found.

    Parameters:
    ----------
    feature : list or iterable
        An iterable of strings representing the features to search for in the GFF file.
    
    gff_file : str, optional
        The file path to the GFF file. Defaults to '/home/t44p/PW_rawdata/Transciptome_GenomeAnnotation/Xele_annotated2_gff_export2.gff'.

    Returns:
    -------
    dict
        A dictionary where keys are the features and values are lists of lines (as strings) from the GFF file where these features are found.
        Each line represents an entry in the GFF file that matches the feature.

    Example:
    --------
    #>>> features = ['gene1', 'gene2']
    #>>> matchgff(features)
    {'gene1': ['line content from GFF file'], 'gene2': ['line content from GFF file']}

    Notes:
    -----
    The function uses regular expressions for precise matching of features. It expects the feature followed by a tab character in the GFF file.
    """
    with open(gff_file, 'r') as file:
        lines_where_feat_found = {}
        for feat in feature:
            file.seek(0)  # Reset file pointer to the beginning for each feature
            lines_where_feat_found[feat] = []
            pattern = re.compile(re.escape(feat) + r'\t')  # Compile a regex pattern for the exact match followed by a tab
            for line in file:
                if pattern.search(line):
                    lines_where_feat_found[feat].append(line.strip())  # Store the line (as a string) if feature is found
    return lines_where_feat_found

## Match Non-Zero features for Sucrose Model in GFF

In [5]:
with open("./models/lasso_cvsucrose_scores.json", 'r') as file:
    lasso_cvsucrose_scores = json.load(file)
lasso_sucrose_feat_matched = matchgff(lasso_cvsucrose_scores['non_zero_features'])
with open("./models/lasso_sucrose_feat_matched.txt", 'w') as file:
    #file.write(str(lasso_sucrose_feat_matched))
    for ele in lasso_sucrose_feat_matched:
        #file.write(ele)
        #file.write("\n")

        for lel in lasso_sucrose_feat_matched[ele]:
            file.write(lel)
            #file.write("\n")
        file.write("\n")


# Match Non-Zero features for Glucose Model in GFF

In [6]:
with open("./models/lasso_cvglucose_scores.json", 'r') as file:
    lasso_cvglucose_scores = json.load(file)

lasso_glucose_feat_matched = matchgff(lasso_cvglucose_scores['non_zero_features'])
with open("./models/lasso_glucose_feat_matched.txt", 'w') as file:
    #file.write(str(lasso_sucrose_feat_matched))
    for ele in lasso_glucose_feat_matched:
        #file.write(ele)
        #file.write("\n")

        for lel in lasso_glucose_feat_matched[ele]:
            file.write(lel)
            #file.write("\n")
        file.write("\n")


# Match Non-Zero features for Citric Acid Model in GFF

In [7]:
with open("./models/lasso_cvcitricAcid_scores.json", 'r') as file:
    lasso_cvcitricAcid_scores = json.load(file)

lasso_citricAcid_feat_matched = matchgff(lasso_cvcitricAcid_scores['non_zero_features'])
with open("./models/lasso_citricAcid_feat_matched.txt", 'w') as file:
    for ele in lasso_citricAcid_feat_matched:
        #file.write(ele)
        #file.write("\n")

        for lel in lasso_citricAcid_feat_matched[ele]:
            file.write(lel)
            #file.write("\n")
        file.write("\n")


# GOATools to extract GO Terms

To translate the Gene Ontology (GO) IDs into their respective terms for your given gene entries, you'll need to follow these steps:

1. **Extract the GO IDs**: From your provided entries, you'll need to parse out the GO IDs. In the second entry, these are listed in the `Ontology_id` attribute.

2. **Load GO Terms**: Use GOAtools or another resource to load the complete set of GO terms. This usually involves downloading the GO ontology file (in OBO format) from the Gene Ontology website.

3. **Map GO IDs to Terms**: Once you have the GO IDs and the GO ontology loaded, you can map the IDs to their respective terms (including the term name and possibly the term definition).

Here's an example of how you might code this in Python using GOAtools:

### Step 1: Extracting GO IDs

```python
import re

# Sample GFF entries
entries = [
    "Xele.ptg000045l.82\tBlast2GO\tCDS\t1\t107\t.\t.\t.\tID=Xele.ptg000045l.82_1;Description=UniRef90_UPI0018AE80C6uncharacterized protein LOC120271497 n=1 Tax=Dioscorea cayennensis subsp. rotundata TaxID=55577 RepID=UPI0018AE80C6;Gene=CEY00_Acc22721;Gene=CB5_LOCUS31267",
    "Xele.ptg000011l.21\tBlast2GO\tCDS\t1\t511\t.\t.\t.\t\"ID=Xele.ptg000011l.21_1;Description=RecName: Full=AFG1-like ATPase; AltName: Full=Lactation elevated protein 1;Gene=AFG1;Gene=SPBC115.02c;Gene=P46441;Gene=lace1b;Gene=AFG1L;Gene=Afg1l;Ontology_id=GO:0031966,GO:0005515,GO:0006123,GO:0035694\""
]

# Extract GO IDs
go_ids = set()
for entry in entries:
    match = re.search(r"Ontology_id=([GO:\d,]+)", entry)
    if match:
        ids = match.group(1).split(',')
        go_ids.update(ids)

print("Extracted GO IDs:", go_ids)
```

### Step 2 & 3: Loading GO Terms and Mapping

```python
from goatools import obo_parser

# Load GO ontology
go_ontology = obo_parser.GODag("path/to/go-basic.obo")

# Map GO IDs to terms
for go_id in go_ids:
    go_term = go_ontology.get(go_id)
    if go_term:
        print(f"{go_id}: {go_term.name}, {go_term.namespace}")
```

In this code:
- We first extract the GO IDs from your GFF entries.
- We then load the GO ontology using GOAtools (you need to download the `go-basic.obo` file from the Gene Ontology website).
- Finally, we map each GO ID to its term name and namespace (like biological process, cellular component, or molecular function).

This approach should give you a basic understanding of the functional categories associated with your genes. For more detailed analysis or interpretation, consider exploring additional GOAtools functionalities or consulting with a bioinformatician.

In [8]:
#print(lasso_sucrose_feat_matched)
go_ids = {}
for entry in lasso_sucrose_feat_matched:
    #print(lasso_sucrose_feat_matched[entry])
    match = re.search(r"Ontology_id=([GO:\d,]+)", str(lasso_sucrose_feat_matched[entry]))
    if match:
        ids = match.group(1).split(',')
        go_ids[entry] = ids

In [22]:
from goatools import obo_parser
# Load GO ontology
go_ontology = obo_parser.GODag("/home/t44p/PW_rawdata/go_obo/goslim_plant.obo")

for entry in go_ids:
    print(entry)
    for go_id in go_ids[entry]:
        go_term = go_ontology.get(go_id)
        if go_term != None:
            print(go_id,": ", go_term.name, go_term.namespace)
        else:
            print(go_id,":", "None")

/home/t44p/PW_rawdata/go_obo/goslim_plant.obo: fmt(1.2) rel(go/2023-10-09/subsets/goslim_plant.owl) 166 Terms
Xele.ptg000034l.205
GO:0016021 :  membrane cellular_component
GO:0016779 : None
Xele.ptg000068l.7
GO:0005764 :  lysosome cellular_component
GO:0005769 : None
GO:0005774 : None
GO:0005829 :  cytosol cellular_component
GO:0010008 : None
GO:0033565 : None
GO:0019904 : None
GO:0032266 : None
GO:0042802 : None
GO:0043130 : None
GO:0044389 : None
GO:0046982 : None
GO:0140036 : None
GO:0006622 : None
GO:0009306 : None
GO:0010324 : None
GO:0010642 : None
GO:0016525 : None
GO:0030948 : None
GO:0043162 : None
GO:0043405 : None
GO:0045053 : None
GO:0045324 : None
GO:0046426 : None
GO:0071985 : None
GO:0072657 : None
GO:0140504 : None
GO:1903319 : None
GO:1903543 : None
GO:1904669 : None
Xele.ptg000014l.41
GO:0005747 : None
GO:0006119 : None
GO:0022904 : None
Xele.ptg000020l.374
GO:0009507 :  chloroplast cellular_component
GO:0005975 :  carbohydrate metabolic process biological_process
GO: