This file gets the candidagenome.org GO annotations for the genes in important clusters.

In the future, if web scraping is needed to get annotations from C.glabrata or any other species on the candidagenome.org website, this same code can be used with the getURL() function modified as needed. (if the species is still C.glabrata it doesn't need to be changed). To avoid having to run all of the cells again, the serialized object is imported below.

In [7]:
import _pickle as pickle

pickleIn = open('GOgenes.pickle', 'rb')
genes = pickle.load(pickleIn)
genes = list(genes)

345


In [2]:
# These are some commonly used web-scraping libraries
import requests
from bs4 import BeautifulSoup
from time import sleep
import random
import re
from tqdm import tqdm

In [3]:
# This function generates the URL for each gene to get its genepage on candidagenome.org
def getURL(gene):
    url = 'http://www.candidagenome.org/cgi-bin/locus.pl?locus=' + gene + '&organism=C_glabrata_CBS138'
    # Don't delete the sleep function below, doing so might crash the website!
    sleep(random.uniform(1,3))
    return url

In [4]:
# This function takes a gene as input and returns a list of its annotations from candidagenome.org
def getAnno(gene):
    url = getURL(gene)

    try:
        # This parses the html to get the annotation information associated with the 'a' tag
        genePage = requests.get(url)
        soup = BeautifulSoup(genePage.text, 'html.parser')
        links = list(soup.find_all('a'))
        
        # Store all the annotations for a gene into a list
        annotations = []
        for link in links:
            link = str(link)
            if 'http://www.candidagenome.org/cgi-bin/GO/go.pl' in link:
                goTermUnfiltered = re.search(r'>.+</a>', link)
                goTerm = goTermUnfiltered.group(0)[1:-4]
                if goTerm == 'molecular_function':
                    annotations.append('unknown molecular function')
                elif goTerm == 'biological_process':
                    annotations.append('unknown biological process')
                elif goTerm == 'cellular_component':
                    annotations.append('unknown cellular component')
                else:
                    annotations.append(goTerm)
        return annotations
    
    # This deals with exceptions if the page doesn't load, or if a gene doesn't exist on the website, etc.
    except:
        return []

In [11]:
# Collect data and gather info
geneAnno = dict()
for gene in tqdm(genes):
    geneAnno[gene] = getAnno(gene)

100%|██████████████████████████████████████████████████████████████████████████████| 345/345 [22:03<00:00,  3.28s/it]


In [12]:
pickleOut = open('GOanno.pickle', 'wb')
pickle.dump(geneAnno, pickleOut)

In [None]:
# Run this to avoid running everything above this point
pickleIn = open('GOanno.pickle', 'rb')
geneAnno = pickle.load(pickleIn)

In [22]:
# Check how many genes caused an error
count = 0
for gene in geneAnno.keys():
    if len(geneAnno[gene]) == 0:
        print(gene)
        count += 1
        
print('Error Gene Count:', count)

CAGL0M10461s
CAGL0H00132g
CAGL0E00209g
CAGL0I10513g
CAGL0H00715g
Error Gene Count: 5


Upon further inspection of the genes which ran into errors, the website didn't have a GO section for these genes, so the script wasn't the issue.
Below is a test of the different clusters/groups of interest and their top 5 annotations.

In [9]:
import pandas as pd

table = pd.read_csv('ClassifiedTable.txt', sep='\t', header=(0), index_col=0)

In [10]:
# Get gene lists for all target clusters and reference target list

clust5 = table[table.loc[:, 'Prediction'] == 5]
clust5 = list(clust5.index)

clust10 = table[table.loc[:, 'Prediction'] == 10]
clust10 = list(clust10.index)

clust11 = table[table.loc[:, 'Prediction'] == 11]
clust11 = list(clust11.index)

targets = pd.read_csv('TargetGenes.txt', sep= '\t')
targets = list(targets.iloc[:, 0])

In [56]:
from collections import Counter

freq10 = []
for gene in clust10:
    freq10.extend(geneAnno[gene])
freq10list = [item for item in Counter(freq10).most_common()]
freq10list = [(item, round(num/(len(clust10)), 3)) for item, num in freq10list]
print(freq10list[0:15])

freq5 = []
for gene in clust5:
    freq5.extend(geneAnno[gene])
freq5list = [item for item in Counter(freq5).most_common()]
freq5list = [(item, round(num/(len(clust5)), 3)) for item, num in freq5list]
print(freq5list[0:15])

freq11 = []
for gene in clust5:
    freq11.extend(geneAnno[gene])
freq11list = [item for item in Counter(freq11).most_common()]
freq11list = [(item, round(num/(len(clust11)), 3)) for item, num in freq11list]
print(freq11list[0:15])

freqTarg = []
for gene in targets:
    freqTarg.extend(geneAnno[gene])
freqTarglist = [item for item in Counter(freqTarg).most_common()]
freqTarglist = [(item, round(num/(len(targets)), 3)) for item, num in freqTarglist]
print(freqTarglist[0:15])

[('cellular response to oxidative stress', 1.0), ('oxidation-reduction process', 0.667), ('cytoplasm', 0.667), ('cellular detoxification of hydrogen peroxide', 0.5), ('hydrogen peroxide catabolic process', 0.5), ('thioredoxin peroxidase activity', 0.333), ('cell redox homeostasis', 0.333), ('cytosol', 0.333), ('mitochondrion', 0.333), ('positive regulation of transcription from RNA polymerase II promoter in response to oxidative stress', 0.333), ('catalase activity', 0.333), ('glutathione peroxidase activity', 0.167), ('phospholipid-hydroperoxide glutathione peroxidase activity', 0.167), ('extrinsic component of mitochondrial inner membrane', 0.167), ('extrinsic component of mitochondrial outer membrane', 0.167)]
[('unknown cellular component', 0.38), ('unknown molecular function', 0.28), ('unknown biological process', 0.2), ('cytosol', 0.16), ('plasma membrane', 0.14), ('cellular response to drug', 0.12), ('cytoplasm', 0.12), ('nucleus', 0.1), ('mitochondrial intermembrane space', 0.0

In [109]:
anno = []
anno10 = [anno for anno, val in freq10list]
anno.extend(anno10)
anno5 = [anno for anno, val in freq5list]
anno.extend(anno5)
anno11 = [anno for anno, val in freq11list]
anno.extend(anno11)
annoTarg = [anno for anno, val in freqTarglist]
anno.extend(annoTarg)
anno = set(anno)
print(len(anno))

734


In [110]:
import pandas as pd

annoTable = pd.DataFrame(index = anno)

for anno, val in freq10list:
    annoTable.loc[anno, 'Extremely High Confidence, N=6'] = val
for anno, val in freq5list:
    annoTable.loc[anno, 'High Confidence, N=50'] = val
for anno, val in freq11list:
    annoTable.loc[anno, 'Low Confidence, N=130'] = val
for anno, val in freqTarglist:
    annoTable.loc[anno, 'Reference Set, N=206'] = val

annoTable.index.rename('GO Annotation', inplace=True)

print(annoTable)

                                                    Extremely High Confidence, N=6  \
GO Annotation                                                                        
NAD binding                                                                    NaN   
thioredoxin peroxidase activity                                              0.333   
cytoplasmic translational elongation                                           NaN   
membrane protein proteolysis                                                   NaN   
protein dephosphorylation                                                      NaN   
autophagy of peroxisome                                                        NaN   
protein quality control for misfolded or incomp...                             NaN   
mediator complex                                                               NaN   
mRNA pseudouridine synthesis                                                   NaN   
hydrogen peroxide catabolic process                   

[734 rows x 4 columns]


In [112]:
testAnnoFrame = annoTable.copy(deep=True)
testAnnoFrame = testAnnoFrame.fillna(0)
try:
    del testAnnoFrame['Sum']
except:
    pass
rowSum = testAnnoFrame.sum(axis=1)
annoTable['Sum'] = rowSum

annoTable = annoTable.sort_values(by='Sum', ascending=False)
del annoTable['Sum']

# print(annoTable)

# for ele in annoTable.head(20).index:
#     print(annoTable.loc[ele, :], '\n')

                                                    Extremely High Confidence, N=6  \
GO Annotation                                                                        
cellular response to oxidative stress                                        1.000   
unknown cellular component                                                   0.167   
cytoplasm                                                                    0.667   
oxidation-reduction process                                                  0.667   
cytosol                                                                      0.333   
unknown molecular function                                                     NaN   
hydrogen peroxide catabolic process                                          0.500   
mitochondrion                                                                0.333   
cellular detoxification of hydrogen peroxide                                 0.500   
unknown biological process                            

[734 rows x 4 columns]
Extremely High Confidence, N=6    1.000
High Confidence, N=50             0.080
Low Confidence, N=130             0.031
Reference Set, N=206              0.083
Name: cellular response to oxidative stress, dtype: float64 

Extremely High Confidence, N=6    0.167
High Confidence, N=50             0.380
Low Confidence, N=130             0.146
Reference Set, N=206              0.364
Name: unknown cellular component, dtype: float64 

Extremely High Confidence, N=6    0.667
High Confidence, N=50             0.120
Low Confidence, N=130             0.046
Reference Set, N=206              0.117
Name: cytoplasm, dtype: float64 

Extremely High Confidence, N=6    0.667
High Confidence, N=50             0.060
Low Confidence, N=130             0.023
Reference Set, N=206              0.102
Name: oxidation-reduction process, dtype: float64 

Extremely High Confidence, N=6    0.333
High Confidence, N=50             0.160
Low Confidence, N=130             0.062
Reference Set, N=2

In [114]:
annoTable.to_csv("AnnotationFrequency.csv", sep=",", index=True)

In [12]:
import _pickle as pickle

# Run this to avoid running everything above this point
pickleIn = open('GOanno.pickle', 'rb')
geneAnno = pickle.load(pickleIn)
print(len(geneAnno.keys()))
print(clust10)
print(clust5)
print(clust11)

345
['CAGL0C01705g', 'CAGL0H04631g', 'CAGL0I01166g', 'CAGL0K06259g', 'CAGL0K10868g', 'CAGL0L05258g']
['CAGL0B01419g', 'CAGL0B02607g', 'CAGL0B03069g', 'CAGL0C01727g', 'CAGL0D01496g', 'CAGL0D05434g', 'CAGL0D05478g', 'CAGL0D06138g', 'CAGL0E05170g', 'CAGL0F01793g', 'CAGL0F07359g', 'CAGL0F07513g', 'CAGL0F09097g', 'CAGL0G03223g', 'CAGL0G06182g', 'CAGL0G09405r', 'CAGL0H00704g', 'CAGL0H01375g', 'CAGL0H02739g', 'CAGL0H04653g', 'CAGL0H05951g', 'CAGL0H06017g', 'CAGL0H10142g', 'CAGL0I00726g', 'CAGL0I00748g', 'CAGL0I04884g', 'CAGL0J07612g', 'CAGL0J09680g', 'CAGL0K00803g', 'CAGL0K01727g', 'CAGL0K03531g', 'CAGL0K03553g', 'CAGL0K08162g', 'CAGL0K08184g', 'CAGL0K09350g', 'CAGL0K10879g', 'CAGL0K11858g', 'CAGL0K11880g', 'CAGL0L04664g', 'CAGL0L05280g', 'CAGL0L05742g', 'CAGL0L13365r', 'CAGL0L13376g', 'CAGL0L13387r', 'CAGL0L13392g', 'CAGL0M02607g', 'CAGL0M02629g', 'CAGL0M05819g', 'CAGL0M11682g', 'CAGL0M11704g']
['CAGL0A00495g', 'CAGL0A00517g', 'CAGL0B00990g', 'CAGL0B01188g', 'CAGL0B01203g', 'CAGL0B02651g', '

In [37]:
key_phrases = ['cell wall', 'hyph']
counter_list = []
anno_list = []

targets = clust5 + clust10 + clust11

for gene in targets:
    for anno in geneAnno[gene]:
        for phrase in key_phrases:
            if phrase in anno:
                counter_list.append(gene)
                anno_list.append(anno)
            
print(str(len(set(counter_list))) + ' genes found in CgYap1 target list related to keywords: ', str(key_phrases))
print('Genes: ', set(counter_list))
print('Annotations: ',set(anno_list))

14 genes found in CgYap1 target list related to keywords:  ['cell wall', 'hyph']
Genes:  {'CAGL0F05137g', 'CAGL0H09130g', 'CAGL0H01375g', 'CAGL0B03069g', 'CAGL0G04873g', 'CAGL0M08514g', 'CAGL0D03256g', 'CAGL0K09350g', 'CAGL0F08833g', 'CAGL0G07491g', 'CAGL0M11704g', 'CAGL0G05027g', 'CAGL0E05412g', 'CAGL0I05390g'}
Annotations:  {'cell wall mannoprotein biosynthetic process', 'hyphal growth', 'fungal-type cell wall organization', 'fungal-type cell wall biogenesis', 'fungal-type cell wall polysaccharide biosynthetic process', 'fungal-type cell wall', 'regulation of cell wall (1-&gt;3)-beta-D-glucan biosynthetic process', 'pseudohyphal growth', 'positive regulation of pseudohyphal growth', 'hyphal cell wall', 'regulation of fungal-type cell wall organization', 'fungal-type cell wall beta-glucan biosynthetic process', 'yeast-form cell wall'}
