In [1]:
import pandas as pd

# Read in data with cluster classification

table = pd.read_csv('ClassifiedTable.txt', sep='\t', header=(0), index_col=0)
print(table.head(7))

              Roetzer log2  Lelandais log2 Rep1  Lelandais log2 Rep2  \
FEATURE_NAME                                                           
CAGL0A00105g      0.067601            -0.026006             0.036205   
CAGL0A00110g      0.366500            -0.026006             0.036205   
CAGL0A00116g      0.067601            -0.026006             0.036205   
CAGL0A00132g      0.067601            -0.026006             0.036205   
CAGL0A00154g      0.067601            -0.026006             0.036205   
CAGL0A00165g     -0.119234             0.217832            -0.029164   
CAGL0A00187g      0.581269            -0.028898            -0.003969   

              Kuo log2 Rep1  Kuo log2 Rep2  Kuo log2 Rep3  \
FEATURE_NAME                                                
CAGL0A00105g       0.354450       0.661396       1.084942   
CAGL0A00110g      -0.454108      -0.227552      -0.159453   
CAGL0A00116g      -0.454108      -0.227552      -0.159453   
CAGL0A00132g      -0.537156      -0.368734    

In [2]:
# Read in targets from previous analyses and save into a list

targets = pd.read_csv('TargetGenes.txt', sep= '\t')
targets = list(targets.iloc[:, 0])

Once the data for target genes is read in, we parse through each cluster and create a list for each cluster that contains: genes in the cluster, amount of overlapped genes (correct) between the cluster and the target set, and the amount of non overlapped genes (incorrect) between the cluster and target set.

In [3]:
def intersect(targets, genes):
    intersect = []
    for gene in genes:
        if gene in targets:
            intersect.append(gene)
    return intersect

In [4]:
clusters = range(0,12)
targetClasses = []
for gene in targets:
    targetClasses.append(table.loc[gene, 'Prediction'])

ratios = []
corrTable = []
for cluster in clusters:
    genesInCluster = table[table.loc[:, 'Prediction'] == cluster]
    intersection = intersect(targets, list(genesInCluster.index))
    correct = len(intersection)
    incorrect = len(genesInCluster)- len(intersection)
    if incorrect == 0:
        ratio = correct
    else:
        ratio = correct/incorrect
        
    ratios.append(ratio)
    corrTable.append([cluster, len(intersection), len(genesInCluster)- len(intersection)])
    
    
corrTable = pd.DataFrame(corrTable)
corrTable.columns = ['Cluster', 'Overlap Genes', 'Non-Overlap Genes']
corrTable.index = corrTable.iloc[:, 0]
corrTable = corrTable.iloc[:, 1:]
corrTable.loc[:, 'Ratio'] = ratios

corrTable['Target'] = 'Very Unlikely'
corrTable.loc[10, 'Target'] = 'Extremely High Confidence'
corrTable.loc[11, 'Target'] = 'Low Confidence'
corrTable.loc[5, 'Target'] = 'High Confidence'
corrTable.loc[8, 'Target'] = 'Possible Repressor'
print(corrTable)


         Overlap Genes  Non-Overlap Genes     Ratio                     Target
Cluster                                                                       
0                   14                589  0.023769              Very Unlikely
1                    5               1027  0.004869              Very Unlikely
2                    9               1233  0.007299              Very Unlikely
3                   28                309  0.090615              Very Unlikely
4                   13                447  0.029083              Very Unlikely
5                   42                  8  5.250000            High Confidence
6                    2                407  0.004914              Very Unlikely
7                   25                442  0.056561              Very Unlikely
8                    2                 40  0.050000         Possible Repressor
9                   21                958  0.021921              Very Unlikely
10                   6                  0  6.000000 

Clusters 5 and 10 appear to have a high ratio of correct/incorrect. In order to understand what the data in each cluster represents, its useful to manually observe it.

In [5]:
# print(table[table.loc[:, 'Prediction'] == 10])

In [6]:
# print(table[table.loc[:, 'Prediction'] == 5])

Cluster 10 appears to be strong targets of Yap1 and cluster 5 appears to be likely targets of Yap1. In order to make sure there aren't false positives, we can directly observe the 8 genes that were not a part of the overlap set to see if any new discovery was made from this analysis which wasn't found in earlier analyses.

In [7]:
cluster5 = table[table.loc[:, 'Prediction'] == 5]
newGenes = [gene for gene in list(cluster5.index) if not gene in targets]
# print(newGenes)
print(table.loc[newGenes,:])

              Roetzer log2  Lelandais log2 Rep1  Lelandais log2 Rep2  \
FEATURE_NAME                                                           
CAGL0D05478g      0.054514            -0.244361             0.120199   
CAGL0G03223g      0.805756             0.133721            -0.405787   
CAGL0G09405r      0.067601            -0.026006             0.036205   
CAGL0K10879g      0.067601            -0.026006             0.036205   
CAGL0L13365r      0.067601            -0.026006             0.036205   
CAGL0L13376g      0.067601            -0.026006             0.036205   
CAGL0L13387r      0.067601            -0.026006             0.036205   
CAGL0L13392g      0.067601            -0.026006             0.036205   

              Kuo log2 Rep1  Kuo log2 Rep2  Kuo log2 Rep3  \
FEATURE_NAME                                                
CAGL0D05478g       2.801382       2.647244       2.196127   
CAGL0G03223g       0.096668       0.191343       0.150483   
CAGL0G09405r       1.690861       1

In [8]:
# Possible repressor genes
print(table[table.loc[:, 'Prediction'] == 8])

              Roetzer log2  Lelandais log2 Rep1  Lelandais log2 Rep2  \
FEATURE_NAME                                                           
CAGL0A00132g      0.067601            -0.026006             0.036205   
CAGL0B00110g      0.181877            -0.026006             0.036205   
CAGL0B00116g      0.067601            -0.026006             0.036205   
CAGL0B03245g      0.025501             0.116651            -0.083314   
CAGL0B04895g      0.217062             0.016023            -0.726815   
CAGL0C00231g      0.067601             0.122012            -0.045777   
CAGL0C05621g      0.164058            -0.026006             0.036205   
CAGL0D02288g      0.067601             0.149435            -0.132811   
CAGL0F05005g      0.772439             0.376161             0.051762   
CAGL0F09075g      0.503680             0.152608             0.169236   
CAGL0F09273g     -0.597578            -0.026006             0.036205   
CAGL0G02893g     -0.110275            -0.048243             0.00

Get the gene names from all the clusters that need GO annotations collected. This list will be used in "Get Annotations.ipynb" to web scrape from candidagenome.org

In [24]:
# Pickle is just an easier library to use for importing/exporting data structures instead
# of constantly saving them as tab-separated .txt files and using pandas
import _pickle as pickle

clustersOfInterest = [5, 8, 10, 11]

geneList = []
for cluster in clustersOfInterest:
    geneList.extend(list(table[table.loc[:, 'Prediction'] == cluster].index))

# Also include target list from previous analyses for comparitive purposes
geneList.extend(targets)

# print(len(geneList))
# print(len(set(geneList)))

# Remove duplicate genes
geneList = set(geneList)

print(list(geneList)[247])

pickleOut = open('GOgenes.pickle', 'wb')
pickle.dump(geneList, pickleOut)
pickleOut.close()


CAGL0M04213g
