In [11]:
import pandas as pd

# Read in data with cluster classification

table = pd.read_csv('ClassifiedTable.txt', sep='\t', header=(0), index_col=0)
print(table.head(7))

              Roetzer log2  Lelandais log2 Rep1  Lelandais log2 Rep2  \
FEATURE_NAME                                                           
CAGL0A00105g      0.067601            -0.026006             0.036205   
CAGL0A00110g      0.366500            -0.026006             0.036205   
CAGL0A00116g      0.067601            -0.026006             0.036205   
CAGL0A00132g      0.067601            -0.026006             0.036205   
CAGL0A00154g      0.067601            -0.026006             0.036205   
CAGL0A00165g     -0.119234             0.217832            -0.029164   
CAGL0A00187g      0.581269            -0.028898            -0.003969   

              Kuo log2 Rep1  Kuo log2 Rep2  Kuo log2 Rep3  \
FEATURE_NAME                                                
CAGL0A00105g       0.354450       0.661396       1.084942   
CAGL0A00110g      -0.454108      -0.227552      -0.159453   
CAGL0A00116g      -0.454108      -0.227552      -0.159453   
CAGL0A00132g      -0.537156      -0.368734    

In [16]:
# Read in targets from previous analyses and save into a list

targets = pd.read_csv('TargetGenes.txt', sep= '\t')
targets = list(targets.iloc[:, 0])

['CAGL0A00495g', 'CAGL0A00517g', 'CAGL0A02530g', 'CAGL0A02552g', 'CAGL0A02574g', 'CAGL0A03080g', 'CAGL0A03102g', 'CAGL0B00165r', 'CAGL0B00902g', 'CAGL0B01188g', 'CAGL0B01203g', 'CAGL0B01419g', 'CAGL0B02277g', 'CAGL0B02607g', 'CAGL0B02629g', 'CAGL0B03047g', 'CAGL0B03069g', 'CAGL0B03663g', 'CAGL0B03685g', 'CAGL0B03817g', 'CAGL0B04279g', 'CAGL0C01661g', 'CAGL0C01683g', 'CAGL0C01705g', 'CAGL0C01727g', 'CAGL0C02189g', 'CAGL0C03597g', 'CAGL0D01298g', 'CAGL0D01320g', 'CAGL0D01496g', 'CAGL0D01936g', 'CAGL0D01958g', 'CAGL0D02134g', 'CAGL0D02156g', 'CAGL0D02816g', 'CAGL0D04114g', 'CAGL0D04136g', 'CAGL0D05434g', 'CAGL0D05456g', 'CAGL0D06138g', 'CAGL0E00187g', 'CAGL0E00759g', 'CAGL0E04884g', 'CAGL0E04906g', 'CAGL0E05170g', 'CAGL0E05192g', 'CAGL0F00407g', 'CAGL0F00913g', 'CAGL0F01793g', 'CAGL0F02717g', 'CAGL0F03399g', 'CAGL0F04015g', 'CAGL0F04521g', 'CAGL0F04543g', 'CAGL0F05115g', 'CAGL0F05137g', 'CAGL0F07359g', 'CAGL0F07513g', 'CAGL0F07579g', 'CAGL0F07601g', 'CAGL0F08767g', 'CAGL0F09097g', 'CAGL0G

Once the data for target genes is read in, we parse through each cluster and create a list for each cluster that contains: genes in the cluster, amount of overlapped genes (correct) between the cluster and the target set, and the amount of non overlapped genes (incorrect) between the cluster and target set.

In [24]:
def intersect(targets, genes):
    intersect = []
    for gene in genes:
        if gene in targets:
            intersect.append(gene)
    return intersect

In [41]:
clusters = range(0,12)
targetClasses = []
for gene in targets:
    targetClasses.append(table.loc[gene, 'Prediction'])

ratios = []
corrTable = []
for cluster in clusters:
    genesInCluster = table[table.loc[:, 'Prediction'] == cluster]
    intersection = intersect(targets, list(genesInCluster.index))
    correct = len(intersection)
    incorrect = len(genesInCluster)- len(intersection)
    if incorrect == 0:
        ratio = correct
    else:
        ratio = correct/incorrect
        
    ratios.append(ratio)
    corrTable.append([cluster, len(intersection), len(genesInCluster)- len(intersection)])
    
    
corrTable = pd.DataFrame(corrTable)
corrTable.columns = ['Cluster', 'Overlap Genes', 'Non-Overlap Genes']
corrTable.index = corrTable.iloc[:, 0]
corrTable = corrTable.iloc[:, 1:]
corrTable.loc[:, 'Ratio'] = ratios
print(corrTable)


         Overlap Genes  Non-Overlap Genes     Ratio
Cluster                                            
0                   14                589  0.023769
1                    5               1027  0.004869
2                    9               1233  0.007299
3                   28                309  0.090615
4                   13                447  0.029083
5                   42                  8  5.250000
6                    2                407  0.004914
7                   25                442  0.056561
8                    2                 40  0.050000
9                   21                958  0.021921
10                   6                  0  6.000000
11                  39                 91  0.428571


Clusters 5 and 10 appear to have a high ratio of correct/incorrect. In order to understand what the data in each cluster represents, its useful to manually observe it.

In [31]:
# print(table[table.loc[:, 'Prediction'] == 10])

              Roetzer log2  Lelandais log2 Rep1  Lelandais log2 Rep2  \
FEATURE_NAME                                                           
CAGL0C01705g      3.553027             3.506469             3.522566   
CAGL0H04631g      3.069182             6.382119             6.303253   
CAGL0I01166g      4.909161             4.945712             5.319500   
CAGL0K06259g      2.450814             4.684777             5.689060   
CAGL0K10868g      4.059321             4.042283             3.756058   
CAGL0L05258g      0.067601             4.854121             4.921401   

              Kuo log2 Rep1  Kuo log2 Rep2  Kuo log2 Rep3  \
FEATURE_NAME                                                
CAGL0C01705g       2.429326       1.793841       1.684550   
CAGL0H04631g       2.938910       2.571172       3.164469   
CAGL0I01166g       2.015746       1.318805       1.224463   
CAGL0K06259g       2.369199       2.200777       2.222702   
CAGL0K10868g       2.641597       2.252599       2.159253

In [42]:
# print(table[table.loc[:, 'Prediction'] == 5])

              Roetzer log2  Lelandais log2 Rep1  Lelandais log2 Rep2  \
FEATURE_NAME                                                           
CAGL0B01419g      0.145682             0.601566            -0.125244   
CAGL0B02607g      1.147943             1.232364             1.007086   
CAGL0B03069g      0.269242            -0.055675             0.144454   
CAGL0C01727g     -0.056084            -0.083715             0.220708   
CAGL0D01496g      0.979657             1.123836             1.013580   
CAGL0D05434g     -0.304324             0.031505             1.241321   
CAGL0D05478g      0.054514            -0.244361             0.120199   
CAGL0D06138g      0.516345             0.483132             0.441893   
CAGL0E05170g     -0.598463            -0.802137            -1.353157   
CAGL0F01793g      0.282743             1.019380             1.114633   
CAGL0F07359g     -1.211840             0.760290             0.387049   
CAGL0F07513g      0.067601            -0.976698             0.36

Cluster 10 appears to be strong targets of Yap1 and cluster 5 appears to be likely targets of Yap1. In order to make sure there aren't false positives, we can directly observe the 8 genes that were not a part of the overlap set to see if any new discovery was made from this analysis which wasn't found in earlier analyses.

In [56]:
cluster5 = table[table.loc[:, 'Prediction'] == 5]
newGenes = [gene for gene in list(cluster5.index) if not gene in targets]
# print(newGenes)
print(table.loc[newGenes,:])

              Roetzer log2  Lelandais log2 Rep1  Lelandais log2 Rep2  \
FEATURE_NAME                                                           
CAGL0D05478g      0.054514            -0.244361             0.120199   
CAGL0G03223g      0.805756             0.133721            -0.405787   
CAGL0G09405r      0.067601            -0.026006             0.036205   
CAGL0K10879g      0.067601            -0.026006             0.036205   
CAGL0L13365r      0.067601            -0.026006             0.036205   
CAGL0L13376g      0.067601            -0.026006             0.036205   
CAGL0L13387r      0.067601            -0.026006             0.036205   
CAGL0L13392g      0.067601            -0.026006             0.036205   

              Kuo log2 Rep1  Kuo log2 Rep2  Kuo log2 Rep3  \
FEATURE_NAME                                                
CAGL0D05478g       2.801382       2.647244       2.196127   
CAGL0G03223g       0.096668       0.191343       0.150483   
CAGL0G09405r       1.690861       1

In [58]:
print(table[table.loc[:, 'Prediction'] == 8])

              Roetzer log2  Lelandais log2 Rep1  Lelandais log2 Rep2  \
FEATURE_NAME                                                           
CAGL0A00132g      0.067601            -0.026006             0.036205   
CAGL0B00110g      0.181877            -0.026006             0.036205   
CAGL0B00116g      0.067601            -0.026006             0.036205   
CAGL0B03245g      0.025501             0.116651            -0.083314   
CAGL0B04895g      0.217062             0.016023            -0.726815   
CAGL0C00231g      0.067601             0.122012            -0.045777   
CAGL0C05621g      0.164058            -0.026006             0.036205   
CAGL0D02288g      0.067601             0.149435            -0.132811   
CAGL0F05005g      0.772439             0.376161             0.051762   
CAGL0F09075g      0.503680             0.152608             0.169236   
CAGL0F09273g     -0.597578            -0.026006             0.036205   
CAGL0G02893g     -0.110275            -0.048243             0.00