# Set up Python environment

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
from openpyxl import load_workbook
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)

import math
import ast
import numpy as np

%matplotlib inline
from matplotlib.patches import Rectangle
import matplotlib.pyplot as plt
from matplotlib import rcParams
import matplotlib.image as mpimg

from IPython.display import display

# hack to get relative imports to work
import sys
sys.path.append("..") # Adds higher directory to python modules path.
from utils import utils
from utils import venn

# Set plot settings

In [None]:
# SET PLOTTING SETTINGS
SMALL_SIZE = 16
MEDIUM_SIZE = 20
BIGGER_SIZE = 24
BIGGEST_SIZE = 28

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title

# Load dataset

In [None]:
df = pd.read_pickle("../Data/df_ORF_condition_normalized")

# Compare GEM, MACE & maxPeak scores to set thresholds

## Minimal score of targets shown by both GEM and MACE

In [None]:
t = [1,0.005] # initial thresholds for: GEM, MACE

experiments = ['Fkh1 log','Fkh1 stat','Fkh2 log','Fkh2 stat']
for exp in experiments:
    print(df[(df['GEM '+exp] >= t[0]) & (df['MACE '+exp] <= t[1])]['maxPeak_AD_12 '+exp].min())

## Set thresholds
Heuristically determined from the score comparisons.

We set the maxPeak score based on the lowest score obtained by genes predicted by GEM and MACE using the thresholds of 1 and 0.005.

In [None]:
t = [1,1,0.005] # thresholds for: maxPeak, GEM, MACE

## Score comparison plots

In [None]:
experiments = ['Fkh1 log','Fkh1 stat','Fkh2 log','Fkh2 stat']

fig = plt.figure(figsize=(20,12))
ax1 = plt.subplot2grid((100, 100), (0, 0), rowspan=44, colspan=45)
ax2 = plt.subplot2grid((100, 100), (0, 55), rowspan=44, colspan=45)
ax3 = plt.subplot2grid((100, 100), (56, 0), rowspan=44, colspan=45)
ax4 = plt.subplot2grid((100, 100), (56, 55), rowspan=44, colspan=45)
axes = [ax1,ax2,ax3,ax4]

for k in range(len(experiments)):
    ax = axes[k]
    exp = experiments[k]
    
    df_targets = df[['maxPeak_AD_12 '+exp, 'GEM '+exp , 'MACE '+exp]]
        
    df_targets_gem = df_targets[df_targets['GEM '+exp] >= t[1]]
    df_targets_notgem = df_targets[(df_targets['GEM '+exp] < t[1]) | (df_targets['GEM '+exp].isnull())]

    x1 = df_targets_gem['MACE '+exp]
    y1 = df_targets_gem['maxPeak_AD_12 '+exp]
        
    x2 = df_targets_notgem['MACE '+exp]
    y2 = df_targets_notgem['maxPeak_AD_12 '+exp]
    
    h2 = ax.scatter(x2,y2,c=['red']*len(x2),alpha=0.5)
    h1 = ax.scatter(x1,y1,c=['blue']*len(x1),alpha=0.5)

    ax.plot(np.linspace(0,0.01,100), [t[0]]*100,'k-.')
    ymax = 1.15*max(df_targets['maxPeak_AD_12 '+exp].values)
    ax.plot([t[2]]*100, np.linspace(0,ymax,100),'k-.')
    
    ax.set_xscale('log')
    ax.set_xlim(0.0001, 0.01)
    ax.set_xticks([0.0001, 0.001, 0.01])
    ax.set_xticklabels(['-4','-3','-2'])
    ax.set_xlabel(r'$\log_{10}$(MACE p-value)')
    ax.set_ylim(-0.2,ymax)
    ax.set_ylabel('maxPeak SNR')
    ax.legend([h1,h2],[r'GEM SNR$\geq$1',r'GEM SNR$<$1'], loc=1)
    
plt.show()

fig.savefig('../Figures/Comparison_scores_maxPeak_vs_MACE.png', bbox_inches='tight', dpi=200)

In [None]:
experiments = ['Fkh1 log','Fkh1 stat','Fkh2 log','Fkh2 stat']

fig = plt.figure(figsize=(20,12))
ax1 = plt.subplot2grid((100, 100), (0, 0), rowspan=44, colspan=45)
ax2 = plt.subplot2grid((100, 100), (0, 55), rowspan=44, colspan=45)
ax3 = plt.subplot2grid((100, 100), (56, 0), rowspan=44, colspan=45)
ax4 = plt.subplot2grid((100, 100), (56, 55), rowspan=44, colspan=45)
axes = [ax1,ax2,ax3,ax4]

for k in range(len(experiments)):
    ax = axes[k]
    exp = experiments[k]
    
    df_targets = df[['maxPeak_AD_12 '+exp, 'GEM '+exp, 'MACE '+exp]]
    df_targets['GEM '+exp] = df_targets['GEM '+exp].fillna(0)
    
    df_targets_mace = df_targets[df_targets['MACE '+exp] <= t[2]]
    df_targets_mace_low = df_targets[(df_targets['MACE '+exp] > t[2]) & (df_targets['MACE '+exp] < 0.01)]
    df_targets_notmace = df_targets[df_targets['MACE '+exp].isnull()]

    x1 = df_targets_mace['GEM '+exp]
    y1 = df_targets_mace['maxPeak_AD_12 '+exp]
    
    x2 = df_targets_mace_low['GEM '+exp]
    y2 = df_targets_mace_low['maxPeak_AD_12 '+exp]
    
    x3 = df_targets_notmace['GEM '+exp]
    y3 = df_targets_notmace['maxPeak_AD_12 '+exp]

    h1 = ax.scatter(x1,y1,c=['blue']*len(x1),alpha=0.5)
    h2 = ax.scatter(x2,y2,c=['green']*len(x3),alpha=0.5)
    h3 = ax.scatter(x3,y3,c=['red']*len(x2),alpha=0.5)
    

    ax.plot(np.linspace(0,df_targets['GEM '+exp].dropna().max(),100), [t[0]]*100,'k-.')
    ymax = 1.65*df_targets['maxPeak_AD_12 '+exp].max()
    ax.plot([t[1]]*100, np.linspace(0,ymax,100),'k-.')
    
    ax.set_xlabel('GEM SNR')
    ax.set_ylabel('maxPeak SNR')
    # str(t[2])+r'$<$MACE < '+str(0.01)
    ax.legend([h1,h2,h3],[r'MACE$\leq$'+str(t[2]), r'MACE$\in (0.005, 0.01)$',r'MACE$>$'+str(0.01)],loc=1)
    ax.set_ylim(-0.2,ymax)
    ax.set_xlim(-0.2,1.01*df_targets['GEM '+exp].dropna().max())
    
plt.show()

fig.savefig('../Figures/Comparison_scores_maxPeak_vs_GEM.png', bbox_inches='tight', dpi=200)

In [None]:
experiments = ['Fkh1 log','Fkh1 stat','Fkh2 log','Fkh2 stat']

fig = plt.figure(figsize=(20,12))
ax1 = plt.subplot2grid((100, 100), (0, 0), rowspan=44, colspan=45)
ax2 = plt.subplot2grid((100, 100), (0, 55), rowspan=44, colspan=45)
ax3 = plt.subplot2grid((100, 100), (56, 0), rowspan=44, colspan=45)
ax4 = plt.subplot2grid((100, 100), (56, 55), rowspan=44, colspan=45)
axes = [ax1,ax2,ax3,ax4]

for k in range(len(experiments)):
    ax = axes[k]
    exp = experiments[k]
    
    df_targets = df[['maxPeak_AD_12 '+exp, 'GEM '+exp, 'MACE '+exp]]
    df_targets['GEM '+exp] = df_targets['GEM '+exp].fillna(0)
    
    df_targets_max = df_targets[df_targets['maxPeak_AD_12 '+exp] >= t[0]]
    df_targets_notmax = df_targets[df_targets['maxPeak_AD_12 '+exp] < t[0]]

    x1 = df_targets_max['MACE '+exp]
    y1 = df_targets_max['GEM '+exp]
    
    x2 = df_targets_notmax['MACE '+exp]
    y2 = df_targets_notmax['GEM '+exp]

    h1 = ax.scatter(x1,y1,c=['blue']*len(x1),alpha=0.5)
    h2 = ax.scatter(x2,y2,c=['red']*len(x2),alpha=0.25)

    ax.plot(np.linspace(0,0.01,100), [t[1]]*100,'k-.')
    ymax = 1.03*df_targets['GEM '+exp].dropna().max()
    ax.plot([t[2]]*100, np.linspace(0,ymax,100),'k-.')   
    
    ax.set_xscale('log')
    ax.set_xlim(0.0001, 0.01)
    ax.set_xticks([0.0001, 0.001, 0.01])
    ax.set_xticklabels(['-4','-3','-2'])
    ax.set_xlabel(r'$\log_{10}$(MACE p-value)')
    ax.set_ylim(-0.2,ymax)
    ax.set_ylabel('GEM SNR')
    ax.legend([h1,h2],[r'maxPeak SNR$\geq$'+str(t[0]), r'maxPeak SNR$<$'+str(t[0])], loc=1)

plt.show()

fig.savefig('../Figures/Comparison_scores_GEM_vs_MACE.png', bbox_inches='tight', dpi=200)

## Overlap in targets between the 3 peak detection methods

In [None]:
experiments = ['Fkh1 log','Fkh1 stat','Fkh2 log','Fkh2 stat']

for i, exp in enumerate(experiments):
    labels = venn.get_labels([df[df['GEM '+exp]>=t[1]].index.values,
                              df[df['MACE '+exp]<=t[2]].index.values,
                              df[df['maxPeak_AD_12 '+exp]>=t[0]].index.values], 
                              fill=['number']);

    fig, ax = venn.venn3(labels, names=['GEM','MACE','maxPeak'], legend=True, ymax=1.2)
    fig.savefig('../Figures/Venn_3_methods_'+exp+'.png', bbox_inches='tight', dpi=200)
    plt.clf()
    
    fig, ax = venn.venn3(labels, names=['GEM','MACE','maxPeak'], legend=False, ymax=1)
    fig.savefig('../Figures/Venn_3_methods_'+exp+'_NO_LEGEND.png', bbox_inches='tight', dpi=300)
    plt.clf()
    
# show the output
img1 = mpimg.imread('../Figures/Venn_3_methods_Fkh1 log_NO_LEGEND.png',)
img2 = mpimg.imread('../Figures/Venn_3_methods_Fkh1 stat_NO_LEGEND.png')
img3 = mpimg.imread('../Figures/Venn_3_methods_Fkh2 log_NO_LEGEND.png')
img4 = mpimg.imread('../Figures/Venn_3_methods_Fkh2 stat_NO_LEGEND.png')

plt.figure().set_size_inches(25,25)
plt.subplot(221)
plt.imshow(img1)
plt.axis('off')
plt.subplot(222)
plt.imshow(img2)
plt.axis('off')
plt.subplot(223)
plt.imshow(img3)
plt.axis('off')
plt.subplot(224)
plt.imshow(img4)
plt.axis('off')
plt.show()

# Define targets and save subdataframes

## Our targets: 2x PDM, 3x PDM and CCR
"Targets" are predicted by 2 out of 3: maxPeak, GEM and Mace. Also save 3 out of 3 separately.

In [None]:
methods = ['maxPeak_AD_12','GEM','MACE']
experiments = ['Fkh1 log','Fkh1 stat','Fkh2 log','Fkh2 stat']

df_targets_list = []
df_targets_ccr_list = []
df_targets_3x_PDM_list = []

####################
# for each exp, loop over methods and keep only those shown by the method until we reach a 'common core'
####################
for exp in experiments:  
    # 2x methods
    df_targets = df[((df['maxPeak_AD_12 '+exp] >= t[0]) & (df['GEM '+exp] >= t[1])) 
                 | ((df['maxPeak_AD_12 '+exp] >= t[0]) & (df['MACE '+exp] <= t[2])) 
                 | ((df['GEM '+exp] >= t[1]) & (df['MACE '+exp] <= t[2]))]
    df_targets.to_excel('../Tables/targets_'+exp+'.xlsx')
       
    # CCR
    df_targets_ccr = df_targets[~df_targets['Expression peak phase'].isnull()]
    
    # 3x methods
    df_targets_3x_PDM = df[(df['maxPeak_AD_12 '+exp] >= t[0]) & (df['GEM '+exp] >= t[1]) & (df['MACE '+exp] <= t[2])]
        
    # append all to lists
    df_targets_list.append(df_targets)
    df_targets_3x_PDM_list.append(df_targets_3x_PDM)
    df_targets_ccr_list.append(df_targets_ccr)
    
    print(exp, '\t\t', len(df_targets), '('+str(len(df_targets_3x_PDM))+')', '\t\tCell cycle regulated:', len(df_targets_ccr),
         '\t\t','Metabolic enzymes:', len(df_targets[df_targets['is enzyme']]))

    
print('Fkh1 unique targets in common core:', len(list(set(df_targets_list[0].index.tolist()+df_targets_list[1].index.tolist()))))
print('Fkh2 unique targets in common core:', len(list(set(df_targets_list[2].index.tolist()+df_targets_list[3].index.tolist()))))


######################
# Export to Excel
######################
# start from empty 3x PDM file
pdm_path = '../Tables/subset_targets_3x_PDM.xlsx'
try:
    os.remove(pdm_path)
except: # fails if it doesn't exist
    pass

writer = pd.ExcelWriter(pdm_path, engine='xlsxwriter')
for exp,subdf in zip(experiments,df_targets_3x_PDM_list):
    subdf.to_excel(writer,sheet_name=exp)

writer.save()

# start from empty CCR file
ccr_path = '../Tables/subset_targets_CCR.xlsx'
try:
    os.remove(ccr_path)
except: # fails if it doesn't exist
    pass

writer = pd.ExcelWriter(ccr_path, engine='xlsxwriter')
for exp,subdf in zip(experiments,df_targets_ccr_list):
    subdf.to_excel(writer,sheet_name=exp)

writer.save()

######################
# give readable names to the sub-dataframes
######################
# Targets shown by 2 methods
df_Fkh1log_targets = df_targets_list[0]
df_Fkh1stat_targets = df_targets_list[1]
df_Fkh2log_targets = df_targets_list[2]
df_Fkh2stat_targets = df_targets_list[3]

# cell cycle dependent targets
df_Fkh1log_targets_ccr = df_targets_ccr_list[0]
df_Fkh1stat_targets_ccr = df_targets_ccr_list[1]
df_Fkh2log_targets_ccr = df_targets_ccr_list[2]
df_Fkh2stat_targets_ccr = df_targets_ccr_list[3]

## Targets in common among MacIsaac, Venters & Ostrow

In [None]:
df_Fkh1log_targets_literature = df[(df['MacIsaac 2006 Fkh1']) &
                                   (df['Venters 2011 Fkh1']) &
                                   (df['Ostrow 2014 Fkh1'])]

df_Fkh2log_targets_literature = df[(df['MacIsaac 2006 Fkh2']) &
                                   (df['Venters 2011 Fkh2']) &
                                   (df['Ostrow 2014 Fkh2'])]

######################
# Export to Excel
######################
path = '../Tables/literature_targets_macIsaac_Venters_and_Ostrow.xlsx'
try:
    os.remove(path)
except: # fails if it doesn't exist
    pass

writer = pd.ExcelWriter(path, engine='xlsxwriter')
for fkh,subdf in zip(['Fkh1','Fkh2'],[df_Fkh1log_targets_literature,df_Fkh2log_targets_literature]):
    subdf.to_excel(writer,sheet_name=fkh)

writer.save()

#### Print the gene names
print('Common core among 3 literature ChIP studies for Fkh1 in log phase:', len(df_Fkh1log_targets_literature))
print(df_Fkh1log_targets_literature['Standard name'].values)

print('Common core among 3 literature ChIP studies for Fkh2 in log phase:', len(df_Fkh2log_targets_literature))
print(df_Fkh2log_targets_literature['Standard name'].values)

## Targets in common among all 4 studies

In [None]:
df_Fkh1log_targets_4x_ChIP = df_Fkh1log_targets[(df_Fkh1log_targets['MacIsaac 2006 Fkh1']) &
                                                   (df_Fkh1log_targets['Venters 2011 Fkh1']) &
                                                   (df_Fkh1log_targets['Ostrow 2014 Fkh1'])]

df_Fkh2log_targets_4x_ChIP = df_Fkh2log_targets[(df_Fkh2log_targets['MacIsaac 2006 Fkh2']) &
                                                   (df_Fkh2log_targets['Venters 2011 Fkh2']) &
                                                   (df_Fkh2log_targets['Ostrow 2014 Fkh2'])]

print('Common core among 4 ChIP studies for Fkh1 in log phase:', len(df_Fkh1log_targets_4x_ChIP))
print(df_Fkh1log_targets_4x_ChIP['Standard name'].values)

print('Enzymes', len(df_Fkh1log_targets_4x_ChIP[df_Fkh1log_targets_4x_ChIP['is enzyme']]))
print(df_Fkh1log_targets_4x_ChIP[df_Fkh1log_targets_4x_ChIP['is enzyme']]['Standard name'].values)


print('Common core among 4 ChIP studies for Fkh2 in log phase:', len(df_Fkh2log_targets_4x_ChIP))
print(df_Fkh2log_targets_4x_ChIP['Standard name'].values)

print('Enzymes', len(df_Fkh2log_targets_4x_ChIP[df_Fkh2log_targets_4x_ChIP['is enzyme']]))
print(df_Fkh2log_targets_4x_ChIP[df_Fkh2log_targets_4x_ChIP['is enzyme']]['Standard name'].values)

######################
# Export to Excel
######################
path = '../Tables/subset_targets_4x_ChIP.xlsx'
try:
    os.remove(path)
except: # fails if it doesn't exist
    pass

writer = pd.ExcelWriter(path, engine='xlsxwriter')
for fkh,subdf in zip(['Fkh1','Fkh2'],[df_Fkh1log_targets_4x_ChIP,df_Fkh2log_targets_4x_ChIP]):
    subdf.to_excel(writer,sheet_name=fkh)

writer.save()

## Novel targets
Novel as in not shown to be targets by Venters et al. or Ostrow et al or MacIsaac et al.

In [None]:
df_Fkh1log_targets['Suggested by (Fkh1)'] = ''
df_Fkh1log_targets['Suggested by (Fkh1)'] = df_Fkh1log_targets.apply(lambda row: utils.identify_papers_showing_target(row, 'Fkh1 log'),axis=1)
df_Fkh2log_targets['Suggested by (Fkh2)'] = ''
df_Fkh2log_targets['Suggested by (Fkh2)'] = df_Fkh2log_targets.apply(lambda row: utils.identify_papers_showing_target(row, 'Fkh2 log'),axis=1)

# Define variables for verified targets for later
df_verified_targets_Fkh1 = df_Fkh1log_targets[df_Fkh1log_targets.apply(lambda x: len(x['Suggested by (Fkh1)'].split(", ")) > 1, axis = 1)]
df_verified_targets_Fkh2 = df_Fkh2log_targets[df_Fkh2log_targets.apply(lambda x: len(x['Suggested by (Fkh2)'].split(", ")) > 1, axis = 1)]
df_verified_3_targets_Fkh1 = df_Fkh1log_targets[df_Fkh1log_targets.apply(lambda x: len(x['Suggested by (Fkh1)'].split(", ")) > 2, axis = 1)]
df_verified_3_targets_Fkh2 = df_Fkh2log_targets[df_Fkh2log_targets.apply(lambda x: len(x['Suggested by (Fkh2)'].split(", ")) > 2, axis = 1)]
df_verified_4_targets_Fkh1 = df_Fkh1log_targets[df_Fkh1log_targets.apply(lambda x: len(x['Suggested by (Fkh1)'].split(", ")) > 3, axis = 1)]
df_verified_4_targets_Fkh2 = df_Fkh2log_targets[df_Fkh2log_targets.apply(lambda x: len(x['Suggested by (Fkh2)'].split(", ")) > 3, axis = 1)]

# novel targets
df_novel_Fkh1 = df_Fkh1log_targets[df_Fkh1log_targets['Suggested by (Fkh1)'] == 'Mondeel']
df_novel_Fkh2 = df_Fkh2log_targets[df_Fkh2log_targets['Suggested by (Fkh2)'] == 'Mondeel']

print('Number of novel targets (Fkh1, Fkh2):', len(df_novel_Fkh1), len(df_novel_Fkh2))
print('Number of enzymes among novel targets (Fkh1, Fkh2):', 
      len(df_novel_Fkh1[df_novel_Fkh1['is enzyme']]), 
      len(df_novel_Fkh2[df_novel_Fkh2['is enzyme']]))

######################
# Export to Excel
######################
# start from empty CCR file
novel_path = '../Tables/subset_targets_novel.xlsx'
try:
    os.remove(novel_path)
except: # fails if it doesn't exist
    pass

writer = pd.ExcelWriter(novel_path, engine='xlsxwriter')
df_novel_Fkh1.to_excel(writer,sheet_name='Fkh1')
df_novel_Fkh2.to_excel(writer,sheet_name='Fkh2')
writer.save()

## Target overview table for Pathview

In [None]:
def assign_exo_score(row, exo_targets):
    
    if row.name in exo_targets:
        score = 1
    else:
        score = np.NaN
    
    return score

def assign_consensus_score(row, df, exo_targets):
    
    evidence_exo = row.name in exo_targets
    evidence_chip = df.loc[row.name]['MacIsaac 2006 '+exp[:-4].rstrip()] or df.loc[row.name]['Venters 2011 '+exp[:-4].rstrip()] or df.loc[row.name]['Ostrow 2014 '+exp[:-4].rstrip()]
    
    if evidence_chip and evidence_exo:
        score = 1
    elif evidence_exo:
        score = 0
    elif evidence_chip:
        score = -1
    else:
        score = np.NaN
    
    return score

df_pathview = pd.DataFrame(df,copy=True)
df_pathview = df_pathview[['Standard name','Description']]

# For all genes get boolean for being target in each experiment
for i,exp in enumerate(experiments):
    df_pathview['Target '+exp] = df_pathview.apply(lambda row: assign_exo_score(row, df_targets_list[i].index), axis=1)
        
# Consensus only in log phase
for i,exp in [(0,'Fkh1 log'),(2, 'Fkh2 log')]:
    df_pathview['Consensus '+exp] = df_pathview.apply(lambda row: assign_consensus_score(row, df, df_targets_list[i].index), axis=1)
        
df_pathview.to_excel('../Tables/pathview_targets.xlsx')

## Print list of target genes for each experiment
highlight enzymes and novel targets

In [None]:
for i,df_targets in enumerate(df_targets_list):
    exp = experiments[i]
    print(exp)
    
    l = df_targets.index.values
    l_std = [df_targets.loc[g]['Standard name'] for g in l]
    l_std = sorted(l_std)
    
    for g in l_std:
        print(g,end=', ')
    print('\n')
    
    print('Metabolic enzymes:', end='\t')
    for g in l:
        if df_targets.loc[g]['is enzyme']:
            print(df_targets.loc[g]['Standard name'],end=', ')
    print('\n')
    
    print('Novel targets:', end='\t')
    for g in l:
        if (not df_targets.loc[g]['Venters 2011 ' + exp[:4]]) and (not df_targets.loc[g]['Ostrow 2014 ' + exp[:4]]) \
        and (not df_targets.loc[g]['MacIsaac 2006 ' + exp[:4]]):
            print(df_targets.loc[g]['Standard name'],end=', ')
    print('\n')

## Print 3x PDM targets

In [None]:
total_3x_pdm = []
Fkh1_3x_pdm = []
Fkh2_3x_pdm = []
for i,exp in enumerate(experiments):
    print(exp)
    d = df_targets_3x_PDM_list[i]
    
    l = d['Standard name'].tolist()
    
    # save the genes
    total_3x_pdm.extend(l)
    if i < 2:
        Fkh1_3x_pdm.extend(l)
    else:
        Fkh2_3x_pdm.extend(l)
    
    print(', '.join(l))
    print('enzymes')
    print(d[d['is enzyme']]['Standard name'].tolist())
    print('novel')
    if 'Fkh1' in exp:
        novel_3x = [g for g in d['Standard name'].tolist() if g in df_novel_Fkh1['Standard name'].tolist()]
    else:
        novel_3x = [g for g in d['Standard name'].tolist() if g in df_novel_Fkh2['Standard name'].tolist()]
    
    print(novel_3x)
    print()
    
# total unique gene counts
total_3x_pdm = list(set(total_3x_pdm))
Fkh1_3x_pdm = list(set(Fkh1_3x_pdm))
Fkh2_3x_pdm = list(set(Fkh2_3x_pdm))

print('Total number of unique 3x PDM verfied targets:',len(total_3x_pdm))
print('Number of unique 3x PDM verfied targets for Fkh1:',len(Fkh1_3x_pdm))
print('Number of unique 3x PDM verfied targets for Fkh2:',len(Fkh2_3x_pdm))

## Summary table

In [None]:
data = [df_targets_list]
data.append([df_Fkh1log_targets_4x_ChIP,np.NaN,df_Fkh2log_targets_4x_ChIP,np.NaN])
data.append(df_targets_3x_PDM_list)
data.append([df_novel_Fkh1, np.NaN, df_novel_Fkh2, np.NaN])
data.append(df_targets_ccr_list)
data.append([df_temp[df_temp['is enzyme']] for df_temp in df_targets_list])
data.append([df_temp[df_temp['yeast7']] for df_temp in df_targets_list])

data_counts = [[len(l[i]) if type(l[i])==pd.DataFrame else l[i] for l in data] for i in range(len(experiments)) ]


df_counts = pd.DataFrame(data_counts, columns=["# Targets",'# 4x ChIP verified','#3x PDM verified',"# Novel targets",'# CCR',
                                               '# enzymes','# metabolic enzymes'], 
                         index=experiments).transpose()
display(df_counts)

# Venn Diagrams of overlap with previous studies
Here we plot a sequence of Venn diagrams of overlapping & unique targets between this study and previous studies:
- Mondeel et al. (this study)
- Ostrow et al. http://doi.org/10.1371/journal.pone.0087647
- Venters et al. http://doi.org/10.1016/j.molcel.2011.01.015
- MacIsaac et al. http://doi.org/10.1186/1471-2105-7-113## Compare common core with literature

## MacIsaac, Venters & Ostrow

In [None]:
labels = venn.get_labels([df[df['MacIsaac 2006 Fkh1']].index.values,
                          df[df['Venters 2011 Fkh1']].index.values,
                          df[df['Ostrow 2014 Fkh1']].index.values],
                          fill=['number'])
fig, ax = venn.venn3(labels, names=['MacIsaac et al.',
                                    'Venters et al.',
                                    'Ostrow et al.'], legend=True, ymax=1.2)
fig.show()
fig.savefig('../Figures/Overlap_targets_venters_ostrow_macIsaac_Fkh1log.jpg', bbox_inches='tight', dpi=200)
plt.clf()

print(labels['111'],"Out of",sum([int(k) for k in list(labels.values())]),"Overlapping among all three studies.")

labels = venn.get_labels([df[df['MacIsaac 2006 Fkh2']].index.values,
                          df[df['Venters 2011 Fkh2']].index.values,
                          df[df['Ostrow 2014 Fkh2']].index.values], 
                        fill=['number'])
fig, ax = venn.venn3(labels, names=['MacIsaac et al.',
                                    'Venters et al.',
                                    'Ostrow et al.'], legend=True, ymax=1.2)
fig.show()
fig.savefig('../Figures/Overlap_targets_venters_ostrow_macIsaac_Fkh2log.jpg', bbox_inches='tight', dpi=200)

print(labels['111'],"Out of",sum([int(k) for k in list(labels.values())]),"Overlapping among all three studies.")

## All 4 ChIP studies

In [None]:
for i,df_targets in enumerate(df_targets_list):
    exp = experiments[i]
    print("\n"+exp)
    
    labels = venn.get_labels([df[df['MacIsaac 2006 ' + exp[:4]]].index.values,
                              df[df['Venters 2011 ' + exp[:4]]].index.values,        
                              df[df['Ostrow 2014 ' + exp[:4]]].index.values, 
                              df_targets.index.values],
                             fill=['number'])
    
    fig, ax = venn.venn4(labels, names=['MacIsaac et al.',
                                        'Venters et al.',
                                        'Ostrow et al.',
                                        'This study'], legend=True, textsize=25, )

    fig.savefig('../Figures/Overlap_'+exp+'.png', bbox_inches='tight', dpi=200)
    fig.show()
    plt.clf()
    
    print(labels['1111'],"Out of",sum([int(k) for k in list(labels.values())]),"Overlapping among all four studies.")
    print(labels['0001'],"Out of",sum([int(k) for k in list(labels.values())]),"New in our study")
    print(sum([int(k) for k in list(labels.values())]) - 
               sum([int(v) for v in [labels['1000'],labels['0100'],labels['0010'],labels['0001']]]),
          "Out of",sum([int(k) for k in list(labels.values())]),"Shown by at least 2 studies.")

    
# show the output 
img1 = mpimg.imread('../Figures/Overlap_Fkh1 log.png',)
img2 = mpimg.imread('../Figures/Overlap_Fkh1 stat.png')
img3 = mpimg.imread('../Figures/Overlap_Fkh2 log.png')
img4 = mpimg.imread('../Figures/Overlap_Fkh2 stat.png')

plt.figure().set_size_inches(25,25)
plt.subplot(221)
plt.imshow(img1)
plt.axis('off')
plt.subplot(222)
plt.imshow(img2)
plt.axis('off')
plt.subplot(223)
plt.imshow(img3)
plt.axis('off') 
plt.subplot(224)
plt.imshow(img4)
plt.axis('off')
plt.show()

# GO term enrichment
## GEMMER primary GO term

In [None]:
dict_df = {'all':df,
           'Fkh1 log': df_Fkh1log_targets,
           'Fkh1 stat': df_Fkh1stat_targets,
           'Fkh2 log': df_Fkh2log_targets,
           'Fkh2 stat': df_Fkh2stat_targets}
utils.get_functional_enrichment_multi(dict_df,'Primary GO term').style.applymap(utils._color_red_or_green)

# Build complete common core dataframe
All relevant columns, all genes suggested by each of the 4 experiments and a column indicating the experiment

In [None]:
exp_abbr = ['F1L','F1S','F2L','F2S']

idx = []
idx_to_exp = {}
for i,df_targets in enumerate(df_targets_list):
    l = df_targets.index.tolist()
    idx.extend(l)
    
    # keep track of which experiments identified each gene in the common core
    for g in l:
        if g not in idx_to_exp:
            idx_to_exp[g] = [exp_abbr[i]]
        else:
            idx_to_exp[g].append(exp_abbr[i])

idx = list(set(idx))
df_cc = df.loc[idx]
df_cc['Target of'] = pd.Series(idx_to_exp)
print("Number of genes in the common core:", len(df_cc))

df_cc_ccr = df_cc[~df_cc['Expression peak phase'].isnull()]
print("Number of genes in the common core that are cell cycle dependent:", len(df_cc_ccr))

# Timing

## Combined Fkh1,2 stackplot
Combine all common core genes for all four experiments that are cell cycle dependent. Plot them in bins according to their phase of peak expression (Rowicka et al.)

In [None]:
plt.rc('font', size=12)          # controls default text sizes

experiment = ['maxPeak_AD_12 Fkh1 log']
data = [df_cc_ccr]


fig = plt.figure()
ax1 = plt.subplot2grid((100, 1), (0, 0), rowspan=40)
axes = [ax1]

for k in range(len(experiment)):
    ax = axes[k]
    d = data[k]
    exp = experiment[k]
    
    ax = utils.draw_stackplot_cc(d, exp, ax, no_xlabel=False, pos_mult=(0.9,1.2))
    
fig.set_size_inches(12,35)
plt.show()
fig.savefig('../Figures/stackplot_combined_relative_SNR_vs_phase.png', bbox_inches='tight', dpi=200)

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes

## Separated stackplots per experiment
### Logarithmic phase

In [None]:
plt.rc('font', size=13)          # controls default text sizes

experiment = ['Fkh1 log', 'Fkh2 log'] 
data = [df_Fkh1log_targets_ccr, df_Fkh2log_targets_ccr]

# fig, axes = plt.subplots(2,1)

fig = plt.figure()
ax1 = plt.subplot2grid((100, 1), (0, 0), rowspan=68)
ax2 = plt.subplot2grid((100, 1), (70, 0), rowspan=30) # needs more space

axes = [ax1, ax2]

for k in range(len(experiment)):
    ax = axes[k]
    d = data[k]
    exp = experiment[k]
    
    if k==0:
        no_label = True
    else:
        no_label = False
    
    ax = utils.draw_stackplot(d, t, exp, ax, no_label, pos_mult=(1,0.9))

# # panel labels
# ax1.text(-0.07, 1, 'A', transform=ax1.transAxes,
#       fontsize=20, fontweight='bold', va='top', ha='left')
# ax1.text(-0.07, -0.15, 'B', transform=ax1.transAxes,
#       fontsize=20, fontweight='bold', va='top', ha='left')
    
fig.set_size_inches(13,15)
plt.show()
fig.savefig('../Figures/stackplot_separated_relative_SNR_vs_phase_log.png', bbox_inches='tight', dpi=300)

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes

### Stationary phase

In [None]:
plt.rc('font', size=13)          # controls default text sizes

experiment = ['Fkh1 stat', 'Fkh2 stat'] 
data = [df_Fkh1stat_targets_ccr, df_Fkh2stat_targets_ccr]

# fig, axes = plt.subplots(2,1)

fig = plt.figure()
ax1 = plt.subplot2grid((100, 1), (0, 0), rowspan=68)
ax2 = plt.subplot2grid((100, 1), (70, 0), rowspan=30) # needs more space

axes = [ax1, ax2]

for k in range(len(experiment)):
    ax = axes[k]
    d = data[k]
    exp = experiment[k]
    
    if k==0:
        no_label = True
    else:
        no_label = False
    
    ax = utils.draw_stackplot(d, t, exp, ax, no_label, pos_mult=(1,0.9))

# # panel labels
# ax1.text(-0.07, 1, 'A', transform=ax1.transAxes,
#       fontsize=20, fontweight='bold', va='top', ha='left')
# ax1.text(-0.07, -0.15, 'B', transform=ax1.transAxes,
#       fontsize=20, fontweight='bold', va='top', ha='left')
    
fig.set_size_inches(13,20)
plt.show()
fig.savefig('../Figures/stackplot_separated_relative_SNR_vs_phase_stat.png', bbox_inches='tight', dpi=300)

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes

## Phase distribution of cell cycle regulated targets

In [None]:
plt.rc('font', size=MEDIUM_SIZE-1) # the numbers in the pies
plt.rc('xtick', labelsize=MEDIUM_SIZE-1) # fontsize of the pie labels
plt.rc('axes', titlesize=MEDIUM_SIZE-1) # experiments

df_ccr = df[df['Expression peak phase'].notnull() ]

perc_genome_wide = df_ccr['Expression peak phase'].value_counts(dropna=False) / len(df_ccr) * 100
perc_genome_wide = perc_genome_wide.reindex(['G1','G1(P)','G1/S','S','G2','G2/M','M','M/G1'])

titles = ['Genome-wide', 'Fkh1 logarithmic', 'Fkh1 stationary', 'Fkh2 logarithmic', 'Fkh2 stationary']

colors_dict = {np.nan:"#F8F8FF","G1(P)":"lightblue", 
               "G1/S":"pink", "S":"#ff4c4c", "G2":"orange",
               "G2/M":"#ffe119","M":"#d65bd6","M/G1":"#2ca02c", "G1":"lightgrey"}

data = [perc_genome_wide]

fig = plt.figure()
# total size = 21x43
# two rows (10) and 4 columns (10), all spacings equal to 1
# the image in the first row is centered between the 2nd and 3rd column
cspace = 1
rspace = 2
r = 10 # row height
c = 10 # col width
ax1 = plt.subplot2grid((2*r+rspace, 4*c+3*cspace), (0, 16), colspan=c, rowspan=r)
ax2 = plt.subplot2grid((2*r+rspace, 4*c+3*cspace), (r+rspace, 0), colspan=c, rowspan=r)
ax3 = plt.subplot2grid((2*r+rspace, 4*c+3*cspace), (r+rspace, 1*(c+cspace)), colspan=c, rowspan=r)
ax4 = plt.subplot2grid((2*r+rspace, 4*c+3*cspace), (r+rspace, 2*(c+cspace)), colspan=c, rowspan=r)
ax5 = plt.subplot2grid((2*r+rspace, 4*c+3*cspace), (r+rspace, 3*(c+cspace)), colspan=c, rowspan=r)
axes = [ax1,ax2,ax3,ax4,ax5]
fig.set_size_inches(4*(c+cspace)/2.,2*(r+rspace)/2.)

startangles = [90, 90, 90, 90, 90] # G1 roughly at the top
pctdistance = [0.75,0.75,0.75,0.75,0.75]

df_data = []
for k in range(len(axes)):
    ax = axes[k]
    
    if k == 0:
        d = perc_genome_wide
    else:
        df_targets = df_targets_list[k-1]
        df_targets_ccr = df_targets[~df_targets['Expression peak phase'].isnull()]
        d = df_targets_ccr['Expression peak phase'].value_counts(dropna=False) / len(df_targets_ccr) * 100
        d = d.reindex(['G1','G1(P)','G1/S','S','G2','G2/M','M','M/G1'])
        d2 = d.fillna(0) # new series to drop nan later
        d = d.dropna()
                
        l_up = []; l_down = []; l_same = []
        for phase in d2.index:
            if (d2.loc[phase] / perc_genome_wide.loc[phase] > 1) :
                l_up.append(phase)
            elif (d2.loc[phase] / perc_genome_wide.loc[phase] < 1):
                l_down.append(phase)
            else:
                l_same.append(phase)
        str_up = ', '.join(l_up)
        str_down = ', '.join(l_down)
        str_same = ', '.join(l_same)
        
        df_data.append([
            round(sum([d2[i] - perc_genome_wide[i] for i in [0,1,2]]),2),
            round(sum([d2[i] - perc_genome_wide[i] for i in [3]]),2),
            round(sum([d2[i] - perc_genome_wide[i] for i in [4,5,6,7]]),2),
            str_up,
            str_down,
            str_same
        ])
        
    x = d.index
    y = d.values
    
    colors = [colors_dict[func] for func in x]
        
    ax.pie(y, labels=x, autopct='%1.1f%%', pctdistance=pctdistance[k], shadow=False, colors=colors,
           startangle=startangles[k], counterclock=False)
    ax.set_title(titles[k])

plt.show()
fig.savefig('../Figures/Piechart_ccr_targets.pdf', bbox_inches='tight', dpi=200)

df_ccr_enrichment = pd.DataFrame(df_data,columns=['Early','Mid','Late','Up','Down','No change'],
                                   index=experiments)
display(df_ccr_enrichment)

plt.rc('font', size=MEDIUM_SIZE)
plt.rc('xtick', labelsize=MEDIUM_SIZE)
plt.rc('axes', titlesize=MEDIUM_SIZE)

# KEGG Pathways
## All pathways 

In [None]:
pathway_counts = {} # pathway : {count: n,'genes':[gene1,gene2]}
for i,df_exp in enumerate(df_targets_list):
    exp = experiments[i]

    # filter out those genes with at least one KEGG pathway
    df_kegg = df_exp[df_exp['KEGG pathway'].notnull()] 
    
    for i in range(len(df_kegg)): # loop over targets that are in KEGG pathways
        row = df_kegg.iloc[i]
        
        if row['Standard name'] != '':
            gene = row['Standard name']
        else:
            gene = row['Systematic name']

        list_of_pathways = row['KEGG pathway'].split(', ')
        
        if len(list_of_pathways) == 0:
            print(gene)
        
        for pathway in list_of_pathways:
            if pathway not in pathway_counts:
                # init
                pathway_counts[pathway] = {e:[] for e in experiments}

            # add current gene to correct experiment
            pathway_counts[pathway][exp].append(gene)
                
    
df_pathway_counts = pd.DataFrame.from_dict(pathway_counts, orient="index").sort_index() 

# turn lists of genes into strings
for exp in experiments:
    df_pathway_counts[exp] = df_pathway_counts[exp].apply(lambda x: ', '.join(x) if type(x)==list else '')
    

### EXCEL EXPORT
filename = "../Tables/KEGG_pathways.xlsx"
writer = pd.ExcelWriter(filename, engine='xlsxwriter')
workbook = writer.book
df_pathway_counts.to_excel(writer, sheet_name='All')

# formatting
format_null = workbook.add_format({'text_wrap': True,'align':'left','font_size':10})
worksheet = writer.sheets['All']
worksheet.set_column('A:M',40,format_null)

writer.save()

print(len(df_pathway_counts))
df_pathway_counts

## Filter out interesting rows for the paper

In [None]:
keep = [# Shared high-enrichment pathways
    'Cell cycle - yeast','Meiosis - yeast',
    #signalling
    'MAPK signaling pathway - yeast','Mitophagy - yeast',
    # metabolism
    'Glycolysis / Gluconeogenesis','Citrate cycle (TCA cycle)', 'Oxidative phosphorylation',
    'Biosynthesis of amino acids','Ribosome biogenesis in eukaryotes','Ribosome','Proteasome',
    # RNA
    'RNA degradation','RNA transport'
    ]
df_pathway_counts_reduced = df_pathway_counts.loc[keep]

### EXCEL EXPORT
filename = "../Tables/KEGG_pathways_reduced.xlsx"
writer = pd.ExcelWriter(filename, engine='xlsxwriter')
workbook = writer.book
df_pathway_counts_reduced.to_excel(writer, sheet_name='All')

# formatting
format_null = workbook.add_format({'text_wrap': True,'align':'left','font_size':10})
worksheet = writer.sheets['All']
worksheet.set_column('A:M',40,format_null)

writer.save()

print(len(df_pathway_counts_reduced))
df_pathway_counts_reduced

### List enzymes and novel targets among filtered kegg pathways

In [None]:
l = [x for y in df_pathway_counts_reduced.values.tolist() for x in y] # string for each cell
l = [x.split(', ') for x in l]
l = [x for y in l for x in y]
l = list(set(l))
l.remove('')

# sub dataframe with all the listed targets
# make sure to keep systematic name as a column
# set index as standard name
subdf = df.copy()
subdf['Systematic name'] = df.index.tolist()
subdf = subdf.set_index('Standard name').loc[l]

# enzymes
print('Enzymes:')
print(subdf[subdf['is enzyme']].index.tolist())
print()

# novel
print('Novel Fkh1:')
print([x for x in subdf.index.tolist() if subdf.loc[x]['Systematic name'] in df_novel_Fkh1.index])
print('Novel Fkh2:')
print([x for x in subdf.index.tolist() if subdf.loc[x]['Systematic name'] in df_novel_Fkh2.index])

## All pathways for CCR targets

In [None]:
pathway_counts = {} # pathway : {count: n,'genes':[gene1,gene2]}
for i,df_exp in enumerate(df_targets_list):
    exp = experiments[i]

    # filter out those genes with at least one KEGG pathway
    df_kegg = df_exp[df_exp['KEGG pathway'].notnull()] 
    df_kegg_ccr = df_kegg[~df_kegg['Expression peak phase'].isnull()]
    
    for i in range(len(df_kegg_ccr)): # loop over targets that are in KEGG pathways
        row = df_kegg_ccr.iloc[i]
        
        if row['Standard name'] != '':
            gene = row['Standard name']
        else:
            gene = row['Systematic name']

        list_of_pathways = row['KEGG pathway'].split(', ')
        
        if len(list_of_pathways) == 0:
            print(gene)
        
        for pathway in list_of_pathways:
            if pathway not in pathway_counts:
                # init
                pathway_counts[pathway] = {e:[] for e in experiments}

            # add current gene to correct experiment
            pathway_counts[pathway][exp].append(gene)
                
    
df_pathway_counts = pd.DataFrame.from_dict(pathway_counts, orient="index").sort_index() 

# turn lists of genes into strings
for exp in experiments:
    df_pathway_counts[exp] = df_pathway_counts[exp].apply(lambda x: ', '.join(x) if type(x)==list else '')
    

### EXCEL EXPORT
filename = "../Tables/KEGG_pathways_CCR.xlsx"
writer = pd.ExcelWriter(filename, engine='xlsxwriter')
workbook = writer.book
df_pathway_counts.to_excel(writer, sheet_name='All')

# formatting
format_null = workbook.add_format({'text_wrap': True,'align':'left','font_size':10})
worksheet = writer.sheets['All']
worksheet.set_column('A:M',40,format_null)

writer.save()

print(len(df_pathway_counts))
df_pathway_counts

## All pathways 4x verified targets

In [None]:
pathway_counts = {} # pathway : {count: n,'genes':[gene1,gene2]}
for i,df_exp in [(0,df_Fkh1log_targets_4x_ChIP),(2,df_Fkh2log_targets_4x_ChIP)]:
    exp = experiments[i]

    # filter out those genes with at least one KEGG pathway
    df_kegg = df_exp[df_exp['KEGG pathway'].notnull()] 
    
    for i in range(len(df_kegg)): # loop over targets that are in KEGG pathways
        row = df_kegg.iloc[i]
        
        if row['Standard name'] != '':
            gene = row['Standard name']
        else:
            gene = row['Systematic name']

        list_of_pathways = row['KEGG pathway'].split(', ')
        
        if len(list_of_pathways) == 0:
            print(gene)
        
        for pathway in list_of_pathways:
            if pathway not in pathway_counts:
                # init
                pathway_counts[pathway] = {e:[] for e in experiments}

            # add current gene to correct experiment
            pathway_counts[pathway][exp].append(gene)
                
    
df_pathway_counts = pd.DataFrame.from_dict(pathway_counts, orient="index").sort_index() 

# turn lists of genes into strings
for exp in experiments:
    df_pathway_counts[exp] = df_pathway_counts[exp].apply(lambda x: ', '.join(x) if type(x)==list else '')
    

### EXCEL EXPORT
filename = "../Tables/KEGG_pathways_4x.xlsx"
writer = pd.ExcelWriter(filename, engine='xlsxwriter')
workbook = writer.book
df_pathway_counts.to_excel(writer, sheet_name='All')

# formatting
format_null = workbook.add_format({'text_wrap': True,'align':'left','font_size':10})
worksheet = writer.sheets['All']
worksheet.set_column('A:M',40,format_null)

writer.save()

print(len(df_pathway_counts))
df_pathway_counts

# SI table: enzymes, expression phases, KEGG pathways and SGD description

In [None]:
df_per_exp_enzymes = {}

for i, df_exp in enumerate([df_Fkh1log_targets, df_Fkh1stat_targets, df_Fkh2log_targets, df_Fkh2stat_targets]):
    exp = experiments[i]
    
    # filter out enzymes
    df_exp = df_exp[df_exp['is enzyme']]
    
    # filter out columns
    df_exp = df_exp[['Standard name','Expression peak time','Expression peak phase','Description','Name description','KEGG pathway','Primary GO term','Secondary GO term']]
    
    df_exp = df_exp.sort_values(by='Expression peak time')
       
    df_per_exp_enzymes[exp] = df_exp


filename = "../Tables/subset_targets_enzymes.xlsx"

writer = pd.ExcelWriter(filename, engine='xlsxwriter')
workbook = writer.book

for exp in experiments:
    df_per_exp_enzymes[exp].to_excel(writer, sheet_name=exp)


# formatting
format_null = workbook.add_format({'text_wrap': True,'align':'left','font_size':10})

for exp in experiments:
    worksheet = writer.sheets[exp]
    worksheet.set_column('A:D',20,format_null)
    worksheet.set_column('E:E',100,format_null)
    worksheet.set_column('F:G',40,format_null)

writer.save()

# Target overlap between Fkh1 and Fkh2

In [None]:
idx_fkh1log = df_Fkh1log_targets.index
idx_fkh1stat = df_Fkh1stat_targets.index
idx_fkh2log = df_Fkh2log_targets.index
idx_fkh2stat = df_Fkh2stat_targets.index

idx_fkh1log_verified = df_verified_targets_Fkh1.index
idx_fkh2log_verified = df_verified_targets_Fkh2.index

idx_fkh1log_3_verified = df_verified_3_targets_Fkh1.index
idx_fkh2log_3_verified = df_verified_3_targets_Fkh2.index

idx_fkh1log_4_verified = df_verified_4_targets_Fkh1.index
idx_fkh2log_4_verified = df_verified_4_targets_Fkh2.index

# intersection
idx_intersection_log = idx_fkh1log.intersection(idx_fkh2log)
idx_intersection_stat = idx_fkh1stat.intersection(idx_fkh2stat)

idx_intersection_log_verified = idx_fkh1log_verified.intersection(idx_fkh2log_verified)

idx_intersection_log_3_verified = idx_fkh1log_3_verified.intersection(idx_fkh2log_3_verified)

idx_intersection_log_4_verified = idx_fkh1log_4_verified.intersection(idx_fkh2log_4_verified)

# fkh1 unique
idx_unique_fkh1_log = idx_fkh1log.difference(idx_fkh2log)
idx_unique_fkh1_stat = idx_fkh1stat.difference(idx_fkh2stat)

idx_unique_fkh1_log_verified = idx_fkh1log_verified.difference(idx_fkh2log_verified)

idx_unique_fkh1_log_3_verified = idx_fkh1log_3_verified.difference(idx_fkh2log_3_verified)

idx_unique_fkh1_log_4_verified = idx_fkh1log_4_verified.difference(idx_fkh2log_4_verified)

# fkh2 unique
idx_unique_fkh2_log = idx_fkh2log.difference(idx_fkh1log)
idx_unique_fkh2_stat = idx_fkh2stat.difference(idx_fkh1stat)

idx_unique_fkh2_log_verified = idx_fkh2log_verified.difference(idx_fkh1log_verified)

idx_unique_fkh2_log_3_verified = idx_fkh2log_3_verified.difference(idx_fkh1log_3_verified)

idx_unique_fkh2_log_4_verified = idx_fkh2log_4_verified.difference(idx_fkh1log_4_verified)

data = [[len(idx_intersection_log),len(idx_intersection_stat), len(idx_intersection_log_verified), len(idx_intersection_log_3_verified), len(idx_intersection_log_4_verified)],
        [len(idx_unique_fkh1_log),len(idx_unique_fkh1_stat), len(idx_unique_fkh1_log_verified), len(idx_unique_fkh1_log_3_verified), len(idx_unique_fkh1_log_4_verified)],
        [len(idx_unique_fkh2_log),len(idx_unique_fkh2_stat),len(idx_unique_fkh2_log_verified), len(idx_unique_fkh2_log_3_verified), len(idx_unique_fkh2_log_4_verified)]]

df_overlap = pd.DataFrame(data, index=["Common targets","Unique targets Fkh1","Unique targets Fkh2"], 
             columns=["Logarithmic", "Stationary",
                     "2x ChIP verified","3x ChIP verified","4x ChIP verified"])
display(df_overlap)

## Overlap in ChIP-chip

In [None]:
lit_overlap = pd.DataFrame()

lit_overlap.at['Common targets','MacIsaac'] = len(df[df['MacIsaac 2006 Fkh1'] & df['MacIsaac 2006 Fkh2']])
lit_overlap.at['Unique targets Fkh1','MacIsaac'] = len(df[df['MacIsaac 2006 Fkh1'] & ~df['MacIsaac 2006 Fkh2']])
lit_overlap.at['Unique targets Fkh2','MacIsaac'] = len(df[df['MacIsaac 2006 Fkh2'] & ~df['MacIsaac 2006 Fkh1']])

lit_overlap.at['Common targets','Venters'] = len(df[df['Venters 2011 Fkh1'] & df['Venters 2011 Fkh2']])
lit_overlap.at['Unique targets Fkh1','Venters'] = len(df[df['Venters 2011 Fkh1'] & ~df['Venters 2011 Fkh2']])
lit_overlap.at['Unique targets Fkh2','Venters'] = len(df[df['Venters 2011 Fkh2'] & ~df['Venters 2011 Fkh1']])

lit_overlap.at['Common targets','Ostrow'] = len(df[df['Ostrow 2014 Fkh1'] & df['Ostrow 2014 Fkh2']])
lit_overlap.at['Unique targets Fkh1','Ostrow'] = len(df[df['Ostrow 2014 Fkh1'] & ~df['Ostrow 2014 Fkh2']])
lit_overlap.at['Unique targets Fkh2','Ostrow'] = len(df[df['Ostrow 2014 Fkh2'] & ~df['Ostrow 2014 Fkh1']])

lit_overlap = lit_overlap.astype(int)
lit_overlap

## How low is the signal for unique targets in the other Fkh?

In [None]:
df_result = pd.DataFrame()

# Fkh1 log uniques
subdf = df.loc[idx_unique_fkh1_log]
df_result.at['Fkh1 log', 'MaxPeak > 1'] = len(subdf[subdf['maxPeak_AD_12 Fkh2 log'] > 1])
df_result.at['Fkh1 log', 'GEM > 1'] = len(subdf[subdf['GEM Fkh2 log'] > 1])
df_result.at['Fkh1 log', 'MaxPeak < 0.005'] = len(subdf[subdf['MACE Fkh2 log'] < 0.005])
print(len(subdf[(subdf['maxPeak_AD_12 Fkh2 log'] > 1) | (subdf['GEM Fkh2 log'] > 1) | 
                (subdf['MACE Fkh2 log'] < 0.005)]))

# Fkh2 log uniques
subdf = df.loc[idx_unique_fkh2_log]
df_result.at['Fkh2 log', 'MaxPeak > 1'] = len(subdf[subdf['maxPeak_AD_12 Fkh1 log'] > 1])
df_result.at['Fkh2 log', 'GEM > 1'] = len(subdf[subdf['GEM Fkh1 log'] > 1])
df_result.at['Fkh2 log', 'MaxPeak < 0.005'] = len(subdf[subdf['MACE Fkh1 log'] < 0.005])
print(len(subdf[(subdf['maxPeak_AD_12 Fkh1 log'] > 1) | (subdf['GEM Fkh1 log'] > 1) | 
                (subdf['MACE Fkh1 log'] < 0.005)]))

# Fkh1 stat uniques
subdf = df.loc[idx_unique_fkh1_stat]
df_result.at['Fkh1 stat', 'MaxPeak > 1'] = len(subdf[subdf['maxPeak_AD_12 Fkh2 stat'] > 1])
df_result.at['Fkh1 stat', 'GEM > 1'] = len(subdf[subdf['GEM Fkh2 stat'] > 1])
df_result.at['Fkh1 stat', 'MaxPeak < 0.005'] = len(subdf[subdf['MACE Fkh2 stat'] < 0.005])
print(len(subdf[(subdf['maxPeak_AD_12 Fkh2 stat'] > 1) | (subdf['GEM Fkh2 stat'] > 1) | 
                (subdf['MACE Fkh2 stat'] < 0.005)]))

# Fkh2 stat uniques
subdf = df.loc[idx_unique_fkh2_stat]
df_result.at['Fkh2 stat', 'MaxPeak > 1'] = len(subdf[subdf['maxPeak_AD_12 Fkh1 stat'] > 1])
df_result.at['Fkh2 stat', 'GEM > 1'] = len(subdf[subdf['GEM Fkh1 stat'] > 1])
df_result.at['Fkh2 stat', 'MaxPeak < 0.005'] = len(subdf[subdf['MACE Fkh1 stat'] < 0.005])
print(len(subdf[(subdf['maxPeak_AD_12 Fkh1 stat'] > 1) | (subdf['GEM Fkh1 stat'] > 1) | 
                (subdf['MACE Fkh1 stat'] < 0.005)]))

pd.DataFrame(df_result)

In [None]:
data = []
data.append(df.loc[idx_unique_fkh1_log]['maxPeak_AD_12 Fkh2 log'])
data.append(df.loc[idx_unique_fkh2_log]['maxPeak_AD_12 Fkh1 log'])
data.append(df.loc[idx_unique_fkh1_stat]['maxPeak_AD_12 Fkh2 stat'])
data.append(df.loc[idx_unique_fkh2_stat]['maxPeak_AD_12 Fkh1 stat'])
            
fig = plt.figure()
res = plt.boxplot(data)
plt.show()

# Missing targets from the literature?

In [None]:
lit_targets_fkh1 = df[(df['MacIsaac 2006 Fkh1']) | (df['Venters 2011 Fkh1']) | (df['Ostrow 2014 Fkh1'])]
lit_targets_fkh2 = df[(df['MacIsaac 2006 Fkh2']) | (df['Venters 2011 Fkh2']) | (df['Ostrow 2014 Fkh2'])]

print('Literature targets:',len(lit_targets_fkh1), len(lit_targets_fkh2))

lit_targets_fkh1_missing = lit_targets_fkh1.loc[[idx for idx in lit_targets_fkh1.index if idx not in df_targets_list[0].index]]
lit_targets_fkh2_missing = lit_targets_fkh2.loc[[idx for idx in lit_targets_fkh2.index if idx not in df_targets_list[2].index]]

print('Missing literature targets:',len(lit_targets_fkh1_missing), len(lit_targets_fkh2_missing))

lit_fkh1_missing_1method = lit_targets_fkh1_missing[(lit_targets_fkh1_missing['maxPeak_AD_12 Fkh1 log']>=t[0]) |
                                                   (lit_targets_fkh1_missing['GEM Fkh1 log']>=t[1]) |
                                                   (lit_targets_fkh1_missing['MACE Fkh1 log']<=t[2])]
lit_fkh2_missing_1method = lit_targets_fkh2_missing[(lit_targets_fkh2_missing['maxPeak_AD_12 Fkh2 log']>=t[0]) |
                                                   (lit_targets_fkh2_missing['GEM Fkh2 log']>=t[1]) |
                                                   (lit_targets_fkh2_missing['MACE Fkh2 log']<=t[2])]

print('Significant in one method:',len(lit_fkh1_missing_1method), len(lit_fkh2_missing_1method))


lit_fkh1_missing_1method_low = lit_targets_fkh1_missing[(lit_targets_fkh1_missing['maxPeak_AD_12 Fkh1 log']>0) |
                                                   (lit_targets_fkh1_missing['GEM Fkh1 log']>0) |
                                                   (lit_targets_fkh1_missing['MACE Fkh1 log']<=0.01)]
lit_fkh2_missing_1method_low = lit_targets_fkh2_missing[(lit_targets_fkh2_missing['maxPeak_AD_12 Fkh2 log']>0) |
                                                   (lit_targets_fkh2_missing['GEM Fkh2 log']>0) |
                                                   (lit_targets_fkh2_missing['MACE Fkh2 log']<=0.01)]

print('Shows signal:',len(lit_fkh1_missing_1method_low), len(lit_fkh2_missing_1method_low))


lit_fkh1_missing_insig = lit_targets_fkh1_missing[(lit_targets_fkh1_missing['maxPeak_AD_12 Fkh1 log']==0) &
                                                   (lit_targets_fkh1_missing['GEM Fkh1 log'].isnull()) &
                                                   (lit_targets_fkh1_missing['MACE Fkh1 log'].isnull())]
lit_fkh2_missing_insig = lit_targets_fkh2_missing[(lit_targets_fkh2_missing['maxPeak_AD_12 Fkh2 log']==0) &
                                                   (lit_targets_fkh2_missing['GEM Fkh2 log'].isnull()) &
                                                   (lit_targets_fkh2_missing['MACE Fkh2 log'].isnull())]

print('No signal:',len(lit_fkh1_missing_insig), len(lit_fkh2_missing_insig))



# Clb2 cluster genes as targets

In [None]:
clb2_cluster = [
    # Zhu et al. still oscillating in double deletion
    'APC1','BUD8','CDC20','CHS2','NUM1','TEM1',
    'YCL063W','YIL051W','YKL130C','YLR057W','YLR084C','YPR156C',
    'YML119W','YMR032W',
    # No more oscillations in the double deletion
    'ACE2', 'ALK1', 'BUD3', 'BUD4', 'CDC5', 'CLB1', 'CLB2', 'HST3', 'KIP2', 'IQG1', 'MOB1', 'MYO1', 'SWI5', 
    'YIL158W', 'YLR190W', 'YML034W', 'YNL058C', 'YPL141C'
    # deleted ORFs
    # YCL013W
    # changed annotations
    # used to be: YCL012W: 'YCL014W' = BUD3, YCL062W: YCL063W, 'YML033W':YML034W
    ]

clb2_cluster_affected = ['ACE2', 'ALK1', 'BUD3', 'BUD4', 'CDC5', 'CLB1', 'CLB2', 'HST3', 'KIP2', 'IQG1', 'MOB1', 'MYO1', 'SWI5', 
    'YIL158W', 'YLR190W', 'YML034W', 'YNL058C', 'YPL141C']

print('Num. genes in Clb2 cluster:',len(clb2_cluster))
df_clb2cluster = df[df['Standard name'].isin(clb2_cluster)]


recovered_targets = []

for i in [0,2]:
    
    df_temp = df_targets_list[i]
    
    recovered_targets.extend(df_temp[df_temp['Standard name'].isin(clb2_cluster)]['Standard name'].tolist())
    recovered_targets = list(set(recovered_targets))
    
print('2x recovered clb2 cluster genes:',len(recovered_targets))
print(sorted(recovered_targets))
recovered_targets_affected = [x for x in recovered_targets if x in clb2_cluster_affected]
print('2x recovered clb2 cluster affected genes:',len(recovered_targets_affected))
print(sorted(recovered_targets_affected))

recovered_targets_ostrow = list(set(df_clb2cluster[df_clb2cluster['Ostrow 2014 Fkh1']]['Standard name'].tolist()
               +df_clb2cluster[df_clb2cluster['Ostrow 2014 Fkh2']]['Standard name'].tolist()))
print('Ostrow Clb2 cluster genes:', len(recovered_targets_ostrow))
recovered_targets_affected_ostrow = [x for x in recovered_targets_ostrow if x in clb2_cluster_affected]
print('Ostrow Clb2 cluster affected genes:', len(recovered_targets_affected_ostrow))

recovered_targets_venters = list(set(df_clb2cluster[df_clb2cluster['Venters 2011 Fkh1']]['Standard name'].tolist()
               +df_clb2cluster[df_clb2cluster['Venters 2011 Fkh2']]['Standard name'].tolist()))
print('Venters Clb2 cluster genes:', len(recovered_targets_venters))
recovered_targets_affected_venters = [x for x in recovered_targets_venters if x in clb2_cluster_affected]
print('Venters Clb2 cluster affected genes:', len(recovered_targets_affected_venters))

recovered_targets_macIsaac = list(set(df_clb2cluster[df_clb2cluster['MacIsaac 2006 Fkh1']]['Standard name'].tolist()
               +df_clb2cluster[df_clb2cluster['MacIsaac 2006 Fkh2']]['Standard name'].tolist()))
print('MacIsaac Clb2 cluster genes:', len(recovered_targets_macIsaac))
recovered_targets_affected_macIsaac = [x for x in recovered_targets_macIsaac if x in clb2_cluster_affected]
print('MacIsaac Clb2 cluster affected genes:', len(recovered_targets_affected_macIsaac))

In [None]:
df[df['Standard name'].isin(clb2_cluster)]

# Data to generate CC regulation image
Which methods in which phases showed which genes of the core regulatory network?

In [None]:
experiments = ['Fkh1 log','Fkh1 stat','Fkh2 log','Fkh2 stat']

CC_genes = ['SWI4','SWI6','MBP1','SWI5','ACE2','FKH2','NDD1','MCM1', # TF's
            'CLN3','CLN2','CLN1','CLB6','CLB5','CLB4','CLB3','CLB2','CLB1', # cyclins
            'SIC1'] # other

CC_genes_d = {g:{'MacIsaac':[],'Venters':[],'Ostrow':[],'This study':[]} for g in CC_genes}

# gather the results
for i,exp in enumerate(experiments):
    for g in CC_genes:
        if g in df_targets_list[i]['Standard name'].values.tolist():
            CC_genes_d[g]['This study'].append(exp)
    
# check the literature if each gene was shown before
macIsaac_Fkh1 = df[df['MacIsaac 2006 Fkh1']]['Standard name'].values.tolist()
venters_Fkh1 = df[df['Venters 2011 Fkh1']]['Standard name'].values.tolist()
ostrow_Fkh1 = df[df['Ostrow 2014 Fkh1']]['Standard name'].values.tolist()  
macIsaac_Fkh2 = df[df['MacIsaac 2006 Fkh2']]['Standard name'].values.tolist()
venters_Fkh2 = df[df['Venters 2011 Fkh2']]['Standard name'].values.tolist()
ostrow_Fkh2 = df[df['Ostrow 2014 Fkh2']]['Standard name'].values.tolist()   

for g in CC_genes:
    if g in macIsaac_Fkh1:
        CC_genes_d[g]['MacIsaac'].append('Fkh1')
    if g in venters_Fkh1:
        CC_genes_d[g]['Venters'].append('Fkh1')
    if g in ostrow_Fkh1:
        CC_genes_d[g]['Ostrow'].append('Fkh1')
    if g in macIsaac_Fkh2:
        CC_genes_d[g]['MacIsaac'].append('Fkh2')
    if g in venters_Fkh2:
        CC_genes_d[g]['Venters'].append('Fkh2')
    if g in ostrow_Fkh2:
        CC_genes_d[g]['Ostrow'].append('Fkh2')

# print results
for g in CC_genes_d:
    print(g,'\t\t',{k:CC_genes_d[g][k] for k in CC_genes_d[g] if CC_genes_d[g][k] != []})

# Data to generate the central carbon metabolism image
Here a slightly different approach: match substrings because there are many isoenzymes

In [None]:
CCm_genes_draft = ['HXT','HXK','GLK','PGI','PFK','FBP','FBA','GPD','GPP','GUT','TPI','TDH','ZWF','SOL','GND','RPE','RKI','TKL','TAL',
                  'PGK','GPM','ENO','PCK','PYK','CDC19','PYC','PDC','ADH','ALD','LAT','PDX','PDB','PDA','ACS','IDH','IDP',
                   'KGD','LSC','SDH','FUM','MDH','CIT','ACO']
CCm_genes = []

l = df[~df['Standard name'].isnull()]['Standard name'].values.tolist()
for g in CCm_genes_draft:
    for g2 in l:
        if g in g2: #substring
            CCm_genes.append(g2)

CCm_genes_d = {g:{'MacIsaac':[],'Venters':[],'Ostrow':[],'This study':[]} for g in CCm_genes}


# gather the results
experiments = ['Fkh1 log','Fkh1 stat','Fkh2 log','Fkh2 stat']
for i,exp in enumerate(experiments):
    for g in CCm_genes:
        if g in df_targets_list[i]['Standard name'].values.tolist():
            CCm_genes_d[g]['This study'].append(exp)
            
# check the literature if each gene was shown before
macIsaac_Fkh1 = df[df['MacIsaac 2006 Fkh1']]['Standard name'].values.tolist()
venters_Fkh1 = df[df['Venters 2011 Fkh1']]['Standard name'].values.tolist()
ostrow_Fkh1 = df[df['Ostrow 2014 Fkh1']]['Standard name'].values.tolist()    
macIsaac_Fkh2 = df[df['MacIsaac 2006 Fkh2']]['Standard name'].values.tolist()
venters_Fkh2 = df[df['Venters 2011 Fkh2']]['Standard name'].values.tolist()
ostrow_Fkh2 = df[df['Ostrow 2014 Fkh2']]['Standard name'].values.tolist()   

for g in CCm_genes:
    if g in macIsaac_Fkh1:
        CCm_genes_d[g]['MacIsaac'].append('Fkh1')
    if g in venters_Fkh1:
        CCm_genes_d[g]['Venters'].append('Fkh1')
    if g in ostrow_Fkh1:
        CCm_genes_d[g]['Ostrow'].append('Fkh1')
    if g in macIsaac_Fkh2:
        CCm_genes_d[g]['MacIsaac'].append('Fkh2')
    if g in venters_Fkh2:
        CCm_genes_d[g]['Venters'].append('Fkh2')
    if g in ostrow_Fkh2:
        CCm_genes_d[g]['Ostrow'].append('Fkh2')
    
# print results
for g in CCm_genes_d:
    print(g,'\t\t',{k:CCm_genes_d[g][k] for k in CCm_genes_d[g] if CCm_genes_d[g][k] != []})

# Noteworthy targets identified by only one method
i.e. targets only picked up by one PDM sorted by score (showing the top 5)

In [None]:
methods = ['maxPeak_AD_12', 'GEM', 'MACE']
colnames = [method + ' ' + exp for method in methods for exp in experiments]

for i, exp in enumerate(experiments):
    for method in methods:
        col = method + ' ' + exp # target column name
        print('######## '+col+' ########')
        
        # sort mace in ascending fashion
        if method == 'MACE':
            asc = True
        else:
            asc = False
        
        # sort targets on score
        df = df.sort_values(col, ascending = asc)
        
        # find highest 5 scores that are ultimately not targets
        genes_to_highlight = []
        for gene, row in df.iterrows():
            if gene not in df_targets_list[i].index:
                genes_to_highlight.append(gene)
                
            if len(genes_to_highlight) == 5:
                break
                
        display(df.loc[genes_to_highlight][['Standard name']+colnames])
        print()

# Rank genes per method
List the top 25 genes according to GEM ranking first and catalogue their ranking in MaxPeak and MACE.

In [None]:
df_sorted_gem = df.sort_values('GEM Fkh1 log',ascending=False).iloc[:25]

df_ranks = pd.DataFrame(index=df_sorted_gem.index)
df_ranks['Standard name'] = df_sorted_gem['Standard name']
df_ranks['MaxPeak'] = None
df_ranks['MACE'] = None

for g in df_sorted_gem.index.tolist():

    df_ranks.at[g,'MaxPeak'] = df.sort_values('maxPeak_AD_12 Fkh1 log',ascending=False).index.get_loc(g)
    df_ranks.at[g,'MACE'] = df.sort_values('MACE Fkh1 log',ascending=True).index.get_loc(g)

df_ranks