In [None]:
import h5py, matplotlib
import os, sklearn, umap, rpy2
import arboresto, kneed
import matplotlib_venn
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import pandas as pd
import networkx as nx

from rpy2 import robjects
import rpy2.robjects.numpy2ri
robjects.numpy2ri.activate()
from rpy2.robjects.packages import importr
base = importr("base")
dollar = base.__dict__["$"]


# custom class for sc datasets
# * additional dependencies:
#   *

from dataset import dataset

In [None]:
### 

#-------------------------------------------------#
###
# label cells by cluster from Lush et al. and genes 
# by mouse homologues for later dataset integration
###
fishEns_to_mouseName = pd.read_csv(('metaAnalysis/'
                                    'refs/'
                                    'fishEns_to'
                                    '_mouseName.csv'),
                                    index_col=0)

labeled_data = pd.DataFrame(data_copy.values,
                            index=data_copy.index.values,
                            columns=data_copy.columns.values)

labeled_data = labeled_data.loc[:,[x for x in 
                    labeled_data.columns.values if x in 
                      fishEns_to_mouseName.index.values]]

labeled_data.columns = [fishEns_to_mouseName.loc[x,'Associated Gene Name'].values[0] if 
                        type(fishEns_to_mouseName.loc[x,'Associated Gene Name'])!=str 
                        else fishEns_to_mouseName.loc[x,'Associated Gene Name'] for x in 
                        labeled_data.columns.values]

labeled_data['node'] = labeled_data.index.to_series(
                        ).map(dict(zip(p4i_data.index,
                                       p4i_data['node'])))
#--------------------------------------------------------#



In [None]:
###################################################################################################################
#######################################  PART 3: META-ANALYSIS   ##################################################
###################################################################################################################

#-------------------------------------------------------#
########## preprocess mouse data for alignment ##########
#-------------------------------------------------------#

try:
    metadata = pd.read_csv(('metaAnalysis/'
                            'datasets/'
                            'metadata.csv'),
                            index_col=0)
    
    lush = pd.read_csv(('metaAnalysis/'
                        'datasets/'
                        'lush.csv'),
                        index_col=0)
    
    burns = pd.read_csv(('metaAnalysis/'
                         'datasets/'
                         'burns.csv'),
                         index_col=0)
    
    hoa = pd.read_csv(('metaAnalysis/'
                       'datasets/'
                       'hoa.csv'),
                       index_col=0)

except:  
    print('loading datasets...')
    
    #---------------------------------------#
    ### load Burns et al. data (GSE71982) ###
    burns_t = pd.read_csv(('GSE71982/'
                           'GSE71982_'
                           'RSEM_Counts_'
                           'Matrix.csv'),
                           index_col=0)
    
    burns_phenoData_utric = pd.read_csv(('GSE71982/'
                                         'GSE71982_'
                                         'P1_Utricle_'
                                         'PhenoData.csv'),
                                         index_col=0)
    
    burns_phenoData_cochl1 = pd.read_csv(('GSE71982/'
                                          'GSE71982_'
                                          'P1_Coch__'
                                          'nonFACs__'
                                          'PhenoData.csv'),
                                          index_col=0)
    
    burns_phenoData_cochl2 = pd.read_csv(('GSE71982/'
                                          'GSE71982_'
                                          'P1_Coch__'
                                          'FACs__PhenoData.csv'),
                                          index_col=0)
    burns = burns_t.T
    burns.drop(['EGFP','tdTom'],
               axis=1,inplace=True)
    #-----------------------------#
    
    
    
    #--------------------------------------#
    ### load Hoa et al. data (GSE135703) ###
    hoa_t = pd.read_csv(('GSE135703/'
                         'ExpressionMatrix.csv'),
                         index_col=0)
    
    geneVersion_to_mouseName = pd.read_csv(('metaAnalysis/'
                                            'refs/geneVersion'
                                            '_to_mouseName.csv'),
                                            index_col=1)
    hoa = hoa_t.T
    hoa = hoa.loc[:,[x for x in hoa.columns.values if x in 
                     geneVersion_to_mouseName.index.values]]
    
    hoa.columns = [geneVersion_to_mouseName.loc[x,'Gene name'].values[0] if
                   type(geneVersion_to_mouseName.loc[x,'Gene name'])!=str
                   else geneVersion_to_mouseName.loc[x,'Gene name'] for x in
                   hoa.columns.values]
    #--------------------------------#
    
    
    
    #-----------------------------------#
    ### compile support cell metadata ###
    print('compiling metadata...')
    allCells = np.concatenate((labeled_data.index.values,
                               burns.index.values,
                               hoa.index.values))
    
    metadata = pd.DataFrame(index=allCells,
                            columns=['dataset',
                                     'species',
                                     'organ',
                                     'type',
                                     'subtype'])
    
    for idx in tqdm(metadata.index.values):
    
        # lush et al
        if idx in labeled_data.index.values:
            
            metadata.loc[idx,'dataset'] = 'lush et al'
            metadata.loc[idx,'species'] = 'D. rerio'
            metadata.loc[idx,'organ'] = 'neuromast'
            node = labeled_data.loc[idx,'node']
            
            if node in [7,8,9,14]:
                metadata.loc[idx,'type'] = 'sc'
                metadata.loc[idx,'subtype'] = 'central sc'
                
            elif node == 4:
                metadata.loc[idx,'type'] = 'sc'
                metadata.loc[idx,'subtype'] = 'differentiating sc'
                
            elif node in [3,10,11]:
                metadata.loc[idx,'type'] = 'sc'
                metadata.loc[idx,'subtype'] = 'D/V amplifying sc'
                
            elif node == 13:
                metadata.loc[idx,'type'] = 'sc'
                metadata.loc[idx,'subtype'] = 'A/P pole sc'
                
            elif node in [5,6]:
                metadata.loc[idx,'type'] = 'mc'
            
            
        # burns et al
        if idx in burns.index.values:
            
            metadata.loc[idx,'dataset'] = 'burns et al'
            metadata.loc[idx,'species'] = 'M. musculus'
            
            if idx in burns_phenoData_utric.index.values:
                
                metadata.loc[idx,'organ'] = 'utricle'
                GroupID = burns_phenoData_utric.loc[idx,'GroupID']
                
                if GroupID == 'TEC':
                    metadata.loc[idx,'type'] = 'tec'
                    
                elif GroupID == 'SC (i)':
                    metadata.loc[idx,'type'] = 'sc'
                    metadata.loc[idx,'subtype'] = 'young sc'
                    
                elif GroupID == 'SC (ii)':
                    metadata.loc[idx,'type'] = 'sc'
                    metadata.loc[idx,'subtype'] = 'mature sc'
                    
            elif idx in burns_phenoData_cochl1.index.values:
                
                metadata.loc[idx,'organ'] = 'corti'
                GroupID = burns_phenoData_cochl1.loc[idx,'GroupID']
                
                if GroupID in ['NSC (i)','NSC (ii)']:
                    metadata.loc[idx,'type'] = 'nsc'
                    
                elif GroupID == 'SC':
                    metadata.loc[idx,'type'] = 'sc'
                    
            elif idx in burns_phenoData_cochl2.index.values:
                
                metadata.loc[idx,'organ'] = 'corti'
                GroupID = burns_phenoData_cochl2.loc[idx,'GroupID']
                
                if GroupID == 'MedSC':
                    metadata.loc[idx,'type'] = 'sc'
                    metadata.loc[idx,'subtype'] = 'medial sc'
                    
                elif GroupID in ['LatSCa','LatSCb']:
                    metadata.loc[idx,'type'] = 'sc'
                    metadata.loc[idx,'subtype'] = 'lateral sc'
    
    
        # hoa et al
        elif idx in hoa.index.values:
            
            metadata.loc[idx,'dataset'] = 'hoa et al'
            metadata.loc[idx,'species'] = 'M. musculus'
            metadata.loc[idx,'organ'] = 'corti'
            metadata.loc[idx,'type'] = 'sc'
    
    
    # remove empty rows and save metadata 
    metadata = metadata.loc[metadata.index.values[~metadata['type'].isnull().values],:] 
    
    metadata.to_csv(('metaAnalysis/'
                     'datasets/'
                     'metadata.csv'))
    #-------------------------------#

    
    
    #------------------------------------------#
    ### common genes for dataset integration ###                
    shared_genes = np.array(list(set(labeled_data.columns.values).intersection(
                            set(burns.columns.values),
                            set(hoa.columns.values))))
    #------------------------------------------------#
    
    
    
    #--------------------------#
    ### write zebrafish data ###
    lush = labeled_data.loc[[idx for idx in labeled_data.index.values if
                             idx in metadata.index.values],shared_genes].T
    lush = lush.loc[~lush.index.duplicated(keep='first')]
    
    lush.to_csv(('metaAnalysis/'
                 'datasets/'
                 'lush.csv'))
    #-----------------------#
    
    
    
    #----------------------#
    ### write mouse data ###
    burns = burns.loc[[idx for idx in burns.index.values if 
                       idx in metadata.index.values],shared_genes].T
    burns = burns.loc[~burns.index.duplicated(keep='first')]
    
    burns.to_csv(('metaAnalysis/'
                  'datasets/'
                  'burns.csv'))
    
    hoa = hoa.loc[[idx for idx in hoa.index.values if 
                   idx in metadata.index.values],shared_genes].T
    hoa = hoa.loc[~hoa.index.duplicated(keep='first')]
    
    hoa.to_csv(('metaAnalysis/'
                'datasets/'
                'hoa.csv'))
    #---------------------#

In [None]:
#-----------------------------------------#
########## UMAP before alignment ##########
#-----------------------------------------#


#-----------------------------#
### additional dependencies ###
from matplotlib.legend_handler import HandlerTuple
from sklearn.decomposition import PCA
import umap.umap_ as umap
#-----------------------#

try:
    umap_before = pd.read_csv(('metaAnalysis/'
                               'embeddings/'
                               'umap_before.csv'),
                               index_col=0) 
except:
    
    #----------#
    ### UMAP ###
    n_pcs = 100
    reducer = umap.UMAP()
    
    lush = scprep.normalize.library_size_normalize(lush)
    lush = scprep.transform.log(lush)
    
    burns = scprep.normalize.library_size_normalize(burns)
    burns = scprep.transform.log(burns)
    
    hoa = scprep.normalize.library_size_normalize(hoa)
    hoa = scprep.transform.log(hoa)
    
    data_before = np.hstack((lush.values,
                             burns.values,
                             hoa.values)).T
    
    pca_before = PCA(n_components=n_pcs
                     ).fit_transform(data_before)
    
    umap_before = reducer.fit_transform(pca_before)
    umap_before = pd.DataFrame(umap_before,columns=['UMAP_1',
                                                    'UMAP_2'],
                               index=np.concatenate(
                                   (lush.columns.values,
                                    burns.columns.values,
                                    hoa.columns.values)))
    
    umap_before.to_csv(('metaAnalysis/'
                        'embeddings/'
                        'umap_before.csv'))
    #-------------------------------------#


    
#-------------------------#
### dataset identifiers ###
lush_cells = metadata.index.values[np.where((metadata['dataset']=='lush et al'))]

burns_cells = metadata.index.values[np.where((metadata['dataset']=='burns et al'))]

hoa_cells = metadata.index.values[np.where((metadata['dataset']=='hoa et al'))]
#-----------------------------------------------------------------------------#



#---------------#
### plot UMAP ###
sz = 2
lw = sz*0.1
plt.figure(figsize=(2.5,1.25))

lush_plt = plt.scatter(umap_before.loc[lush_cells,'UMAP_1'],
                       umap_before.loc[lush_cells,'UMAP_2'],
                       c='seagreen',
                       edgecolor='k',
                       linewidth=lw,
                       s=sz)

burns_plt = plt.scatter(umap_before.loc[burns_cells,'UMAP_1'],
                        umap_before.loc[burns_cells,'UMAP_2'],
                        c='indianred',
                        edgecolor='k',
                        linewidth=lw,
                        s=sz)

hoa_plt = plt.scatter(umap_before.loc[hoa_cells,'UMAP_1'],
                      umap_before.loc[hoa_cells,'UMAP_2'],
                      c='goldenrod',
                      edgecolor='k',
                      linewidth=lw,
                      s=sz)

ax = plt.gca()
ax.axis('off')

lgd = ax.legend([(lush_plt),(burns_plt,hoa_plt)],
                [('D. rerio neuromast support \n'
                  '& mantle cells (lush et al.)'),
                 ('M. mus. inner ear support \n'
                  'cells (multiple studies)')],
                 loc='lower right',
                 borderaxespad=0,
                 bbox_to_anchor=(1,0),
                 fontsize=4,
                 frameon=False,
                 markerscale=2,
                 handler_map=
                {tuple:HandlerTuple(
                    ndivide=None)})

for t in lgd.get_texts():
    t.set_ha('center')
    t.set_position((400,0))

    
# axes annotation
plt.annotate(s='',xy=(0,0.3),
             xytext=(0.15,0),
             xycoords=
             'axes fraction',
             arrowprops=
             dict(arrowstyle=('<->,'
             'head_width=0.05,'
             'head_length=0.1'),
             connectionstyle=
             ('angle,rad=0,'
              'angleA=0,'
              'angleB=-90'),
             color='k',
             linewidth=0.5))

plt.text(s='UMAP1',
         x=0.065,
         y=-0.05,
         va='center',
         ha='center',
         color='k',
         fontsize=4,
         transform=
         ax.transAxes)

plt.text(s='UMAP2',
         x=-0.025,
         y=0.13,
         rotation=90,
         va='center',
         ha='center',
         color='k',
         fontsize=4,
         transform=
         ax.transAxes)

plt.savefig(('metaAnalysis/'
             'figures/'
             'before.png'),
             dpi=dpi)
#-------------------#

In [None]:
#----------------------------------------#
########## UMAP after alignment ##########
#----------------------------------------#


#------------------------------#
### load aligned data from R ###
data_after = pd.read_csv(('metaAnalysis/'
                          'datasets/'
                          'aligned.csv'),
                          index_col=0).T

data_after['ids'] = [x.replace(".","-") for x in 
                     data_after.index.values]

data_after.set_index('ids',
                inplace=True)

alignment_fts = pd.read_csv(('metaAnalysis/'
                             'datasets/'
                             'features.csv'),
                             index_col=0)
#---------------------------------------#


try:
    umap_after = pd.read_csv(('metaAnalysis/'
                              'embeddings/'
                              'umap_after.csv'),
                              index_col=0) 
    
except:
    #----------#
    ### UMAP ###
    n_pcs = 100
    reducer = umap.UMAP()
    
    pca_after = PCA(n_components=n_pcs).fit_transform(
        data_after.loc[:,alignment_fts['anchors@anchor.features'].values])
    
    umap_after = reducer.fit_transform(pca_after)
    umap_after = pd.DataFrame(umap_after,
                              columns=['UMAP_1','UMAP_2'],
                              index=data_after.index.values)
    
    umap_after.to_csv(('metaAnalysis/'
                       'embeddings/'
                       'umap_after.csv'))
    #-----------------------------------#


    
#---------------#    
### plot UMAP ###
sz = 4
lw = sz*0.1
plt.figure(figsize=(1.25,1.25))

lush_plt = plt.scatter(umap_after.loc[lush_cells,'UMAP_1'],
                       umap_after.loc[lush_cells,'UMAP_2'],
                       c='seagreen',
                       edgecolor='k',
                       linewidth=lw,
                       s=sz)

burns_plt = plt.scatter(umap_after.loc[burns_cells,'UMAP_1'],
                        umap_after.loc[burns_cells,'UMAP_2'],
                        c='indianred',
                        edgecolor='k',
                        linewidth=lw,
                        s=sz)

hoa_plt = plt.scatter(umap_after.loc[hoa_cells,'UMAP_1'],
                      umap_after.loc[hoa_cells,'UMAP_2'],
                      c='goldenrod',
                      edgecolor='k',
                      linewidth=lw,
                      s=sz)

ax = plt.gca()
ax.axis('off')


# axes annotation
plt.annotate(s='',xy=(0,0.3),
             xytext=(0.3,0),
             xycoords=
             'axes fraction',
             arrowprops=
             dict(arrowstyle=('<->,'
             'head_width=0.05,'
             'head_length=0.1'),
             connectionstyle=
             ('angle,rad=0,'
              'angleA=0,'
              'angleB=-90'),
             color='k',
             linewidth=0.5))

plt.text(s='UMAP1',
         x=0.13,
         y=-0.05,
         va='center',
         ha='center',
         color='k',
         fontsize=4,
         transform=
         ax.transAxes)

plt.text(s='UMAP2',
         x=-0.05,
         y=0.13,
         rotation=90,
         va='center',
         ha='center',
         color='k',
         fontsize=4,
         transform=
         ax.transAxes)

plt.savefig(('metaAnalysis/'
             'figures/'
             'after.png'),
             dpi=dpi)
#-------------------#

In [None]:
#--------------------------------------------------------#
########## find Gaussian mixture for clustering ##########
#--------------------------------------------------------#

#-----------------------------#
### additional dependencies ###
from sklearn import mixture
from kneed import KneeLocator
#---------------------------#

cluster_n = np.arange(1,21)
sample_m = np.arange(0,100)

try:
    bic = pd.read_csv(('metaAnalysis/'
                       'results/'
                       'bic.csv'),
                       index_col=0)
    
    aic = pd.read_csv(('metaAnalysis/'
                       'results/'
                       'aic.csv'),
                       index_col=0)
    
except:

    #--------------------------------------------------#
    ### calculate bic & aic for different n clusters ###
    bic = pd.DataFrame(index=sample_m,
                       columns=cluster_n)

    aic = pd.DataFrame(index=sample_m,
                       columns=cluster_n)

    for m in tqdm(sample_m):
    
        for n in cluster_n:
    
            gm = mixture.GaussianMixture(n_components=n,
                                         covariance_type='full'
                                         ).fit(umap_after)
        
            bic.loc[m,n] = gm.bic(umap_after)
            aic.loc[m,n] = gm.aic(umap_after)
    
        # normalize bic & aic for each sampling
        bic.loc[m,:] = (bic.loc[m,:]-bic.loc[m,:].min())/(bic.loc[m,:].max()-bic.loc[m,:].min())
        aic.loc[m,:] = (aic.loc[m,:]-aic.loc[m,:].min())/(aic.loc[m,:].max()-aic.loc[m,:].min())
        
        
    bic.to_csv(('metaAnalysis/'
                'results/'
                'bic.csv'))
    
    aic.to_csv(('metaAnalysis/'
                'results/'
                'aic.csv'))
    #---------------------#



#-------------------------#
### determine the knees ###
bic_knee = KneeLocator(cluster_n,
                       bic.mean(axis=0),
                       S=1.0,
                       curve=
                       'convex',
                       direction=
                       'decreasing')

aic_knee = KneeLocator(cluster_n,
                       aic.mean(axis=0),
                       S=1.0,
                       curve=
                       'convex',
                       direction=
                       'decreasing')

print('Detected knee at: '+
      str(bic_knee.knee)+
      ' and '+str(aic_knee.knee))
#-------------------------------#



#--------------------#
### plot bic & aic ###
plt.figure(figsize=(2.5,1.25))

plt.plot(cluster_n,
         bic.mean(axis=0),
         linewidth=1,
         marker='.',
         markersize=4,
         c='tab:blue',
         label='BIC')

plt.plot(cluster_n,
         aic.mean(axis=0),
         linewidth=1,
         marker='.',
         markersize=4,
         c='tab:red',
         label='AIC')

plt.legend(fontsize=5,
           frameon=False)

ax = plt.gca()
b,t = ax.get_ylim()

knee = 3
print('Knee plotted at: '+str(knee))
plt.vlines(knee,b,t,
           color='k',
           linestyle=
           'dotted',
           linewidth=1)

ax.set_ylim(b,t)
plt.xticks(cluster_n[::2])

ax.set_xlabel('Clusters',
              fontsize=5)

ax.set_ylabel('Normalized Score',
              fontsize=5)

ax.tick_params(axis='both',
               which='major',
               labelsize=5)

plt.tight_layout()

plt.savefig(('metaAnalysis/'
             'figures/'
             'crit.png'),
             dpi=dpi)
#-------------------#

In [None]:
#-----------------------------------#
########## plot by cluster ##########
#-----------------------------------#


try:
    labels_df = pd.read_csv(('metaAnalysis/'
                             'embeddings/'
                             'labels.csv'),
                             index_col=0)
    
except:
    
    #----------------------------------------------------#
    ### compute Gaussians and assign cells to clusters ###
    gm = mixture.GaussianMixture(n_components=knee,
                                 covariance_type='full'
                                 ).fit(umap_after)

    labels_df = pd.DataFrame(gm.predict(umap_after),
                             index=umap_after.index.values,
                             columns=['gmm_cluster'])
    
    labels_df.to_csv(('metaAnalysis/'
                      'embeddings/'
                      'labels.csv'))
    #------------------------------#
    
    
    
#-------------------------------#
### identifiers by cluster ###
clus0 = metadata.index.values[np.where((labels_df['gmm_cluster']==0))]
clus1 = metadata.index.values[np.where((labels_df['gmm_cluster']==1))]
clus2 = metadata.index.values[np.where((labels_df['gmm_cluster']==2))]
#--------------------------------------------------------------------#



#-----------------#
### plot figure ###
sz = 4
lw = sz*0.1
plt.figure(figsize=(1.25,1.25))

plt.scatter(umap_after.loc[clus0,'UMAP_1'],
            umap_after.loc[clus0,'UMAP_2'],
            label='0',
            c='slateblue',
            edgecolor='k',
            linewidth=lw,
            s=sz)

plt.scatter(umap_after.loc[clus1,'UMAP_1'],
            umap_after.loc[clus1,'UMAP_2'],
            label='1',
            c='cadetblue',
            edgecolor='k',
            linewidth=lw,
            s=sz)

plt.scatter(umap_after.loc[clus2,'UMAP_1'],
            umap_after.loc[clus2,'UMAP_2'],
            label='2',
            c='sandybrown',
            edgecolor='k',
            linewidth=lw,
            s=sz)

lgd = plt.legend(fontsize=4,
                 frameon=False,
                 markerscale=1.5,
                 loc='lower right',
                 borderaxespad=0,
                 bbox_to_anchor=(1,0))

ax = plt.gca()
ax.axis('off')

# axes annotation
plt.annotate(s='',xy=(0,0.3),
             xytext=(0.3,0),
             xycoords=
             'axes fraction',
             arrowprops=
             dict(arrowstyle=('<->,'
             'head_width=0.05,'
             'head_length=0.1'),
             connectionstyle=
             ('angle,rad=0,'
              'angleA=0,'
              'angleB=-90'),
             color='k',
             linewidth=0.5))

plt.text(s='UMAP1',
         x=0.13,
         y=-0.05,
         va='center',
         ha='center',
         color='k',
         fontsize=4,
         transform=
         ax.transAxes)

plt.text(s='UMAP2',
         x=-0.05,
         y=0.13,
         rotation=90,
         va='center',
         ha='center',
         color='k',
         fontsize=4,
         transform=
         ax.transAxes)

plt.savefig(('metaAnalysis/'
             'figures/'
             'cluster.png'),
             dpi=dpi)
#-------------------#

In [None]:
#--------------------------------------#
########## plot by cell cycle ##########
#--------------------------------------#


#----------------------------#
### load cell cycle labels ###
phases = pd.read_csv(('metaAnalysis/'
                      'datasets/'
                      'phases.csv'),
                      index_col=0)

phases['ids'] = data_after.index.values

phases.set_index('ids',
                 inplace=True)

phases.rename({phases.columns.values[0]:'phase'},
              axis=1,
              inplace=True)
#-------------------------#



#-------------------------------#
### identifiers by cell cycle ###
g1 = metadata.index.values[np.where((phases['phase']=='G1'))]
s = metadata.index.values[np.where((phases['phase']=='S'))]
g2m = metadata.index.values[np.where((phases['phase']=='G2M'))]
#-------------------------------------------------------------#



#-----------------#
### plot figure ###
sz = 4
lw = sz*0.1
plt.figure(figsize=(1.25,1.25))

c=['slategray',
   'royalblue',
   'orchid']

for idx in metadata.index.values:
    
    phase = phases.loc[idx,'phase']
    
    if phase == 'G1':

        plt.scatter(umap_after.loc[idx,'UMAP_1'],
            umap_after.loc[idx,'UMAP_2'],
            c=c[0],
            edgecolor='k',
            linewidth=lw,
            s=sz)
        
    elif phase == 'S':

        plt.scatter(umap_after.loc[idx,'UMAP_1'],
            umap_after.loc[idx,'UMAP_2'],
            c=c[1],
            edgecolor='k',
            linewidth=lw,
            s=sz)
        
    else:

        plt.scatter(umap_after.loc[idx,'UMAP_1'],
            umap_after.loc[idx,'UMAP_2'],
            c=c[2],
            edgecolor='k',
            linewidth=lw,
            s=sz)
        
        
g1_lgd = plt.scatter([],[],
                     c=c[0],
                     label='G1',
                     edgecolor='k',
                     linewidth=lw,
                     s=sz)

s_lgd = plt.scatter([],[],
                    c=c[1],
                    label='S',
                    edgecolor='k',
                    linewidth=lw,
                    s=sz)

g2m_lgd = plt.scatter([],[],
                      c=c[2],
                      label='G2M',
                      edgecolor='k',
                      linewidth=lw,
                      s=sz)


lgd = plt.legend(handles=[g1_lgd,
                          s_lgd,
                          g2m_lgd],
                 fontsize=4,
                 frameon=False,
                 markerscale=1.5,
                 loc='lower right',
                 borderaxespad=0,
                 bbox_to_anchor=(1,0))

for t in lgd.get_texts():
    t.set_ha('center')
    t.set_position((50,0))

ax = plt.gca()
ax.axis('off')

# axes annotation
plt.annotate(s='',xy=(0,0.3),
             xytext=(0.3,0),
             xycoords=
             'axes fraction',
             arrowprops=
             dict(arrowstyle=('<->,'
             'head_width=0.05,'
             'head_length=0.1'),
             connectionstyle=
             ('angle,rad=0,'
              'angleA=0,'
              'angleB=-90'),
             color='k',
             linewidth=0.5))

plt.text(s='UMAP1',
         x=0.13,
         y=-0.05,
         va='center',
         ha='center',
         color='k',
         fontsize=4,
         transform=
         ax.transAxes)

plt.text(s='UMAP2',
         x=-0.05,
         y=0.13,
         rotation=90,
         va='center',
         ha='center',
         color='k',
         fontsize=4,
         transform=
         ax.transAxes)

plt.savefig(('metaAnalysis/'
             'figures/'
             'cycle.png'),
             dpi=dpi)
#-------------------#

In [None]:
# med_lat = data_before.loc[data_before.index.values[np.where((metadata['organ']=='corti')&
#                          (metadata['type']=='sc')&(metadata['dataset']=='hoa et al'))],
#                           ['Plp1','Plp2','Prox1','Cdh4','Sparcl1','Cntn1','Fabp7','Gjb2','Cdh1',
#                            'Anxa5','Fabp7','Matn4','Emid1','Npy','Ppp1r2','Serpine2','Hes5',
#                            'S100a1','Lfng','Egfl6','Nupr1']]

# med_lat_pca = sklearn.decomposition.PCA(n_components=4).fit_transform(med_lat)

# reducer = umap.UMAP()
# med_lat_umap = reducer.fit_transform(med_lat_pca)

# plt.scatter(med_lat_umap[:,0],med_lat_umap[:,1],s=5,edgecolor='k')
# plt.show()

In [None]:
# #----------------------------------------------------#
# ########## cell types in segregated cluster ##########
# #----------------------------------------------------#


# #---------------------------------------#
# ### concatenate datasets (raw counts) ###
# data_before = pd.DataFrame(np.hstack((lush.values,
#                                       burns.values,
#                                       hoa.values)).T,
                           
#                            columns=lush.index.values,
#                            index=np.concatenate(
#                                (lush.columns.values,
#                                 burns.columns.values,
#                                 hoa.columns.values)))
# #---------------------------------------------------#


# #------------------------------------------------#
# ### cell type counts in the segregated cluster ###

# dvsc = np.sum((labels_df['gmm_cluster']==1)&
#               (metadata['subtype']=='D/V amplifying sc'))

# diffsc = np.sum((labels_df['gmm_cluster']==1)&
#                 (metadata['subtype']=='differentiating sc'))

# iphc_bc = np.sum((labels_df['gmm_cluster']==1)&
#                  (metadata['organ']=='corti')&
#                  (metadata['type']=='sc')&
#                  (data_before[['Plp1','Plp2']].sum(axis=1)>0))

# fish_other = np.sum((labels_df['gmm_cluster']==1)&
#                     (metadata['species']=='D. rerio')&
#                     (metadata['subtype']!='D/V amplifying sc')&
#                     (metadata['subtype']!='differentiating sc'))

# mouse_other = np.sum((labels_df['gmm_cluster']==1))-\
#                       dvsc-diffsc-iphc_bc-fish_other     
# #--------------------------------------------------#


# #-----------------#
# ### plot figure ###
# _,(a0, a1) = plt.subplots(2,1,gridspec_kw=
#                           {'height_ratios':[8, 1]},
#                           figsize=(1.25,1.25))

# pie = a0.pie([dvsc,
#               diffsc,
#               fish_other,
#               iphc_bc,
#               mouse_other],
#              colors=
#              ['dimgray',
#               'darkgray',
#               'lightgray',
#               'whitesmoke',
#               'black'])

# a1.axis("off") 
# a1.legend(handles=pie[0],
#            labels=["D/V sc's (D. rerio)",
#                    "diff. sc's (D. rerio)",
#                    'Other (D. rerio)',
#                    "IPhC's & BC's (M. mus.)",
#                    'Other (M. mus.)'],
#           loc='center',
#           frameon=False,
#           fontsize=4)

# plt.savefig(('metaAnalysis/'
#              'figures/'
#              'type.png'),
#             dpi=dpi)
# #------------------#

In [None]:
# IT LOOKS LIKE DVSCs IN CLUSTER 1 ARE CYCLING MORE THAN OTHER DVSCs? HOW?

clus1 = metadata.index.values[np.where((labels_df['gmm_cluster']==1)&
                                       (phases['phase']!='G1')&
                                       (metadata['subtype']=='D/V amplifying sc'))]

clus02 = metadata.index.values[np.where((labels_df['gmm_cluster']!=1)&
                                        (phases['phase']!='G1')&
                                        (metadata['subtype']=='D/V amplifying sc'))]

ids = np.concatenate((clus1,clus02))


dds = deseq.DESeqDataSetFromMatrix(countData=data_before.loc[ids,:].values.T+1,
                                   colData=robjects.DataFrame({'label':robjects.StrVector(
                                   ['clus1' if idx in clus1 else 'clus02' for idx in ids])}),
                                   design=Formula('~ label'))
dds = deseq.DESeq(dds)

res = deseq.results(dds)

res_df = pd.DataFrame(index=data_before.columns.values)
res_df['P.Value'] = dollar(res,'pvalue')
res_df['adj.P.Val'] = dollar(res,'padj')
res_df['logFC'] = dollar(res,'log2FoldChange')

res_df.sort_values('logFC',
                   ascending=False,
                   inplace=True)

print(res_df.head(20))
print(res_df.tail(20))




In [None]:

from matplotlib_venn import venn2, venn3

#------------------
### identifiers for hoa et al
clus1_s = metadata.index.values[np.where((metadata['dataset']=='hoa et al')&
                                         (labels_df['gmm_cluster']==1)&
                                         (phases['phase']=='S'))]

clus1_g2m = metadata.index.values[np.where((metadata['dataset']=='hoa et al')&
                                           (labels_df['gmm_cluster']==1)&
                                           (phases['phase']=='G2M'))]

clus02 = metadata.index.values[np.where((metadata['dataset']=='hoa et al')&
                                        (labels_df['gmm_cluster']!=1))]

hoa_ids = np.concatenate((clus1_s,
                          clus1_g2m,
                          clus02))
#-------------



# # deseq2
# dds = deseq.DESeqDataSetFromMatrix(countData=data_before.loc[hoa_ids,:].values.T+1,
#                                    colData=robjects.DataFrame({'label':robjects.StrVector(
#                                    ['clus1_s' if idx in clus1_s else 'clus1_g2m' if idx in 
#                                     clus1_g2m else 'clus02' for idx in hoa_ids])}),
#                                    design=Formula('~ label'))
# dds = deseq.DESeq(dds)

# hoa_s = deseq.results(dds,contrast=robjects.StrVector(
#                       ['label','clus02','clus1_s']))

# hoa_g2m = deseq.results(dds,contrast=robjects.StrVector(
#                         ['label','clus1_s','clus1_g2m']))

# de_hoa = pd.DataFrame(index=data_before.columns.values)
# de_hoa['s.P.Value'] = dollar(hoa_s,'pvalue')
# de_hoa['s.adj.P.Val'] = dollar(hoa_s,'padj')
# de_hoa['s.logFC'] = dollar(hoa_s,'log2FoldChange')
# de_hoa['g2m.P.Value'] = dollar(hoa_g2m,'pvalue')
# de_hoa['g2m.adj.P.Val'] = dollar(hoa_g2m,'padj')
# de_hoa['g2m.logFC'] = dollar(hoa_g2m,'log2FoldChange')


upreg_s = de_hoa.index.values[np.where((de_hoa['s.logFC']>1.25)&
                                       (de_hoa['s.adj.P.Val']<0.05))]

downreg_s = de_hoa.index.values[np.where((de_hoa['s.logFC']<-1.25)&
                                         (de_hoa['s.adj.P.Val']<0.05))]

upreg_g2m = de_hoa.index.values[np.where((de_hoa['g2m.logFC']>1.25)&
                                         (de_hoa['g2m.adj.P.Val']<0.05))]

downreg_g2m = de_hoa.index.values[np.where((de_hoa['g2m.logFC']<-1.25)&
                                           (de_hoa['g2m.adj.P.Val']<0.05))]

plt.figure(figsize=(2.5,2.5))
venn3([set(upreg_s),set(upreg_g2m),set(downreg_g2m)])
plt.show()

plt.figure(figsize=(2.5,2.5))
venn3([set(downreg_s),set(upreg_g2m),set(downreg_g2m)])
plt.show()



In [None]:
## DE comparison (amplifying S) ##

from matplotlib_venn import venn2

# cell IDs
hoa_ampS = metadata.index.values[\
                np.where(amplifying&
                (phases['phase']=='S')&
                (metadata['dataset']==
                 'hoa et al'))]

hoa_not = metadata.index.values[\
            np.where(~amplifying&
            (metadata['dataset']==
             'hoa et al'))]

lush_ampS = metadata.index.values[\
                 np.where(amplifying&
                 (phases['phase']=='S')&
                 (metadata['dataset']==
                  'lush et al'))]

lush_not = metadata.index.values[\
             np.where(~amplifying&
             (metadata['dataset']==
              'lush et al'))]

hoa_s = np.concatenate((hoa_ampS,
                        hoa_not))

lush_s = np.concatenate((lush_ampS,
                         lush_not))


try:
    de_hoaS = pd.read_csv(('metaAnalysis/',
                          'results/',
                          'hoa_de_s.csv'),
                          index_col=0)
    
    de_lushS = pd.read_csv(('metaAnalysis/',
                           'results/',
                           'lush_de_s.csv'),
                           index_col=0)
    
except:
    # hoa amp S vs ~amp (deseq2)
    dds = deseq.DESeqDataSetFromMatrix(countData=
                data_before.loc[hoa_s,:].values.T+1,
                colData=robjects.DataFrame(\
                {'label':robjects.IntVector(\
                [1 if idx in hoa_ampS else 0\
                         for idx in hoa_s])}),
                    design=Formula('~ label'))

    dds = deseq.DESeq(dds)
    ds_res = deseq.results(dds)

    de_hoaS = pd.DataFrame(index=
                data_before.columns.values)

    de_hoaS['P.Value'] = dollar(ds_res,
                                'pvalue')

    de_hoaS['adj.P.Val'] = dollar(ds_res,
                                  'padj')

    de_hoaS['logFC'] = dollar(ds_res,
                              'log2FoldChange')

    de_hoaS.sort_values('logFC',
                        ascending=False,
                        inplace=True)

    de_hoaS.to_csv(('metaAnalysis/',
                    'results/',
                    'hoa_de_s.csv'))



    # lush amp S vs ~amp (deseq2)
    dds = deseq.DESeqDataSetFromMatrix(countData=
                data_before.loc[lush_s,:].values.T+1,
                colData=robjects.DataFrame(\
                {'label':robjects.IntVector(\
                [1 if idx in lush_ampS else 0\
                         for idx in lush_s])}),
                    design=Formula('~ label'))

    dds = deseq.DESeq(dds)
    ds_res = deseq.results(dds)

    de_lushS = pd.DataFrame(index=
                 data_before.columns.values)

    de_lushS['P.Value'] = dollar(ds_res,
                                 'pvalue')

    de_lushS['adj.P.Val'] = dollar(ds_res,
                                   'padj')

    de_lushS['logFC'] = dollar(ds_res,
                               'log2FoldChange')

    de_lushS.sort_values('logFC',
                         ascending=False,
                         inplace=True)

    de_lushS.to_csv(('metaAnalysis/',
                     'results/',
                     'lush_de_s.csv'))



pval = 0.05
thresh = 1.25

deg_hoaS = de_hoaS.index.values[\
             np.where((de_hoaS['adj.P.Val']<pval)&
             (de_hoaS['logFC'].abs()>np.log2(thresh)))]

deg_lushS = de_lushS.index.values[\
               np.where((de_lushS['adj.P.Val']<pval)&
               (de_lushS['logFC'].abs()>np.log2(thresh)))]


# plot venn diagram
plt.figure(figsize=(4,4))

venn2([set(deg_hoaS),
       set(deg_lushS)],
      ('adult M. mus. \n(hoa et al.)',
       'D. rerio \n(lush et al.)'),
      set_colors=('teal',
                  'palevioletred'),
      alpha=1)

plt.title('DE Genes'+
          '\nAmplifying S Phase')
plt.tight_layout()

plt.savefig(('metaAnalysis/'
             'figures/'
             'de_s.png'),
             dpi=300)

In [None]:
## GO comparison (amplifying S) ##


# enriched TF-gene co-ocurrence terms (Enrichr)
enr = gp.enrichr(gene_list=list(deg_hoaS),
                 background=list(data_after.columns.values),
                 description='hoaS',organism='Mouse',
                 gene_sets='Enrichr_Submissions_TF-Gene_Coocurrence',
                 outdir='metaAnalysis/gseapy/hoaS/',
                 cutoff=1,no_plot=True)

enr_hoaS = pd.DataFrame(\
                enr.results).sort_values(\
                'Combined Score',ascending=False)

enr_hoaS.to_csv('metaAnalysis/'+
                'results/'+
                'hoa_go_s.csv')



enr = gp.enrichr(gene_list=list(deg_lushS),
                 background=list(data_after.columns.values),
                 description='lushS',organism='Mouse',
                 gene_sets='Enrichr_Submissions_TF-Gene_Coocurrence',
                 outdir='metaAnalysis/gseapy/lushS/',
                 cutoff=1,no_plot=True)

enr_lushS = pd.DataFrame(\
                enr.results).sort_values(\
                'Combined Score',ascending=False)

enr_lushS.to_csv('metaAnalysis/'+
                 'results/'+
                 'lush_go_s.csv')


# enriched terms
hoa_cut = 30                # knee
# plt.figure(figsize=(3,2))
# plt.scatter(np.arange(enr_hoaS.shape[0]),
#             enr_hoaS['Combined Score'],
#             c='k',s=2)
# ax = plt.gca()
# l,r = ax.get_xlim()
# plt.hlines(hoa_cut,l,r,
#            color='r')
# ax.set_xlim(l,r)
# plt.show()

hoaS_terms = enr_hoaS.loc[enr_hoaS.index.values[\
                np.where(enr_hoaS['Combined Score']>
                                         hoa_cut)],:]


lush_cut = 200              # knee
# plt.figure(figsize=(3,2))
# plt.scatter(np.arange(enr_lushS.shape[0]),
#             enr_lushS['Combined Score'],
#             c='k',s=2)
# ax = plt.gca()
# l,r = ax.get_xlim()
# plt.hlines(lush_cut,l,r,
#            color='r')
# ax.set_xlim(l,r)
# plt.show()

lushS_terms = enr_lushS.loc[enr_lushS.index.values[\
                np.where(enr_lushS['Combined Score']>
                                         lush_cut)],:]


# plot venn diagram
plt.figure(figsize=(4,4))

venn2([set(hoaS_terms['Term']),
       set(lushS_terms['Term'])],
      ('adult M. mus. \n(hoa et al.)',
       'D. rerio \n(lush et al.)'),
      set_colors=('teal',
                  'palevioletred'),
      alpha=1)

plt.title('TF-Gene Enriched GO Terms'+
          '\nAmplifying S Phase')
plt.tight_layout()

plt.savefig(('metaAnalysis/'
             'figures/'
             'go_s.png'),
             dpi=300)

# print zebrafish only terms
print('Zebrafish-only terms:')
for term in lushS_terms['Term']:
    if term not in hoaS_terms['Term'].values:
        print(term)

In [None]:
## DE comparison (amplifying G2M) ##


# cell IDs
hoa_ampG2M = metadata.index.values[\
                  np.where(amplifying&
                  (phases['phase']=='G2M')&
                  (metadata['dataset']==
                   'hoa et al'))]

lush_ampG2M = metadata.index.values[\
                   np.where(amplifying&
                   (phases['phase']=='G2M')&
                   (metadata['dataset']==
                    'lush et al'))]

hoa_g2m = np.concatenate((hoa_ampS,
                          hoa_ampG2M,
                          hoa_not))

lush_g2m = np.concatenate((lush_ampS,
                           lush_ampG2M,
                           lush_not))


try:
    de_hoaG2M = pd.read_csv('metaAnalysis/'+
                            'results/'+
                            'hoa_de_g2m.csv',
                            index_col=0)
    
    de_lushG2M = pd.read_csv('metaAnalysis/'+
                             'results/'+
                             'lush_de_g2m.csv',
                             index_col=0)
    
except:
    # hoa amp G2M vs amp S + ~amp (deseq2)
    dds = deseq.DESeqDataSetFromMatrix(countData=
                data_before.loc[hoa_g2m,:].values.T+1,
                colData=robjects.DataFrame(\
                {'label':robjects.IntVector(\
                [1 if idx in hoa_ampG2M else 0\
                         for idx in hoa_g2m])}),
                    design=Formula('~ label'))

    dds = deseq.DESeq(dds)
    ds_res = deseq.results(dds)
    
    de_hoaG2M = pd.DataFrame(index=
                  data_before.columns.values)

    de_hoaG2M['P.Value'] = dollar(ds_res,
                                  'pvalue')

    de_hoaG2M['adj.P.Val'] = dollar(ds_res,
                                    'padj')

    de_hoaG2M['logFC'] = dollar(ds_res,
                                'log2FoldChange')

    de_hoaG2M.sort_values('logFC',
                          ascending=False,
                          inplace=True)

    de_hoaG2M.to_csv(('metaAnalysis/',
                      'results/',
                      'hoa_de_g2m.csv'))



    # lush amp G2M vs amp S + ~amp (deseq2)
    dds = deseq.DESeqDataSetFromMatrix(countData=
                data_before.loc[lush_g2m,:].values.T+1,
                colData=robjects.DataFrame(\
                {'label':robjects.IntVector(\
                [1 if idx in lush_ampG2M else 0\
                         for idx in lush_g2m])}),
                      design=Formula('~ label'))

    dds = deseq.DESeq(dds)
    ds_res = deseq.results(dds)
    
    de_lushG2M = pd.DataFrame(index=
                   data_before.columns.values)

    de_lushG2M['P.Value'] = dollar(ds_res,
                                   'pvalue')

    de_lushG2M['adj.P.Val'] = dollar(ds_res,
                                     'padj')

    de_lushG2M['logFC'] = dollar(ds_res,
                                 'log2FoldChange')

    de_lushG2M.sort_values('logFC',
                           ascending=False,
                           inplace=True)

    de_lushG2M.to_csv(('metaAnalysis/',
                       'results/',
                       'lush_de_g2m.csv'))



pval = 0.05
thresh = 1.25

deg_hoaG2M = de_hoaG2M.index.values[\
                np.where((de_hoaG2M['adj.P.Val']<pval)&
                (de_hoaG2M['logFC'].abs()>np.log2(thresh)))]

deg_lushG2M = de_lushG2M.index.values[\
                  np.where((de_lushG2M['adj.P.Val']<pval)&
                  (de_lushG2M['logFC'].abs()>np.log2(thresh)))]


# plot venn diagram
plt.figure(figsize=(4,4))

venn2([set(deg_hoaG2M),
       set(deg_lushG2M)],
      ('adult M. mus. \n(hoa et al.)',
       'D. rerio \n(lush et al.)'),
      set_colors=('teal',
                  'palevioletred'),
      alpha=1)

plt.title('DE Genes'+
          '\nAmplifying G2M Phase')
plt.tight_layout()

plt.savefig(('metaAnalysis/'
             'figures/'
             'de_g2m.png'),
             dpi=300)

In [None]:
## 
limma = importr('limma')

# cell IDs
dvsc_s = metadata.index.values[\
            np.where(amplifying&
             (phases['phase']=='S')&
             (metadata['subtype']==
              'D/V amplifying sc'))]

corti_s = metadata.index.values[\
            np.where(amplifying&
             (phases['phase']=='S')&
             (metadata['dataset']==
                      'hoa et al')&
             (metadata['organ']==
                          'corti')&
             (metadata['type']=='sc'))]

# dvsc_g2m = metadata.index.values[np.where(amplifying&
#                                  (phases['phase']=='G2M')&
#                                  (metadata['subtype']==
#                                   'D/V amplifying sc'))]

# corti_g2m = metadata.index.values[np.where(amplifying&
#                                   (phases['phase']=='G2M')&
#                                   (metadata['organ']=='corti')&
#                                   (metadata['type']=='sc'))]
                                   
S = np.concatenate((dvsc_s,
                    corti_s))
# G2M = np.concatenate((dvsc_g2m,corti_g2m))


# amplifying S phase (limma, using aligned data not raw counts)
data_s = data_after.loc[S,:]
design = pd.DataFrame(index=data_s.index.values,
                      columns=['fish',
                               'fish vs. mouse'])
design['fish']=1
design['fish vs. mouse']=0

design.loc[corti_s,
           'fish vs. mouse']=1

fit = limma.lmFit(data_s.values.T,
                  design.values)

fit = limma.eBayes(fit,
                   trend=True)

de_s = pd.DataFrame(limma.topTable(fit,
                    coef=design.shape[1],
                    number=data_s.shape[1],
                    **{'sort.by':'none'}),
                    index=data_s.columns.values)

de_s.sort_values('logFC',
                 ascending=False,
                 inplace=True)

# de_dvsc.to_csv('metaAnalysis/results/de_dvsc.csv')

print(de_s.loc[de_s.index.values[\
        np.where((de_s['adj.P.Val']<0.001)&
        (de_s['logFC'].abs()>np.log2(1.25)))],:])



In [None]:
## markers for proliferating fish & mouse cells ## grn inference ##
from arboreto.algo import grnboost2
limma = importr('limma')

# mouse TFs (via TFDB3)
mouse_tfs = pd.read_csv('metaAnalysis/refs/mouse_tfs.csv')
mouse_tfs.loc[mouse_tfs.index.values[-1]+1,'Symbol'] = 'Cbx5'

try:
    network = pd.read_csv('metaAnalysis/results/grn.csv',index_col=0)
    
except:    
    # GRN inference with GRNboost2 via Arboreto (all cells, not just near dvsc)
    network = grnboost2(expression_data=data_after,
                        tf_names=list(mouse_tfs['Symbol']))
    network.set_index(np.arange(network.shape[0]),inplace=True)
    network['weight'] = network['importance']
    network.drop('importance',axis=1,inplace=True)
    network.to_csv('metaAnalysis/results/grn.csv')
    
cut = 4 # cutoff weight for graph edges
# plt.figure()
# plt.plot(network.index.values,network['weight'])
# ax = plt.gca()
# l,r = ax.get_xlim()
# plt.hlines(cut,l,r,color='r')
# ax.set_xlim(l,r)
# plt.show()

# DE analysis with limma (using integrated data, not raw counts)
# cells near dvsc, mouse vs. fish
de_data = data_after.loc[np.concatenate((mouse_near_dvsc,dvsc_near_dvsc)),:]
design = pd.DataFrame(index=de_data.index.values,
                      columns=['fish','fish vs. mouse'])
design['fish']=1
design['fish vs. mouse']=0
design.loc[mouse_near_dvsc,'fish vs. mouse']=1
fit = limma.lmFit(de_data.values.T,design.values)
fit = limma.eBayes(fit,trend=True)
de_dvsc = pd.DataFrame(limma.topTable(fit,
                       coef=design.shape[1],
                       number=de_data.shape[1],
                       **{'sort.by':'none'}),
                       index=de_data.columns.values)
de_dvsc.sort_values('logFC',ascending=False,inplace=True)
de_dvsc.to_csv('metaAnalysis/results/de_dvsc.csv')

# choose (in)significant genes from DE analysis
cutoff = 1.33
ins = de_dvsc.index.values[np.where((de_dvsc['P.Value']>0.05)|
                                    (de_dvsc['logFC'].abs()<np.log2(cutoff)))]
sig = de_dvsc.index.values[np.where((de_dvsc['P.Value']<0.05)&
                                    (de_dvsc['logFC'].abs()>np.log2(cutoff)))]

# find GRN for DE genes
G = network.loc[network.index.values[np.where((network['weight']>cut)&
                np.array([gene in sig for gene in network['TF'].values])&
                np.array([gene in sig for gene in network['target'].values]))],:]
    
# volcano plot
plt.figure(figsize=(4,6))
plt.scatter(de_dvsc.loc[ins,'logFC'],
            -np.log10(de_dvsc.loc[ins,'P.Value']),
            c='gray',s=10)
plt.scatter(de_dvsc.loc[sig,'logFC'],
            -np.log10(de_dvsc.loc[sig,'P.Value']),
            c='black',s=10)
plt.scatter(de_dvsc.loc[G['TF'].unique(),'logFC'],
            -np.log10(de_dvsc.loc[G['TF'].unique(),'P.Value']),
            c='indianred',s=10)

# draw and annotate cutoffs
ax = plt.gca()
l,r = ax.get_xlim()
b,t = ax.get_ylim()
plt.hlines(-np.log10(0.05),-max([l,r]),max([l,r]),
           linestyle='--',linewidth=1,color='k',zorder=0)
plt.text(s='p = 0.05',x=l-0.075,y=-np.log10(0.05),
         va='center',ha='right',fontsize=8)
plt.vlines(np.log2(cutoff),b,t,
           linestyle='--',linewidth=1,color='k',zorder=0)
plt.vlines(-np.log2(cutoff),b,t,
           linestyle='--',linewidth=1,color='k',zorder=0)
plt.text(s='FC = +/- 1.33',x=0,y=17,ha='center')
plt.annotate(s='',xy=(0.22,17),xytext=(0.4,16),
             arrowprops=dict(arrowstyle=ArrowStyle.CurveA(),color='k'))
plt.annotate(s='',xy=(-0.22,17),xytext=(-0.4,16),
             arrowprops=dict(arrowstyle=ArrowStyle.CurveA(),color='k'))
ax.set_xlim(-max([l,r]),max([l,r]))
ax.set_ylim(b,t)

# # show regulation results
# print('DE genes: ')
# print(de_dvsc.loc[sig,:])
# print('\nRegulation: ')
# print(G)
# print('\nRegulation stats: ')
# print(G['target'].value_counts().to_frame().reset_index().rename(columns={'index':'target', 'target':'count'}))

# annotate genes
plt.text(x=np.mean([-max([l,r]),-np.log2(cutoff)]),
         s='Cbx5',y=8.8,ha='center',color='indianred')
plt.text(x=np.mean([-max([l,r]),-np.log2(cutoff)]),
         s='Ccnd2\nRrm1\nRrm2\nRpa1\nRpa2\nFen1\nAhcy',
         y=3.5,ha='center',color='royalblue')
plt.text(x=np.mean([max([l,r]),np.log2(cutoff)]),
         s='Tubb5',y=17.5,ha='center',color='rebeccapurple')
plt.text(x=np.mean([max([l,r]),np.log2(cutoff)]),
         s='Fos',y=7,ha='center',color='indianred')
plt.text(x=np.mean([max([l,r]),np.log2(cutoff)]),
         s='Ucp2\n*Pdia3',y=4.8,ha='center',color='rebeccapurple')

# add arrows for GRN
plt.annotate(s='',xy=(np.mean([-max([l,r]),-np.log2(cutoff)]),8.7),
             xytext=(np.mean([-max([l,r]),-np.log2(cutoff)]),7.6),
             arrowprops=dict(arrowstyle=ArrowStyle.CurveA(),color='k'))
plt.annotate(s='',xy=(np.mean([max([l,r]),np.log2(cutoff)]),6.9),
             xytext=(np.mean([max([l,r]),np.log2(cutoff)]),5.75),
             arrowprops=dict(arrowstyle=ArrowStyle.CurveA(),color='k'))
plt.annotate(s='',xy=(np.mean([-max([l,r]),-np.log2(cutoff)]),9.2),
             xytext=(np.mean([max([l,r]),np.log2(cutoff)]),7.4),
             arrowprops=dict(arrowstyle='<->',color='k',
             connectionstyle='arc3,rad=1'))

# labels and save
ax.set_xlabel('log2 FC')
ax.set_ylabel('-log10 p value')
plt.tight_layout()
plt.savefig('metaAnalysis/figures/dvsc_volcano.png',dpi=300)

In [None]:
## markers for proliferating medial support cells (corti) ##
####### REDO WITH DESEQ2 -> ONLY ONE DATASET AT A TIME, RAW READS #########

# medial sc IDs: proliferating (near dvsc)
mouse_dvsc_meta = metadata.loc[mouse_near_dvsc,:]
burns_dvsc_idxs = mouse_dvsc_meta.index.values[np.where((mouse_dvsc_meta['dataset']=='burns et al')&
                                                        (mouse_dvsc_meta['organ']=='corti')&
                                                        (mouse_dvsc_meta['type']=='sc'))]
hoa_dvsc_idxs = mouse_dvsc_meta.index.values[np.where((mouse_dvsc_meta['dataset']=='hoa et al')&
                                                      (mouse_dvsc_meta['organ']=='corti')&
                                                      (mouse_dvsc_meta['type']=='sc'))]
burns_dvsc_medial_idxs = burns.columns.values[np.where((burns.T.loc[burns_dvsc_idxs,['Plp1']].values > 0)|
                                                       (burns.T.loc[burns_dvsc_idxs,['Plp2']].values > 0))[0]]
hoa_dvsc_medial_idxs = hoa.columns.values[np.where((hoa.T.loc[hoa_dvsc_idxs,['Plp1']].values > 0)|
                                                   (hoa.T.loc[hoa_dvsc_idxs,['Plp2']].values > 0))[0]]
dvsc_idxs = np.concatenate((burns_dvsc_medial_idxs,hoa_dvsc_medial_idxs))

# medial sc IDs: near other
mouse_other_meta = metadata.loc[mouse_near_other,:]
burns_other_idxs = mouse_other_meta.index.values[np.where((mouse_other_meta['dataset']=='burns et al')&
                                                          (mouse_other_meta['organ']=='corti')&
                                                          (mouse_other_meta['type']=='sc'))]
hoa_other_idxs = mouse_other_meta.index.values[np.where((mouse_other_meta['dataset']=='hoa et al')&
                                                        (mouse_other_meta['organ']=='corti')&
                                                        (mouse_other_meta['type']=='sc'))]
burns_other_medial_idxs = burns.columns.values[np.where((burns.T.loc[burns_other_idxs,['Plp1']].values > 0)|
                                                        (burns.T.loc[burns_other_idxs,['Plp2']].values > 0))[0]]
hoa_other_medial_idxs = hoa.columns.values[np.where((hoa.T.loc[hoa_other_idxs,['Plp1']].values > 0)|
                                                    (hoa.T.loc[hoa_other_idxs,['Plp2']].values > 0))[0]]
other_idxs = np.concatenate((burns_other_medial_idxs,hoa_other_medial_idxs))


# DE analysis with limma (using integrated data, not raw counts)
de_data = data_after.loc[np.concatenate((other_idxs,dvsc_idxs)),:]
design = pd.DataFrame(index=de_data.index.values,
                      columns=['other','other vs. dvsc'])
design['other']=1
design['other vs. dvsc']=0
design.loc[dvsc_idxs,'other vs. dvsc']=1
fit = limma.lmFit(de_data.values.T,design.values)
fit = limma.eBayes(fit,trend=True)
de_medial = pd.DataFrame(limma.topTable(fit,
                         coef=design.shape[1],
                         number=de_data.shape[1],
                         **{'sort.by':'none'}),
                         index=de_data.columns.values)
de_medial.sort_values('logFC',ascending=False,inplace=True)
de_medial.to_csv('metaAnalysis/results/de_medial.csv')

# show top results
print(de_medial.head(15))
print(de_medial.tail(15))