In [52]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
import plotnine as gg
import umap
from pathlib import Path
import scipy.linalg
from sklearn.feature_selection import VarianceThreshold
from functions_utils import *

In [63]:
outpath = Path(os.path.join("..", "Figures"))
outpath

PosixPath('../Figures')

## loading data

In [99]:
top_dir = os.path.dirname(os.getcwd())
proj_dir = 'data'


class load_data:
    
    
    def __init__(self,top_dir,proj_dir):
        
        self.top_dir = top_dir
        self.proj_dir = proj_dir
    
    
    def csvpath(self):
        path = os.path.join(self.top_dir, self.proj_dir, "backend")
        plates = [pl for pl in os.listdir(path)]
        csvpath = [os.path.join(path, pl, pl + "_dmso.csv") for pl in plates]  
        
        return csvpath

    
    def featlist(self):
        path = os.path.join(self.top_dir, self.proj_dir,"metadata","input", "feature_list_reduced.txt")
        featlist = np.loadtxt(str(path), dtype=str).tolist()
        return featlist

    
    
    
    
subclass = load_data(top_dir, proj_dir)

csvlist = subclass.csvpath()

featlist = subclass.featlist()


featlist

['Cells_AreaShape_Area',
 'Cells_AreaShape_Compactness',
 'Cells_AreaShape_Eccentricity',
 'Cells_AreaShape_Extent',
 'Cells_AreaShape_FormFactor',
 'Cells_AreaShape_Solidity',
 'Cells_AreaShape_Zernike_0_0',
 'Cells_AreaShape_Zernike_2_0',
 'Cells_AreaShape_Zernike_3_1',
 'Cells_AreaShape_Zernike_3_3',
 'Cells_AreaShape_Zernike_4_0',
 'Cells_AreaShape_Zernike_4_2',
 'Cells_AreaShape_Zernike_4_4',
 'Cells_AreaShape_Zernike_5_3',
 'Cells_AreaShape_Zernike_5_5',
 'Cells_AreaShape_Zernike_6_4',
 'Cells_AreaShape_Zernike_6_6',
 'Cells_AreaShape_Zernike_7_5',
 'Cells_AreaShape_Zernike_7_7',
 'Cells_AreaShape_Zernike_9_5',
 'Cells_AreaShape_Zernike_9_7',
 'Cells_AreaShape_Zernike_9_9',
 'Cells_Correlation_Correlation_DNA_AGP',
 'Cells_Correlation_Correlation_DNA_ER',
 'Cells_Correlation_Correlation_DNA_Mito',
 'Cells_Correlation_Correlation_DNA_RNA',
 'Cells_Correlation_Correlation_ER_AGP',
 'Cells_Correlation_Correlation_ER_RNA',
 'Cells_Correlation_Correlation_Mito_AGP',
 'Cells_Correlatio

## Data Preprocessing

In [None]:
def processing_data(csvlist, featlist, outdata):
    
    """
    Implements the combining CSV's, zscoring, whitening and umap
    
    Arguments:
    
    csvlist -- input list of dataset path
    
    featlist -- list of pre-selected features
    
    outdata -- returns output data
    
    parameters:
                "combined" -- returns z-normalized combined CSV's profiles
                "combined+umap" -- returns umap of z-normalized combined CSV's profiles
                "combined+whitened" -- returns whitened z-normalized combined CSV's profiles
                "combined+whitened+umap" -- returns umap of whitened z-normalized combined CSV's profiles
                    
                       
    
    Returns:
    
    outdata  -- combined DataFrame
    
    """
    
    
    combined = []
    
    for csv in csvlist:
        
        metadata = ['Image_FileName_OrigMito', 'Image_FileName_OrigER', 'Image_PathName_OrigER', 
                    'Metadata_mg_per_ml','Image_PathName_OrigDNA','Image_PathName_OrigRNA',
                    'Image_PathName_OrigMito','Image_FileName_OrigDNA','Image_FileName_OrigRNA',
                    'Metadata_broad_sample','Image_Metadata_Well','Metadata_plate_map_name',
                    'Metadata_mmoles_per_liter','Image_FileName_CellOutlines','Image_Metadata_Site',
                    'Metadata_Plate','Image_FileName_OrigAGP','Image_PathName_CellOutlines',
                    'Image_PathName_OrigAGP','Cells_Location_Center_X','Cells_Location_Center_Y',
                    'Nuclei_Location_Center_X','Nuclei_Location_Center_Y','Cytoplasm_Location_Center_X', 
                    'Cytoplasm_Location_Center_Y','ObjectNumber']
        
    
        dmso = pd.read_csv(csv)
        
        scale = StandardScaler()
        
        scaled_data = scale.fit_transform(dmso[featlist].values)
        
         
        
        
        if outdata == "combined":
            
             
            df = dmso[metadata].merge(pd.DataFrame(scaled_data, columns=featlist), 
                                            how='left',
                                            left_index=True,
                                            right_index=True
                                           )
            
            combined.append(df)
            
            
            
            
        elif outdata == "combined+umap":
            
            
            reducer = umap.UMAP(random_state=42)
        
            embedding = reducer.fit_transform(scaled_data)
        
            test = pd.DataFrame(embedding, columns= ['UMAP-1', 'UMAP-2'])
            
            df = dmso[metadata].merge(test, how='left',
                                            left_index=True,
                                            right_index=True)
            

            combined.append(df)
            
            
        elif outdata == "combined+whitened":
            
            
            dmso_mean = np.mean(scaled_data, axis=0)

        ## Applying whitening transformation on Z normalized features
            
            REG_PARAM = 1e-6

            AW = whitening_transform(scaled_data - dmso_mean, REG_PARAM, rotate=False)
    
            wh_profile = whiten(scaled_data, dmso_mean, AW)
             
            df = dmso[metadata].merge(pd.DataFrame(wh_profile, columns=featlist), 
                                            how='left',
                                            left_index=True,
                                            right_index=True
                                           )
            
            
            combined.append(df)
            
            
        elif outdata == "combined+whitened+umap":
      
            
            dmso_mean = np.mean(scaled_data, axis=0)
            
            REG_PARAM = 1e-6

            AW = whitening_transform(scaled_data - dmso_mean, REG_PARAM, rotate=False)
    
            wh_profile = whiten(scaled_data, dmso_mean, AW)
         
            reducer = umap.UMAP()
        
            embedding = reducer.fit_transform(wh_profile)
        
            test = pd.DataFrame(embedding, columns= ['UMAP-1', 'UMAP-2'])
            
            df = dmso[metadata].merge(test, how='left',
                                            left_index=True,
                                            right_index=True)
            
            
            combined.append(df)
        
    
    return pd.concat(combined)


# combined_csv = processing_data(csvlist, featlist, do_umap=False)
#combined_umap = processing_data(csvlist, featlist, do_umap=True)
    
#df_whitened = processing_data(csvlist, featlist, do_whitening=True)

df_whitened_umap = processing_data(csvlist, featlist, outdata="combined+whitened+umap")


In [None]:
df_whitened_umap.head()

In [None]:
def plotting_function(data, title, filename, outpath):
    
    p = gg.ggplot(data, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_Plate')) + \
    gg.geom_point(size = 0.003) + \
    gg.xlab("UMAP-1") + \
    gg.ylab("UMAP-2") + \
    gg.ggtitle(title) + \
    gg.theme_bw() 

   
    gg.ggsave(filename=filename, plot = p, path = outpath)

    
    return p
    
    
    
    
    

In [None]:
plotting_function(df_whitened_umap, "UMAP of DMSO-whitened cellprofiles", "UMAP_DMSO-whitened_cellprofiles.png", outpath)