In [55]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
import plotnine as gg
import umap
from pathlib import Path
import scipy.linalg
from sklearn.feature_selection import VarianceThreshold
from functions_utils import *

## loading data

In [50]:
top_dir = os.path.dirname(os.getcwd())
proj_dir = 'data'


class load_data:
    
    
    def __init__(self,top_dir,proj_dir):
        
        self.top_dir = top_dir
        self.proj_dir = proj_dir
    
    
    def csvpath(self):
        path = os.path.join(self.top_dir, self.proj_dir, "backend")
        plates = [pl for pl in os.listdir(path)]
        csvpath = [os.path.join(path, pl, pl + "_dmso.csv") for pl in plates]  
        
        return csvpath

    
    def featlist(self):
        path = os.path.join(self.top_dir, self.proj_dir,"metadata","input", "feature_list.txt")
        featlist = np.loadtxt(str(path), dtype=str).tolist()
        return featlist

    
    
    
    
subclass = load_data(top_dir, proj_dir)

csvlist = subclass.csvpath()

featlist = subclass.featlist()




csvlist = csvlist[0:2]
csvlist

['/Users/habbasi/Documents/Github/broadinstitute/2020-06-01-Evidence-of-state-switching-in-single-cell-drug-response-Broad/data/backend/SQ00015142/SQ00015142_dmso.csv',
 '/Users/habbasi/Documents/Github/broadinstitute/2020-06-01-Evidence-of-state-switching-in-single-cell-drug-response-Broad/data/backend/SQ00015145/SQ00015145_dmso.csv']

## Data Preprocessing

In [52]:

def combined_csv(csvlist):
    
    combined = []

    for csv in csvlist:

        dmso = pd.read_csv(csv)

        dmso =znormalization(dmso)

        combined.append(dmso)
        
    
    return pd.concat(combined)
    
    
        
combined = combined_csv(csvlist)
    
    

In [54]:

combined.head()

Unnamed: 0,Image_Metadata_Well,Metadata_Plate,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_EulerNumber,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,A01,SQ00015142,-0.992988,-0.004444,-1.686237,-0.169274,0.452708,0.151809,0.137827,-0.090261,...,2.425291,-1.215894,-1.154453,-0.778557,-1.536069,-1.535926,-0.952211,1.089463,-1.194015,1.837023
1,A01,SQ00015142,0.320931,0.90137,-1.727213,-0.094177,0.330674,0.151809,0.097741,-0.448981,...,-0.034576,-1.106966,-0.812636,-1.204521,-1.532026,-1.264949,-1.762282,-0.251114,0.581406,-0.658795
2,A01,SQ00015142,-0.076284,0.53575,-1.730491,-0.090749,0.722522,0.151809,1.23234,-0.83263,...,0.92998,-0.586733,-0.913551,-0.284658,-1.456381,-1.235929,-1.413553,0.145065,0.062055,0.075946
3,A01,SQ00015142,1.189914,-0.267954,-1.648539,-0.439036,0.15613,0.151809,0.67457,-0.806424,...,0.294527,0.253564,0.688204,-0.002272,0.581256,0.991929,0.409337,-0.559672,0.029269,-0.65256
4,A01,SQ00015142,-0.148629,-0.482056,-1.632149,-0.267161,0.44204,0.151809,0.823209,-1.568563,...,0.658042,0.949889,1.925098,0.550128,0.401861,0.634207,0.21796,0.454802,0.015853,-0.011019


In [None]:
data1 = combined.query("Metadata_Plate == 'SQ00015142'")
data2 = combined.query("Metadata_Plate == 'SQ00015143'")
data3 = combined.query("Metadata_Plate == 'SQ00015144'")
data4 = combined.query("Metadata_Plate == 'SQ00015145'")
data5 = combined.query("Metadata_Plate == 'SQ00015201'")

data1_feat = feature_selection(data1)
data2_feat = feature_selection(data2) 
data3_feat = feature_selection(data3) 
data4_feat = feature_selection(data4)
data5_feat = feature_selection(data5)

In [25]:
print(len(data1_feat))
print(len(data2_feat))
print(len(data3_feat))
print(len(data4_feat))
print(len(data5_feat))
    

    
    



In [29]:
    
final_list = IntersecOfSets(data1_feat, data2_feat, data3_feat, data4_feat, data5_feat)
    
f = open("featureslist.txt", "w")

f.writelines(v + '\v' for v in final_list)






815
811
808
816
783
