In [118]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import seaborn as sns
import plotnine as gg
from mpl_toolkits.mplot3d import Axes3D
from pylab import *
from sklearn.manifold import TSNE
from palettable.colorbrewer.sequential import *
from pathlib import Path
import umap.umap_ as umap
from scipy import stats
from scipy.stats import ttest_ind


In [119]:
top_dir = os.path.dirname(os.getcwd())
outpath = os.path.join(top_dir, "Figures")
outpath

'/Users/habbasi/Documents/Github/broadinstitute/ProteinKinase_NathianelGray/Figures'

## Data Paths

In [157]:
path = Path("../collated/")
#outpath = Path("/Welch_ttest/")
fname = "C-7210-01-CMP-008-gray_collapsed_sampled.csv"
fpath = os.path.join(path, fname)
df = pd.read_csv(fpath)


metadata = [col for col in df.columns if "Metadata" in col]
var = [col for col in df.columns if not "Metadata" in col]
exclude_var = ['Cells_Correlation_Costes_DNA_Mito',
 'Cytoplasm_Correlation_Costes_DNA_Mito',
 'Cytoplasm_Correlation_Costes_DNA_RNA']

variables = [v for v in var if v not in exclude_var]




## Welch Ttest

In [4]:
df['Metadata_Target'].replace(np.nan, 'NA', inplace=True)
df['Metadata_Annotation'].replace('active ', 'active', inplace=True)

p = ['NA', 'SECRET']

excluded = ["C1", "C2", "C3", "C4"]

tt = []

for c in df.Metadata_cell_line.unique():
    d = df.query('Metadata_cell_line in @ c and Metadata_compound_name not in @ excluded and Metadata_Target in @p')

    for i in variables:
        test = d.Metadata_compound_name.unique().tolist()
        gp1 = d.query("Metadata_compound_name in @ test[0]")[i].values
        gp2 = d.query("Metadata_compound_name in @ test[1]")[i].values
        gp3 = d.query("Metadata_compound_name in @ test[2]")[i].values
    

        t1 = ttest_ind(gp1, gp2, equal_var=False)
        t2 = ttest_ind(gp1, gp3, equal_var=False)
        t3 = ttest_ind(gp2, gp3, equal_var=False)



        d1 = {"Paired": [test[0]+', '+ test[1], test[0]+', '+ test[2], test[1]+', '+ test[2]],
          "Statistics": [t1[0], t2[0], t3[0]],
          "P-value": [t1[1], t2[1], t3[1]],
          "Variables": str(i),
          "Cellline": str(c)}



        prf = pd.DataFrame(d1, columns = ['Paired','Statistics', 'P-value', 'Variables', "Cellline"])
    
        tt.append(prf)
final_tt = pd.concat(tt)

    
final_tt['logp'] = -log10(final_tt['P-value'])
index = [i for i in range(1, len(final_tt) +1)]
final_tt['index'] =  index
 
    


## Plotting top 15 features which are different between Active and Inactive controls

In [124]:
c = "A549"

tmp = (final_tt.query("Paired == 'BSJ-04-030, BSJ-03-136' and Cellline in @ c")
      .rename(columns={"logp": "Active_Inactive"})
      .reset_index()
      .drop(columns=["level_0", "index"])
      .sort_values(by=["Active_Inactive"],ascending=False)
     )

top_var = tmp.head(15)['Variables'].tolist()
bottom_var = tmp.tail(15)['Variables'].tolist()


## Plotting Density plots for those top and bottom features


p = ["NA", 'SECRET']

excluded = ["C1", "C2", "C3", "C4"]


d = df.query('Metadata_cell_line in @ c and Metadata_compound_name not in @ excluded and Metadata_Target in @p')




for i, var in enumerate(top_var):
    g = gg.ggplot() + \
    gg.geom_density(gg.aes(x=str(var), y='stat(density)', color = 'Metadata_compound_name', fill= 'Metadata_compound_name'), data= d, alpha=.1) + \
    gg.xlab(str(var)) + \
    gg.ylab("Density") + \
    gg.labs(title='Density plot') + \
    gg.theme_classic() 



    gg.ggsave(filename= str(var)+ ".png", plot = g, path = os.path.join(outpath, "Densityplot", c, "top_variables"))


for i, var in enumerate(bottom_var):
    g = gg.ggplot() + \
    gg.geom_density(gg.aes(x=str(var), y='stat(density)', color = 'Metadata_compound_name', fill= 'Metadata_compound_name'), data= d, alpha=.1) + \
    gg.xlab(str(var)) + \
    gg.ylab("Density") + \
    gg.labs(title='Density plot') + \
    gg.theme_classic() 



    gg.ggsave(filename= str(var)+ ".png", plot = g, path = os.path.join(outpath, "Densityplot", c, "bottom_variables"))





In [127]:
top_var = tmp.head(10)['Variables'].tolist()
bottom_var = tmp.tail(10)['Variables'].tolist()
bottom_var

['Cells_Granularity_3_Mito',
 'Cytoplasm_Correlation_K_AGP_DNA',
 'Nuclei_Correlation_Correlation_DNA_RNA',
 'Nuclei_Correlation_Overlap_RNA_AGP',
 'Nuclei_Texture_DifferenceVariance_Mito_20_0',
 'Cells_Correlation_Overlap_DNA_ER',
 'Nuclei_Texture_DifferenceVariance_ER_5_0',
 'Nuclei_Intensity_MeanIntensity_DNA',
 'Cytoplasm_Correlation_Overlap_DNA_AGP',
 'Nuclei_Texture_Gabor_AGP_10']

In [94]:
d1 = (final_tt.query("Paired == 'DMSO, BSJ-04-030'")
      .rename(columns={"logp": "Control_Inactive"})
      .reset_index()
      .drop(columns=["level_0", "index"])
     )
d2 = (final_tt.query("Paired == 'DMSO, BSJ-03-136'")
      .rename(columns = {"logp": "Control_Active"})
      .reset_index()
      .drop(columns=["level_0", "index", "Cellline", "Variables"])
)

result = pd.concat([d1, d2], axis=1)

col_var = ["Control_Inactive", "Control_Active", "Variables", "Cellline"]

prf = result.loc[:, col_var]

c = "A549"

pf = prf.query("Cellline in @ c")


control_active = pf.Control_Active.quantile(0.95)
control_inactive = pf.Control_Inactive.quantile(0.95)



# only label points which are greater than threshold




selected_var = pf.query("Control_Active > @ control_active and Control_Inactive > @ control_inactive")

g = gg.ggplot(pf, gg.aes(x='Control_Inactive', y='Control_Active', label="Variables")) + \
    gg.geom_point(size = 1.5, color="#de2d26") + \
    gg.geom_vline(xintercept= control_inactive, color="blue",linetype='dashed') + \
    gg.theme_classic() + \
    gg.geom_hline(yintercept = control_active, color="black") + \
    gg.geom_label(data = selected_var,
                 size=8,
                 label_size=0,
                 label_padding=1,
                 show_legend=False
                ) + \
    gg.labs(title='SECRET ' + "[" + str(c) + "]" , x="-log10 [p-value] \n\n Control_Inactive", y="Control_Active \n\n -log10 [p-value] ")

    
  
 
gg.ggsave(filename='Welch_test_SECRET_95percentile_labeltext' + str(c) + ".png", plot = g, path = outpath)


g




Unnamed: 0,Control_Inactive,Control_Active,Variables,Cellline
0,6.293564,11.088178,Cells_AreaShape_Area,A549
1,2.689233,6.07316,Cells_AreaShape_Compactness,A549
2,1.022069,7.056555,Cells_AreaShape_Eccentricity,A549
3,1.973137,2.417587,Cells_AreaShape_Extent,A549
4,7.172603,5.444402,Cells_AreaShape_FormFactor,A549


In [93]:
d3_var = selected_var['Variables'].tolist()
d3_var

['Cells_Intensity_MADIntensity_DNA',
 'Cells_Intensity_MedianIntensity_DNA',
 'Cells_Intensity_MinIntensityEdge_RNA',
 'Cytoplasm_Intensity_MADIntensity_DNA',
 'Cytoplasm_Intensity_UpperQuartileIntensity_DNA',
 'Cytoplasm_Texture_Gabor_AGP_5']

In [81]:
p = ['NA', 'SECRET']

excluded = ["C1", "C2", "C3", "C4"]


d = df.query('Metadata_cell_line in @ c and Metadata_compound_name not in @ excluded and Metadata_Target in @p')
d3_var = selected_var['Variables'].tolist()

for i, var in enumerate(d3_var):
    g = gg.ggplot() + \
    gg.geom_density(gg.aes(x=str(var), y='stat(density)', color = 'Metadata_compound_name', fill= 'Metadata_compound_name'), data= d, alpha=.1) + \
    gg.xlab(str(var)) + \
    gg.ylab("Density") + \
    gg.labs(title='Density plot') + \
    gg.theme_classic() 



    gg.ggsave(filename= str(var)+ ".png", plot = g, path = os.path.join(outpath, "Densityplot", c))






In [291]:
## c = "A549"


p = ["NA", 'SECRET']

excluded = ["C1", "C2", "C3", "C4"]

d = df.query('Metadata_cell_line in @ c and Metadata_compound_name not in @ excluded and Metadata_Target in @p')

test = d.copy()


feat = [col for col in test.columns if not "Metadata" in col]

featlist =  ['Metadata_Well', 'Metadata_compound_name'] + feat



test = test[test.columns[test.columns.isin(featlist)]]


tmp2 = (test.groupby('Metadata_compound_name')[featlist].median().reset_index().T
       )

tmp2.columns = tmp2.iloc[0]

tmp2 = (tmp2.iloc[1:]
        .reset_index()
       .rename(columns={'index': "variables"})
       )


tmp2["compartment"] = (tmp2["variables"]
.map(lambda x: "Cells" if "Cells" in x else "Cytoplasm" if "Cytoplasm" in x else "Nuclei" if "Nuclei" in x else "Metadata")
                                                    
                             )


tmp2["RNA"] = (tmp2["variables"]
.map(lambda x: "RNA" if "RNA" in x else "")   
                             )

tmp2["DNA"] = (tmp2["variables"]
.map(lambda x: "DNA" if "DNA" in x else "")   
                             )

tmp2["ER"] = (tmp2["variables"]
.map(lambda x: "ER" if "ER" in x else "")   
                             )

tmp2["AGP"] = (tmp2["variables"]
.map(lambda x: "AGP" if "AGP" in x else "")   
                             )

tmp2["Mito"] = (tmp2["variables"]
.map(lambda x: "Mito" if "Mito" in x else "")   
                             )

# t1 = tmp2.query("ER == 'ER' and compartment == 'Cells'")
# t2 = tmp2.query("ER == 'ER' and compartment == 'Cytoplasm'")
# t3 = tmp2.query("ER == 'ER' and compartment == 'Cells'")

tmp2["AreaShape"] = (tmp2["variables"]
.map(lambda x: "AreaShape" if "AreaShape" in x else "")  
                    )
t1 = tmp2.query("AreaShape == 'AreaShape'")
t1.head()

t1['Diff'] = t1.apply(lambda x: abs(x['BSJ-03-136'] - x['BSJ-04-030']), axis=1)

tmp3 = t1.groupby('compartment')['Diff'].median().reset_index()
t4 = tmp3.T
t4.reset_index()

# features = tmp3.compartment.tolist()
# values = tmp3.Diff.tolist().T

# pd.DataFrame(values)
#tmp3.style.background_gradient(cmap='Blues')

# t1.groupby("compartment")['BSJ-03-136'].median()


# t1       
         
# fig = plt.figure()

# ax = fig.add_subplot(111)

# xlabels = t1['variables'].tolist()
# plt.plot(t1['variables'], t1['BSJ-03-136'],'r-',linewidth=2)
# plt.plot(t1['variables'], t1['BSJ-04-030'],'c-',linewidth=2)
# plt.plot(t1['variables'], t1['DMSO'],'b-',linewidth=2)
# plt.xticks(xlabels, rotation='vertical')
# plt.legend(('BSJ-03-136', 'BSJ-04-030', 'DMSO'),
#            loc='upper right')
# plt.title('plotting median [RNA] featues z-scores ----{Cells}')
# plt.ylabel('Median Z-scores')

# ax = fig.add_subplot(122)

# xlabels = t2['variables'].tolist()
# plt.plot(t2['variables'], t2['BSJ-03-136'],'r-',linewidth=2)
# plt.plot(t2['variables'], t2['BSJ-04-030'],'c-',linewidth=2)
# plt.plot(t2['variables'], t2['DMSO'],'b-',linewidth=2)
# plt.xticks(xlabels, rotation='vertical')
# plt.legend(('BSJ-03-136', 'BSJ-04-030', 'DMSO'),
#            loc='upper right')
# plt.title('plotting median [RNA] featues z-scores ----{Cells}')
# plt.ylabel('Median Z-scores')

# fig.subplots_adjust(wspace=2)

# plt.show()



# ax = fig.add_subplot(132)
# plt.plot(x,z)
# ...

# plt.show()
    




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,index,0,1,2
0,compartment,Cells,Cytoplasm,Nuclei
1,Diff,4.15597,3.34811,2.2784


In [184]:
from sklearn.decomposition import PCA


In [197]:
p = ['SECRET']

excluded = ["C1", "C2", "C3", "C4", "DMSO"]


c = "A549"


d = df.query('Metadata_compound_name not in @ excluded and Metadata_Target in @p and Metadata_cell_line in @ c')

d.Metadata_cell_line.unique()

array(['A549'], dtype=object)

In [198]:
pc_meta = d.loc[:, metadata].reset_index()
pc_var = d.loc[:, variables].values

model = PCA(n_components=2)
pca = model.fit_transform(pc_var)

PC1 = round(model.explained_variance_ratio_[0], 3) * 100
PC2 = round(model.explained_variance_ratio_[1], 3) * 100

test = pd.DataFrame(pca, columns= ['PC1', 'PC2'])

# Setup plotting logic
combined = test.merge(pc_meta, left_index=True, right_index=True)
combined.head()


# g = (ggplot(combined, gg.aes(x='PC1', y='PC2', color="Metadata_compound_name", shape='Metadata_cell_line')) 
#      + geom_point(size = 1.5)
#      + scale_colour_manual(name= 'Compound_name', values=["#de2d26", "#984ea3", "#01665e"])
#      + scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
#      + theme_classic()
#      + labs(title='SECRET', x="PC1 "+ str([PC1])+"%", y="PC2 "+ str([PC2])+"%")
    

#     )

# ggsave(filename='SECRET_PCA.pdf', plot = g, path = outpath)
# g


Unnamed: 0,PC1,PC2,index,Metadata_Plate,Metadata_Well,Metadata_broad_sample,Metadata_Assay_Plate_Barcode,Metadata_Plate_Map_Name,Metadata_cell_line,Metadata_well_position,...,Metadata_pert_id_vendor,Metadata_broad_sample_type,Metadata_pert_vehicle,Metadata_pert_type,Metadata_label,Metadata_chemist,Metadata_compound_name,Metadata_Annotation,Metadata_Target,Metadata_Broad_barcode
0,-67.486211,0.384005,148,BR00100032,G05,BRD-U68785813-000-01-5,BR00100032,C-7210-01-CMP-008-gray,A549,G05,...,,trt,DMSO,trt,6,Baishan Jiang,BSJ-04-030,inactive (protac),SECRET,1136944000.0
1,-68.79709,7.458148,152,BR00100032,G09,BRD-U68785813-000-01-5,BR00100032,C-7210-01-CMP-008-gray,A549,G09,...,,trt,DMSO,trt,6,Baishan Jiang,BSJ-04-030,inactive (protac),SECRET,1136944000.0
2,-60.057484,18.089509,161,BR00100032,G18,BRD-U68785813-000-01-5,BR00100032,C-7210-01-CMP-008-gray,A549,G18,...,,trt,DMSO,trt,6,Baishan Jiang,BSJ-04-030,inactive (protac),SECRET,1136944000.0
3,-35.978915,23.952113,165,BR00100032,G22,BRD-U68785813-000-01-5,BR00100032,C-7210-01-CMP-008-gray,A549,G22,...,,trt,DMSO,trt,6,Baishan Jiang,BSJ-04-030,inactive (protac),SECRET,1136944000.0
4,61.290721,-13.297887,172,BR00100032,H05,BRD-U43181200-000-01-8,BR00100032,C-7210-01-CMP-008-gray,A549,H05,...,,trt,DMSO,trt,5,Baishan Jiang,BSJ-03-136,active (protac),SECRET,1136944000.0


In [199]:
n_pcs= model.components_.shape[0]


initial_feature_names = variables

# get the index of the most important feature on EACH component i.e. largest absolute value


most_important = [np.abs(model.components_[i]).argsort()[-11:][::-1] for i in range(n_pcs)]
most_important[0]

pc1 = []

for i, p in enumerate(initial_feature_names):
    
    for pc in most_important[0]:
        
        if i == pc:
            
            pc1.append(p)
            

pc2=[]            
for i, p in enumerate(initial_feature_names):
    
    for pc in most_important[1]:
        
        if i == pc:
            
            pc2.append(p)
pc2

    
dic1 = {'PC1': pc1}
dic2= {'PC2': pc2}
print(dic1)
#print(dic2)



# # using LIST COMPREHENSION HERE AGAIN
# dic = {'PC{}'.format(i+1): most_important_names[i] for i in range(n_pcs)}




{'PC1': ['Cells_Intensity_MassDisplacement_ER', 'Cells_Intensity_MassDisplacement_Mito', 'Cells_RadialDistribution_RadialCV_ER_4of4', 'Cells_RadialDistribution_RadialCV_Mito_4of4', 'Cells_RadialDistribution_RadialCV_RNA_3of4', 'Cells_Texture_InfoMeas1_Mito_20_0', 'Cytoplasm_Intensity_MassDisplacement_AGP', 'Cytoplasm_Texture_SumEntropy_ER_20_0', 'Nuclei_Granularity_13_AGP', 'Nuclei_Intensity_MassDisplacement_DNA', 'Nuclei_RadialDistribution_RadialCV_ER_2of4']}


In [150]:
print(dic2)

{'PC2': ['Cells_AreaShape_Compactness', 'Cells_AreaShape_Extent', 'Nuclei_AreaShape_Compactness', 'Nuclei_AreaShape_FormFactor', 'Nuclei_AreaShape_Zernike_0_0', 'Nuclei_AreaShape_Zernike_4_0', 'Nuclei_AreaShape_Zernike_4_4', 'Nuclei_AreaShape_Zernike_6_6', 'Nuclei_AreaShape_Zernike_8_8', 'Nuclei_Intensity_MassDisplacement_DNA', 'Nuclei_RadialDistribution_FracAtD_Mito_1of4']}


In [None]:
x = 11.72866053483449/log10()

In [None]:
import numpy as np
for i in finalvalues.values():
    print np.percentile(map(int,i),95)

## Welch ttest for ERK5

In [None]:
tt = []

for i in variables:
    test = d.Metadata_compound_name.unique().tolist()
    gp0 = d.query("Metadata_compound_name in @ test[0]")[i].values
    gp1 = d.query("Metadata_compound_name in @ test[1]")[i].values
    gp2 = d.query("Metadata_compound_name in @ test[2]")[i].values
    gp3 = d.query("Metadata_compound_name in @ test[3]")[i].values
    

    t0 = ttest_ind(gp0, gp1, equal_var=False)
    t1 = ttest_ind(gp0, gp2, equal_var=False)
    t2 = ttest_ind(gp0, gp3, equal_var=False)
    t3 = ttest_ind(gp1, gp2, equal_var=False)
    t4 = ttest_ind(gp1, gp3, equal_var=False)
    t5 = ttest_ind(gp2, gp3, equal_var=False)



    d1 = {"Paired": [test[0]+', '+ test[1], test[0]+', '+ test[2], test[0]+', '+ test[3], test[1]+', '+ test[2], test[1]+', '+ test[3], test[2]+', '+ test[3]],
      "Statistics": [t0[0], t1[0], t2[0], t3[0], t4[0], t5[0]],
      "P-value": [t0[1], t1[1], t2[1], t3[1], t4[1], t5[1]],
      "Variables": str(i)}



    prf = pd.DataFrame(d1, columns = ['Paired','Statistics', 'P-value', 'Variables'])
    
    tt.append(prf)
final_tt = pd.concat(tt)
    
final_tt['logp'] = -log10(final_tt['P-value'])
index = [i for i in range(1, len(final_tt) +1)]
final_tt['index'] =  index
final_tt.head()   

In [None]:
final_tt.to_csv("ERK5_ttest_U2OS.csv",index=False)

### Welch ttest

In [None]:
tt = []

for i in variables:
    test = d.Metadata_compound_name.unique().tolist()
    gp1 = d.query("Metadata_compound_name in @ test[0]")[i].values
    gp2 = d.query("Metadata_compound_name in @ test[1]")[i].values
    gp3 = d.query("Metadata_compound_name in @ test[2]")[i].values
    

    t1 = ttest_ind(gp1, gp2, equal_var=False)
    t2 = ttest_ind(gp1, gp3, equal_var=False)
    t3 = ttest_ind(gp2, gp3, equal_var=False)



    d1 = {"Paired": [test[0]+', '+ test[1], test[0]+', '+ test[2], test[1]+', '+ test[2]],
      "Statistics": [t1[0], t2[0], t3[0]],
      "P-value": [t1[1], t2[1], t3[1]],
      "Variables": str(i)}



    prf = pd.DataFrame(d1, columns = ['Paired','Statistics', 'P-value', 'Variables'])
    
    tt.append(prf)
final_tt = pd.concat(tt)
    
final_tt['logp'] = -log10(final_tt['P-value'])
index = [i for i in range(1, len(final_tt) +1)]
final_tt['index'] =  index
final_tt.head()  

In [None]:
final_tt.to_csv("PIN1_ttest_U2oS.csv",index=False)

# kruskal Wallis test

In [None]:


p = ['NA', 'DCLK1']

tmp = d.query("Metadata_Target in @p")




kw = []

for i in variables:
    test = tmp.Metadata_compound_name.unique().tolist()
    gp1 = tmp.query("Metadata_compound_name in @ test[0]")[i].tolist()
    gp2 = tmp.query("Metadata_compound_name in @ test[1]")[i].tolist()
    gp3 = tmp.query("Metadata_compound_name in @ test[2]")[i].tolist()
    

    k1 = stats.kruskal(gp1, gp2)
    k2 = stats.kruskal(gp1, gp3)
    k3 = stats.kruskal(gp2, gp3)



    d1 = {"Paired": [test[0]+', '+ test[1], test[0]+', '+ test[2], test[1]+', '+ test[2]],
      "Statistics": [k1[0], k2[0], k3[0]],
      "P-value": [k1[1], k2[1], k3[1]],
      "Variables": str(i)}



    prf = pd.DataFrame(d1, columns = ['Paired','Statistics', 'P-value', 'Variables'])
    
    kw.append(prf)
final_kw = pd.concat(kw)
    
final_kw['logp'] = -log10(final_kw['P-value'])
index = [i for i in range(1, len(final_kw) +1)]
final_kw['index'] =  index
final_kw.head()   

In [None]:
final_kw.head()

In [None]:
final_kw.to_csv("DCLK_kw_U2oS.csv", index=False)

In [None]:
# tt = []
# for v in final_kw.Variables.unique():
    
#     tv = final_kw.query('Variables in @ v')
    
#     if (tv.iloc[0]["P-value"] < 0.05) & (tv.iloc[1]["P-value"] < 0.05) & (tv.iloc[2]["P-value"] < 0.05):
        
#         tt.append(tv)
    
# tmp2 = pd.concat(tt)




In [None]:
topdifvar = (final_kw
.sort_values(by="logp", ascending=False)
.groupby("Paired")
.head(10)
)
 
finallist = topdifvar[~topdifvar['Paired'].str.contains("DMSO")]['Variables'].tolist() 
finallist

#geom_hline(aes(yintercept = med, group = gr), colour = 'red')

In [None]:
g = (ggplot(final_kw, gg.aes(x='index', y='logp', color= "Paired")) 
     + geom_point(size = 1.5) 
     + facet_wrap('~Paired')
     + theme_bw() 
     + geom_hline(aes(yintercept = med, group = gr), colour = 'red')
     #+ geom_text(aes(label="Variables"), data=best_in_class) 
     + labs(title='Drugs targetting DCLK1', x="Variables ID", y="-log10 [p-value] "))
  
 
g

In [None]:
+
  geom_text(aes(label=ifelse(PTS>24,as.character(Name),'')),hjust=0,vjust=0)

In [None]:

ggplot(mpg, aes("displ", "hwy")) +\
geom_point(aes(colour="class")) +\
geom_text(aes(label="model"), data=best_in_class)

In [None]:
#final_kw.loc[(final_kw['P-value'] < 0.05) & (final_kw['Paired'] < 0.05)]


In [None]:
ttt.head()




In [None]:
def log_p(d):
    d['P-value'] = -log10(d['P-value'])
    return d

tp = log_p(ttt)
tp.head()

ttt

In [None]:


gp1 =df.query("Metadata_cell_line == 'U2OS' and Metadata_compound_name == 'FMF-04-159-2'")['Cells_AreaShape_Area'].tolist()
gp2 =df.query("Metadata_cell_line == 'U2OS' and Metadata_compound_name == 'FMF-05-176-1'")['Cells_AreaShape_Area'].tolist()
gp3 =df.query("Metadata_cell_line == 'U2OS' and Metadata_compound_name == 'DMSO'")['Cells_AreaShape_Area'].tolist()


from scipy import stats


stats.kruskal(gp1, gp2)


In [None]:
np.random.seed(123)
reducer=umap.UMAP()
embedding = reducer.fit_transform(df.loc[:, variables].values)


test = pd.DataFrame(embedding, columns= ['UMAP-1', 'UMAP-2'])
combined = test.merge(df.loc[:, metadata], left_index=True, right_index=True)




In [None]:
combined.columns

In [None]:
combined['Metadata_Target'].replace(np.nan, 'NA', inplace=True)
combined['Metadata_Annotation'].replace('active ', 'active', inplace=True)
combined['test']= combined['Metadata_compound_name']
combined['test'].replace(['DMSO', 'C1', 'C2', 'C3', 'C4'], 'controls', inplace=True)
combined['Metadata_test1'] = combined['test'] + "_" + combined['Metadata_Annotation'] + "_" + combined['Metadata_Target']  
combined['Metadata_test2'] = combined['test'] + "[" + combined['Metadata_Annotation'] + "]"
# combined['Metadata_Target4'].unique()
combined.Metadata_test1.unique()

In [None]:
combined.Metadata_test2.unique()

In [None]:
# options = ['controls[control]', 'FMF-03-146-1[active]', 'FMF-04-112-1[inactive]']


# d = combined.loc[combined['Metadata_test2'].isin(options)]

# d.head()
# g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_test2', shape="Metadata_cell_line")) + \
#     gg.geom_point(size = 1.5) + \
#     gg.xlab("UMAP-1") + \
#     gg.ylab("UMAP-2") + \
#     gg.ggtitle("DCLK1") + \
#     gg.xlim(-15,20) + \
#     gg.ylim(-10,25) + \
#     gg.theme_classic() + \
#     gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#2b83ba","#bababa"]) + \
#     gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
# ggsave(filename='umap_compound_treatment_DCLK.pdf', plot = g, path = outpath)

# g 


# options = ['controls[control]', 'BJP-06-005-3[active]', 'BJP-06-115-3[inactive]']

# d = combined.loc[combined['Metadata_test2'].isin(options)]

# d.head()
# g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_test2', shape="Metadata_cell_line")) + \
#     gg.geom_point(size = 1.5) + \
#     gg.xlab("UMAP-1") + \
#     gg.ylab("UMAP-2") + \
#     gg.ggtitle("PIN1") + \
#     gg.xlim(-15,20) + \
#     gg.ylim(-10,25) + \
#     gg.theme_classic() + \
#     gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#2b83ba","#bababa"]) + \
#     gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
# ggsave(filename='umap_compound_treatment_PIN1.pdf', plot = g, path = outpath)

# g 

# options = ['controls[control]', 'BSJ-03-136[active (protac)]', 'BSJ-04-030[inactive (protac)]']

# d = combined.loc[combined['Metadata_test2'].isin(options)]

# g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_test2', shape="Metadata_cell_line")) + \
#     gg.geom_point(size = 1.5) + \
#     gg.xlab("UMAP-1") + \
#     gg.ylab("UMAP-2") + \
#     gg.ggtitle("SECRET") + \
#     gg.xlim(-15,20) + \
#     gg.ylim(-10,25) + \
#     gg.theme_classic() + \
#     gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#2b83ba","#bababa"]) + \
#     gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
# ggsave(filename='umap_compound_treatment_SECRET.pdf', plot = g, path = outpath)

# g


# options = ['controls[control]', 'FMF-04-159-2[covalent]', 'FMF-05-176-1[reversible]']

# d = combined.loc[combined['Metadata_test2'].isin(options)]

# g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_test2', shape="Metadata_cell_line")) + \
#     gg.geom_point(size = 1.5) + \
#     gg.xlab("UMAP-1") + \
#     gg.ylab("UMAP-2") + \
#     gg.ggtitle("CDK14 / pan-TAIRE") + \
#     gg.xlim(-15,20) + \
#     gg.ylim(-10,25) + \
#     gg.theme_classic() + \
#     gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#2b83ba","#bababa"]) + \
#     gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
# ggsave(filename='umap_compound_treatment_CDK14pan-TAIRE.pdf', plot = g, path = outpath)

# g

# options = ['controls[control]', 'JWG-071[active]', 'AX15836[active]', 'JWG-119[inactive]']

# d = combined.loc[combined['Metadata_test2'].isin(options)]

# g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_test2', shape="Metadata_cell_line")) + \
#     gg.geom_point(size = 1.5) + \
#     gg.xlab("UMAP-1") + \
#     gg.ylab("UMAP-2") + \
#     gg.ggtitle("ERK5") + \
#     gg.xlim(-15,20) + \
#     gg.ylim(-10,25) + \
#     gg.theme_classic() + \
#     gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#2b83ba","#7b3294", "#bababa"]) + \
#     gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
# ggsave(filename='umap_compound_treatment_ERK5.pdf', plot = g, path = outpath)

# g


# options = ['controls[control]', 'PND-1186[active]']

# d = combined.loc[combined['Metadata_test2'].isin(options)]

# g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_test2', shape="Metadata_cell_line")) + \
#     gg.geom_point(size = 1.5) + \
#     gg.xlab("UMAP-1") + \
#     gg.ylab("UMAP-2") + \
#     gg.ggtitle("FAK") + \
#     gg.xlim(-15,20) + \
#     gg.ylim(-10,25) + \
#     gg.theme_classic() + \
#     gg.scale_colour_manual(name= 'Compound_name', values=["#7b3294", "#bababa"]) + \
#     gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
# ggsave(filename='umap_compound_treatment_FAK.pdf', plot = g, path = outpath)

# g

# options = ['C1', 'C2', 'C3', 'C4']

# d = combined.loc[combined['Metadata_compound_name'].isin(options)]

# g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_compound_name', shape="Metadata_cell_line")) + \
#     gg.geom_point(size = 1.5) + \
#     gg.xlab("UMAP-1") + \
#     gg.ylab("UMAP-2") + \
#     gg.xlim(-15,20) + \
#     gg.ylim(-10,25) + \
#     gg.theme_classic() + \
#     gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#2b83ba","#7b3294", "#fdae61"]) + \
#     gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
# ggsave(filename='umap_compound_treatment_controls.pdf', plot = g, path = outpath)

# g

options = ['DMSO', 'C1', 'C2', 'C3', 'C4']

d = combined.loc[combined['Metadata_compound_name'].isin(options)]

g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_compound_name', shape="Metadata_cell_line")) + \
    gg.geom_point(size = 1.5) + \
    gg.xlab("UMAP-1") + \
    gg.ylab("UMAP-2") + \
    gg.xlim(-15,20) + \
    gg.ylim(-10,25) + \
    gg.theme_classic() + \
    gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#2b83ba","#7b3294", "#fdae61", "#bababa"]) + \
    gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
ggsave(filename='umap_compound_treatment_controls_DMSO.pdf', plot = g, path = outpath)

g



In [None]:
combined['Metadata_Target_new'] =combined['Metadata_Target'] 
combined['Metadata_Target_new'] = np.where(combined.Metadata_compound_name == 'DMSO', 'controls', combined.Metadata_Target_new)
combined['Metadata_Target_new'] = np.where(combined.Metadata_compound_name == 'C1', 'C1', combined.Metadata_Target_new)
combined['Metadata_Target_new'] = np.where(combined.Metadata_compound_name == 'C2', 'C2', combined.Metadata_Target_new)
combined['Metadata_Target_new'] = np.where(combined.Metadata_compound_name == 'C3', 'C3', combined.Metadata_Target_new)
combined['Metadata_Target_new'] = np.where(combined.Metadata_compound_name == 'C4', 'C4', combined.Metadata_Target_new)
combined['Metadata_Target2'] = combined['Metadata_Target']

combined['Metadata_Target2'] = combined['Metadata_Target2'].replace(np.nan, 'controls')
combined['Metadata_Target2'].unique()
combined['Metadata_Target3'] = combined['Metadata_Target2'] + "[" + combined['Metadata_Annotation'] + "]"

combined['Metadata_Target4'] = combined['Metadata_compound_name'] + "[" + combined['Metadata_Annotation'] + "]"

combined['Metadata_Target4'].unique()




In [None]:
combined['Metadata_Target5'] = combined['Metadata_compound_name'] + "[" + combined['Metadata_Annotation'] + "]" + "_" +  combined['Metadata_Target']
combined.Metadata_Target5.unique()                                                                                                                                   
                                                                                                                                   

In [None]:
# options = ['controls[control]', 'DCLK1[active]', 'DCLK1[inactive]']


# d = combined.loc[combined['Metadata_compound_name'].isin(options)]

# d.head()
# g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_Target3', shape="Metadata_cell_line")) + \
#     gg.geom_point(size = 1.5) + \
#     gg.xlab("UMAP-1") + \
#     gg.ylab("UMAP-2") + \
#     gg.ggtitle("DCLK1") + \
#     gg.xlim(-15,20) + \
#     gg.ylim(-10,25) + \
#     gg.theme_classic() + \
#     gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#2b83ba","#bababa"]) + \
#     gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
# ggsave(filename='umap_compound_treatment_DCLK.pdf', plot = g, path = outpath)

# g 

# options = ['controls[control]', 'PIN1[active]', 'PIN1[inactive]']


# d = combined.loc[combined['Metadata_Target3'].isin(options)]

# d.head()
# g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_Target3', shape="Metadata_cell_line")) + \
#     gg.geom_point(size = 1.5) + \
#     gg.xlab("UMAP-1") + \
#     gg.ylab("UMAP-2") + \
#     gg.ggtitle("PIN1") + \
#     gg.xlim(-15,20) + \
#     gg.ylim(-10,25) + \
#     gg.theme_classic() + \
#     gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#2b83ba","#bababa"]) + \
#     gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
# ggsave(filename='umap_compound_treatment_PIN1.pdf', plot = g, path = outpath)

# g 


# options = ['controls[control]', 'SECRET[active (protac)]', 'SECRET[inactive (protac)]']


# d = combined.loc[combined['Metadata_Target3'].isin(options)]

# d.head()
# g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_Target3', shape="Metadata_cell_line")) + \
#     gg.geom_point(size = 1.5) + \
#     gg.xlab("UMAP-1") + \
#     gg.ylab("UMAP-2") + \
#     gg.ggtitle("SECRET") + \
#     gg.xlim(-15,20) + \
#     gg.ylim(-10,25) + \
#     gg.theme_classic() + \
#     gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#2b83ba","#bababa"]) + \
#     gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
# ggsave(filename='umap_compound_treatment_SECRET.pdf', plot = g, path = outpath)

# g 

options = ['controls[control]', 'CDK14 / pan-TAIRE[covalent]', 'CDK14 / pan-TAIRE[reversible]']


# d = combined.loc[combined['Metadata_Target3'].isin(options)]


# g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_Target3', shape="Metadata_cell_line")) + \
#     gg.geom_point(size = 1.5) + \
#     gg.xlab("UMAP-1") + \
#     gg.ylab("UMAP-2") + \
#     gg.ggtitle("CDK14 / pan-TAIRE") + \
#     gg.xlim(-15,20) + \
#     gg.ylim(-10,25) + \
#     gg.theme_classic() + \
#     gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#2b83ba","#bababa"]) + \
#     gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
# ggsave(filename='umap_compound_treatment_CDK14_pan-TAIRE.pdf', plot = g, path = outpath)

# g 

options = ['controls[control]', 'ERK5[active]', 'ERK5[inactive]']


d = combined.loc[combined['Metadata_Target3'].isin(options)]


g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_Target3', shape="Metadata_cell_line")) + \
    gg.geom_point(size = 1.5) + \
    gg.xlab("UMAP-1") + \
    gg.ylab("UMAP-2") + \
    gg.ggtitle("ERK5") + \
    gg.xlim(-15,20) + \
    gg.ylim(-10,25) + \
    gg.theme_classic() + \
    gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#2b83ba","#bababa"]) + \
    gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
ggsave(filename='umap_compound_treatment_ERK5.pdf', plot = g, path = outpath)

g 






In [None]:
# options = ['C1', 'C2', 'C3', 'C4'] 

# d = combined.loc[combined['Metadata_Target'].isin(options)]


# g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_Target_new', shape="Metadata_cell_line")) + \
#     gg.geom_point(size = 2.5) + \
#     gg.xlab("UMAP-1") + \
#     gg.ylab("UMAP-2") + \
#     gg.xlim(-15,20) + \
#     gg.ylim(-10,25) + \
#     gg.theme_classic() + \
#     gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#f4a582","#2b83ba", "#404040"]) + \
#     gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
# ggsave(filename='umap_compound_treatment_controls.pdf', plot = g, path = outpath)

# g 

# options = ['controls', 'C1', 'C2', 'C3', 'C4'] 

# d = combined.loc[combined['Metadata_Target'].isin(options)]


# g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_Target_new', shape="Metadata_cell_line")) + \
#     gg.geom_point(size = 1) + \
#     gg.xlab("UMAP-1") + \
#     gg.ylab("UMAP-2") + \
#     gg.xlim(-15,20) + \
#     gg.ylim(-10,25) + \
#     gg.theme_classic() + \
#     gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#f4a582","#2b83ba", "#404040", "#bababa"]) + \
#     gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
# ggsave(filename='umap_compound_treatment_controls_DMSO.pdf', plot = g, path = outpath)

# g 

options = ['controls', 'C1', 'C2', 'C3', 'C4', 'DCLK1']


d = combined.loc[combined['Metadata_Target'].isin(options)]

d.head()
g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_Target', shape="Metadata_Annotation")) + \
    gg.geom_point(size = 3) + \
    gg.xlab("UMAP-1") + \
    gg.ylab("UMAP-2") + \
    gg.xlim(-15,20) + \
    gg.ylim(-10,25) + \
    gg.theme_classic() + \
    gg.scale_colour_manual(name= 'Compound_name', values=["#ca0020","#f4a582","#2b83ba", "#404040", "#bababa", "#abdda4"]) + \
    gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])
ggsave(filename='umap_compound_treatment_DCLK.pdf', plot = g, path = outpath)

g 



In [None]:
#options = ['controls', 'DCLK1', 'C1', 'C2', 'C3', 'C4'] 
options = ['conrols','C1', 'C2', 'C3', 'C4'] 

d = combined.loc[combined['Metadata_Target'].isin(options)]
d.Metadata_Target.unique()
# g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_Target')) + \
#     gg.geom_point(size = 1) + \
#     gg.xlab("UMAP-1") + \
#     gg.ylab("UMAP-2") + \
#     gg.xlim(-15,20) + \
#     gg.ylim(-10,25) + \
#     gg.theme_classic() 
# #+ \
# #     gg.scale_colour_manual(name= 'Compound_name', values=["#bdbdbd","#cab2d6","#ff7f00", "#fdbf6f", "#e31a1c"]) 
# g   


In [None]:
for i, p in enumerate(combined.Metadata_Target.unique()):
    
    d = combined.query("Metadata_Target == 'control' and Metadata_Target == @p" )
    
    g = gg.ggplot(d, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_Target')) + \
    gg.geom_point(size = 1) + \
    gg.xlab("UMAP-1") + \
    gg.ylab("UMAP-2") + \
    gg.xlim(-15,20) + \
    gg.ylim(-10,25) + \
    gg.theme_classic() + \
    gg.labs(title= 'UMAP' +  str(i)) 
    ggsave(filename=str(i)+'.png', plot = g, path = figpath)

In [None]:
p = gg.ggplot(combined, gg.aes(x='UMAP-1', y='UMAP-2', color= 'Metadata_Target')) + \
    gg.geom_point(size = 1) + \
    gg.xlab("UMAP-1") + \
    gg.ylab("UMAP-2") + \
    gg.xlim(-15,20) + \
    gg.ylim(-10,25) + \
    gg.theme_classic() 
#+ \
#     gg.scale_colour_manual(name= 'Compound_name', values=["#cab2d6", "#ff7f00", "#fdbf6f", "#e31a1c",
#                                                          "#fb9a99", "#33a02c","#b2df8a", "#1f78b4",
#                                                          "#a6cee3", "#f7fcfd", "#999999", "#a65628",
#                                                          "#984ea3", "#f781bf", "#ffff33", "#8dd3c7",
#                                                          "#000000"]) 
#+ \
#     gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o']) + \
  
   
   


    

#ggsave(filename='Drug_target_UMAP.png', plot = p, path = '/Users/habbasi/Desktop/')

p 

# Principle Component Analysis

In [None]:
# Fit and transform with t-SNE

from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0, learning_rate=100, n_iter=1000, perplexity=30)


#Project the data in 2D 
x_2d = model.fit_transform(df.loc[:, variables].values)

test = pd.DataFrame(x_2d, columns= ['t-SNE-1', 't-SNE-2'])

combined = test.merge(df.loc[:, metadata], left_index=True, right_index=True)
combined.head()


In [None]:
p = gg.ggplot(combined, gg.aes(x='t-SNE-1', y='t-SNE-2', color='Metadata_compound_name', label='Metadata_compound_name', shape='Metadata_cell_line')) + \
    gg.geom_point() + \
    gg.theme_bw() + \
    gg.xlab("t-SNE-1") + \
    gg.ylab("t-SNE-2") + \
    gg.ggtitle("t-distributed stochastic neighbour embedding") + \
    gg.scale_colour_manual(name= 'Compound_name', values=["#cab2d6", "#ff7f00", "#fdbf6f", "#e31a1c",
                                                         "#fb9a99", "#33a02c","#b2df8a", "#1f78b4",
                                                         "#a6cee3", "#01665e", "#999999", "#a65628",
                                                         "#984ea3", "#f781bf", "#ffff33", "#8dd3c7",
                                                         "#000000"]) + \
    gg.scale_shape_manual(name="Celllines", labels=['A549', 'U2oS'], values=['*', 'o'])

    

#ggsave(filename='Gray_project_t-SNE.pdf', plot = p, path = '/Users/habbasi/Desktop/')
    
p
