In [1]:
import uuid
import pandas
import numpy
from random import shuffle

import sklearn
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import sklearn.manifold

import matplotlib
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx

import os

In [2]:
def load_data():
    
    filename = "/Users/simondi/PHD/data/data/target_infect_x/screening_data/infectx-bartonella-d-p-g-1-dz14-2a_cells_data.tsv"

    data = pandas.read_csv(filename, sep="\t", header=0)

    feature_cols = [(i, x) for i, x in enumerate(data.columns.values) if
                    x.startswith("cells")]
    for i, c in feature_cols:
        data.loc[:, c] = data.loc[:, c].astype('float64')

    X = data.dropna(subset=["cells.children_invasomes_count"], how='all')
    X["cells.children_invasomes_count"] =  X["cells.children_invasomes_count"].fillna(0)
    X = X.dropna()
    X["infection"] = X["cells.children_invasomes_count"]
    del X["cells.children_invasomes_count"]
    
    return X

In [3]:
def plot(data, gene_list, cnt):

    shuffle(gene_list)
    used_genes = gene_list[1:cnt]
    X = data[ data["gene"].isin(used_genes)]
     
    uniq = list(set(X["gene"]))

    hot = plt.get_cmap('hot')
    cNorm = colors.Normalize(vmin=0, vmax=len(uniq))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=hot)
    alpha = 0.5

    plt.figure()
    for i in range(len(uniq)):
        indx = X["gene"] == uniq[i]
        plt.scatter(X.loc[indx, "cells.location_center_x"], X.loc[indx, "cells.location_center_x"],
                    color=scalarMap.to_rgba(i),
                    label=uniq[i], marker=".", alpha=alpha)
    if len(uniq) < 20:     
        plt.legend()    
    
    uid = str(uuid.uuid1())
    plt.show()

In [4]:
X = load_data()

In [5]:
feature_cols = [x for x in X.columns if x.startswith("cells")]
for c in feature_cols:
    X.loc[:, c] = (X.loc[:, c] - numpy.mean(X.loc[:, c])) / numpy.std(X.loc[:, c])

In [6]:
gene_list = list(set(X["gene"]))
len(gene_list)

304

In [7]:
well_list = list(set(X["well"]))
len(well_list)

384

In [8]:
cnt  = 10

In [9]:
fl = "/Users/simondi/PROJECTS/target_infect_x_project/src/tix_util/tix_analysis/plots/scatter_plate_bartonella-d-p-g-1-dz14-2a_"

In [10]:
shuffle(well_list)
used_genes = well_list[0:cnt]

# take mid images
X_5well = X.loc[X.image_idx % 5 == 0,]

X_5well_wells = list(set(X_5well.well))
shuffle(X_5well_wells)

# take random 5 genes/weels
Xw = X_5well.loc[X_5well["well"].isin(X_5well_wells[0:5]), :]

In [15]:
X.columns.values

array(['well', 'gene', 'sirna', 'well_type', 'image_idx', 'object_idx',
       'cells.areashape_area', 'cells.areashape_eccentricity',
       'cells.areashape_extent', 'cells.areashape_formfactor',
       'cells.areashape_majoraxislength',
       'cells.areashape_minoraxislength', 'cells.areashape_orientation',
       'cells.areashape_perimeter',
       'cells.intensity_integratedintensity_corractin',
       'cells.intensity_integratedintensity_corrdna',
       'cells.intensity_integratedintensity_corrpathogen',
       'cells.intensity_integratedintensityedge_corractin',
       'cells.intensity_integratedintensityedge_corrdna',
       'cells.intensity_integratedintensityedge_corrpathogen',
       'cells.intensity_lowerquartileintensity_corractin',
       'cells.intensity_lowerquartileintensity_corrdna',
       'cells.intensity_lowerquartileintensity_corrpathogen',
       'cells.intensity_massdisplacement_corractin',
       'cells.intensity_massdisplacement_corrdna',
       'cells.inten

In [36]:
Xw.loc[:,'sort'] = Xw[c2].abs()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [42]:
Xw["sort"].values.sort()
Xw.

Unnamed: 0,well,gene,sirna,well_type,image_idx,object_idx,cells.areashape_area,cells.areashape_eccentricity,cells.areashape_extent,cells.areashape_formfactor,...,cells.texture_sumentropy_corrdna_3,cells.texture_sumentropy_corrpathogen_3,cells.texture_sumvariance_corractin_3,cells.texture_sumvariance_corrdna_3,cells.texture_sumvariance_corrpathogen_3,cells.texture_variance_corractin_3,cells.texture_variance_corrdna_3,cells.texture_variance_corrpathogen_3,infection,sort
93206,e05,lcmt2,l-008815-01,pooled_sirna,905,1,0.332752,1.147171,-0.333477,-0.612764,...,0.669795,-0.894089,-0.616359,-0.402158,-0.743063,-0.589136,-0.292917,-0.726447,1.0,0.000114
93207,e05,lcmt2,l-008815-01,pooled_sirna,905,2,1.830534,1.019159,-0.007003,-0.549592,...,-1.392440,-0.622712,-0.429925,-1.377374,-0.496350,-0.319922,-1.380743,-0.537022,4.0,0.000198
93208,e05,lcmt2,l-008815-01,pooled_sirna,905,3,3.780036,0.864014,-0.495485,-1.756033,...,-1.856313,-1.119028,-0.683336,-1.887548,-0.705923,-0.725878,-1.843158,-0.779068,3.0,0.000260
93209,e05,lcmt2,l-008815-01,pooled_sirna,905,4,-0.373686,0.651027,-1.649701,-0.632591,...,-1.028882,-0.390078,-0.070936,-1.198070,-0.194783,0.201649,-1.198541,-0.126339,2.0,0.000334
93210,e05,lcmt2,l-008815-01,pooled_sirna,905,5,0.790195,1.247784,-1.473598,-0.747144,...,-1.444510,-1.085202,-0.654596,-0.832965,-0.489426,-0.757542,-0.859495,-0.501783,2.0,0.000393
93211,e05,lcmt2,l-008815-01,pooled_sirna,905,6,-0.478627,-1.057480,2.054802,1.292897,...,0.376745,0.873207,1.295605,1.966204,0.755167,1.301460,1.887767,0.707685,2.0,0.000581
93212,e05,lcmt2,l-008815-01,pooled_sirna,905,7,0.236636,-0.024134,0.504392,0.172736,...,-1.063386,-0.836522,-0.356415,-0.966260,-0.773374,-0.429824,-0.978196,-0.816524,1.0,0.000620
93213,e05,lcmt2,l-008815-01,pooled_sirna,905,8,2.070227,-1.327671,1.415300,-0.256841,...,-1.250846,-0.875783,0.223911,-0.805594,-0.407864,0.073707,-0.645848,-0.474250,4.0,0.001196
93214,e05,lcmt2,l-008815-01,pooled_sirna,905,9,-0.473856,0.217616,-0.795422,-1.251978,...,-0.751995,0.663079,1.566525,0.052070,0.453461,1.544244,0.021631,0.462863,1.0,0.001427
93215,e05,lcmt2,l-008815-01,pooled_sirna,905,10,-0.248951,0.303017,0.796970,-0.221231,...,-0.650086,-1.005929,-0.550123,-0.397830,-0.818122,-0.480633,-0.435742,-0.888437,1.0,0.001487


In [60]:
uid =str(uuid.uuid1())
uniq = list(set(Xw.gene))

hot = plt.get_cmap('hot')
cNorm = colors.Normalize(vmin=0, vmax=len(uniq))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=hot)
alpha = 0.5

# # plot features pairwisely
# for c1 in feature_cols:
#     for c2 in feature_cols:
c1 = "cells.areashape_extent"
c2 = "cells.intensity_integratedintensityedge_corrdna"
fig = plt.figure(0)
ax = fig.add_subplot(111)
for i in range(len(uniq)):
    indx = Xw["gene"] == uniq[i]

    plt.scatter(Xw.loc[indx, c1],
                Xw.loc[indx, c2],
                color=scalarMap.to_rgba(i),
                label=uniq[i], marker=".", alpha=alpha)
    if len(uniq) < 20:     
        plt.legend()    

plt.xlabel(c1)
plt.ylabel(c2)
#plt.show()
plt.savefig(  fl + c1 + "_"+c2 +"_" +uid+ ".png",
  dpi=720)
plt.close(fig)

In [56]:
plt.close(fig)

In [15]:
def plot_dimred(X2, X, method):
    uniq = list(set(X["gene"]))
    
    hot = plt.get_cmap('hot')
    cNorm = colors.Normalize(vmin=0, vmax=len(uniq))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=hot)
    alpha = 0.5
    
    fig = plt.figure()
    for i in range(len(uniq)):
        indx = X["gene"] == uniq[i]
        plt.scatter(X2[indx, 1], X2[indx, 0], color=scalarMap.to_rgba(i),
                    label=uniq[i], marker=".", alpha=alpha)
    if len(uniq) < 20:     
        plt.legend()
    plt.savefig(fl + "_" + method + "_2d" +uid+ ".png", dpi=720)
    plt.close(fig)

In [13]:
def plot_pca(Xtrans, X, method):
    uniq = list(set(X["gene"]))
    
    hot = plt.get_cmap('hot')
    cNorm = colors.Normalize(vmin=0, vmax=len(uniq))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=hot)
    alpha = 0.5
    
    X2 = Xtrans.transform(X.loc[:, feature_cols_idx])
    
    fig = plt.figure()
    for i in range(len(uniq)):
        indx = X["gene"] == uniq[i]
        plt.scatter(X2[indx, 0], X2[indx, 1], color=scalarMap.to_rgba(i),
                    label=uniq[i], marker=".", alpha=alpha)
    if len(uniq) < 20:     
        plt.legend()
    plt.xlabel("Variance ratio:" + "{0:.2f}".format(Xtrans.explained_variance_ratio_[0]))
    plt.ylabel("Variance ratio:" + "{0:.2f}".format(Xtrans.explained_variance_ratio_[1]))
    plt.savefig(fl + "_" + method + "_2d_pca"+uid+".png", dpi=720)
    plt.close(fig)
    
    fig, ax = plt.subplots()
    ax.bar(list(numpy.array(range(10)) + 0.5),  Xtrans.explained_variance_ratio_[:10], width=0.5)

    ax.set_ylabel('Variance ratio')
    ax.set_xticks(list(numpy.array(range(10)) + 0.5))
    ax.set_xticklabels(["PCA" + str(i+1) for i in range(10)])
    plt.savefig(fl + "_" + method + "_2d_scree"+uid+".png", dpi=720)
    plt.close(fig)

In [None]:
for i in range(20):
    uid =str(uuid.uuid1())

    shuffle(well_list)
    used_genes = well_list[0:cnt]

    # take mid images
    X_5well = X.loc[X.image_idx % 5 == 0,]

    X_5well_wells = list(set(X_5well.well))
    shuffle(X_5well_wells)

    # take random 5 genes/weels
    Xw = X_5well.loc[X_5well["well"].isin(X_5well_wells[0:5]), :]

    feature_cols_idx = [x for x in (Xw.columns.values) if x.startswith("cells")]
    perp = [2,5,10,30,50]
    for x in perp:
        ts2 = sklearn.manifold.TSNE(n_components=2, perplexity=x, n_iter=5000)
        X2 = ts2.fit_transform(Xw.loc[:, feature_cols_idx])
        plot_dimred(X2, Xw, "tsne_" + str(x))

    for x in ["rbf", "poly", "linear"]:
        ts2 = sklearn.decomposition.KernelPCA(n_components=2, kernel=x)
        X2 = ts2.fit_transform(Xw.loc[:, feature_cols_idx])
        plot_dimred(X2, Xw, "kpca_" + x)    

    ts2 = sklearn.decomposition.PCA(n_components=20)
    Xtrans = ts2.fit(Xw.loc[:, feature_cols_idx])
    plot_pca(Xtrans, Xw, "pca_")