In [55]:
import uuid
import pandas
import numpy
from random import shuffle

import sklearn
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import sklearn.manifold

import matplotlib
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx

import os

In [134]:
def load_data():
    if os.path.isdir("/cluster/home/simondi/spark/"):
        filename = "/cluster/home/simondi/simondi/tix/data/screening_data/cells_sample_10_normalized.tsv"
    else:
        filename = "/Users/simondi/PHD/data/data/target_infect_x/screening_data_subset/cells_sample_10_normalized.tsv"

    data = pandas.read_csv(filename, sep="\t", header=0)

    feature_cols = [(i, x) for i, x in enumerate(data.columns.values) if
                    x.startswith("cells")]
    for i, c in feature_cols:
        data.loc[:, c] = data.loc[:, c].astype('float64')

    data_new = data.query(
      "library=='d' and design=='p' and replicate==1 and (pathogen=='brucella' or pathogen=='listeria' or pathogen=='adeno' or pathogen=='bartonella')").groupby(
      ["gene"]).filter(lambda x: len(x) == 40)

    X = data_new.dropna(subset=["cells.children_invasomes_count", "cells.children_bacteria_count"], how='all')
    X["cells.children_invasomes_count"] =  X["cells.children_invasomes_count"].fillna(0)
    X["cells.children_bacteria_count"] =  X["cells.children_bacteria_count"].fillna(0)
    X = X.dropna()
    X["infection"] = X["cells.children_invasomes_count"] + X["cells.children_bacteria_count"]
    del X["cells.children_invasomes_count"]
    del X["cells.children_bacteria_count"]

    
    return X

In [135]:
data = load_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [136]:
data

Unnamed: 0.1,Unnamed: 0,study,pathogen,library,design,replicate,plate,well,gene,sirna,...,cells.neighbors_anglebetweenneighbors_2,cells.neighbors_firstclosestobjectnumber_2,cells.neighbors_firstclosestxvector_2,cells.neighbors_firstclosestyvector_2,cells.neighbors_numberofneighbors_2,cells.neighbors_secondclosestobjectnumber_2,cells.neighbors_secondclosestxvector_2,cells.neighbors_secondclosestyvector_2,cells.parent_nuclei,infection
19189,19189,group_cossart,listeria,d,p,1,dz10-1k,a03,ptdss1,l-008568-00,...,-1.587547,1.810329,0.499172,0.673442,0.320972,1.652265,-0.122069,1.142655,1.624893,-0.601384
19190,19190,group_cossart,listeria,d,p,1,dz10-1k,a03,ptdss1,l-008568-00,...,0.784279,2.040954,-0.169644,-1.082033,-0.683946,1.990027,-0.366199,0.801289,2.083792,-0.601384
19191,19191,group_cossart,listeria,d,p,1,dz10-1k,a03,ptdss1,l-008568-00,...,0.350490,-1.453904,-0.416235,-1.104777,-0.181487,-1.263154,0.974851,0.220525,-1.375597,2.554173
19192,19192,group_cossart,listeria,d,p,1,dz10-1k,a03,ptdss1,l-008568-00,...,-0.868585,1.366819,-0.374024,-0.895257,0.823431,1.474496,0.791894,-0.699153,1.395444,0.503061
19193,19193,group_cossart,listeria,d,p,1,dz10-1k,a03,ptdss1,l-008568-00,...,-0.155559,0.639463,-0.329358,-1.296968,0.823431,0.514541,-0.927066,0.520524,0.689446,-0.285829
19194,19194,group_cossart,listeria,d,p,1,dz10-1k,a03,ptdss1,l-008568-00,...,1.431668,-0.992654,-1.288298,-1.197770,0.823431,-0.694292,1.010994,1.156246,-0.828449,1.134172
19195,19195,group_cossart,listeria,d,p,1,dz10-1k,a03,ptdss1,l-008568-00,...,0.292664,0.994271,-0.496163,1.014964,-0.181487,1.296727,1.258625,-0.149059,1.060095,-0.522495
19196,19196,group_cossart,listeria,d,p,1,dz10-1k,a03,ptdss1,l-008568-00,...,1.153805,1.047492,-1.564150,0.553712,2.330808,1.492273,1.163072,-0.942612,1.218945,3.106396
19197,19197,group_cossart,listeria,d,p,1,dz10-1k,a03,ptdss1,l-008568-00,...,0.944995,-1.099096,1.178522,-0.483206,1.325890,-1.316485,-0.732244,0.905257,-1.216748,1.134172
19198,19198,group_cossart,listeria,d,p,1,dz10-1k,a03,ptdss1,l-008568-00,...,1.332182,0.054030,1.093609,0.297378,0.320972,-0.196537,-0.879636,-0.484085,-0.034201,-0.364718


In [None]:
# VARIABLE IMPORTANCE with random forst or so
# plot cells with coordinates and then encolor with different features or interactions of features

In [137]:
def plot(X2, X, method):
    uniq = list(set(X["gene"]))
    
    hot = plt.get_cmap('hot')
    cNorm = colors.Normalize(vmin=0, vmax=len(uniq))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=hot)
    alpha = 0.5
    
    plt.figure()
    for i in range(len(uniq)):
        indx = X["gene"] == uniq[i]
        plt.scatter(X2[indx, 1], X2[indx, 0], color=scalarMap.to_rgba(i),
                    label=uniq[i], marker=".", alpha=alpha)
    if len(uniq) < 20:     
        plt.legend()    
    uid = str(uuid.uuid1())
    plt.savefig(
      "/Users/simondi/PROJECTS/target_infect_x_project/src/tix_util/tix_analysis/plots/scatter_" + method + "_2d" +uid+ ".png",
      dpi=720)
    plt.close()

In [138]:
gene_list = list(set(data["gene"]))

In [139]:
feature_cols_idxs = [x for x in data.columns.values if x.startswith("cells")]

In [140]:
def tsne(data, genes_list, cnt):
    uid = str(uuid.uuid1())
    shuffle(gene_list)
    used_genes = gene_list[1:cnt]
    X = data[ data["gene"].isin(used_genes)]
    for x in [1, 5, 10, 30, 50, 100]:
        ts2 = sklearn.manifold.TSNE(n_components=2, perplexity=x, n_iter=5000)
        X2 = ts2.fit_transform(X.loc[:, feature_cols_idxs])
        plot(X2, X, "tsne_" + str(x))

In [141]:
def lle(data, genes_list, cnt):
    uid = str(uuid.uuid1())
    shuffle(gene_list)
    used_genes = gene_list[1:cnt]
    X = data[ data["gene"].isin(used_genes)]
    for x in [5, 10, 30]:
        ts2 = sklearn.manifold.LocallyLinearEmbedding(n_components=2,method="standard",n_neighbors=x)
        X2 = ts2.fit_transform(X.loc[:, feature_cols_idxs])
        plot(X2, X, "lle_"+ str(x))

In [142]:
def kpca(data, genes_list, cnt):
    uid = str(uuid.uuid1())
    shuffle(gene_list)
    used_genes = gene_list[1:cnt]
    X = data[ data["gene"].isin(used_genes)]
    for x in ["rbf", "poly", "rbf"]:
        ts2 = sklearn.decomposition.KernelPCA(n_components=2, kernel=x)
        X2 = ts2.fit_transform(X.loc[:, feature_cols_idxs])
        plot(X2, X, "kpca_" + x)

In [143]:
def pca(data, genes_list, cnt):
    uid = str(uuid.uuid1())
    shuffle(gene_list)
    used_genes = gene_list[1:cnt]
    X = data[ data["gene"].isin(used_genes)]
    ts2 = sklearn.decomposition.PCA(n_components=2, )
    X2 = ts2.fit_transform(X.loc[:, feature_cols_idxs])
    plot(X2, X, "pca")

In [None]:
pca(data, gene_list, 10)
tsne(data, gene_list, 10)
lle(data, gene_list, 10)
kpca(data, gene_list, 10)

pca(data, gene_list, 100)
tsne(data, gene_list, 100)
lle(data, gene_list, 100)
kpca(data, gene_list, 100)