In [1]:
import uuid
import pandas
import numpy
from random import shuffle

import sklearn
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import sklearn.manifold

import matplotlib
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx

import os

In [2]:
def load_data():
    
    filename = "/Users/simondi/PHD/data/data/target_infect_x/screening_data/infectx-bartonella-d-p-g-1-dz14-2a_cells_data.tsv"

    data = pandas.read_csv(filename, sep="\t", header=0)

    feature_cols = [(i, x) for i, x in enumerate(data.columns.values) if
                    x.startswith("cells")]
    for i, c in feature_cols:
        data.loc[:, c] = data.loc[:, c].astype('float64')

    X = data.dropna(subset=["cells.children_invasomes_count"], how='all')
    X["cells.children_invasomes_count"] =  X["cells.children_invasomes_count"].fillna(0)
    X = X.dropna()
    X["infection"] = X["cells.children_invasomes_count"]
    del X["cells.children_invasomes_count"]
    
    return X

In [3]:
def plot(data, gene_list, cnt):

    shuffle(gene_list)
    used_genes = gene_list[1:cnt]
    X = data[ data["gene"].isin(used_genes)]
     
    uniq = list(set(X["gene"]))

    hot = plt.get_cmap('hot')
    cNorm = colors.Normalize(vmin=0, vmax=len(uniq))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=hot)
    alpha = 0.5

    plt.figure()
    for i in range(len(uniq)):
        indx = X["gene"] == uniq[i]
        plt.scatter(X.loc[indx, "cells.location_center_x"], X.loc[indx, "cells.location_center_x"],
                    color=scalarMap.to_rgba(i),
                    label=uniq[i], marker=".", alpha=alpha)
    if len(uniq) < 20:     
        plt.legend()    
    
    uid = str(uuid.uuid1())
    plt.show()

In [4]:
X = load_data()

In [5]:
feature_cols = [x for x in X.columns if x.startswith("cells")]
for c in feature_cols:
    X.loc[:, c] = (X.loc[:, c] - numpy.mean(X.loc[:, c])) / numpy.std(X.loc[:, c])

In [6]:
gene_list = list(set(X["gene"]))
len(gene_list)

304

In [7]:
well_list = list(set(X["well"]))
len(well_list)

384

In [8]:
cnt  = 10

In [15]:
fl = "/Users/simondi/PROJECTS/target_infect_x_project/plots/scatter_plate_bartonella-d-p-g-1-dz14-2a_"

In [10]:
shuffle(well_list)
used_genes = well_list[0:cnt]
used_genes = ["slc43a1", "mock", "lcmt2", "rnmt", "pias3"]
# take mid images
X_5well = X.loc[X.image_idx % 5 == 0,]

X_5well_wells = list(set(X_5well.well))
shuffle(X_5well_wells)

# take random 5 genes/weels
Xw = X_5well.loc[X_5well["well"].isin(X_5well_wells[0:5]), :]

In [60]:
uid =str(uuid.uuid1())
uniq = list(set(Xw.gene))

hot = plt.get_cmap('hot')
cNorm = colors.Normalize(vmin=0, vmax=len(uniq))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=hot)
alpha = 0.5

# # plot features pairwisely
# for c1 in feature_cols:
#     for c2 in feature_cols:
c1 = "cells.areashape_extent"
c2 = "cells.intensity_integratedintensityedge_corrdna"
fig = plt.figure(0)
ax = fig.add_subplot(111)
for i in range(len(uniq)):
    indx = Xw["gene"] == uniq[i]

    plt.scatter(Xw.loc[indx, c1],
                Xw.loc[indx, c2],
                color=scalarMap.to_rgba(i),
                label=uniq[i], marker=".", alpha=alpha)
    if len(uniq) < 20:     
        plt.legend()    

plt.xlabel(c1)
plt.ylabel(c2)
#plt.show()
plt.savefig(  fl + c1 + "_"+c2 +"_" +uid+ ".png", dpi=720)
plt.close(fig)

In [12]:
def plot_dimred(X2, X, method):
    uniq = list(set(X["gene"]))
    
    hot = plt.get_cmap('hot')
    cNorm = colors.Normalize(vmin=0, vmax=len(uniq))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=hot)
    alpha = 0.5
    
    fig = plt.figure()
    for i in range(len(uniq)):
        indx = X["gene"] == uniq[i]
        plt.scatter(X2[indx, 1], X2[indx, 0], color=scalarMap.to_rgba(i),
                    label=uniq[i], marker=".", alpha=alpha)
    if len(uniq) < 20:     
        plt.legend()
    plt.savefig(fl + "_" + method + "_2d" +uid+ ".png", dpi=720)
    plt.close(fig)

In [13]:
def plot_pca(Xtrans, X, method):
    uniq = list(set(X["gene"]))
    
    hot = plt.get_cmap('hot')
    cNorm = colors.Normalize(vmin=0, vmax=len(uniq))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=hot)
    alpha = 0.5
    
    X2 = Xtrans.transform(X.loc[:, feature_cols_idx])
    
    fig = plt.figure()
    for i in range(len(uniq)):
        indx = X["gene"] == uniq[i]
        plt.scatter(X2[indx, 0], X2[indx, 1], color=scalarMap.to_rgba(i),
                    label=uniq[i], marker=".", alpha=alpha)
    if len(uniq) < 20:     
        plt.legend()
    plt.xlabel("Variance ratio:" + "{0:.2f}".format(Xtrans.explained_variance_ratio_[0]))
    plt.ylabel("Variance ratio:" + "{0:.2f}".format(Xtrans.explained_variance_ratio_[1]))
    plt.savefig(fl + "_" + method + "_2d_pca"+uid+".png", dpi=720)
    plt.close(fig)
    
    fig, ax = plt.subplots()
    ax.bar(list(numpy.array(range(10)) + 0.5),  Xtrans.explained_variance_ratio_[:10], width=0.5)

    ax.set_ylabel('Variance ratio')
    ax.set_xticks(list(numpy.array(range(10)) + 0.5))
    ax.set_xticklabels(["PCA" + str(i+1) for i in range(10)])
    plt.savefig(fl + "_" + method + "_2d_scree"+uid+".png", dpi=720)
    plt.close(fig)

In [16]:
for i in range(2):
    uid =str(uuid.uuid1())

    shuffle(well_list)
    used_genes = well_list[0:cnt]
    used_genes = ["slc43a1", "mock", "lcmt2", "rnmt", "pias3"]

    # take mid images
    X_5well = X.loc[X.image_idx % 5 == 0,]

    X_5well_wells = list(set(X_5well.well))
    shuffle(X_5well_wells)

    # take random 5 genes/weels
    Xw = X_5well.loc[X_5well["well"].isin(X_5well_wells[0:5]), :]

    feature_cols_idx = [x for x in (Xw.columns.values) if x.startswith("cells")]
    perp = [2, 5, 10,30, 50]
    for x in perp:
        ts2 = sklearn.manifold.TSNE(n_components=2, perplexity=x, n_iter=5000)
        X2 = ts2.fit_transform(Xw.loc[:, feature_cols_idx])
        plot_dimred(X2, Xw, "tsne_" + str(x))

#     for x in ["rbf", "poly", "linear"]:
#         ts2 = sklearn.decomposition.KernelPCA(n_components=2, kernel=x)
#         X2 = ts2.fit_transform(Xw.loc[:, feature_cols_idx])
#         plot_dimred(X2, Xw, "kpca_" + x)    

    ts2 = sklearn.decomposition.PCA(n_components=20)
    Xtrans = ts2.fit(Xw.loc[:, feature_cols_idx])
    plot_pca(Xtrans, Xw, "pca_")