In [151]:
import uuid
import pandas
import numpy
from random import shuffle

import sklearn
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import sklearn.manifold

import matplotlib
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx

import os

In [177]:
def load_data():
    if os.path.isdir("/cluster/home/simondi/spark/"):
        filename = "/cluster/home/simondi/simondi/tix/data/screening_data/cells_sample_10_normalized.tsv"
    else:
        filename = "/Users/simondi/PHD/data/data/target_infect_x/screening_data_subset/cells_sample_10_normalized.tsv"

    data = pandas.read_csv(filename, sep="\t", header=0)

    feature_cols = [(i, x) for i, x in enumerate(data.columns.values) if
                    x.startswith("cells")]
    for i, c in feature_cols:
        data.loc[:, c] = data.loc[:, c].astype('float64')

    data_new = data.query(
      "library=='d' and design=='p' and replicate==1 and (pathogen=='brucella' or pathogen=='listeria' or pathogen=='adeno' or pathogen=='bartonella')").groupby(
      ["gene"]).filter(lambda x: len(x) == 40)

    del data_new["cells.children_invasomes_count"]
    del data_new["cells.children_bacteria_count"]

    X = data_new.dropna()
    return X

In [178]:
data = load_data()

In [179]:
data.groupby("pathogen").size()

pathogen
adeno         140850
bartonella    155620
brucella      155620
listeria      155620
dtype: int64

In [197]:
def plot(X2, X, method):
    markers = [    'o', 'v','8',   's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X' ]
    pathogens = list(set(X["pathogen"]))
    path_map = {}
    print(pathogens)
    for i, p in enumerate(pathogens):
        path_map[p] = markers[i]
    
    uniq = list(set(X["gene"]))
    
    hot = plt.get_cmap('hot')
    cNorm = colors.Normalize(vmin=0, vmax=len(uniq))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=hot)
    alpha = 0.5
    
    plt.figure()
    for i in range(len(uniq)):
        indx = X["gene"] == uniq[i]
        XG = X.loc[indx,:]
        XP = X2[indx,:]
        unip = list(set(XG["pathogen"]))
        for j in range(len(unip)):
            indx = XG["pathogen"] == unip[j]
            plt.scatter(XP[indx, 1], XP[indx, 0], 
                        color=scalarMap.to_rgba(i),
                        label=unip[j].capitalize() + ": "+ uniq[i].upper(),
                        marker=path_map[ unip[j]  ], alpha=alpha)
    if len(uniq) <= 5:     
        plt.legend( fontsize="xx-small")    
    uid = str(uuid.uuid1())
    
    plt.savefig(
      "/Users/simondi/PROJECTS/target_infect_x_project/src/tix_util/tix_analysis/plots/scatter_" + method + "_2d" +uid+ ".png",
      dpi=720)
    plt.close()

In [182]:
gene_list = list(set(data["gene"]))

In [183]:
feature_cols_idxs = [x for x in data.columns.values if x.startswith("cells")]

In [188]:
def tsne(data, genes_list, cnt):
    uid = str(uuid.uuid1())
    shuffle(gene_list)
    used_genes = gene_list[1:cnt]
    X = data[ data["gene"].isin(used_genes)]
    for x in [1, 5, 10, 30, 50, 100]:
        ts2 = sklearn.manifold.TSNE(n_components=2, perplexity=x, n_iter=5000)
        X2 = ts2.fit_transform(X.loc[:, feature_cols_idxs])
        plot(X2, X, "tsne_" + str(x))

In [189]:
def lle(data, genes_list, cnt):
    uid = str(uuid.uuid1())
    shuffle(gene_list)
    used_genes = gene_list[1:cnt]
    X = data[ data["gene"].isin(used_genes)]
    for x in [5, 10, 30]:
        ts2 = sklearn.manifold.LocallyLinearEmbedding(n_components=2,method="standard",n_neighbors=x)
        X2 = ts2.fit_transform(X.loc[:, feature_cols_idxs])
        plot(X2, X, "lle_"+ str(x))

In [190]:
def kpca(data, genes_list, cnt):
    uid = str(uuid.uuid1())
    shuffle(gene_list)
    used_genes = gene_list[1:cnt]
    X = data[ data["gene"].isin(used_genes)]
    for x in ["rbf", "poly", "rbf"]:
        ts2 = sklearn.decomposition.KernelPCA(n_components=2, kernel=x)
        X2 = ts2.fit_transform(X.loc[:, feature_cols_idxs])
        plot(X2, X, "kpca_" + x)

In [191]:
def pca(data, genes_list, cnt):
    uid = str(uuid.uuid1())
    shuffle(gene_list)
    used_genes = gene_list[1:cnt]
    X = data[ data["gene"].isin(used_genes)]
    ts2 = sklearn.decomposition.PCA(n_components=2, )
    X2 = ts2.fit_transform(X.loc[:, feature_cols_idxs])
    plot(X2, X, "pca")

In [None]:
for i in range(10):
    pca(data, gene_list, 5)
    tsne(data, gene_list, 5)
    lle(data, gene_list, 5)
    kpca(data, gene_list, 5)
    pca(data, gene_list, 10)
    tsne(data, gene_list, 10)
    lle(data, gene_list, 10)
    kpca(data, gene_list, 10)
    
pca(data, gene_list, 100)
tsne(data, gene_list, 100)
lle(data, gene_list, 100)
kpca(data, gene_list, 100)

['adeno', 'brucella', 'bartonella', 'listeria']
['bartonella', 'listeria', 'brucella', 'adeno']
['bartonella', 'listeria', 'brucella', 'adeno']
['bartonella', 'listeria', 'brucella', 'adeno']
['bartonella', 'listeria', 'brucella', 'adeno']
['bartonella', 'listeria', 'brucella', 'adeno']
['bartonella', 'listeria', 'brucella', 'adeno']
['brucella', 'listeria', 'bartonella', 'adeno']
['brucella', 'listeria', 'bartonella', 'adeno']
['brucella', 'listeria', 'bartonella', 'adeno']
['adeno', 'brucella', 'bartonella', 'listeria']
['adeno', 'brucella', 'bartonella', 'listeria']
['adeno', 'brucella', 'bartonella', 'listeria']
['brucella', 'listeria', 'bartonella', 'adeno']
['adeno', 'brucella', 'bartonella', 'listeria']
['adeno', 'brucella', 'bartonella', 'listeria']
['adeno', 'brucella', 'bartonella', 'listeria']
['adeno', 'brucella', 'bartonella', 'listeria']
['adeno', 'brucella', 'bartonella', 'listeria']
['adeno', 'brucella', 'bartonella', 'listeria']
['adeno', 'brucella', 'bartonella', 'lis