In [1]:
import sys
import matplotlib
import sklearn
import pandas
import matplotlib.pyplot
import numpy as np

from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
from matplotlib.backends.backend_pdf import PdfPages
from sklearn import manifold
from sklearn.utils import check_random_state

In [2]:
import os
import findspark

if os.path.isdir("/cluster/home/simondi/spark/"):
    is_cluster = True
    import sparkhpc
    from sparkhpc import sparkjob
    spark_path = "/cluster/home/simondi/spark/"
else:
    is_cluster = False
    spark_path = "/usr/local/spark/spark/"

findspark.init(spark_path)
import pyspark

In [3]:
from pyspark.sql.window import Window
import pyspark.sql.functions as func
from pyspark.rdd import reduce
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler, PCA
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import SparseVector, VectorUDT, Vector, Vectors

In [4]:
if is_cluster:
    print("Doing cluster stuff")
    sj = sparkjob.sparkjob(ncores=4)
    sj.wait_to_start()
    sc = sj.start_spark()
else:
    print("Doing local stuff")
    conf = pyspark.SparkConf().setMaster("local[2]").setAppName("test").set("spark.driver.memory", "2G").set("spark.executor.memory", "1G")
    sc = pyspark.SparkContext(conf=conf)

Doing local stuff


In [5]:
spark = pyspark.sql.SparkSession(sc)

In [6]:
if is_cluster:
    file_name = "/cluster/home/simondi/simondi/tix/data/screening_data/cells_sample_10.tsv"
else:
    file_name = "/Users/simondi/PHD/data/data/target_infect_x/screening_data_subset/cells_sample_10.tsv"

In [7]:
df = spark.read.csv(path=file_name, sep="\t", header='true')

In [8]:
df.cache()

DataFrame[study: string, pathogen: string, library: string, design: string, replicate: string, plate: string, well: string, gene: string, sirna: string, well_type: string, image_idx: string, object_idx: string, cells.areashape_area: string, cells.areashape_eccentricity: string, cells.areashape_extent: string, cells.areashape_formfactor: string, cells.areashape_majoraxislength: string, cells.areashape_minoraxislength: string, cells.areashape_perimeter: string, cells.children_bacteria_count: string, cells.children_invasomes_count: string, cells.location_center_x: string, cells.location_center_y: string, cells.neighbors_anglebetweenneighbors_2: string, cells.neighbors_firstclosestobjectnumber_2: string, cells.neighbors_firstclosestxvector_2: string, cells.neighbors_firstclosestyvector_2: string, cells.neighbors_numberofneighbors_2: string, cells.neighbors_secondclosestobjectnumber_2: string, cells.neighbors_secondclosestxvector_2: string, cells.neighbors_secondclosestyvector_2: string, ce

In [9]:
old_cols = df.schema.names
new_cols = list(map(lambda x: x.replace(".", "_"),  old_cols))
df = reduce(lambda data, idx: data.withColumnRenamed(old_cols[idx], new_cols[idx]), range(len(new_cols)), df)
for i, x in enumerate(new_cols):
    if x.startswith("cells"):
        df= df.withColumn(x, df[x].cast("double"))

In [10]:
def z_score_w(col, w):
    avg = func.avg(col).over(w)
    sd = func.stddev(col).over(w)
    return (col - avg) / sd

DataFrame[study: string, pathogen: string, library: string, design: string, replicate: string, plate: string, well: string, gene: string, sirna: string, well_type: string, image_idx: string, object_idx: string, cells_areashape_area: double, cells_areashape_eccentricity: double, cells_areashape_extent: double, cells_areashape_formfactor: double, cells_areashape_majoraxislength: double, cells_areashape_minoraxislength: double, cells_areashape_perimeter: double, cells_children_bacteria_count: double, cells_children_invasomes_count: double, cells_location_center_x: double, cells_location_center_y: double, cells_neighbors_anglebetweenneighbors_2: double, cells_neighbors_firstclosestobjectnumber_2: double, cells_neighbors_firstclosestxvector_2: double, cells_neighbors_firstclosestyvector_2: double, cells_neighbors_numberofneighbors_2: double, cells_neighbors_secondclosestobjectnumber_2: double, cells_neighbors_secondclosestxvector_2: double, cells_neighbors_secondclosestyvector_2: double, ce

In [11]:
w = Window().partitionBy(["study", "pathogen"]).rowsBetween(-sys.maxsize, sys.maxsize)
for x in X.columns:
    if x.startswith("cells"):
        X = X.withColumn(x, z_score_w(df[x], w))

In [12]:
X = df.sample(fraction=.0001, withReplacement=False)
X.cache()
X = X.na.fill(value=0)

In [13]:
X.count()

559

In [14]:
assembler = VectorAssembler(inputCols=[x for x in X.columns if x.startswith("cells")], outputCol='features')
X = assembler.transform(X)

In [15]:
X.cache()

DataFrame[study: string, pathogen: string, library: string, design: string, replicate: string, plate: string, well: string, gene: string, sirna: string, well_type: string, image_idx: string, object_idx: string, cells_areashape_area: double, cells_areashape_eccentricity: double, cells_areashape_extent: double, cells_areashape_formfactor: double, cells_areashape_majoraxislength: double, cells_areashape_minoraxislength: double, cells_areashape_perimeter: double, cells_children_bacteria_count: double, cells_children_invasomes_count: double, cells_location_center_x: double, cells_location_center_y: double, cells_neighbors_anglebetweenneighbors_2: double, cells_neighbors_firstclosestobjectnumber_2: double, cells_neighbors_firstclosestxvector_2: double, cells_neighbors_firstclosestyvector_2: double, cells_neighbors_numberofneighbors_2: double, cells_neighbors_secondclosestobjectnumber_2: double, cells_neighbors_secondclosestxvector_2: double, cells_neighbors_secondclosestyvector_2: double, ce

In [40]:
kmeans = KMeans(k=5, seed=23)
model = kmeans.fit(X)

In [17]:
#pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
#pca_model = pca.fit(Y)

In [18]:
#result = pca_model.transform(Y).select("pcaFeatures")

In [19]:
#df2 = result.rdd.map(lambda x: tuple(float(x)  for x in x.pcaFeatures.values )).toDF()
#df2  = df2.withColumnRenamed("_1", "PCA1")
#df2  = df2.withColumnRenamed("_2", "PCA2")

In [20]:
#df2 = df2.toPandas()

In [21]:
#df2[1:5]

Unnamed: 0,PCA1,PCA2
1,1.233312,-0.708045
2,4.778434,2.410706
3,-0.079723,2.027226
4,1.854294,3.359678


In [22]:
#df2["pathogen"] = Y.select("pathogen").toPandas()
#df2["design"] = Y.select("design").toPandas()
#df2["library"] = Y.select("library").toPandas()

In [23]:
#df2[1:5]

Unnamed: 0,PCA1,PCA2,pathogen,design,library
1,1.233312,-0.708045,brucella,u,q
2,4.778434,2.410706,brucella,p,d
3,-0.079723,2.027226,brucella,p,d
4,1.854294,3.359678,brucella,p,d


In [25]:
# from cycler import cycler
# import matplotlib
# import pylab
# import matplotlib.pyplot

# fig, ax = matplotlib.pyplot.subplots()
# matplotlib.pyplot.rc('axes', prop_cycle=(cycler('color', ['r', 'g', 'b', 'y'])))
# ax.margins(0.05)

# groups = df2.groupby("pathogen")
# for name, group in groups:
#     design_groups = group.groupby(['design'])
#     for design, dgr in design_groups:
#         ax.plot(dgr.PCA1, dgr.PCA2, marker='o', linestyle='', ms=5, label=name +", "+ design)
# ax.legend()
# pylab.savefig('foo.png')

In [43]:
X = X.toPandas()

In [51]:
X_index = [i for i, x in enumerate(X.columns) if x.startswith("cells")]

In [52]:
tsne = manifold.TSNE(n_components=2, init='pca', random_state=23)
trans_data = tsne.fit_transform(X.values[:, X_index])

In [56]:
transe = pandas.DataFrame(trans_data)

In [68]:
model.summary.cluster.toPandas()["prediction"]

0      0
1      4
2      2
3      3
4      2
5      1
6      3
7      4
8      2
9      1
10     3
11     2
12     4
13     4
14     3
15     2
16     3
17     3
18     3
19     2
20     2
21     3
22     2
23     2
24     4
25     0
26     3
27     4
28     4
29     2
      ..
529    2
530    2
531    2
532    2
533    2
534    3
535    1
536    4
537    4
538    2
539    2
540    2
541    3
542    1
543    2
544    0
545    4
546    2
547    3
548    2
549    2
550    2
551    4
552    0
553    4
554    4
555    0
556    2
557    2
558    2
Name: prediction, Length: 559, dtype: int64

In [70]:
transe

Unnamed: 0,a,b,pathogen,design,cluster
0,-5.699305,-0.105661,brucella,p,0
1,-17.061263,-7.018676,brucella,u,4
2,-20.689502,8.859087,brucella,p,2
3,-6.785939,2.362020,brucella,p,3
4,-19.279591,9.653475,brucella,p,2
5,10.750934,9.813220,brucella,p,1
6,2.474082,-7.891385,brucella,u,3
7,-4.348266,-11.426652,brucella,u,4
8,-19.508275,-4.234960,brucella,u,2
9,8.683016,10.236466,brucella,u,1


In [71]:
transe["pathogen"] = X["pathogen"]
transe["design"] = X["design"]
transe["cluster"] = model.summary.cluster.toPandas()["prediction"]
transe.columns = ['a', 'b', 'pathogen', 'design', "kmeans"]

Unnamed: 0,a,b,pathogen,design,kmeans
0,-5.699305,-0.105661,brucella,p,0
1,-17.061263,-7.018676,brucella,u,4
2,-20.689502,8.859087,brucella,p,2
3,-6.785939,2.362020,brucella,p,3
4,-19.279591,9.653475,brucella,p,2
5,10.750934,9.813220,brucella,p,1
6,2.474082,-7.891385,brucella,u,3
7,-4.348266,-11.426652,brucella,u,4
8,-19.508275,-4.234960,brucella,u,2
9,8.683016,10.236466,brucella,u,1


In [77]:
fig, ax = matplotlib.pyplot.subplots()
matplotlib.pyplot.rc('axes', prop_cycle=(cycler('color', ['r', 'g', 'b', 'y', 'm'])))
ax.margins(0.05)

groups = transe.groupby("pathogen")
for name, group in groups:
        ax.plot(group.a, group.b, marker='o', linestyle='', ms=3, label=name)
ax.legend()

pp = PdfPages("clustering_by_pathogen.pdf")
plt.savefig(pp, format='pdf')
pp.close()

In [79]:
fig, ax = matplotlib.pyplot.subplots()
matplotlib.pyplot.rc('axes', prop_cycle=(cycler('color', ['r', 'g', 'b', 'y', 'm'])))
ax.margins(0.05)

groups = transe.groupby("kmeans")
for name, group in groups:
        ax.plot(group.a, group.b, marker='o', linestyle='', ms=3, label=name)
ax.legend()

pp = PdfPages("clustering_by_kmeans.pdf")
plt.savefig(pp, format='pdf')
pp.close()

In [None]:
sc.stop()