In [1]:
import os
import sys
import pandas
import numpy

import findspark
findspark.init("/usr/local/spark/spark")

import pyspark

from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vector, Vectors

In [92]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [3]:
file_name = "/Users/simondi/Desktop/test_ba/kmeans_transform-cells_sample_10_normalized_cut_100_K005"

In [4]:
conf = pyspark.SparkConf().setMaster("local[*]").set("spark.driver.memory", "4G").set("spark.executor.memory", "4G")
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)

In [5]:
data = spark.read.parquet(file_name)

In [6]:
data = data.limit(30)

In [7]:
data.take(5)

[Row(study='infectx_published', pathogen='listeria', library='a', design='u', replicate='1', plate='kb2-02-1x', well='a01', gene='chka', sirna='s3008', well_type='sirna', image_idx='4', object_idx='144', prediction=0, features=DenseVector([-0.8044, 0.0121, 1.1159, 1.4749, -0.9369, -0.7485, -1.0209, -0.703, 0.0, 0.4476, 1.2809, 1.3916, 0.1489, 0.6694, -1.2335, -0.0825, 0.0106, -0.5078, 1.2455, 0.1357])),
 Row(study='infectx_published', pathogen='listeria', library='a', design='u', replicate='1', plate='kb2-02-1x', well='a01', gene='chka', sirna='s3008', well_type='sirna', image_idx='5', object_idx='168', prediction=0, features=DenseVector([-0.9474, -0.0399, 0.1262, 0.6198, -1.0081, -0.7959, -0.9485, -0.6324, 0.0, 0.4263, -0.2104, -0.3939, 0.3941, -0.3786, 1.0053, -0.6748, 0.268, -1.0621, -0.5899, 0.429])),
 Row(study='infectx_published', pathogen='listeria', library='a', design='u', replicate='1', plate='kb2-02-1x', well='a01', gene='chka', sirna='s3008', well_type='sirna', image_idx='2

In [8]:
pca = PCA(k=2, inputCol="features", outputCol="pcs")

In [9]:
model = pca.fit(data)

In [10]:
data = model.transform(data)

In [11]:
data.take(1)

[Row(study='infectx_published', pathogen='listeria', library='a', design='u', replicate='1', plate='kb2-02-1x', well='a01', gene='chka', sirna='s3008', well_type='sirna', image_idx='4', object_idx='144', prediction=0, features=DenseVector([-0.8044, 0.0121, 1.1159, 1.4749, -0.9369, -0.7485, -1.0209, -0.703, 0.0, 0.4476, 1.2809, 1.3916, 0.1489, 0.6694, -1.2335, -0.0825, 0.0106, -0.5078, 1.2455, 0.1357]), pcs=DenseVector([1.7649, 1.6109]))]

In [16]:
counts = data.groupBy(["pathogen", "gene", "sirna"]).count()

+--------+----+-----+-----+
|pathogen|gene|sirna|count|
+--------+----+-----+-----+
|listeria|chka|s3008|   10|
|listeria|grk5|s6088|   10|
|listeria|chuk|s3077|   10|
+--------+----+-----+-----+



In [19]:
from pyspark.sql.window import Window

In [41]:
window = Window.partitionBy(["pathogen", "gene", "sirna"]).rowsBetween(0, 10)

In [109]:
from pyspark.sql.functions import row_number

In [53]:
data = data.withColumn("row_num", row_number().over(Window.partitionBy(["pathogen", "gene", "sirna"]).orderBy("pathogen")))

In [62]:
data = data.filter("row_num <= 10")

In [88]:
data_p = data.select(["pathogen", "gene", "sirna", "prediction", "pcs"]).toPandas()
data_p[['pc1','pc2']] = pandas.DataFrame(data_p.pcs.values.tolist())

In [75]:
uniq_genes = list(set(data_p["gene"]))
uniq_pathogen = list(set(data_p["pathogen"]))
uniq_sirnas = list(set(data_p["sirna"]))
uniq_clusts = list(set(data_p["prediction"]))

In [79]:
uniq_clusts

[0, 1, 2, 3, 4]

In [103]:
colors = plt.cm.rainbow(numpy.linspace(0, 1, len(uniq_clusts)))
colors

array([[  5.00000000e-01,   0.00000000e+00,   1.00000000e+00,
          1.00000000e+00],
       [  1.96078431e-03,   7.09281308e-01,   9.23289106e-01,
          1.00000000e+00],
       [  5.03921569e-01,   9.99981027e-01,   7.04925547e-01,
          1.00000000e+00],
       [  1.00000000e+00,   7.00543038e-01,   3.78411050e-01,
          1.00000000e+00],
       [  1.00000000e+00,   1.22464680e-16,   6.12323400e-17,
          1.00000000e+00]])

In [108]:
data_p.loc[:, "color"] = colors[data_p.prediction]

ValueError: Must have equal len keys and value when setting with an ndarray

In [106]:
data_p

Unnamed: 0,pathogen,gene,sirna,prediction,pcs,pc1,pc2
0,listeria,chka,s3008,0,"[1.76487012551, 1.61088575943]",1.76487,1.610886
1,listeria,chka,s3008,0,"[1.63468301131, 0.594261298524]",1.634683,0.594261
2,listeria,chka,s3008,2,"[1.75772043796, -1.00642121456]",1.75772,-1.006421
3,listeria,chka,s3008,4,"[2.02350481383, 0.100451937143]",2.023505,0.100452
4,listeria,chka,s3008,2,"[2.25783110217, -2.77525388643]",2.257831,-2.775254
5,listeria,chka,s3008,4,"[1.53234179511, -1.49438655759]",1.532342,-1.494387
6,listeria,chka,s3008,1,"[-3.37821265856, -4.81525978572]",-3.378213,-4.81526
7,listeria,chka,s3008,4,"[2.81887922754, 0.382042590338]",2.818879,0.382043
8,listeria,chka,s3008,3,"[-2.03657176404, 2.7128186201]",-2.036572,2.712819
9,listeria,chka,s3008,4,"[2.16869401353, 1.386068262]",2.168694,1.386068


In [100]:
hot = plt.get_cmap('hot')
colors = plt.cm.rainbow(np.linspace(0, 1, len(uniq_clusts)))

plt.figure()

plt.scatter(data_p.loc[:, "pc1"], data_p.loc[:, "pc2"], color=colors)
    
#uid = str(uuid.uuid1())
plt.show()

NameError: name 'np' is not defined

In [66]:
font = {'weight': 'normal',
            'family': 'sans-serif',
            'size': 14}
plt.rc('font', **font)
plt.figure()
ax = plt.subplot(111)
plt.tick_params(axis="both", which="both", bottom="off", top="off",
                labelbottom="on", left="off", right="off", labelleft="on")
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(True)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(True)

In [None]:
data

In [None]:
ax.plot(ks, score, "black")
    ax.plot(ks, score, "or")
    plt.xlabel('K', fontsize=15)
    plt.ylabel(axis_label, fontsize=15)
    plt.title('')
    ax.grid(True)
    logger.info("Saving plot to: {}".format(plotfile))
    plt.savefig(plotfile, bbox_inches="tight")