In [1]:
import os
import sys
import pandas
import numpy
import scipy
from scipy import linalg

import findspark
findspark.init("/usr/local/spark/spark-2.2.0-bin-hadoop2.7/")

import pyspark
from pyspark.sql.window import Window
import pyspark.sql.functions as func

from pyspark.rdd import reduce
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler
import pyspark.mllib.linalg.distributed
from pyspark.mllib.linalg.distributed import RowMatrix, DenseMatrix
from pyspark.ml.linalg import SparseVector, VectorUDT, Vector, Vectors
from pyspark.mllib.stat import Statistics

In [2]:
conf = pyspark.SparkConf().setMaster("local[*]").set("spark.driver.memory", "4G").set("spark.executor.memory", "4G")
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)

In [3]:
spark.version

'2.2.0'

In [4]:
file_name = "/Users/simondi/PHD/data/data/target_infect_x/query_data/cells_sample_10_normalized_cut_100.tsv"
df = spark.read.csv(path=file_name, sep="\t", header='true')

In [6]:
df.take(1)

[Row(study='infectx_published', pathogen='listeria', library='a', design='u', replicate='1', plate='kb2-02-1x', well='a01', gene='chka', sirna='s3008', well_type='sirna', image_idx='4', object_idx='144', cells.areashape_area='-0.8044235519947214', cells.areashape_eccentricity='0.01210726352227413', cells.areashape_extent='1.1159023326616404', cells.areashape_formfactor='1.4749286136850854', cells.areashape_majoraxislength='-0.9368530225439791', cells.areashape_minoraxislength='-0.7485320762193192', cells.areashape_perimeter='-1.0209269921587854', cells.children_bacteria_count='-0.7030171063989196', cells.children_invasomes_count=None, cells.location_center_x='0.44758084788204255', cells.location_center_y='1.2808903222918113', cells.neighbors_anglebetweenneighbors_2='1.3915760955745562', cells.neighbors_firstclosestobjectnumber_2='0.14886791955020456', cells.neighbors_firstclosestxvector_2='0.6694456185473984', cells.neighbors_firstclosestyvector_2='-1.233468444008488', cells.neighbors_

In [7]:
old_cols = df.columns
new_cols = list(map(lambda x: x.replace(".", "_"), old_cols))

df = reduce(
  lambda data, idx: data.withColumnRenamed(old_cols[idx], new_cols[idx]),
  range(len(new_cols)), df)

for i, x in enumerate(new_cols):
    if x.startswith("cells") or x.startswith("perin") or x.startswith("nucl"):
        df = df.withColumn(x, df[x].cast("double"))

df = df.fillna(0)

In [8]:
feature_columns = [x for x in df.columns if (x.startswith("cells") or x.startswith("perin") or x.startswith("nucl")) and not x.startswith("cells_children_invasomes_count")]
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
data = assembler.transform(df)

In [9]:
data

DataFrame[study: string, pathogen: string, library: string, design: string, replicate: string, plate: string, well: string, gene: string, sirna: string, well_type: string, image_idx: string, object_idx: string, cells_areashape_area: double, cells_areashape_eccentricity: double, cells_areashape_extent: double, cells_areashape_formfactor: double, cells_areashape_majoraxislength: double, cells_areashape_minoraxislength: double, cells_areashape_perimeter: double, cells_children_bacteria_count: double, cells_children_invasomes_count: double, cells_location_center_x: double, cells_location_center_y: double, cells_neighbors_anglebetweenneighbors_2: double, cells_neighbors_firstclosestobjectnumber_2: double, cells_neighbors_firstclosestxvector_2: double, cells_neighbors_firstclosestyvector_2: double, cells_neighbors_numberofneighbors_2: double, cells_neighbors_secondclosestobjectnumber_2: double, cells_neighbors_secondclosestxvector_2: double, cells_neighbors_secondclosestyvector_2: double, ce

In [39]:
data.select(feature_columns).columns

['cells_areashape_area',
 'cells_areashape_eccentricity',
 'cells_areashape_extent',
 'cells_areashape_formfactor',
 'cells_areashape_majoraxislength',
 'cells_areashape_minoraxislength',
 'cells_areashape_perimeter',
 'cells_children_bacteria_count',
 'cells_location_center_x',
 'cells_location_center_y',
 'cells_neighbors_anglebetweenneighbors_2',
 'cells_neighbors_firstclosestobjectnumber_2',
 'cells_neighbors_firstclosestxvector_2',
 'cells_neighbors_firstclosestyvector_2',
 'cells_neighbors_numberofneighbors_2',
 'cells_neighbors_secondclosestobjectnumber_2',
 'cells_neighbors_secondclosestxvector_2',
 'cells_neighbors_secondclosestyvector_2',
 'cells_parent_nuclei']

In [83]:
rdd = data.select(feature_columns).rdd.map(list)
X = RowMatrix(rdd)

In [None]:
summary = Statistics.colStats(rdd)
means, var = summary.mean(), summary.variance()

In [84]:
X = RowMatrix(X.rows.map(lambda x: (x - means) / numpy.sqrt(var)))

In [85]:
X.rows.take(5)

[DenseVector([-0.8334, 0.1165, 1.0605, 1.3942, -0.9828, -0.7089, -1.0943, -0.8341, 0.4385, 1.3451, 1.3012, 0.089, 0.7667, -1.357, -0.2354, -0.0203, -0.4495, 1.409, 0.085]),
 DenseVector([-0.9948, 0.0686, 0.0135, 0.5573, -1.0667, -0.7542, -1.0105, -0.7318, 0.4187, -0.1479, -0.5329, 0.3132, -0.449, 0.9905, -0.8389, 0.215, -1.0324, -0.6781, 0.3534]),
 DenseVector([-0.0198, 1.1411, 1.7431, 0.7815, 0.7757, -0.6904, -0.4619, 0.9058, 0.7684, -1.0541, 0.5905, 0.84, -0.2169, 1.055, 1.5752, 0.9771, 0.7725, -0.7497, 0.8231]),
 DenseVector([-0.2795, 0.7564, -0.2166, 0.7758, -0.0042, -0.5275, -0.6045, 0.087, 0.8081, 0.2262, 0.4252, 0.7727, -0.3396, -1.4178, 0.3682, 0.8762, 1.1998, 0.8403, 0.7896]),
 DenseVector([-0.2976, -1.7301, 1.6591, 0.5017, -0.8565, 0.3434, -0.5239, 0.4964, 1.109, -0.2848, 1.2879, 1.165, -0.3789, 1.4738, 1.5752, 1.3132, 0.53, -1.3521, 1.1922])]

In [55]:
def svd(X, comps):
    svd = X.computeSVD(X.numCols(), computeU=False)
    s = svd.s.toArray()
    V = svd.V.toArray().T
    var = numpy.dot(s[comps:], s[comps:])
    return s[:comps], V[:comps]

In [71]:
s, v = svd(X, 5)

In [73]:
v.shape

(5, 19)

In [62]:
s / numpy.sqrt(max(1, 100 - 1))

array([2.00727217, 1.98343091, 1.49762608, 1.32266576, 1.229918  ,
       1.0325376 , 0.97206181, 0.91251183, 0.83006183, 0.79666918,
       0.71880855, 0.65128727, 0.50751047, 0.2469179 , 0.2232005 ,
       0.1984104 , 0.11875283, 0.03354878, 0.02497995])

In [87]:
X.multiply(DenseMatrix(19, 5, v.flatten())).rows.take(5)

[DenseVector([-1.4448, 0.8724, 0.9894, 0.2418, 0.9197]),
 DenseVector([1.0509, -0.3239, -0.138, 0.2582, -0.7921]),
 DenseVector([1.2966, 1.1084, 0.7674, -0.0261, 1.4224]),
 DenseVector([-0.2841, 0.0118, 0.223, 0.4749, 1.1753]),
 DenseVector([0.1274, 0.7767, 0.3588, -0.1662, 1.3104])]

In [77]:
v.T.shape

(19, 5)