In [184]:
import os
import sys
import pandas
import numpy

import findspark
findspark.init("/usr/local/spark/spark/")

import pyspark
from pyspark.sql.window import Window
import pyspark.sql.functions as func

from pyspark.rdd import reduce
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler
import pyspark.mllib.linalg.distributed
from pyspark.mllib.linalg.distributed import RowMatrix, CoordinateMatrix
from pyspark.ml.linalg import SparseVector, VectorUDT, Vector, Vectors
from pyspark.mllib.stat import Statistics

In [4]:
conf = pyspark.SparkConf().setMaster("local[*]").set("spark.driver.memory", "4G").set("spark.executor.memory", "4G")
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)

In [5]:
spark.version

'2.2.0'

In [44]:
file_name = "/Users/simondi/PHD/data/data/target_infect_x/query_data/cells_sample_10_normalized_cut_100.tsv"
df = spark.read.csv(path=file_name, sep="\t", header='true')

In [45]:
old_cols = df.columns
new_cols = list(map(lambda x: x.replace(".", "_"), old_cols))

df = reduce(
  lambda data, idx: data.withColumnRenamed(old_cols[idx], new_cols[idx]),
  range(len(new_cols)), df)

for i, x in enumerate(new_cols):
    if x.startswith("cells") or x.startswith("perin") or x.startswith("nucl"):
        df = df.withColumn(x, df[x].cast("double"))

df = df.fillna(0)

In [46]:
feature_columns = [x for x in df.columns if x.startswith("cells") or x.startswith("perin") or x.startswith("nucl")]
assembler = VectorAssembler(inputCols=feature_columns,outputCol='features')
data = assembler.transform(df)

In [186]:
rdd = data.select(feature_columns).rdd.map(list)
X = RowMatrix(rdd)

In [208]:
s = numpy.array([2] * 20)

[DenseVector([-0.4022, 0.0061, 0.558, 0.7375, -0.4684, -0.3743, -0.5105, -0.3515, 0.0, 0.2238, 0.6404, 0.6958, 0.0744, 0.3347, -0.6167, -0.0413, 0.0053, -0.2539, 0.6228, 0.0679])]

In [192]:
X.rows.take(1)

[DenseVector([-0.8044, 0.0121, 1.1159, 1.4749, -0.9369, -0.7485, -1.0209, -0.703, 0.0, 0.4476, 1.2809, 1.3916, 0.1489, 0.6694, -1.2335, -0.0825, 0.0106, -0.5078, 1.2455, 0.1357])]

In [165]:
summary = Statistics.colStats(rdd)

In [166]:
summary.variance()

array([ 0.78434512,  1.17739852,  0.89355321,  1.04392896,  0.7218773 ,
        1.09173459,  0.74492437,  0.4759858 ,  0.        ,  1.15081103,
        0.99763152,  0.94775656,  1.19722718,  0.74309616,  0.90959581,
        0.96306527,  1.19621207,  0.90429383,  0.77339588,  1.19382168])

In [154]:
def svd(X, comps):
    svd = X.computeSVD(X.numCols(), computeU=False)
    s = svd.s.toArray()
    V = svd.V.toArray().T
    var = numpy.dot(s[comps:], s[comps:])
    return s[:comps], V[:comps], var

In [199]:
iter = 0
DELTA = 1e-12
MAX_ITER = 100

N, P = X.numRows(), X.numCols()
ll = old_ll = -numpy.inf
psi = numpy.ones(P, dtype=numpy.float64)
var = summary.variance()
nsqrt = numpy.sqrt(N)

n_components = 10
llconst = P * numpy.log(2. * numpy.pi) + n_components

In [210]:
def tilde(X, psi_sqrt, n_sqrt):
    norm = psi_sqrt * n_sqrt
    Xtilde = RowMatrix(X.rows.map(lambda x: x / norm)
    return Xtilde

In [214]:
for i in range(MAX_ITER):
    sqrt_psi = numpy.sqrt(psi) + DELTA
    s, V, unexp_var = svd(tilde(X, sqrt_psi, nsqrt), 2)
    s = s ** 2
    W = numpy.sqrt(numpy.maximum(s - 1., 0.))[:, numpy.newaxis] * V
    W *= sqrt_psi    
    ll = llconst + numpy.sum(numpy.log(s))
    ll += unexp_var + numpy.sum(numpy.log(psi))
    ll *= -N / 2.
    psi = numpy.maximum(var - numpy.sum(W ** 2, axis=0), DELTA)
    if numpy.abs(ll - old_ll) < 0.001:
        break
    old_ll = ll

In [None]:
Ih = numpy.eye(len(W))
X_transformed = X
Wpsi = W / psi
cov_z = scipy.linalg.inv(Ih + numpy.dot(Wpsi, W.T))
tmp = numpy.dot(X_transformed, Wpsi.T)
X_transformed = numpy.dot(tmp, cov_z)
X_transformed[:5, :]