In [163]:
import os
import sys
import pandas
import numpy

import findspark
findspark.init("/usr/local/spark/spark/")

import pyspark
from pyspark.sql.window import Window
import pyspark.sql.functions as func

from pyspark.rdd import reduce
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler
import pyspark.mllib.linalg.distributed
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.linalg import SparseVector, VectorUDT, Vector, Vectors
from pyspark.mllib.stat import Statistics

In [4]:
conf = pyspark.SparkConf().setMaster("local[*]").set("spark.driver.memory", "4G").set("spark.executor.memory", "4G")
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)

In [5]:
spark.version

'2.2.0'

In [44]:
file_name = "/Users/simondi/PHD/data/data/target_infect_x/query_data/cells_sample_10_normalized_cut_100.tsv"
df = spark.read.csv(path=file_name, sep="\t", header='true')

In [45]:
old_cols = df.columns
new_cols = list(map(lambda x: x.replace(".", "_"), old_cols))

df = reduce(
  lambda data, idx: data.withColumnRenamed(old_cols[idx], new_cols[idx]),
  range(len(new_cols)), df)

for i, x in enumerate(new_cols):
    if x.startswith("cells") or x.startswith("perin") or x.startswith("nucl"):
        df = df.withColumn(x, df[x].cast("double"))

df = df.fillna(0)

In [46]:
feature_columns = [x for x in df.columns if x.startswith("cells") or x.startswith("perin") or x.startswith("nucl")]
assembler = VectorAssembler(inputCols=feature_columns,outputCol='features')
data = assembler.transform(df)

In [47]:
rdd = data.select(feature_columns).rdd.map(list)
X = RowMatrix(rdd)

In [165]:
summary = Statistics.colStats(rdd)

In [166]:
summary.variance()

array([ 0.78434512,  1.17739852,  0.89355321,  1.04392896,  0.7218773 ,
        1.09173459,  0.74492437,  0.4759858 ,  0.        ,  1.15081103,
        0.99763152,  0.94775656,  1.19722718,  0.74309616,  0.90959581,
        0.96306527,  1.19621207,  0.90429383,  0.77339588,  1.19382168])

In [154]:
def svd(X, comps):
    svd = X.computeSVD(X.numCols(), computeU=False)
    s = svd.s.toArray()
    V = svd.V.toArray().T
    var = numpy.dot(s[comps:], s[comps:])
    return s[:comps], V[:comps], var

In [168]:
iter = 0
delta = 1e-12
ll = old_ll = -numpy.inf
N, P = X.numRows(), X.numCols()
psi = numpy.ones(N, dtype=numpy.float64)
var = summary.variance()
nsqrt = numpy.sqrt(N)

n_components = 10
llconst = P * numpy.log(2. * numpy.pi) + n_components

In [120]:
RowMatrix(sc.parallelize(V.toArray()[:2])).rows.take(2)

[DenseVector([-0.034, -0.4417, -0.0843, 0.159, 0.0063, 0.1022, 0.0047, -0.0445, 0.1947, 0.1807, -0.0262, 0.0523, 0.1423, 0.1067, -0.1971, 0.1621, 0.765, 0.0468, -0.0285, 0.0]),
 DenseVector([-0.0227, 0.1068, 0.6125, 0.1978, 0.0993, 0.16, -0.3683, -0.1608, 0.2489, 0.1421, 0.1679, -0.1084, -0.0013, 0.1567, -0.3359, -0.3368, -0.0621, -0.0135, -0.0048, -0.0])]

In [91]:
X.numCols()

20