In [1]:
import os
import sys
import pandas
import numpy, scipy, sklearn

import findspark
findspark.init("/usr/local/spark/spark")

import pyspark
from pyspark.sql.window import Window
import pyspark.sql.functions as func

from pyspark.rdd import reduce
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeansModel, KMeans
from pyspark.ml.linalg import SparseVector, VectorUDT, Vector, Vectors

In [5]:
file_name = "/Users/simondi/PROJECTS/target_infect_x_project/results/2-analysis/1-fa/kmeans_fa/all_optimal_from_file_feature_dbq_250_cells_100"

In [3]:
conf = pyspark.SparkConf().setMaster("local[*]").set("spark.driver.memory", "4G").set("spark.executor.memory", "4G")
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)

In [4]:
spark.version

'2.2.0'

In [13]:
df = spark.read.csv(path=file_name, sep="\t", header='true')

[Row(study='group_cossart', pathogen='listeria', library='d', design='p', replicate='1', plate='dz01-1k', well='a01', gene='atp6v1a', sirna='l-017590-01', well_type='control', image_idx='3', object_idx='196', cells_children_bacteria_count=-0.6348524588490496, cells_children_invasomes_count=0.0, cells_parent_nuclei=1.637146964522227, nuclei_areashape_area=1.331986032373456, nuclei_areashape_eccentricity=0.6991364038587189, nuclei_areashape_extent=1.0779592159992262, nuclei_areashape_formfactor=0.5356999999952932, nuclei_areashape_majoraxislength=1.2761067493510043, nuclei_areashape_minoraxislength=0.6184093315089785, nuclei_areashape_orientation=1.5058170723130202, nuclei_areashape_perimeter=0.849385606533167, nuclei_children_perinuclei_count=0.0, nuclei_location_center_x=1.3666459116451455, nuclei_location_center_y=0.39182218594382734, perinuclei_areashape_area=1.2084589080433021, perinuclei_areashape_eccentricity=0.3005343851706547, perinuclei_areashape_formfactor=-0.16934123324327918

In [6]:
data = df.toPandas()
data = data.drop(columns="cells.children_invasomes_count")

In [7]:
fc = list(filter(lambda x: x.startswith("cells"), data.columns))
data = data[fc]

In [8]:
data.to_csv("/Users/simondi/PHD/data/data/target_infect_x/query_data/cells_sample_10_normalized_cut_100_X.tsv", sep="\t", header=False, index=False)

In [9]:
data = data.as_matrix()
data = data.astype(numpy.float64)

In [18]:
# factor analysis
means = numpy.mean(data, axis=0)
std = numpy.nanstd(data, axis=0)
X = (data )

In [11]:
from sklearn import decomposition

In [24]:
fa = sklearn.decomposition.FactorAnalysis(2, svd_method="lapack")
Fact = fa.fit(X)

In [19]:
X

array([[-0.80442355,  0.01210726,  1.11590233, ..., -0.50782963,
         1.24553725,  0.13572336],
       [-0.94741708, -0.03987198,  0.12623638, ..., -1.06214707,
        -0.58986888,  0.42900039],
       [-0.08386401,  1.12391128,  1.76111788, ...,  0.65419376,
        -0.65290985,  0.94223519],
       ..., 
       [-0.22605869, -0.02048565, -1.18489627, ...,  0.95339164,
        -0.90274757,  1.69986752],
       [-0.12180922,  0.98090367, -1.03566249, ..., -1.15072905,
        -0.38057235, -1.35510153],
       [-1.07563194,  1.32405374,  0.52578998, ..., -1.02782875,
         0.09378172,  1.32105136]])

In [45]:
Fact.transform(X)[:5, :]

array([[-0.12770892, -0.9039069 ],
       [-0.39663874, -1.03977064],
       [-0.86838462, -0.01130043],
       [-0.83457665, -0.27794946],
       [-1.23926041, -0.24232402]])

In [34]:
def my_svd(X, n_components):
    _ , s, V = scipy.linalg.svd(X, full_matrices=True)
    squared_norm = numpy.dot(s[n_components:], s[n_components:])
    return (s[:n_components], V[:n_components], squared_norm)

In [33]:
s, V, unexp_var = my_svd(X , 2)
s, V, unexp_var

(array([ 21.65266028,  18.2892159 ]),
 array([[ 0.0340333 ,  0.02268537,  0.04267166,  0.11936017,  0.03393101,
          0.01474167, -0.01928034, -0.03671472, -0.47661886,  0.0439692 ,
          0.00712674, -0.49919559,  0.01784548, -0.03531952, -0.03146012,
         -0.49798872, -0.00225486,  0.01623744, -0.50000802],
        [-0.44170891,  0.1068268 ,  0.12974732,  0.2699105 , -0.36552517,
         -0.47879096, -0.42314363, -0.0395485 , -0.00887609,  0.05972258,
          0.01474232,  0.01129729, -0.07094486, -0.08024128, -0.33947188,
          0.03247905,  0.13474579, -0.09976371,  0.02152992]]),
 982.15165778690869)

In [107]:
iter = 0
SMALL = 1e-12
ll = old_ll = -numpy.inf
n_samples, n_features = X.shape
psi = numpy.ones(n_features, dtype=X.dtype)
var = numpy.var(X, axis=0)
n_components = 2
nsqrt = numpy.sqrt(n_samples)
llconst = n_features * numpy.log(2. * numpy.pi) + n_components
print(nsqrt)

10.0


In [108]:
for i in range(5):
    print(i)
    sqrt_psi = numpy.sqrt(psi) + SMALL
    s, V, unexp_var = my_svd(X / (sqrt_psi * nsqrt), 2)
    s **= 2
    W = numpy.sqrt(numpy.maximum(s - 1., 0.))[:, numpy.newaxis] * V
    W *= sqrt_psi    
    ll = llconst + numpy.sum(numpy.log(s))
    ll += unexp_var + numpy.sum(numpy.log(psi))
    ll *= -n_samples / 2.    
    psi = numpy.maximum(var - numpy.sum(W ** 2, axis=0), SMALL)
    if numpy.abs(ll - old_ll) < 0.001:
        break
    old_ll = ll

0
1
2
3
4


In [109]:
Ih = numpy.eye(len(W))
X_transformed = X
Wpsi = W / psi
cov_z = scipy.linalg.inv(Ih + numpy.dot(Wpsi, W.T))
tmp = numpy.dot(X_transformed, Wpsi.T)
X_transformed = numpy.dot(tmp, cov_z)
X_transformed[:5, :]

array([[-0.08085689, -0.88416287],
       [-0.32554325, -0.9936295 ],
       [-0.85995564,  0.00164118],
       [-0.80788002, -0.29778315],
       [-1.21530389, -0.14961542]])

In [14]:
spark.stop()