In [1]:
import os
import sys
import pandas
import numpy, scipy, sklearn

import findspark
findspark.init("/usr/local/spark/spark")

import pyspark
from pyspark.sql.window import Window
import pyspark.sql.functions as func

from pyspark.rdd import reduce
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeansModel, KMeans
from pyspark.ml.linalg import SparseVector, VectorUDT, Vector, Vectors

In [2]:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

In [3]:
file_name = "/Users/simondi/PHD/data/data/target_infect_x/query_data/cells_sample_10_normalized_cut_100.tsv"

In [4]:
conf = pyspark.SparkConf().setMaster("local[*]").set("spark.driver.memory", "4G").set("spark.executor.memory", "4G")
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)

In [5]:
df = spark.read.csv(path=file_name, sep="\t", header='true')

In [167]:
data = df.toPandas()
data = data.drop(columns="cells.children_invasomes_count")

In [168]:
fc = list(filter(lambda x: x.startswith("cells"), data.columns))
data = data[fc]

In [169]:
data.to_csv("/Users/simondi/PHD/data/data/target_infect_x/query_data/cells_sample_10_normalized_cut_100_X.tsv", sep="\t", header=False, index=False)

In [8]:
data = data.as_matrix()
data = data.astype(numpy.float64)


In [161]:
# factor analysis
means = numpy.nanmean(data, axis=0)
std = numpy.nanstd(data, axis=0)
X = (data - means)

In [12]:
from sklearn import decomposition

In [162]:
fa = sklearn.decomposition.FactorAnalysis(2, svd_method="lapack")
Fact = fa.fit(X)

In [163]:
Fact.transform(X)[:5, :]

array([[-0.08807901, -0.8333121 ],
       [-0.35700883, -0.96917584],
       [-0.82875471,  0.05929437],
       [-0.79494674, -0.20735465],
       [-1.1996305 , -0.17172922]])

In [146]:
def my_svd(X, n_components):
    _ , s, V = scipy.linalg.svd(X, full_matrices=False)
    squared_norm = numpy.dot(s[n_components:], s[n_components:])
    return (s[:n_components], V[:n_components], squared_norm)

In [150]:
iter = 0
SMALL = 1e-12
loglik = old_ll = 0
n_samples, n_features = X.shape
psi = numpy.ones(P, dtype=X.dtype)
var = numpy.var(X, axis=0)
n_components = 2
nsqrt = numpy.sqrt(n_samples)

In [152]:
for i in range(1000):
    sqrt_psi = numpy.sqrt(psi) + SMALL
    s, V, unexp_var = my_svd(X / (sqrt_psi * nsqrt), 2)
    s **= 2
    W = numpy.sqrt(numpy.maximum(s - 1., 0.))[:, numpy.newaxis] * V
    del V
    W *= sqrt_psi
    ll = numpy.sum(numpy.log(s))
    ll += unexp_var + numpy.sum(numpy.log(psi))
    ll *= -n_samples / 2.
    if (ll - old_ll) < 1e-2:
        break
    old_ll = ll
    psi = numpy.maximum(var - numpy.sum(W ** 2, axis=0), SMALL)

In [157]:
Ih = numpy.eye(len(W))
X_transformed = X
Wpsi = W / psi
cov_z = scipy.linalg.inv(Ih + numpy.dot(Wpsi, W.T))
tmp = numpy.dot(X_transformed, Wpsi.T)
X_transformed = numpy.dot(tmp, cov_z)
X_transformed[:5, :]

array([[ 0.01905461,  0.90476007],
       [-0.28180853,  0.82604855],
       [-0.72927048,  0.17711718],
       [-0.65352372,  0.47391855],
       [-1.07675214,  0.06966787]])

In [166]:
Fact.components_

array([[  8.84579042e-02,   4.48903753e-02,   6.29599638e-02,
          2.00818213e-01,   9.20037571e-02,   5.13256633e-02,
         -1.02210910e-03,  -7.82051788e-02,  -1.02059728e+00,
          6.41297047e-02,  -2.46473092e-03,  -1.08409408e+00,
          3.66296997e-02,  -5.43790774e-02,  -4.80653186e-02,
         -1.08053242e+00,  -1.40764089e-03,   3.24452867e-02,
         -1.08706616e+00],
       [  8.75180387e-01,  -1.97812793e-01,  -9.59560052e-03,
         -2.09715735e-01,   6.64981413e-01,   9.24435361e-01,
          6.29650429e-01,   2.47479888e-02,   7.90739468e-02,
         -3.46580964e-02,   5.83038030e-02,   1.32297256e-02,
          5.53447857e-02,   8.29072948e-02,   6.11041395e-01,
         -4.70341367e-03,  -1.58692044e-01,   1.61244315e-01,
          3.81466198e-03]])

In [160]:
W

array([[ 0.06299374,  0.03947032,  0.08647512,  0.23274195,  0.06115416,
         0.02775261, -0.04053818, -0.07396935, -0.9158791 ,  0.08313671,
         0.01675355, -0.95766478,  0.0345847 , -0.06612737, -0.05731117,
        -0.95576302, -0.00594052,  0.03080484, -0.95941779],
       [-0.67587988,  0.17186437,  0.19117327,  0.40740905, -0.55531097,
        -0.7359917 , -0.64441216, -0.05558636, -0.01129259,  0.09297085,
         0.01889788,  0.01795876, -0.10965364, -0.12610757, -0.52589439,
         0.05104842,  0.20904419, -0.15223245,  0.03394243]])