In [1]:
import os
import sys
import pandas
import numpy, scipy, sklearn

import findspark
findspark.init("/usr/local/spark/spark")

import pyspark
from pyspark.sql.window import Window
import pyspark.sql.functions as func

from pyspark.rdd import reduce
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeansModel, KMeans
from pyspark.ml.linalg import SparseVector, VectorUDT, Vector, Vectors

In [2]:
from pyspark.sql.functions import udf, col, struct
from pyspark.sql.types import ArrayType, DoubleType, StringType
from pyspark.mllib.linalg.distributed import RowMatrix, DenseMatrix

from pyspark.mllib.stat import Statistics

In [3]:
from numpy import linalg
from scipy import stats

In [4]:
conf = pyspark.SparkConf().setMaster("local[*]").set("spark.driver.memory", "4G").set("spark.executor.memory", "4G")
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)

In [5]:
file_name = "/Users/simondi/PROJECTS/target_infect_x_project/data/target_infect_x/query_data/cells_sample_10_normalized_cut_100_factors/"

In [7]:
file_name_red = "/Users/simondi/PROJECTS/target_infect_x_project/data/target_infect_x/query_data/cells_sample_10_normalized_cut_100_factors_red/"

In [9]:
df = spark.read.parquet(file_name)
df2 = spark.read.parquet(file_name_red)

In [11]:
df.take(1)

[Row(study='infectx_published', pathogen='listeria', library='a', design='u', replicate='1', plate='kb2-02-1x', well='a01', gene='chka', sirna='s3008', well_type='sirna', image_idx='4', object_idx='144', cells_areashape_area=-0.8044235519947214, cells_areashape_eccentricity=0.01210726352227413, cells_areashape_extent=1.1159023326616404, cells_areashape_formfactor=1.4749286136850854, cells_areashape_majoraxislength=-0.9368530225439791, cells_areashape_minoraxislength=-0.7485320762193192, cells_areashape_perimeter=-1.0209269921587854, cells_children_bacteria_count=-0.7030171063989196, cells_children_invasomes_count=0.0, cells_location_center_x=0.44758084788204255, cells_location_center_y=1.2808903222918113, cells_neighbors_anglebetweenneighbors_2=1.3915760955745562, cells_neighbors_firstclosestobjectnumber_2=0.14886791955020456, cells_neighbors_firstclosestxvector_2=0.6694456185473984, cells_neighbors_firstclosestyvector_2=-1.233468444008488, cells_neighbors_numberofneighbors_2=-0.082518

In [12]:
df2.take(1)

[Row(study='infectx_published', pathogen='listeria', library='a', design='u', replicate='1', plate='kb2-02-1x', well='a01', gene='chka', sirna='s3008', well_type='sirna', image_idx='4', object_idx='144', cells_areashape_area=-0.8044235519947214, cells_areashape_eccentricity=0.01210726352227413, cells_areashape_extent=1.1159023326616404, cells_areashape_formfactor=1.4749286136850854, cells_areashape_majoraxislength=-0.9368530225439791, cells_areashape_minoraxislength=-0.7485320762193192, cells_areashape_perimeter=-1.0209269921587854, cells_children_bacteria_count=-0.7030171063989196, cells_children_invasomes_count=0.0, cells_location_center_x=0.44758084788204255, cells_location_center_y=1.2808903222918113, cells_neighbors_anglebetweenneighbors_2=1.3915760955745562, cells_neighbors_firstclosestobjectnumber_2=0.14886791955020456, cells_neighbors_firstclosestxvector_2=0.6694456185473984, cells_neighbors_firstclosestyvector_2=-1.233468444008488, cells_neighbors_numberofneighbors_2=-0.082518

In [7]:
desc_columns = [x for x in df.columns if (not x.startswith("cells") and not x.startswith("perin") and not x.startswith("nucl")) and not x.startswith("cells_children_invasomes_count") and not x.startswith("features")]
desc_columns

['study',
 'pathogen',
 'library',
 'design',
 'replicate',
 'plate',
 'well',
 'gene',
 'sirna',
 'well_type',
 'image_idx',
 'object_idx']

In [59]:
def to_array(col):
    def to_array_(v):
        return v.toArray().tolist()
    return udf(to_array_, ArrayType(DoubleType()))(col)

len_vec = len(df.select("features").take(1)[0][0])

df2 =(df.withColumn("f", to_array(col("features")))
    .select(df.columns + [col("f")[i] for i in range(len_vec)]))

for i, x in enumerate(df2.columns):
    if x.startswith("f["):
        df2 = df2.withColumnRenamed(x, x.replace("[", "_").replace("]", ""))


In [60]:
[x for x in df2.columns if x.startswith("f_")]

['f_0', 'f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'f_6', 'f_7', 'f_8', 'f_9']

In [61]:
f_cols = []
for _, x in enumerate(df2.columns):
    if x.startswith("f_"):
        f_cols.append(x)

In [62]:
rdd = df2.select(f_cols).rdd.map(list)
summary = Statistics.colStats(rdd)
means = summary.mean()

In [63]:
X = RowMatrix(RowMatrix(rdd).rows.map(lambda x: x - means))

In [64]:
Cov = X.computeCovariance() 

In [65]:
Precision = numpy.linalg.inv(Cov.toArray())
Precision

array([[  4.61483837,  -1.21970939,  -2.03988054,   1.57684704,
         -8.78183355,   2.48015193,  -0.33176774,   1.42871069,
          0.29384616,   3.26571233],
       [ -1.21970939,   4.66383626,   1.26470071,  -1.63242464,
         -3.49082448,  -0.66449692,  -1.68349341,  -4.21262924,
         -0.79365823,  -0.88271342],
       [ -2.03988054,   1.26470071,   3.41135463,  -0.80188545,
          6.57393891,  -6.20878519,   0.18442818,  -1.7231523 ,
         -0.4211366 ,  -2.9122063 ],
       [  1.57684704,  -1.63242464,  -0.80188545,   5.83111469,
         -5.46323645,   7.88573214,  -0.07247254,  -0.41022297,
          0.14981177,   2.13959539],
       [ -8.78183355,  -3.49082448,   6.57393891,  -5.46323645,
         54.82855389, -19.59359531,   6.49964727,  10.98076842,
          1.83822807, -10.1653187 ],
       [  2.48015193,  -0.66449692,  -6.20878519,   7.88573214,
        -19.59359531,  33.33047657,  -0.06396646,  -1.96622908,
         -0.48827882,   9.63678612],
       [ -

In [67]:
Precision.shape

(10, 10)

In [14]:
def to_array(col):
    def to_array_(v):
        arr = v.toArray()
        arr = arr.dot(Precision).dot(arr)
        return float(arr)
    return udf(to_array_, DoubleType())(col)

In [15]:
df = df.withColumn("maha", to_array(col("features")))

In [16]:
rdd = df.select("maha").rdd.map(list)
summary = Statistics.colStats(rdd)
means, var  = summary.mean(), summary.variance()

In [68]:
quant = scipy.stats.chi2.ppf(q=.975, df=Precision.shape[0])

In [69]:
quant

20.48317735080739

In [23]:
df.filter(df.maha < quant).count()

97

In [24]:
spark.stop()
