In [2]:
import os
import sys
import pandas
import numpy, scipy, sklearn

import findspark
findspark.init("/usr/local/spark/spark")

import pyspark
from pyspark.sql.window import Window
import pyspark.sql.functions as func

from pyspark.rdd import reduce
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeansModel, KMeans
from pyspark.ml.linalg import SparseVector, VectorUDT, Vector, Vectors

In [3]:
from pyspark.ml.regression import GeneralizedLinearRegression, LinearRegression

In [6]:
file_name = "/Users/simondi/PROJECTS/target_infect_x_project/data/target_infect_x/query_data/cells_sample_10_normalized_cut_100_factors/"

In [4]:
conf = pyspark.SparkConf().setMaster("local[*]").set("spark.driver.memory", "4G").set("spark.executor.memory", "4G")
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)

In [5]:
spark.version

'2.3.0'

In [7]:
df = spark.read.parquet(file_name)

In [8]:
df.take(1)

[Row(study='infectx_published', pathogen='listeria', library='a', design='u', replicate='1', plate='kb2-02-1x', well='a01', gene='chka', sirna='s3008', well_type='sirna', image_idx='4', object_idx='144', cells_areashape_area=-0.8044235519947214, cells_areashape_eccentricity=0.01210726352227413, cells_areashape_extent=1.1159023326616404, cells_areashape_formfactor=1.4749286136850854, cells_areashape_majoraxislength=-0.9368530225439791, cells_areashape_minoraxislength=-0.7485320762193192, cells_areashape_perimeter=-1.0209269921587854, cells_children_bacteria_count=-0.7030171063989196, cells_children_invasomes_count=0.0, cells_location_center_x=0.44758084788204255, cells_location_center_y=1.2808903222918113, cells_neighbors_anglebetweenneighbors_2=1.3915760955745562, cells_neighbors_firstclosestobjectnumber_2=0.14886791955020456, cells_neighbors_firstclosestxvector_2=0.6694456185473984, cells_neighbors_firstclosestyvector_2=-1.233468444008488, cells_neighbors_numberofneighbors_2=-0.082518

In [9]:
for i, x in enumerate(df.columns):
    if x.startswith("cells_children_bacteria_count"):
        df = df.withColumn("label", df[x].cast("double"))
df = df.fillna(0)

In [10]:
lr = LinearRegression(maxIter=100, regParam=0, elasticNetParam=0)

In [11]:
f = lr.fit(df)

In [38]:
model = f

In [42]:
model.summary.residuals.toPandas()

Unnamed: 0,residuals
0,0.001736
1,-0.206012
2,0.377601
3,-0.144873
4,0.262813
5,-0.524857
6,-0.045906
7,0.016271
8,-0.134229
9,-0.347510


In [23]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

In [24]:
train, test = df.randomSplit([0.9, 0.1], seed=23)

In [25]:
model = lr.fit(train)

In [30]:
model.transform(test).select(["label", "prediction"]).take(10)

[Row(label=-0.6324046272963104, prediction=-0.6490031713160578),
 Row(label=1.1329073502689189, prediction=0.9971040599152651),
 Row(label=-0.6324046272963104, prediction=-0.9035112926007549),
 Row(label=-0.7030171063989196, prediction=-0.5832485615603501),
 Row(label=1.3447447875767464, prediction=0.8272917641961595),
 Row(label=0.4973950383454364, prediction=0.9133392481923913),
 Row(label=-0.7030171063989196, prediction=-0.9633648063317238),
 Row(label=0.7798449547558731, prediction=0.5998874398452572),
 Row(label=-0.7030171063989196, prediction=-0.706635072049074),
 Row(label=0.14433264283239053, prediction=-0.32669901976554777)]

In [45]:
glr = GeneralizedLinearRegression(maxIter=100, regParam=0, family="gaussian", link="identity")

In [47]:
model = glr.fit(train)

In [48]:
model.transform(test).select(["label", "prediction"]).show()

+--------------------+--------------------+
|               label|          prediction|
+--------------------+--------------------+
| -0.6324046272963104| -0.6490031713160578|
|  1.1329073502689189|  0.9971040599152651|
| -0.6324046272963104| -0.9035112926007549|
| -0.7030171063989196| -0.5832485615603501|
|  1.3447447875767464|  0.8272917641961595|
|  0.4973950383454364|  0.9133392481923913|
| -0.7030171063989196| -0.9633648063317238|
|  0.7798449547558731|  0.5998874398452572|
| -0.7030171063989196|  -0.706635072049074|
| 0.14433264283239053|-0.32669901976554777|
|-0.27934223178326456|-0.22168777559232367|
|  0.7798449547558731|  0.7415063196478057|
|  0.2855576010376089|  1.1152499761266965|
| 0.07372016372978137| -0.2530712824875086|
| -0.5617921481937013|   -0.51768899484668|
|  0.9916823920637007|  0.9348901148852727|
| -0.6324046272963104|-0.39129522158481844|
+--------------------+--------------------+



In [50]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.regression import RandomForestRegressor

In [66]:
rf = RandomForestRegressor(labelCol="label", featuresCol="features", numTrees=100, maxDepth=15, subsamplingRate=.5)

In [67]:
rft =  rf.fit(train)

In [68]:
rft.transform(test).select(["label", "prediction"]).show()

+--------------------+--------------------+
|               label|          prediction|
+--------------------+--------------------+
| -0.6324046272963104|-0.32100359445380405|
|  1.1329073502689189|   0.271435105217087|
| -0.6324046272963104| -0.3718445794076824|
| -0.7030171063989196|-0.48411842118083087|
|  1.3447447875767464|  0.4465540533915579|
|  0.4973950383454364|-0.00254131370103...|
| -0.7030171063989196|-0.44245705851029166|
|  0.7798449547558731| -0.2758116078281342|
| -0.7030171063989196|-0.35207308525895187|
| 0.14433264283239053|-0.36972620503460435|
|-0.27934223178326456|-0.24897886576914266|
|  0.7798449547558731| 0.26931673084400887|
|  0.2855576010376089|  0.4048926907210186|
| 0.07372016372978137| -0.2659258607537688|
| -0.5617921481937013| -0.4954164178372482|
|  0.9916823920637007| 0.28838210020171323|
| -0.6324046272963104| -0.3767874529448651|
+--------------------+--------------------+



In [69]:
spark.stop()

In [45]:
fd = "/Users/simondi/PROJECTS/target_infect_x_project/src/tix-analysis/data/single_cell_samples.tsv"

In [46]:
d = spark.read.csv(path=fd, sep="\t", header=True)

In [49]:
d.take(1)

[Row(study='infectx_published', pathogen='listeria', library='a', design='u', replicate='1', plate='kb2-02-1x', well='a01', gene='chka', sirna='s3008', well_type='sirna', image_idx='4', object_idx='144', cells_areashape_area='-0.8044235519947214', cells_areashape_eccentricity='0.01210726352227413', cells_areashape_extent='1.1159023326616404', cells_areashape_formfactor='1.4749286136850854', cells_areashape_majoraxislength='-0.9368530225439791', cells_areashape_minoraxislength='-0.7485320762193192', cells_areashape_perimeter='-1.0209269921587854', cells_children_bacteria_count='-0.7030171063989196', cells_children_invasomes_count=None, cells_location_center_x='0.44758084788204255', cells_location_center_y='1.2808903222918113', cells_neighbors_anglebetweenneighbors_2='1.3915760955745562', cells_neighbors_firstclosestobjectnumber_2='0.14886791955020456', cells_neighbors_firstclosestxvector_2='0.6694456185473984', cells_neighbors_firstclosestyvector_2='-1.233468444008488', cells_neighbors_

In [52]:
d.select(["cells_neighbors_secondclosestyvector_2", "cells_parent_nuclei"]).rdd.map(numpy.array).take(1)

[array(['1.2455372476249023', '0.13572336470424068'], dtype='<U19')]

In [53]:
d.

['study',
 'pathogen',
 'library',
 'design',
 'replicate',
 'plate',
 'well',
 'gene',
 'sirna',
 'well_type',
 'image_idx',
 'object_idx',
 'cells_areashape_area',
 'cells_areashape_eccentricity',
 'cells_areashape_extent',
 'cells_areashape_formfactor',
 'cells_areashape_majoraxislength',
 'cells_areashape_minoraxislength',
 'cells_areashape_perimeter',
 'cells_children_bacteria_count',
 'cells_children_invasomes_count',
 'cells_location_center_x',
 'cells_location_center_y',
 'cells_neighbors_anglebetweenneighbors_2',
 'cells_neighbors_firstclosestobjectnumber_2',
 'cells_neighbors_firstclosestxvector_2',
 'cells_neighbors_firstclosestyvector_2',
 'cells_neighbors_numberofneighbors_2',
 'cells_neighbors_secondclosestobjectnumber_2',
 'cells_neighbors_secondclosestxvector_2',
 'cells_neighbors_secondclosestyvector_2',
 'cells_parent_nuclei']

In [94]:
for x in ["cells_location_center_x"]:
    d = d.withColumn(x, d[x].cast("float"))
for x in ["cells_location_center_y"]:
    d = d.withColumn(x, d[x].cast("float"))

In [95]:
d.dtypes

[('study', 'string'),
 ('pathogen', 'string'),
 ('library', 'string'),
 ('design', 'string'),
 ('replicate', 'string'),
 ('plate', 'string'),
 ('well', 'string'),
 ('gene', 'string'),
 ('sirna', 'string'),
 ('well_type', 'string'),
 ('image_idx', 'string'),
 ('object_idx', 'string'),
 ('cells_areashape_area', 'string'),
 ('cells_areashape_eccentricity', 'string'),
 ('cells_areashape_extent', 'string'),
 ('cells_areashape_formfactor', 'string'),
 ('cells_areashape_majoraxislength', 'string'),
 ('cells_areashape_minoraxislength', 'string'),
 ('cells_areashape_perimeter', 'string'),
 ('cells_children_bacteria_count', 'string'),
 ('cells_children_invasomes_count', 'string'),
 ('cells_location_center_x', 'float'),
 ('cells_location_center_y', 'float'),
 ('cells_neighbors_anglebetweenneighbors_2', 'string'),
 ('cells_neighbors_firstclosestobjectnumber_2', 'string'),
 ('cells_neighbors_firstclosestxvector_2', 'string'),
 ('cells_neighbors_firstclosestyvector_2', 'string'),
 ('cells_neighbors_

In [104]:
"cells_parent_nuclei" in column_types

True

In [98]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["cells_location_center_x"],
    outputCol="features")

dn = assembler.transform(d)

In [106]:
numpy.array(dn.select("features").take(1)[0][0]).dtype == "float64"

True