In [2]:
import os
import sys
import pandas
import numpy, scipy

import findspark
findspark.init("/opt/local/spark/spark")

import pyspark
from pyspark.sql.window import Window
import pyspark.sql.functions as func

from pyspark.rdd import reduce
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeansModel, KMeans
from pyspark.ml.linalg import SparseVector, VectorUDT, Vector, Vectors

In [3]:
from  pyspark.mllib.classification import LogisticRegressionWithLBFGS

In [4]:
from pyspark.mllib.linalg.distributed import RowMatrix, DenseMatrix

In [5]:
from pyspark.ml.regression import GeneralizedLinearRegression, LinearRegression
from pyspark.ml.classification import LogisticRegression

In [5]:
file_name = "/Users/simondi/PROJECTS/target_infect_x_project/src/tix-analysis/data/single_cell_imaging_features.tsv"

In [6]:
file_name = "/home/simon/PROJECTS/koios/data/single_cell_imaging_data.tsv"

In [7]:
conf = pyspark.SparkConf().setMaster("local[*]").set("spark.driver.memory", "4G").set("spark.executor.memory", "4G")
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)

In [8]:
spark.version

'2.4.0'

In [9]:
df = spark.read.csv(file_name, sep="\t", header=True)

In [10]:
for i, x in enumerate(df.columns):
    if x.startswith("cells") or x.startswith("is_infected"):
        df = df.withColumn(x, df[x].cast("double"))
df = df.fillna(0)

In [11]:
cols = [x for x in df.columns if x.startswith("cell")]

In [12]:
assembler = VectorAssembler(
          inputCols=cols,
          outputCol="features")
df = assembler.transform(df)

In [13]:
df.take(1)

[Row(well='a06', gene='none', sirna='si00577689', image_idx='53', object_idx='32', is_infected=0.0, cells_areashape_area=-0.985555580708712, cells_areashape_eccentricity=1.21482809986764, cells_areashape_extent=-2.42399612026336, cells_areashape_formfactor=-1.82590787955417, cells_areashape_majoraxislength=-0.255003035999353, cells_areashape_minoraxislength=-1.47357246412869, cells_areashape_orientation=1.37596478303327, cells_areashape_perimeter=-0.0286053975769842, cells_children_bacteria_count=0.0, cells_children_invasomes_count=0.0, cells_location_center_x=-1.37954655665396, cells_location_center_y=-1.04610227770665, cells_neighbors_anglebetweenneighbors_2=1.14640446403333, cells_neighbors_firstclosestobjectnumber_2=-0.928043517811783, cells_neighbors_firstclosestxvector_2=0.38892244146247, cells_neighbors_firstclosestyvector_2=0.494825886590408, cells_neighbors_numberofneighbors_2=-1.01951908543517, cells_neighbors_percenttouching_2=-0.00303122589479362, cells_neighbors_secondclos

In [14]:
k = df.groupby("is_infected").count().toPandas()

In [27]:
min_cnt = int(k["count"].values.min())

In [28]:
min_cnt

26

In [29]:
res = "is_infected"

In [30]:
df.filter("{} == 0".format(res)).limit(min_cnt)

DataFrame[study: string, pathogen: string, library: string, design: string, replicate: string, plate: string, well: string, gene: string, sirna: string, well_type: string, image_idx: string, object_idx: string, cells_areashape_area: double, cells_areashape_eccentricity: double, cells_areashape_extent: double, cells_areashape_formfactor: double, cells_areashape_majoraxislength: double, cells_areashape_minoraxislength: double, cells_areashape_perimeter: double, cells_children_bacteria_count: double, cells_children_invasomes_count: double, cells_location_center_x: double, cells_location_center_y: double, cells_neighbors_anglebetweenneighbors_2: double, cells_neighbors_firstclosestobjectnumber_2: double, cells_neighbors_firstclosestxvector_2: double, cells_neighbors_firstclosestyvector_2: double, cells_neighbors_numberofneighbors_2: double, cells_neighbors_secondclosestobjectnumber_2: double, cells_neighbors_secondclosestxvector_2: double, cells_neighbors_secondclosestyvector_2: double, ce

In [179]:
glr = GeneralizedLinearRegression(maxIter=100, regParam=1, labelCol="is_infected", family="binomial", link="logit")

In [180]:
model = glr.fit(df)

In [15]:
from pyspark.ml.regression import RandomForestRegressor

In [16]:
forest = RandomForestRegressor(labelCol="is_infected", maxDepth=10, numTrees=100, subsamplingRate=.75)

In [17]:
fit = forest.fit(df)

In [24]:
predictions = fit.transform(df)

In [29]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="is_infected", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

In [27]:
rmse ** 2

2.1052063100712953e-05

In [30]:
from pyspark.ml.classification import RandomForestClassifier

In [31]:
forest = RandomForestClassifier(labelCol="is_infected", maxDepth=10, numTrees=100, subsamplingRate=.75, seed=23)

In [32]:
fit = forest.fit(df)

In [34]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [44]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="is_infected", predictionCol="prediction", metricName=["precision", "f1"])
accuracy = evaluator.evaluate(predictions)

TypeError: Invalid param value given for param "metricName". Could not convert <class 'list'> to string type

In [43]:
accuracy

pyspark.ml.evaluation.MulticlassClassificationEvaluator

In [36]:
accuracy

0.56

In [51]:
spark.stop()