In [1]:
import numpy as np
from functools import reduce
from scipy.stats import norm
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import GaussianMixture

In [2]:
import pyspark
sc = pyspark.SparkContext('local[*]')

## Multivariate Gaussian in Eigenbasis

In [3]:
def svd_pinv(m):
  U,s,Vh = np.linalg.svd(m)
  s_inv = np.diag(np.power(s,-1))
  m_inv = np.matmul(np.matmul(np.transpose(Vh), s_inv), np.transpose(U))
  return m_inv

def shuffle(*dfs):
  dataset = reduce(lambda a,b: np.append(a, b, axis=0), dfs)
  # Generate the permutation index array.
  permutation = np.random.permutation(dataset.shape[0])
  # Shuffle dataset by giving the permutation in the square brackets.
  shuffled = dataset[permutation]
  return shuffled

In [4]:
sigma_1, sigma_2, sigma_3 = 0.5, 1.0, 2.0
sigma_1_2, sigma_1_3, sigma_2_3 = 0.5, -0.5, 0.25
cov1 = np.array([
  [sigma_1, sigma_1_2, sigma_1_3],
  [sigma_1_2, sigma_2, sigma_2_3],
  [sigma_1_3, sigma_2_3, sigma_3]
])
mu1 = np.array([2,0,0])
rv1 = np.random.multivariate_normal(mu1, cov1, 25000)

sigma_1, sigma_2, sigma_3 = 1.0, 0.5, 2.0
sigma_1_2, sigma_1_3, sigma_2_3 = 0.5, 0.25, -0.5
cov2 = np.array([
  [sigma_1, sigma_1_2, sigma_1_3],
  [sigma_1_2, sigma_2, sigma_2_3],
  [sigma_1_3, sigma_2_3, sigma_3]
])
mu2 = np.array([0,0,2])
rv2 = np.random.multivariate_normal(mu2, cov2, 50000)

sigma_1, sigma_2, sigma_3 = 2.0, 0.5, 1.0
sigma_1_2, sigma_1_3, sigma_2_3 = -0.25, 0.25, 0.5
cov3 = np.array([
  [sigma_1, sigma_1_2, sigma_1_3],
  [sigma_1_2, sigma_2, sigma_2_3],
  [sigma_1_3, sigma_2_3, sigma_3]
])
mu3 = np.array([0,2,0])
rv3 = np.random.multivariate_normal(mu3, cov3, 50000)

sigma_1, sigma_2, sigma_3 = 1.0, 1.0, 1.0
sigma_1_2, sigma_1_3, sigma_2_3 = -0.5, -0.5, -0.25
cov4 = np.array([
  [sigma_1, sigma_1_2, sigma_1_3],
  [sigma_1_2, sigma_2, sigma_2_3],
  [sigma_1_3, sigma_2_3, sigma_3]
])
mu4 = np.array([-1,-1,-1])
rv4 = np.random.multivariate_normal(mu4, cov4, 100000)

In [5]:
independent_rvs = [norm(loc=-1.0, scale=2.0), norm(loc=0.0, scale=0.5), 
                   norm(loc=-1.0, scale=1.0), norm(loc=2.0, scale=0.5)]

error = norm(loc=0, scale=0.05)

def gen_error():
  return np.array([error.rvs(), error.rvs(), error.rvs()])

data = sc.parallelize([ Vectors.dense(np.append(x, independent_rvs[np.random.randint(0,4)].rvs())) 
                        for x in shuffle(rv1, rv2, rv3, rv4) ])

In [6]:
import gmm.gaussian_mixture_model

In [7]:
import importlib
importlib.reload(gmm.gaussian_mixture_model)

<module 'gmm.gaussian_mixture_model' from '/usr/local/spark/python/gmm/gaussian_mixture_model.py'>

In [8]:
modeling_service = gmm.gaussian_mixture_model.ModelingService(data)

In [9]:
full_model = modeling_service.get_gmm(4, range(4))
print(full_model.stats(modeling_service))

GMMStats(log_likelihood=-1484127.3212903528, null_log_likelihoods=array([-2774256.19331429, -2722749.37599251, -2828073.66302987,
       -3222363.78883318]), p_values=[0.0, 0.0, 0.0, 0.0])


In [14]:
full_model.log_likelihood(modeling_service, [0,1,2,3])

-1484127.3212903528

In [16]:
full_model.log_likelihood(modeling_service, [0,1,2]) + full_model.log_likelihood(modeling_service, [3])

-1484144.9308496914

In [11]:
modeling_service.get_gmm(4,[3])

<gmm.gaussian_mixture_model.GaussianMixtureModel at 0x7f6001d50ef0>

In [12]:
m = modeling_service.get_gmm(4,[3])
m.log_likelihood(modeling_service, [3])

-2176556.4878573888

In [36]:
modeling_service.get_gmm(4,[0,1,2]).log_likelihood(modeling_service)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 496.0 failed 1 times, most recent failure: Lost task 0.0 in stage 496.0 (TID 1945, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 229, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 224, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2438, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2438, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 362, in func
    return f(iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 1047, in <lambda>
    return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
  File "/usr/local/spark/python/gmm/gaussian_mixture_model.py", line 116, in f
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/mllib/linalg/__init__.py", line 487, in func
    return DenseVector(getattr(self.array, op)(other))
ValueError: operands could not be broadcast together with shapes (3,) (4,) 

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:153)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 229, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 224, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2438, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2438, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 362, in func
    return f(iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 1047, in <lambda>
    return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
  File "/usr/local/spark/python/gmm/gaussian_mixture_model.py", line 116, in f
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/mllib/linalg/__init__.py", line 487, in func
    return DenseVector(getattr(self.array, op)(other))
ValueError: operands could not be broadcast together with shapes (3,) (4,) 

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [20]:
modeling_service.get_gmm(4,[0,1,2]).log_likelihood(modeling_service, [0,1,2]) 
# modeling_service.get_gmm(4,[3]).log_likelihood(modeling_service, [3])

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 487.0 failed 1 times, most recent failure: Lost task 2.0 in stage 487.0 (TID 1932, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 229, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 224, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2438, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2438, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 362, in func
    return f(iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 1047, in <lambda>
    return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
  File "/usr/local/spark/python/gmm/gaussian_mixture_model.py", line 116, in f
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/mllib/linalg/__init__.py", line 487, in func
    return DenseVector(getattr(self.array, op)(other))
ValueError: operands could not be broadcast together with shapes (3,) (4,) 

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:153)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 229, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 224, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2438, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2438, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 362, in func
    return f(iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 1047, in <lambda>
    return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
  File "/usr/local/spark/python/gmm/gaussian_mixture_model.py", line 116, in f
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/mllib/linalg/__init__.py", line 487, in func
    return DenseVector(getattr(self.array, op)(other))
ValueError: operands could not be broadcast together with shapes (3,) (4,) 

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


## K-tuning using gain in log likelihood

In [9]:
from pyspark.sql import Row

models = []
results = []
n = data.count()
d = 4
for i in range(1,10):
  model = GaussianMixture.train(data, i)
  models.append(model)
  ll, p_values = log_likelihood_2(model, data)
  p = float((d*(d+1)/2.0)*i + i)
  bic = float(np.log(n)*p - 2*ll)
  aic = float(2*p - 2*ll)
  results.append(Row(k=i, ll=float(ll), bic=bic, aic=aic, p_values=[float(p) for p in p_values]))

[-1723816.10335661 -1747573.61591564 -1771145.70119122 -1836063.98274143
 -1732033.37785021]


[-1717323.26246946 -1736159.09863704 -1760731.6902329  -1796214.69778217
 -1725543.57114666]


[-1708273.86395669 -1728519.89911291 -1752688.87935763 -1787765.25047603
 -1719713.65675741]


[-1706533.35247959 -1726828.06132709 -1750113.70107617 -1786175.64347773
 -1725114.70673793]


[-1706011.7220369  -1720825.28024652 -1741035.35396039 -1785879.88761325
 -1725339.53707599]


[-1706082.68663437 -1715871.1711803  -1741389.99941415 -1785697.86229021
 -1721345.32757171]


[-1705940.54106405 -1724328.70622291 -1737042.71467506 -1776218.1951313
 -1722193.62831841]


[-1705682.07881474 -1721515.1642646  -1740796.71936625 -1779473.67427251
 -1721668.05907534]


[-1705674.24507534 -1722362.78743711 -1737962.38261963 -1766601.10673732
 -1722486.88601074]


In [10]:
p_values = [chi2.sf(-2*(results[i-1].ll - results[i].ll), df=d*(d+3)/2.0) for i in range(1, len(results))]
print(p_values)

[0.0, 0.0, 0.0, 8.1402331507547386e-214, 1.0, 2.211169486216714e-52, 2.3904611078300201e-101, 0.33409984487413497]


In [11]:
[results[i-1].ll - results[i].ll for i in range(1, len(results))]

[-6492.840887148166,
 -9049.398512767628,
 -1740.511477104621,
 -521.6304426859133,
 70.9645974682644,
 -142.1455703151878,
 -258.462249313714,
 -7.833739404100925]

In [12]:
display(results)

## P-value estimate using Likelihood Ratio Test

In [15]:
model = GaussianMixture.train(data, 4)

In [16]:
log_likelihood_1(model, data)

In [17]:
log_likelihood_2(model, data)

In [18]:
from pyspark.sql import Row

results = []
d = 4
for i in range(2,10):
  model = GaussianMixture.train(data, 3)
  ll, p_values = log_likelihood(model, data)
  p = float((d*(d+1)/2.0)*i + i)
  n = 45000.0
  bic = float(np.log(n)*p - 2*ll)
  aic = float(2*p - 2*ll)
  results.append(Row(k=i, ll=float(ll), bic=bic, aic=aic, p_values=[float(p) for p in p_values]))

In [19]:
results

In [20]:
d = 4
i, j = 0, 2

chi2.sf(-2 * (results[i].ll - results[j].ll), df=(j-i)*(d*(d+3)/2))

In [21]:
display(results)