In [1]:
import os
import sys
import pandas
import numpy

import findspark
findspark.init("/usr/local/spark/spark")

import pyspark
from pyspark.sql.window import Window
import pyspark.sql.functions as func

from pyspark.rdd import reduce
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import GaussianMixture, KMeansModel, KMeans
from pyspark.ml.linalg import SparseVector, VectorUDT, Vector, Vectors

In [2]:
from pyspark.sql.functions import udf, col, struct
from pyspark.sql.types import ArrayType, DoubleType, StringType
from pyspark.mllib.linalg.distributed import RowMatrix, DenseMatrix

In [3]:
conf = pyspark.SparkConf().setMaster("local[*]").set("spark.driver.memory", "4G").set("spark.executor.memory", "4G")
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)

In [4]:
cluster_files =  "/Users/simondi/PROJECTS/target_infect_x_project/src/tix-analysis/data/outlier-removal"

In [33]:
df = spark.read.parquet(cluster_files)

In [19]:
n = df.count()
p = len(df.select("features").take(1)[0][0])

In [22]:
import scipy
from scipy import stats

In [14]:
def n_param(k, p, n):
    n_mean = k * p
    n_var = k * p*(p + 1) / 2
    n_mix = k - 1
    return n_mean + n_var + n_mix

In [23]:
m_max = GaussianMixture(k=50, seed=23).fit(df)
n_par_k50 = n_param(50, p, n)

In [35]:
p_vals = []

In [40]:
for i in range(2, 20 + 1):
    km = GaussianMixture(k=i, seed=3)
    model = km.fit(df)
    t = 2 * (m_max.summary.logLikelihood - model.summary.logLikelihood)
    d = n_par_k10 - n_param(i, p, n)
    p_val = 1 - scipy.stats.chi2.cdf(t, df=d)
    p_vals.append(p_val)
    print(i, p_val, t, m_max.summary.logLikelihood, model.summary.logLikelihood)

2 0.0 9752.082137066103 4660.878851065225 -215.16221746782648
3 0.0 9598.132896988493 4660.878851065225 -138.18759742902137
4 0.0 9245.63946074466 4660.878851065225 38.05912069289517
5 0.0 8702.272638826007 4660.878851065225 309.7425316522213
6 0.0 8679.253963897298 4660.878851065225 321.2518691165761
7 0.0 10472.683401106478 4660.878851065225 -575.4628494880142
8 0.0 10334.049124003232 4660.878851065225 -506.1457109363907
9 0.0 10758.786732964702 4660.878851065225 -718.5145154171264
10 nan 10633.886047374566 4660.878851065225 -656.0641726220582
11 nan 11383.616124921653 4660.878851065225 -1030.929211395602
12 nan 9791.135239570185 4660.878851065225 -234.6887687198678
13 nan -1131.7343193595243 4660.878851065225 5226.746010744987
14 nan 10193.354102886826 4660.878851065225 -435.7982003781883
15 nan 85.56076537355875 4660.878851065225 4618.098468378445
16 nan 11283.957620350597 4660.878851065225 -981.0999591100733
17 nan 10801.039434503511 4660.878851065225 -739.6408661865306
18 nan 110

In [209]:
def split_features(data):
    def to_array(col):
        def to_array_(v):
            return v.toArray().tolist()

        return udf(to_array_, ArrayType(DoubleType()))(col)

    len_vec = len(data.select("features").take(1)[0][0])
    data = (data.withColumn("f", to_array(col("features")))
            .select(["prediction"]+  [col("f")[i] for i in range(len_vec)]))

    for i, x in enumerate(data.columns):
        if x.startswith("f["):
            data = data.withColumnRenamed(
                x, x.replace("[", "_").replace("]", ""))

    return data

In [211]:
K = 5
gm_k = KMeans(k=K, seed=23).fit(df)
df = gm_k.transform(df)

In [212]:
d_k = split_features(df)

In [213]:
d_k.take(1)

[Row(prediction=3, f_0=0.983076443174413, f_1=0.40052619964395375, f_2=0.059322697068162594, f_3=0.8912970547151504, f_4=0.6459296915260802, f_5=-1.6407744126709187, f_6=0.0779898194213411, f_7=0.6963730444812934, f_8=0.20419738806418344, f_9=-0.6405384795826222, f_10=-1.2687766832708212, f_11=-0.5029967514166027, f_12=0.4632598970025907, f_13=-1.3307274422272615, f_14=-0.44745668769115393)]

In [248]:
from scipy.spatial import distance

In [236]:
d_k_p.values.var()

0.986145035240654

In [257]:
cvar = sum(distance.cdist(d_k_p, [means], "sqeuclidean")) / 
cvar

array([ 55.4781129])

In [274]:
total_var = 0
for i in range(K):
    d_k_c = d_k.filter("prediction==" + str(i)).drop("prediction")
    rdd = d_k_c.rdd.map(list)
    means = gm_k.clusterCenters()[i]
    var = (RowMatrix(rdd).rows
               .map(lambda x: (x - means))
               .map(lambda x: x.T.dot(x))
               .reduce(lambda x, y: x + y))
    total_var += var
total_var /= (d_k.count() - K)

In [275]:
ll =  0
for i in range(K):
    ll += 

5.7776826435575863

In [55]:
data = split_features(df)

In [57]:
data = data.toPandas().values

In [41]:
import sklearn.mixturea

In [197]:
df2 = split_features(df)
df2.count()

94

In [198]:
df2.take(1)

[Row(f_0=0.983076443174413, f_1=0.40052619964395375, f_2=0.059322697068162594, f_3=0.8912970547151504, f_4=0.6459296915260802, f_5=-1.6407744126709187, f_6=0.0779898194213411, f_7=0.6963730444812934, f_8=0.20419738806418344, f_9=-0.6405384795826222, f_10=-1.2687766832708212, f_11=-0.5029967514166027, f_12=0.4632598970025907, f_13=-1.3307274422272615, f_14=-0.44745668769115393)]

In [155]:
from pyspark.mllib.stat import Statistics

In [199]:
rdd = df2.rdd.map(list)
means = Statistics.colStats(rdd).mean()
X = RowMatrix(rdd).rows.map(lambda x: (x - means))

In [225]:
X = RowMatrix(X).rows.map(lambda x: x.T.dot(x)).reduce(lambda x, y: x + y)

In [203]:
1 / df2.count()  * X

DenseVector([0.7589, 1.2618, 0.2354, 0.5254, 0.6426, 0.8631, 0.0176, 1.3929, 0.4221, 1.1192, 0.962, 0.5552, 0.5694, 1.1023, 0.6413])

In [204]:
data.var(axis=0)

array([ 0.75889718,  1.26182452,  0.23535596,  0.52543821,  0.64257824,
        0.86314728,  0.017615  ,  1.39287155,  0.42214697,  1.11919899,
        0.96200105,  0.55520918,  0.56942264,  1.10234209,  0.64131112])

In [None]:
spark.stop()