In [1]:
import os
import sys
import pandas
import numpy

import findspark
findspark.init("/usr/local/spark/spark")

import pyspark
from pyspark.sql.window import Window
import pyspark.sql.functions as func

from pyspark.rdd import reduce
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import GaussianMixture, KMeansModel, KMeans
from pyspark.ml.linalg import SparseVector, VectorUDT, Vector, Vectors

In [2]:
from pyspark.sql.functions import udf, col, struct
from pyspark.sql.types import ArrayType, DoubleType, StringType
from pyspark.mllib.linalg.distributed import RowMatrix, DenseMatrix

In [3]:
conf = pyspark.SparkConf().setMaster("local[*]").set("spark.driver.memory", "4G").set("spark.executor.memory", "4G")
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)

In [4]:
cluster_files =  "/Users/simondi/PROJECTS/target_infect_x_project/src/tix-analysis/data/outlier-removal"

In [258]:
df = spark.read.parquet(cluster_files)

In [259]:
def n_param_gmm(k, p, n):
    n_mean = k * p
    n_var = k * p*(p + 1) / 2
    n_mix = k - 1
    return n_mean + n_var + n_mix

In [260]:
def split_features(data):
    def to_array(col):
        def to_array_(v):
            return v.toArray().tolist()

        return udf(to_array_, ArrayType(DoubleType()))(col)

    len_vec = len(data.select("features").take(1)[0][0])
    data = (data.withColumn("f", to_array(col("features")))
            .select(["prediction"]+  [col("f")[i] for i in range(len_vec)]))

    for i, x in enumerate(data.columns):
        if x.startswith("f["):
            data = data.withColumnRenamed(
                x, x.replace("[", "_").replace("]", ""))

    return data

In [261]:
def n_param_kmm(k, p):
    n_mean = k * p
    n_var = 1
    return n_mean + v_var

In [273]:
n = d_k.count()
p = len(d_k.take(1)[0]) - 1

In [275]:
max_k_model = KMeans(k=10, seed=1).fit(df)
transformed_data = split_features(max_k_model.transform(df))

In [293]:
def compute_variance(K, data, model, p):
    total_var = 0
    ni = scipy.zeros(K)
    p = len(data.take(1)[0]) - 1

    for i in range(K):
        data_cluster = data.filter("prediction==" + str(i)).drop("prediction")
        ni[i] =  data_cluster.count()
        rdd = data_cluster.rdd.map(list)
        means = model.clusterCenters()[i]
        var = (RowMatrix(rdd).rows
                   .map(lambda x: (x - means).T.dot(x - means))
                   .reduce(lambda x, y: x + y))
        total_var += var
    total_var /= ((data.count() - K) * p)

    return total_var, ni

In [294]:
max_k_var, ni = compute_variance(K, transformed_data, max_k_model, p)

In [295]:
def loglik(data, K, n, model):
    p = len(data.take(1)[0]) - 1
    total_var, ni = compute_variance(K, data, model, p)
    ll = 0
    
    for i in range(K):
        l = ni[i] * scipy.log(ni[i])
        l -= ni[i] * scipy.log(n) 
        l -= .5 * ni[i] * p * scipy.log(2 * scipy.pi * total_var) 
        l -= .5 * (ni[i] - 1) * p
        ll += l
    ll -= .5 * scipy.log(n) * (p + 1) * K
    
    return ll

In [292]:
max_loglik = loglik(transformed_data, 10, n, max_k_model)

P 15


In [272]:
def lrt(max_loglik, loglik, max_params, params):
    t = 2 * (max_loglik - loglik)
    df = max_params - params    
    p_val = 1 - scipy.stats.chi2.cdf(t, df=df)
    print(max_loglik, loglik, t, df, p_val)
    return p_val

In [157]:
def bs(arr, p):
    lefts, rights, mids = [], [], []
    left, right = 0, len(arr)
    mid = int((left + right) / 2)
    
    while True:
        mids.append(mid)
        lefts.append(left)
        rights.append(right)
        print(left, mid, right, arr[mid])
        if arr[mid] < p:
            mid, right= int((left + mid) / 2), mid
        elif arr[mid] > p:
            mid, left = int((right + mid) / 2), mid
        if left == lefts[-1] and right == rights[-1]:
            break
    return right

i = bs(li, .05)
print(i, li[i])

i = bs(li2, .05)
print(i, li2[i])

0 22 45 0.175
22 33 45 0.0583157894737
33 39 45 0.0270526315789
33 36 39 0.0426842105263
33 34 36 0.0531052631579
34 35 36 0.0478947368421
34 34 35 0.0531052631579
35 0.0478947368421
0 17 35 0.071125
17 26 35 0.034
17 21 26 0.054625
21 23 26 0.046375
21 22 23 0.0505
22 22 23 0.0505
23 0.046375
