In [1]:
import os
import sys
import pandas
import numpy

import findspark
findspark.init("/usr/local/spark/spark")

import pyspark
from pyspark.sql.window import Window
import pyspark.sql.functions as func

from pyspark.rdd import reduce
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import GaussianMixture, KMeansModel, KMeans
from pyspark.ml.linalg import SparseVector, VectorUDT, Vector, Vectors

In [2]:
from pyspark.sql.functions import udf, col, struct
from pyspark.sql.types import ArrayType, DoubleType, StringType
from pyspark.mllib.linalg.distributed import RowMatrix, DenseMatrix

In [3]:
conf = pyspark.SparkConf().setMaster("local[*]").set("spark.driver.memory", "4G").set("spark.executor.memory", "4G")
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)

In [4]:
cluster_files =  "/Users/simondi/PROJECTS/target_infect_x_project/src/tix-analysis/data/outlier-removal"

In [5]:
df = spark.read.parquet(cluster_files)

In [232]:
n = df.count()
p = len(df.select("features").take(1)[0][0])

In [233]:
import scipy
from scipy import stats

In [234]:
def n_param_gmm(k, p, n):
    n_mean = k * p
    n_var = k * p*(p + 1) / 2
    n_mix = k - 1
    return n_mean + n_var + n_mix

In [9]:
def split_features(data):
    def to_array(col):
        def to_array_(v):
            return v.toArray().tolist()

        return udf(to_array_, ArrayType(DoubleType()))(col)

    len_vec = len(data.select("features").take(1)[0][0])
    data = (data.withColumn("f", to_array(col("features")))
            .select(["prediction"]+  [col("f")[i] for i in range(len_vec)]))

    for i, x in enumerate(data.columns):
        if x.startswith("f["):
            data = data.withColumnRenamed(
                x, x.replace("[", "_").replace("]", ""))

    return data

In [236]:
def n_param_kmm(k, p):
    n_mean = k * p
    n_var = 1
    return n_mean + v_var

In [11]:
d_k = split_features(df)

In [14]:
d_k.take(1)

[Row(prediction=3, f_0=0.983076443174413, f_1=0.40052619964395375, f_2=0.059322697068162594, f_3=0.8912970547151504, f_4=0.6459296915260802, f_5=-1.6407744126709187, f_6=0.0779898194213411, f_7=0.6963730444812934, f_8=0.20419738806418344, f_9=-0.6405384795826222, f_10=-1.2687766832708212, f_11=-0.5029967514166027, f_12=0.4632598970025907, f_13=-1.3307274422272615, f_14=-0.44745668769115393)]

In [17]:
n = d_k.count()
p = 15

In [25]:
def compute_variance(K, d_k, gm_k):
    total_var = 0
    ni = scipy.zeros(K)
    for i in range(K):
        d_k_c = d_k.filter("prediction==" + str(i)).drop("prediction")
        ni[i] =  d_k_c.count()
        rdd = d_k_c.rdd.map(list)
        means = gm_k.clusterCenters()[i]
        var = (RowMatrix(rdd).rows
                   .map(lambda x: (x - means).T.dot(x - means))
                   .reduce(lambda x, y: x + y))
        total_var += var
    total_var /= ((d_k.count() - K) * p)

    return total_var, ni

In [30]:
def loglik(K, n, p):
    total_var, ni = compute_variance(K, d_k, gm_k)
    ll = 0
    for i in range(K):
        l = ni[i] * scipy.log(ni[i])
        l -= ni[i] * scipy.log(n) 
        l -= .5 * ni[i] * p * scipy.log(2 * scipy.pi * total_var) 
        l -= .5 * (ni[i] - 1) * p
        ll += l
    l -= .5 * scipy.log(n) * (p + 1) * K
    return l

In [163]:
sk_data = split_features(df).drop("prediction").toPandas().values

In [218]:
def n_param(k, p):
    n_mean = k * p
    n_var = k * p *(p + 1) / 2
    n_mix = k - 1
    n = n_mean + n_var + n_mix
    return n

In [230]:
def lrt(max_loglik, loglik, max_params, params):
    t = 2 * (max_loglik - loglik)
    df = max_params - params    
    p_val = 1 - scipy.stats.chi2.cdf(t, df=df)
    print(max_loglik, loglik, t, df, p_val)
    return p_val

In [None]:
max_loglik = KMeans(n_components=10, random_state=1).fit(sk_data).score(sk_data)
max_loglik

In [231]:
for i in range(2, 10 + 1):
    gmm = KMeans(k=i, seed=1)
    pred = gmm.fit(sk_data)
    p_val = lrt(max_loglik, pred.score(sk_data), max_par, n_param(i, 15))

30.7422376887 -2.70322219207 66.8909197615 1088.0 1.0
30.7422376887 -1.08740728374 63.6592899449 952.0 1.0
30.7422376887 1.85300687149 57.7784616344 816.0 1.0
30.7422376887 6.18142342046 49.1216285365 680.0 1.0
30.7422376887 7.4806207988 46.5232337798 544.0 1.0
30.7422376887 11.3468823079 38.7907107616 408.0 1.0
30.7422376887 17.5078401205 26.4687951364 272.0 1.0
30.7422376887 23.9919068681 13.5006616413 136.0 1.0
30.7422376887 30.7422376887 0.0 0.0 nan


In [31]:
loglik(K, n, p)

-368.47710755966489

In [136]:
li = scipy.concatenate([scipy.linspace(1, 0.1, 25), scipy.linspace(.1, 0.001, 20)])
li2 = scipy.concatenate([scipy.linspace(1, 0.1, 10), scipy.linspace(.1, 0.001, 25)])

In [153]:
scipy.argmax(li < .05)

35

In [154]:
scipy.argmax(li2 < .05)

23

In [155]:
print(li)
print(li2)

[ 1.          0.9625      0.925       0.8875      0.85        0.8125      0.775
  0.7375      0.7         0.6625      0.625       0.5875      0.55        0.5125
  0.475       0.4375      0.4         0.3625      0.325       0.2875      0.25
  0.2125      0.175       0.1375      0.1         0.1         0.09478947
  0.08957895  0.08436842  0.07915789  0.07394737  0.06873684  0.06352632
  0.05831579  0.05310526  0.04789474  0.04268421  0.03747368  0.03226316
  0.02705263  0.02184211  0.01663158  0.01142105  0.00621053  0.001     ]
[ 1.        0.9       0.8       0.7       0.6       0.5       0.4       0.3
  0.2       0.1       0.1       0.095875  0.09175   0.087625  0.0835
  0.079375  0.07525   0.071125  0.067     0.062875  0.05875   0.054625
  0.0505    0.046375  0.04225   0.038125  0.034     0.029875  0.02575
  0.021625  0.0175    0.013375  0.00925   0.005125  0.001   ]


In [156]:
print(li[35], li2[23])

0.0478947368421 0.046375


In [157]:
def bs(arr, p):
    lefts, rights, mids = [], [], []
    left, right = 0, len(arr)
    mid = int((left + right) / 2)
    
    while True:
        mids.append(mid)
        lefts.append(left)
        rights.append(right)
        print(left, mid, right, arr[mid])
        if arr[mid] < p:
            mid, right= int((left + mid) / 2), mid
        elif arr[mid] > p:
            mid, left = int((right + mid) / 2), mid
        if left == lefts[-1] and right == rights[-1]:
            break
    return right

i = bs(li, .05)
print(i, li[i])

i = bs(li2, .05)
print(i, li2[i])

0 22 45 0.175
22 33 45 0.0583157894737
33 39 45 0.0270526315789
33 36 39 0.0426842105263
33 34 36 0.0531052631579
34 35 36 0.0478947368421
34 34 35 0.0531052631579
35 0.0478947368421
0 17 35 0.071125
17 26 35 0.034
17 21 26 0.054625
21 23 26 0.046375
21 22 23 0.0505
22 22 23 0.0505
23 0.046375
