In [55]:
from __future__ import annotations
from typing import Any
import snowflake.snowpark as snowpark
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.window as W
import snowflake.snowpark.types as T
import pandas as pd
import numpy as np
import altair as alt
import json
import random
import math
import uuid
pd.set_option('display.max_columns', 500)
USER = "BAMENDOLA" #for snowflake
PASS = "Bryceapril61995" #for snowflake
snowflake_credentials = {
  "account" : "KGA92485",
  "user" : USER,
  "password" : PASS,
  "role" : "SYSADMIN",
  "database" : "DATA_SCIENCE",
  "schema" : "PUBLIC",
  "warehouse" : "DATASCIENCE_XLARGE_WH"
}
session = Session.builder.configs(snowflake_credentials).create()



In [56]:
def BinaryEncoder(df,inputCol,outputCol):
    types = df.groupBy(F.col(inputCol)).agg(F.row_number().over(W.Window.orderBy(F.col(inputCol))).alias('r'))
    df = df.join(types,inputCol).withColumn(outputCol,F.col('r'))
    df = df.drop(F.col('r'))
    return df,outputCol

def OneHotEncoder(df,inputCol,outputCol):
    types = df.select(F.col(inputCol)).distinct().toPandas().values[:,0].tolist()
    outputColType = []
    for t in types:
        outputColType.append(outputCol+'_'+str(t))
        df = df.withColumn(outputColType[-1],F.when(F.col(inputCol) == F.lit(t),1).otherwise(0)) 
    return df,outputColType

def MinMaxScaler(df,inputCol,outputCol):
    df = df.withColumn(outputCol,F.when((F.max(F.col(inputCol).cast('float')).over()-F.min(F.col(inputCol).cast('float')).over() > F.lit(0)),(F.col(inputCol)-F.min(F.col(inputCol)).over()).cast('float')/(F.max(F.col(inputCol).cast('float')).over()-F.min(F.col(inputCol).cast('float')).over())).otherwise(F.lit(0)))
    return df,outputCol

In [57]:
def join(s1: str, s2: str):
    if s1[-1] == '"':
        return s1[:-1]+s2+'"'
    else:
        return s1+s2
    
class Kmeans(object):
    def __init__(self, k: int,*args):
        super(Kmeans, self).__init__(*args)
        self.k = k
        self.cluster_id = 'CLUSTER_ID'
        self.cluster = session.createDataFrame(data=[ v for v in range(self.k)],schema=T.StructType([T.StructField(self.cluster_id, T.IntegerType())]))
        self.prediction = None
        self.run_id = None
    def initializeCluster(self,df: snowpark.DataFrame,inputCols: str) -> snowpark.DataFrame:
        return df.crossJoin(self.cluster).groupBy(F.col(self.cluster_id)) \
            .agg([F.min(F.col(c)).alias(c+'_min') for c in inputCols]+[F.max(F.col(c)).alias(c+'_max') for c in inputCols]) \
            .select([self.cluster_id]+[(F.uniform(F.lit(0.0),F.lit(1.0), F.random())*(F.col(c+'_max')-F.col(c+'_min'))+F.col(c+'_min')).alias(c) for c in inputCols])
    def initializeCluster2(self,df: snowpark.DataFrame, inputCols: str):
        return df.sample(n=self.k).select([F.col(c) for c in inputCols]+[F.row_number().over(W.Window.orderBy(F.lit(None))).alias(self.cluster_id)])
    def distance(self,df: snowpark.DataFrame,id: str,inputCols: str) -> snowpark.DataFrame:
        return df.crossJoin(self.cluster,lsuffix='_L',rsuffix='_R') \
            .select(F.col(id) if id != self.cluster_id else F.col(join(id,'_L')),F.col(self.cluster_id) if id != self.cluster_id else F.col(join(self.cluster_id,'_R')),F.sqrt(sum([F.pow(F.col(join(c,'_L'))-F.col(join(c,'_R')),F.lit(2)) for c in inputCols])).alias('distance'),F.row_number().over(W.Window.partitionBy(F.col(id) if id != self.cluster_id else F.col(join(id,'_L'))).orderBy(F.col('distance').asc())).alias('r'))\
            .select(F.col(id) if id != self.cluster_id else F.col(join(id,'_L')),F.col(self.cluster_id) if id != self.cluster_id else F.col(join(self.cluster_id,'_R')),F.col('distance'),F.col('r'))
    def fit(self,df: snowpark.DataFrame,id: str,inputCols: str,maxIter=None,initialize=True) -> Kmeans:
        if initialize:
            self.cluster = self.initializeCluster2(df,inputCols).cache_result()
        prediction = self.transform(df,id,inputCols).cache_result()
        iteration = 0
        self.run_id = str(uuid.uuid4())
        while (self.prediction == None or (self.prediction.except_(prediction).count() > 0)) and (maxIter==None or iteration < maxIter):
            print('run_id:',self.run_id,',iteration:',iteration) 
            self.prediction = prediction
            print(self.prediction.groupBy(F.col(self.cluster_id)).agg(F.count('*')).orderBy(F.col(self.cluster_id)).toPandas())
            self.cluster = df.join(self.prediction,[id]).groupBy(F.col(self.cluster_id)).agg([ F.mean(c).alias(c) for c in inputCols]) \
                .union(self.cluster.where(F.col(self.cluster_id).isin(self.prediction.select(F.col(self.cluster_id))) == F.lit(False))).cache_result()
            prediction = self.transform(df,id,inputCols).cache_result()
            iteration += 1
            self.cluster.select(F.col('*'),F.lit(iteration).alias('t'),F.lit(self.run_id).alias('run_id')).write.mode('append').save_as_table('kmeans_t2')
        return self
    
    def transform(self,df: snowpark.DataFrame,id: str,inputCols: str) ->snowpark.DataFrame:
        return self.distance(df,id,inputCols).where(F.col('r') == F.lit(1)).select(F.col(id),F.col(self.cluster_id))
    
    
class SOM(object):
    # https://www.osti.gov/servlets/purl/1566795
    def __init__(self,k: int,sigma: float,lr: float,*args):
        super(SOM, self).__init__(*args)
        self.k = k**2
        self.lr = lr
        self.sigma = sigma
        self.cluster_id = 'CLUSTER_ID'
        self.cluster = session.createDataFrame(data=[ v for v in range(self.k)],schema=T.StructType([T.StructField(self.cluster_id, T.IntegerType())]))
        self.neighbor = self.cluster.crossJoin(self.cluster,lsuffix='_L',rsuffix='_R')\
            .select(F.col(join(self.cluster_id,'_L')).alias('BMI_ID'),F.col(join(self.cluster_id,'_R')).alias(self.cluster_id),(F.pow(F.col(join(self.cluster_id,'_L'))%F.lit(self.k**0.5)-F.col(join(self.cluster_id,'_R'))%F.lit(self.k**0.5),F.lit(2))+F.pow(F.floor(F.col(join(self.cluster_id,'_L'))/F.lit(self.k**0.5))-F.floor(F.col(join(self.cluster_id,'_R'))/F.lit(self.k**0.5)),F.lit(2))).alias('distance'))
        self.run_id = None
    def initializeCluster(self,df: snowpark.DataFrame, inputCols: str) -> snowpark.DataFrame:
        return df.crossJoin(self.cluster).groupBy(F.col(self.cluster_id)) \
            .agg([F.min(F.col(c)).alias(c+'_min') for c in inputCols]+[F.max(F.col(c)).alias(c+'_max') for c in inputCols]) \
            .select([self.cluster_id]+[(F.uniform(F.lit(0.0),F.lit(1.0), F.random())*(F.col(c+'_max')-F.col(c+'_min'))+F.col(c+'_min')).alias(c) for c in inputCols])
    def initializeCluster2(self,df: snowpark.DataFrame, inputCols: str):
        return df.sample(n=self.k).select([F.col(c) for c in inputCols]+[(F.row_number().over(W.Window.orderBy(F.lit(None)))-F.lit(1)).alias(self.cluster_id)])
    def distance(self,df: snowpark.DataFrame,id: str,inputCols: str) -> snowpark.DataFrame:
        return df.crossJoin(self.cluster,lsuffix='_L',rsuffix='_R') \
            .select(F.col(id) if id != self.cluster_id else F.col(join(id,'_L')),F.col(self.cluster_id) if id != self.cluster_id else F.col(join(self.cluster_id,'_R')),F.sqrt(sum([F.pow(F.col(join(c,'_L'))-F.col(join(c,'_R')),F.lit(2)) for c in inputCols])).alias('distance'),F.row_number().over(W.Window.partitionBy(F.col(id) if id != self.cluster_id else F.col(join(id,'_L'))).orderBy(F.col('distance').asc())).alias('r'))\
            .select(F.col(id) if id != self.cluster_id else F.col(join(id,'_L')),F.col(self.cluster_id) if id != self.cluster_id else F.col(join(self.cluster_id,'_R')),F.col('distance'),F.col('r'))
    def getNeighborHood(self,inputCols: str, sigma: float) -> snowpark.DataFrame:
        return self.cluster.crossJoin(self.cluster,lsuffix='_L',rsuffix='_R')\
            .select([F.col(join(self.cluster_id,'_L')).alias('BMI_ID')]+[F.col(join(c,'_R')).alias(c) for c in self.cluster.columns]+[F.sqrt(sum([F.pow(F.col(join(c,'_L'))-F.col(join(c,'_R')),F.lit(2)) for c in inputCols])).alias('distance'),F.when(F.lit(sigma)>F.lit(0.0),F.exp(-F.pow(F.col('distance'),2)/(F.lit(2)*F.pow(sigma,2)))).otherwise(F.when(F.col('distance')==F.lit(0.0),F.lit(1.0)).otherwise(F.lit(0.0))).alias('influence_rate')])
                    
    def getNeighborHood2(self,inputCols: str, sigma: float) -> snowpark.DataFrame:
        return self.cluster.join(self.neighbor,[self.cluster_id],lsuffix='_L',rsuffix='_R')\
            .select([F.col('BMI_ID')]+[F.col(self.cluster_id).alias(self.cluster_id)]+[F.col(c) for c in self.cluster.columns if c != self.cluster_id]+[F.when(F.lit(sigma)>F.lit(0.0),F.exp(-F.col('distance')
                                                                                                                                                                                             /(F.lit(2)*F.pow(sigma,2)))).otherwise(F.when(F.col('distance')==F.lit(0.0),F.lit(1.0)).otherwise(F.lit(0.0))).alias('influence_rate')])
                    
    def fit(self,df: snowpark.DataFrame,id: str,inputCols: str,maxIter=5, batchSize=None,initialize=True) -> SOM:
        if initialize:
            self.cluster = self.initializeCluster2(df,inputCols).cache_result()
        iteration = 0
        lr=self.lr
        sigma=self.sigma
        maxIter = maxIter if batchSize==None else maxIter*(df.count()/batchSize)
        time_constant = maxIter
        self.run_id = str(uuid.uuid4())
        while iteration < maxIter:
            print('run_id:',self.run_id,',iteration:',iteration,',lr:',lr,',sigma:',sigma) 
            sample = df if batchSize==None else df.sample(n=batchSize).cache_result()
            bmu = self.distance(sample,id,inputCols).where(F.col('r') == F.lit(1)).drop(F.col('r')).rename(self.cluster_id,'BMI_ID').cache_result()
            #print(bmu.groupBy(F.col(self.cluster_id)).agg(F.count('*')).orderBy(F.col(self.cluster_id)).toPandas())
            neighborhood = self.getNeighborHood2(inputCols,sigma).cache_result()
            self.cluster = sample.join(bmu,id).join(neighborhood,on='BMI_ID',lsuffix='_L',rsuffix='_R').select([F.col(self.cluster_id)]+[(F.col(join(c,'_R'))+F.lit(lr)*(F.col('influence_rate')*(F.col(join(c,'_L'))-F.col(join(c,'_R'))))).alias(c) for c in inputCols]) \
                .groupBy(F.col(self.cluster_id)).agg([F.mean(c).alias(c) for c in inputCols]).cache_result()
                # .union(self.cluster.where(F.col(self.cluster_id).isin(neighborhood.select(F.col(self.cluster_id))) == F.lit(False))) is not needed as the 'influence_rate' is zero where the neighbors are out of range 
            iteration += 1
            lr=self.lr*math.exp(-iteration/time_constant)
            #sigma=self.sigma*math.exp(-iteration/time_constant)
            coefficient = 1.0 - (float(iteration)/time_constant)
            #lr=self.lr*coefficient
            sigma=self.sigma*coefficient
            #self.cluster.select(F.col('*'),F.lit(iteration).alias('t'),F.lit(self.run_id).alias('run_id')).write.mode('append').save_as_table('som_t2')
        return self
    
    def transform(self,df: snowpark.DataFrame,id: str,inputCols: str):
        return self.distance(df,id,inputCols).where(F.col('r') == F.lit(1)).select(F.col(id),F.col(self.cluster_id))

In [58]:
def silhouette(model,df:snowpark.DataFrame,id: str,inputCols: str)-> snowpark.DataFrame:
    '''
    this is the simplify version that O(NK) instead of O(N^2)
    https://arrow.tudublin.ie/cgi/viewcontent.cgi?article=1214&context=scschcomcon
    '''
    data = model.distance(df,id,inputCols).cache_result()
    a = data.where(F.col('r') == 1)#.groupBy(F.col(model.cluster_id)).agg(F.mean(F.col('distance')).alias('distance')).cache_result()
    b = data.where(F.col('r') == 2)#.join(data.where(F.col('r') != 1),on=[id],lsuffix='_L',rsuffix='_R').groupBy(F.col(model.cluster_id+'_L'),F.col(model.cluster_id+'_R')).agg(F.mean(F.col('distance'+'_R')).alias('distance')).groupBy(F.col(model.cluster_id+'_L')).agg(F.min(F.col('distance')).alias('distance')).cache_result()#.select(F.col(model.cluster_id+'_L').alias(model.cluster_id),F.col('distance')).cache_result()
    return a,b,float(b.join(a,[id],lsuffix='_L',rsuffix='_R').select(((F.col('distance'+'_L') - F.col('distance'+'_R')) / F.when(F.col('distance'+'_L') >= F.col('distance'+'_R'),F.col('distance'+'_L')).otherwise(F.col('distance'+'_R'))).alias('distance')).agg(F.mean(F.col('distance')).alias('distance')).toPandas().values[:,0])
    #a = data.where(F.col('r') == 1).groupBy(F.col(model.cluster_id)).agg(F.mean(F.col('distance')).alias('distance')).cache_result()
    #b = data.where(F.col('r') == 1).join(data.where(F.col('r') != 1),on=[id],lsuffix='_L',rsuffix='_R').groupBy(F.col(model.cluster_id+'_L'),F.col(model.cluster_id+'_R')).agg(F.mean(F.col('distance'+'_R')).alias('distance')).groupBy(F.col(model.cluster_id+'_L')).agg(F.min(F.col('distance')).alias('distance')).cache_result()#.select(F.col(model.cluster_id+'_L').alias(model.cluster_id),F.col('distance')).cache_result()
    #return a,b,float(b.join(a,F.col(model.cluster_id+'_L')==F.col(model.cluster_id),lsuffix='_L',rsuffix='_R').select(((F.col('distance'+'_L') - F.col('distance'+'_R')) / F.when(F.col('distance'+'_L') >= F.col('distance'+'_R'),F.col('distance'+'_L')).otherwise(F.col('distance'+'_R'))).alias('distance')).agg(F.mean(F.col('distance')).alias('distance')).toPandas().values[:,0])
#a,b,v = silhouette(model,df,'UNIVERSAL_ID',[c for c in df.columns if c != 'UNIVERSAL_ID'])
#print(v)

In [59]:
data_cluster = session.read.table('cluster').cache_result()
data_meta = session.read.table('cluster_meta').cache_result()
df = session.read.table('EXTENDED_ATTRIBUTES_DEMOGRAPHICS_CONFIDENCE_SAMPLE_2').cache_result()

In [13]:
som_t = session.read.table('som_t2')#.where((F.col('run_id')=='b1516eac-d7fe-4125-8656-a9b9886b5872') & (F.col('t') == 11)).cache_result()

In [None]:
som_t = session.read.table('som_t').where(F.col('run_id')=='ce3edbf1-f66b-42ad-8ad6-80bc9c900b40').cache_result()
s = SOM(k=2,sigma=1,lr=1)
#s.cluster = som_t.select(F.col('*'), F.max('t').over(W.Window.orderBy(F.lit(None))).alias('max_t')).where(F.col('t')==F.col('max_t')).drop(F.col('t'),F.col('max_t'),F.col('run_id')).cache_result()
#silhouette(s,df,'UNIVERSAL_ID',[c for c in df.columns if c != 'UNIVERSAL_ID'])
scores = session.createDataFrame(data=[],schema=T.StructType([T.StructField("t", T.IntegerType()), T.StructField("score", T.IntegerType())]))
for time in som_t.select(F.col('t')).distinct().orderBy(F.col('t')).toPandas().to_numpy()[:,0].tolist()[-5:]:    
    s.cluster = som_t.where(F.col('t')==F.lit(time)).drop(F.col('t'),F.col('run_id')).cache_result()
    #print(s.cluster.to_pandas())
    _,_,score = silhouette(s,df,'UNIVERSAL_ID',[c for c in df.columns if c != 'UNIVERSAL_ID'])
    print(time,score)
    scores.unionAll(session.createDataFrame(data=[[time,score]],schema=T.StructType([T.StructField("t", T.IntegerType()), T.StructField("score", T.IntegerType())])))
scores.orderBy('t').toPandas()

In [60]:
def getModel(meta,df,run_id):
    p = json.loads(meta.where(F.col('run_id') == F.lit(run_id)).select(F.col('PARAM')).toPandas().values[0,0])
    m = Kmeans(k=p['k']) if p['model_name'] == 'kmeans' else SOM(k=p['k'],sigma=p['sigma'],lr=p['lr'])
    m.cluster = df.where(F.col('run_id') == F.lit(run_id)).select([F.col(c) for c in df.columns if c not in ('RUN_ID')]).cache_result()
    return m
model = getModel(data_meta,data_cluster,'9d4c28de-fc58-45df-809f-265017d296d2')

In [61]:
id_ = 'UNIVERSAL_ID'
inputCols = [c for c in df.columns if c != id_]


In [None]:
prediction = model.transform(df,id_,inputCols).cache_result()

In [79]:
sample = df.sample(n=1000).cache_result()
t= prediction.join(sample,[id_]).cache_result()

In [43]:
prediction.limit(10).toPandas()

Unnamed: 0,UNIVERSAL_ID,CLUSTER_ID
0,a1912268c7b054d92f1affd0392f89f5f4b26c99ac99a7...,12.0
1,db777083652b4caae5d5c763f34068b4eb15e80fdefc7e...,10.0
2,6ca118f6582965a17ae8fa2da89ff1e31928e565c11de6...,17.0
3,147e0eb5e6225bfaf47c46f250ebea8bb31ceeec3db737...,7.0
4,8e9f2c3f4e2b96a5e1387be338ac01c82049d19a3f82f4...,14.0
5,057cc6e2cb332ab11983b379280f09bc6aebbc1d2d35d1...,6.0
6,edc0010fc520c3e716d6d948091f1317b11af25952cefa...,18.0
7,85ed7ad039d1df6d39ad556f252c9c5cbcc542853006a2...,8.0
8,35effc3f4278a192f251c4051d2740455a30d635b3db05...,0.0
9,044c9fe940a70d0d208e0502ef19c657cd0bb2c8cb3a52...,20.0


In [73]:
#
#df.select([F.mean(F.col(c)).alias(c+'_mean')for c in inputCols]+[F.stddev(F.col(c)).alias(c+'_std') for c in inputCols])
temp = prediction.join(df,id_).groupBy(F.col(model.cluster_id)).agg([F.mean(F.col(c)).alias(c) for c in inputCols]).crossJoin(df.select([F.mean(F.col(c)).alias(c+'_mean') for c in inputCols]+[F.stddev(F.col(c)).alias(c+'_std') for c in inputCols])).select([F.col(model.cluster_id)]+[F.when((F.abs(F.col(c)-F.col(c+'_mean')) >= F.col(c+'_std')),(F.col(c)-F.col(c+'_mean'))).alias(c) for c in inputCols]+[((F.col(c)-F.col(c+'_mean'))).alias(c+'_abs') for c in inputCols]).orderBy(F.col(model.cluster_id).asc()).toPandas()
temp



Unnamed: 0,CLUSTER_ID,GENDER_CONFIDENCE,AGE_CONFIDENCE,ESTIMATED_HOUSEHOLD_INCOME_CONFIDENCE,MARITAL_STATUS_CONFIDENCE,SINGLE_PARENT_CONFIDENCE,HOUSEHOLD_COMPOSITION_CONFIDENCE,USES_CREDIT_CARD_CONFIDENCE,SPANISH_SPEAKING_CONFIDENCE,LENGTH_OF_RESIDENCE_CONFIDENCE,HOME_BUILT_YEAR_CONFIDENCE,HOME_MKT_VAL_CONFIDENCE,HOME_OWNERSHIP_CONFIDENCE,DWELLING_TYPE_CONFIDENCE,HOME_OFFICE_CONFIDENCE,HOME_POOL_CONFIDENCE,SENIOR_ADULT_IN_HH_CONFIDENCE,CHILD_PRESENT_CONFIDENCE,CHILD_UNDER_6_PRESENT_CONFIDENCE,CHILD_6_10_PRESENT_CONFIDENCE,CHILD_11_15_PRESENT_CONFIDENCE,CHILD_16_17_PRESENT_CONFIDENCE,NUMBER_OF_CHILDREN_CONFIDENCE,EDUCATION_LEVEL_CONFIDENCE,OCCUPATION_CONFIDENCE,GENDER_CONFIDENCE_ABS,AGE_CONFIDENCE_ABS,ESTIMATED_HOUSEHOLD_INCOME_CONFIDENCE_ABS,MARITAL_STATUS_CONFIDENCE_ABS,SINGLE_PARENT_CONFIDENCE_ABS,HOUSEHOLD_COMPOSITION_CONFIDENCE_ABS,USES_CREDIT_CARD_CONFIDENCE_ABS,SPANISH_SPEAKING_CONFIDENCE_ABS,LENGTH_OF_RESIDENCE_CONFIDENCE_ABS,HOME_BUILT_YEAR_CONFIDENCE_ABS,HOME_MKT_VAL_CONFIDENCE_ABS,HOME_OWNERSHIP_CONFIDENCE_ABS,DWELLING_TYPE_CONFIDENCE_ABS,HOME_OFFICE_CONFIDENCE_ABS,HOME_POOL_CONFIDENCE_ABS,SENIOR_ADULT_IN_HH_CONFIDENCE_ABS,CHILD_PRESENT_CONFIDENCE_ABS,CHILD_UNDER_6_PRESENT_CONFIDENCE_ABS,CHILD_6_10_PRESENT_CONFIDENCE_ABS,CHILD_11_15_PRESENT_CONFIDENCE_ABS,CHILD_16_17_PRESENT_CONFIDENCE_ABS,NUMBER_OF_CHILDREN_CONFIDENCE_ABS,EDUCATION_LEVEL_CONFIDENCE_ABS,OCCUPATION_CONFIDENCE_ABS
0,0.0,0.291838,0.249529,0.232716,,,,0.411993,,,,,0.376542,0.3614,,,,,,,,,,,,0.291838,0.249529,0.232716,0.24715,0.185227,0.210878,0.411993,0.220762,0.168632,0.294659,0.11343,0.376542,0.3614,0.084645,0.061462,0.122897,0.174607,0.075406,0.090628,0.071909,0.035372,0.052137,0.272992,0.169363
1,1.0,,,,,-0.271148,-0.378451,,,,,,,,,,,-0.540042,,,,,,,,-0.037096,-0.098999,-0.087591,-0.216861,-0.271148,-0.378451,-0.207139,-0.241508,-0.036449,-0.094046,-0.13515,-0.197676,-0.082061,-0.039475,-0.03606,0.008555,-0.540042,-0.073575,-0.085545,-0.06914,-0.03689,-0.2449,-0.203563,-0.112994
2,2.0,,,,,,,,,,,,,,,,,,,,,,,,,-0.04575,-0.036854,-0.055623,-0.032382,-0.211446,0.084726,-0.055883,-0.253483,-0.037722,-0.003561,-0.057022,-0.049639,-0.089189,0.051638,-0.021738,0.061721,0.201835,0.041508,0.014293,0.004697,0.014424,0.05037,-0.016562,0.040634
3,3.0,,,,,,,,,,,,,,,,,,,,,,,,,-0.099643,-0.049701,-0.034518,0.013368,0.197007,0.041517,-0.06139,0.190854,-0.041643,-0.098426,0.054393,-0.052482,-0.081401,-0.057203,0.002788,-0.110305,0.069666,-0.026764,-0.009565,-0.002379,-0.007817,0.073569,-0.018086,-0.053135


In [70]:
prediction.groupBy(F.col(model.cluster_id)).count().toPandas()

Unnamed: 0,CLUSTER_ID,COUNT
0,0.0,18848
1,3.0,36391
2,2.0,24730
3,1.0,20031


In [67]:
temp.T.sort_values([i for i in range(model.k)])

Unnamed: 0,0,1,2,3
CHILD_PRESENT_CONFIDENCE,-0.540042,0.174607,0.201835,0.069666
HOUSEHOLD_COMPOSITION_CONFIDENCE,-0.378451,0.210878,0.084726,0.041517
SINGLE_PARENT_CONFIDENCE,-0.271148,0.185227,-0.211446,0.197007
NUMBER_OF_CHILDREN_CONFIDENCE,-0.2449,0.052137,0.05037,0.073569
SPANISH_SPEAKING_CONFIDENCE,-0.241508,0.220762,-0.253483,0.190854
...,...,...,...,...
CHILD_PRESENT_CONFIDENCE_ABS,0.540042,0.174607,0.201835,0.069666
SINGLE_PARENT_CONFIDENCE_BOOL,True,False,False,False
HOUSEHOLD_COMPOSITION_CONFIDENCE_BOOL,True,False,False,False
CHILD_PRESENT_CONFIDENCE_BOOL,True,False,False,False


In [205]:
def plot_2d(df:snowpark.DataFrame,id:str,fit=None,title='',mark='circle'):
    from sklearn.decomposition import PCA
    import pandas as pd
    pca = PCA(2)
    df = df.toPandas().sort_values(by=id, ascending=True)
    ids = df[id].values
    if fit==None:
        fit = df
    else:
        fit = fit.toPandas()#.sort_values(by=id, ascending=True)
    model = pca.fit(X=fit,y=id)
    t = model.transform(X=df.drop(id,axis=1))
    #t= pca.fit_transform(X=df,y=id)
    t = pd.DataFrame({'pa_x':t[:,0],'pa_y':t[:,1],id:ids})
    return alt.Chart(t,title=title).mark_point().encode(
        x='pa_x:Q',
        y='pa_y:Q',
        color=alt.Color(id+':N'),
        shape=alt.Shape( id+":N",scale = alt.Scale(range=[mark],zero=True))
    )
plot_2d(t.select(['CLUSTER_ID']+[c for c in model.cluster.columns if c != 'CLUSTER_ID']),'CLUSTER_ID',fit=sample.drop(id_),title='title')



In [60]:
results=[]
for time in range(1,som_t.agg(F.max('t')).collect()[0][0]):
    k = int(math.sqrt(som_t.agg(F.max('CLUSTER_ID')).collect()[0][0]+1))
    model = Kmeans(k)
    model.cluster = som_t.where(F.col('t')==F.lit(time)).select([F.col(c) for c in som_t.columns if c not in ("T", "RUN_ID")])
    t= model.transform(sample,id_,inputCols).join(sample,[id_]).cache_result()
    results.append(plot_2d(model.cluster,'CLUSTER_ID',fit=sample.drop(id_),title='t:'+str(time),mark='cross')+plot_2d(t.select(['CLUSTER_ID']+[c for c in model.cluster.columns if c != 'CLUSTER_ID']),'CLUSTER_ID',fit=sample.drop(id_),title='t:'+str(time),mark='circle'))



In [62]:
from functools import reduce
reduce(lambda x,y: x|y,results)

In [178]:
c1,c2,c3 = 'sigma','k','SCORE'
temp = data_meta.where(F.json_extract_path_text(F.col('param'),F.lit('model_name'))==F.lit('som')).select(F.col('score'),F.json_extract_path_text(F.col('param'),F.lit(c1)).alias(c1),F.json_extract_path_text(F.col('param'),F.lit(c2)).alias(c2)).toPandas()

In [69]:
#https://stackoverflow.com/questions/6485699/need-a-specific-example-of-u-matrix-in-self-organizing-map
w,h = int(model.k**0.5),int(model.k**0.5)
weights =model.cluster.orderBy(model.cluster_id).drop(model.cluster_id).toPandas().to_numpy().reshape(w,h,-1).transpose(1,0,2)
u_matrix = np.zeros((2*h-1,2*w-1))
for i in range(2*h-1):
      for j in range(2*w-1):
        nb = (0,0)
        if not (i % 2) and (j % 2):
            nb = (0,1)

        elif (i % 2) and not (j % 2):
            nb = (1,0)
        u_matrix[(i,j)] = np.linalg.norm(
            weights[i //2, j //2] - weights[i //2 +nb[0], j // 2 + nb[1]],
            axis = 0
        )
for i in range(2*h-1):
      for j in range(2*w-1):
        if not (i % 2) and not (j % 2):
            #print('if:',i,',',j)
            nodelist = []
            if i > 0:
                nodelist.append((i-1,j))
            if i < u_matrix.shape[0]-1:
                nodelist.append((i+1, j))
            if j > 0:
                nodelist.append((i,j -1))
            if j < u_matrix.shape[1]-1:
                nodelist.append((i,j+1))

            meanlist = [u_matrix[u_node] for u_node in nodelist]
            u_matrix[(i,j)] = np.mean(meanlist)
        elif (i % 2) and (j % 2):
            #print('elif:',i,',',j)
            nodelist = [
                            (i - 1, j),
                            (i + 1, j),
                            (i, j - 1),
                            (i, j + 1)]
            meanlist = [u_matrix[u_node] for u_node in nodelist]
            u_matrix[(i,j)] = np.mean(meanlist)
u_matrix = np.vectorize(lambda x:(x-u_matrix.min())/(u_matrix.max()-u_matrix.min()))(u_matrix)
u_matrix = pd.DataFrame(u_matrix).stack().rename_axis(['Y', 'X']).reset_index(name='DISTANCE')
print(u_matrix.columns)



Index(['Y', 'X', 'DISTANCE'], dtype='object')


In [80]:
from minisom import MiniSom
msom = MiniSom(model.k, model.k, weights.shape[2], sigma=1, learning_rate=1,random_seed=0)
msom._weights = weights
u_matrix2 = msom.distance_map('mean')
u_matrix2 = pd.DataFrame(u_matrix2).stack().rename_axis(['Y', 'X']).reset_index(name='DISTANCE')

In [89]:
alt.data_transformers.disable_max_rows()
alt.Chart(u_matrix).mark_rect().encode(
    x='X:O',
    y='Y:O',
    color=alt.Color('DISTANCE:Q', scale=alt.Scale(scheme='inferno')),
)

In [None]:
#Validation

In [98]:
# this works
m = Kmeans(2)
m.cluster = session.createDataFrame([[0,1,1],[1,0,2]],schema=T.StructType([T.StructField("cluster_id", T.IntegerType()),T.StructField("a", T.IntegerType()), T.StructField("b", T.IntegerType())]))
d = session.createDataFrame([[0,1,1],[1,1,0],[2,0,2],[3,2,4],[4,3,5]],schema=T.StructType([T.StructField("id", T.IntegerType()),T.StructField("a", T.IntegerType()), T.StructField("b", T.IntegerType())]))
m = m.fit(d,'id',['a','b'],maxIter=1,initialize=False)
(m.cluster.orderBy(F.col('CLUSTER_ID').asc()).toPandas().to_numpy() == pd.DataFrame({'CLUSTER_ID': [0, 1], 'A': [1.000000, 1.666667], 'B': [0.500000, 3.666667]}).to_numpy()).all()

run_id: edd7bc7d-51eb-4741-943d-a2a19823a89b ,iteration: 0
   CLUSTER_ID  COUNT(LITERAL())
0           0                 2
1           1                 3


True

In [12]:
from minisom import MiniSom
data = [#[ 0.80,  0.55,  0.22,  0.03]
        [ 0.82,  0.50,  0.23,  0.03]#,
        #[ 0.80,  0.54,  0.22,  0.03],
        #[ 0.80,  0.53,  0.26,  0.03],
        #[ 0.79,  0.56,  0.22,  0.03],
        #[ 0.75,  0.60,  0.25,  0.03],
        #[ 0.77,  0.59,  0.22,  0.03]
    ]
sigma = 0.7
lr = 0.5
k = 6
#MiniSom
msom = MiniSom(k, k, 4, sigma=sigma, learning_rate=lr,random_seed=0) # initialization of 6x6 SOM
pre_w = msom.get_weights().copy().reshape([k**2,-1])
#print('pre:',pre_w)
msom.train(data, 1) # trains the SOM with 100 iterations
#print('after:',msom.get_weights().copy().reshape([k**2,-1]))
#ml
d = session.createDataFrame(data,schema=T.StructType([T.StructField("a", T.FloatType()), T.StructField("b", T.FloatType()), T.StructField("c", T.FloatType()), T.StructField("d", T.FloatType())]))\
        .select(F.col('*'),(F.row_number().over(W.Window.orderBy(F.lit(None)))-F.lit(1)).alias('id'))
m =SOM(k=k,sigma=sigma, lr=lr)
m.cluster = session.createDataFrame(pd.DataFrame(pre_w,columns=['A','B','C','D']),schema=T.StructType([T.StructField("a", T.FloatType()), T.StructField("b", T.FloatType()), T.StructField("c", T.FloatType()), T.StructField("d", T.FloatType())])) \
    .select(F.col('*'),(F.row_number().over(W.Window.orderBy(F.lit(None)))-F.lit(1)).alias('cluster_id'))
#print('starting cluster:',m.cluster.toPandas())
m.fit(d,'id',['a','b','c','d'],maxIter=1,initialize=False)
(m.cluster.orderBy(F.col('CLUSTER_ID').asc()).toPandas().to_numpy()[:,1:]==msom._weights.reshape(k**2,-1)).all()




run_id: 47e1e66d-791e-463c-9fb5-63e1e02f688f ,iteration: 0 ,lr: 0.5 ,sigma: 0.7


True

In [13]:
from quicksom.som import SOM as QSOM
data = [[ 0.80,  0.55,  0.22,  0.03],
        [ 0.82,  0.50,  0.23,  0.03],
        [ 0.80,  0.54,  0.22,  0.03],
        [ 0.80,  0.53,  0.26,  0.03],
        [ 0.79,  0.56,  0.22,  0.03],
        [ 0.75,  0.60,  0.25,  0.03],
        [ 0.77,  0.59,  0.22,  0.03]
    ]
sigma = 0.7
lr = 0.5
k = 6
batch_size = len(data)
#MiniSom
msom = QSOM(k, k, 4, sigma=sigma, alpha=lr) # initialization of 6x6 SOM
pre_w = msom.centroids.numpy().copy()
#print('pre:',pre_w)
msom.fit(np.array(data), batch_size=batch_size,n_epoch=1,num_workers=0)
#print('after:',msom.get_weights().copy().reshape([k**2,-1]))
#ml
d = session.createDataFrame(data,schema=T.StructType([T.StructField("a", T.FloatType()), T.StructField("b", T.FloatType()), T.StructField("c", T.FloatType()), T.StructField("d", T.FloatType())]))\
        .select(F.col('*'),(F.row_number().over(W.Window.orderBy(F.lit(None)))-F.lit(1)).alias('id'))
m =SOM(k=k,sigma=(sigma**2+1e-5)**(1/2), lr=lr) # this weird sigma is cause they offest the sigma denominator by 1e-5 if cause sigma==0
m.cluster = session.createDataFrame(pd.DataFrame(pre_w,columns=['A','B','C','D']),schema=T.StructType([T.StructField("a", T.FloatType()), T.StructField("b", T.FloatType()), T.StructField("c", T.FloatType()), T.StructField("d", T.FloatType())])) \
    .select(F.col('*'),(F.row_number().over(W.Window.orderBy(F.lit(None)))-F.lit(1)).alias('cluster_id'))
#print('starting cluster:',m.cluster.toPandas())
m.fit(d,'id',['a','b','c','d'],maxIter=1,initialize=False)
(m.cluster.orderBy(F.col('CLUSTER_ID').asc()).toPandas().to_numpy()[:,1:].round(decimals=4, out=None).astype(np.float32) == msom.centroids.numpy().round(decimals=4, out=None)).all()

1/1: 0/7 | alpha: 0.500000 | sigma: 0.700000 | error: 0.594401 | time: 0:00:00.029416 | eta: 0:00:00




run_id: 5c0476ba-36f9-4271-b0b3-38947618d742 ,iteration: 0 ,lr: 0.5 ,sigma: 0.7000071428207001


True