In [30]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import *
sc.stop()
from pyspark.sql import SQLContext
sc =SparkContext()
sqlContext = SQLContext(sc)
import pyspark.sql.functions as func
import sys
from pyspark.sql.functions import countDistinct

In [37]:
#other imports
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [31]:
#reading in sticker type data
feature_data = sqlContext.read.parquet("gs://ds-url-catag/stick_statistics/derived_features/")

In [32]:
feature_data.columns

['user_id_n',
 'numofdays',
 'avg_con_days',
 'sticker_packs_sent',
 'distinct_sticker_packs_sent',
 'sum_paid',
 'sum_free',
 'sum_subs',
 'sum_discont']

In [35]:
#basic statistics of each of the data
# feature_data.describe('sticker_packs_sent').show()

In [62]:
#convert to a vector column
from pyspark.mllib.linalg import Vectors

assembler = VectorAssembler(
    inputCols=["numofdays", "avg_con_days","sticker_packs_sent","distinct_sticker_packs_sent","sum_paid","sum_free","sum_subs","sum_discont"],
    outputCol="features")
output = assembler.transform(feature_data)
# output.select("features").show()


In [63]:
#normalize the features
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import Normalizer

# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(output)


In [76]:
dataset = l1NormData.select('normFeatures')
dataset = dataset.withColumn('features',col('normFeatures'))
dataset = dataset.select('features')

In [89]:
dataset.select('features').show(10,False)

+-----------------------------------------------------------------------------------------------------------------+
|features                                                                                                         |
+-----------------------------------------------------------------------------------------------------------------+
|[0.2857142857142857,0.02857142857142857,0.34285714285714286,0.17142857142857143,0.0,0.17142857142857143,0.0,0.0] |
|[0.19298245614035087,0.08771929824561403,0.5964912280701754,0.07017543859649122,0.0,0.05263157894736842,0.0,0.0] |
|(8,[0,2,3,7],[0.38461538461538464,0.46153846153846156,0.07692307692307693,0.07692307692307693])                  |
|(8,[0,2,3,5],[0.25,0.25,0.25,0.25])                                                                              |
|(8,[0,2,3,5],[0.3333333333333333,0.3333333333333333,0.16666666666666666,0.16666666666666666])                    |
|[0.3333333333333333,0.0,0.38095238095238093,0.14285714285714285,0.09523

In [86]:
#clustering
from pyspark.ml.clustering import KMeans
kmeans = KMeans().setK(40).setSeed(1)
model = kmeans.fit(dataset)

In [87]:
#compute wssse error for those constructed model
wssse = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(wssse))

Within Set Sum of Squared Errors = 13548.6775627


In [81]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
# $example off$

Cluster Centers: 
[  2.37308739e-01   8.87798341e-03   3.00327731e-01   2.34128945e-01
   2.31781485e-04   2.16993991e-01   7.76098387e-07   2.13005362e-03]
[  9.87584114e-02   1.69996681e-02   7.91714838e-01   5.14296350e-02
   5.50224760e-05   4.07093073e-02   7.92848419e-07   3.32324571e-04]
[  2.40603781e-01   3.28825567e-02   5.00046621e-01   1.23539377e-01
   2.15735731e-04   1.01385233e-01   1.77973813e-06   1.32491479e-03]
