In [1]:
# to inf the appropriate version of spark
import findspark
findspark.init()

from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.mllib.linalg import Vectors
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from numpy import array
from math import sqrt


spark = SparkSession.builder.appName("pySparkIDF").getOrCreate()

signaturesData= "work/signatures"  # Should be some file on your system
sigs_df =  spark.read.format("csv") .option("header", "true") .option("inferSchema", "true") .load("work/signatures")

sigs_df.cache() # Cache data for faster reuse
sigs_df = sigs_df.dropna() # drop rows with missing values

print("Schema from Signatures:", sigs_df.printSchema())
print("Sigs: ", sigs_df)

sigs_df = sigs_df.filter(sigs_df.extra.contains('exp_string'))
sigs_df_count  = sigs_df.count()

print("Lines with exp: %i " % (sigs_df_count))

tokenizer = Tokenizer(inputCol="extra", outputCol="words")
wordsData = tokenizer.transform(sigs_df)

# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="count_features", vocabSize=2000, minDF=2.0)
model = cv.fit(wordsData)
result = model.transform(wordsData.select("words"))
result.show()

# use hashing technique to make feature vector
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=200)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("name", "features").show()

root
 |-- uuid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- identity: string (nullable = true)
 |-- source: string (nullable = true)
 |-- source_uri: string (nullable = true)
 |-- extra: string (nullable = true)

Schema from Signatures: None
Sigs:  DataFrame[uuid: string, name: string, identity: string, source: string, source_uri: string, extra: string]
Lines with exp: 10948 
+--------------------+--------------------+
|               words|      count_features|
+--------------------+--------------------+
|["{'"exp_string'"...|(2000,[0,1,4,120,...|
|["{'"exp_string'"...|(2000,[0,1,2,4,50...|
|["{'"exp_string'"...|(2000,[0,1,4,1007...|
|["{'"exp_string'"...|    (2000,[1],[1.0])|
|["{'"exp_string'"...|(2000,[0,1,61,235...|
|["{'"exp_string'"...|(2000,[0,1,2,22,6...|
|["{'"exp_string'"...|(2000,[0,1,4,6,61...|
|["{'"exp_string'"...|(2000,[0,1,2,4,61...|
|["{'"exp_string'"...|(2000,[22,476],[1...|
|["{'"exp_string'"...|(2000,[4,234],[1....|
|["{'"exp_string'"...|(2000

In [3]:

#  kmeans expects the data to be in RDD format, so change the dataframe input to RDD 

inputRddForKMeans = rescaledData.select("features").rdd.map(lambda s : Vectors.dense(s))
#  Print the first row of the RDD 
# print(inputRddForKMeans.take(1))
kmmodel = KMeans.train(inputRddForKMeans, 5, maxIterations=10, initializationMode="random")

print("Final centers: " + str(kmmodel.clusterCenters))
print("Total Cost: " + str(kmmodel.computeCost(inputRddForKMeans)))
# print(clusters)


Final centers: [array([6.98043718e+01, 4.36322473e+00, 3.33176558e+01, 0.00000000e+00,
       0.00000000e+00, 4.92624681e+00, 0.00000000e+00, 5.74144575e+01,
       0.00000000e+00, 2.35662512e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 3.88046841e+00, 0.00000000e+00, 1.60372726e+01,
       0.00000000e+00, 0.00000000e+00, 2.71929121e+02, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.44162792e+00, 0.00000000e+00,
       3.95511480e+00, 0.00000000e+00, 1.56135268e-01, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 3.77758906e+00, 0.00000000e+00, 0.00000000e+00,
       1.79315088e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       7.53515327e+00, 2.38060816e+01, 0.00000000e+00, 0.00000000e+00,
       4.22292095e+00, 0.00000000e+00, 9.77303580e-01, 9.60920018e+00,
       1.83286601e+02, 0.00000000e+00, 4.49262204e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.0000

Total Cost: 99813732.62686466


In [None]:
from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel
# hierarchical clustering 

# Build the model (cluster the data)
biHmodel = BisectingKMeans.train(inputRddForKMeans, 5, maxIterations=10)

# Evaluate clustering
cost = biHmodel.computeCost(inputRddForKMeans)
print("Bisecting K-means Cost = " + str(cost))



In [None]:
from pyspark.mllib.clustering import LDA, LDAModel

# Index documents with unique IDs
corpus = inputRddForKMeans.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize())
      + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

# Save and load model
ldaModel.save(spark, "work/LDAModel")
sameLDAModel = LDAModel.load(spark, "work/LDAModel")