In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import col
import pyspark.sql.functions as fn


#from pyspark.mllib.util import MLUtils
from pyspark.sql.types import *
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover

sc = pyspark.SparkContext()

In [4]:
spark = SparkSession.builder.getOrCreate()

In [2]:
from pyspark.ml.clustering import LDA

In [8]:
dataset = spark.read.format("libsvm").load("sample_libsvm_data.txt")

In [16]:
type(lda)

pyspark.ml.clustering.LDA

In [11]:
# Trains a LDA model.
lda = LDA(k=10, maxIter=10)
model = lda.fit(dataset)

ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

# Shows the result
transformed = model.transform(dataset)
transformed.show(truncate=False)

The lower bound on the log likelihood of the entire corpus: -13029493.086630248
The upper bound on perplexity: 5.352705788898725
The topics described by their top-weighted terms:
+-----+---------------+------------------------------------------------------------------+
|topic|termIndices    |termWeights                                                       |
+-----+---------------+------------------------------------------------------------------+
|0    |[261, 437, 206]|[0.03749973183601975, 0.03384664693406134, 0.03303501557296662]   |
|1    |[183, 211, 155]|[0.04685123492152563, 0.04138597714097195, 0.03780443976328545]   |
|2    |[239, 211, 267]|[0.03175473791544668, 0.030386654250472586, 0.02899787715045062]  |
|3    |[299, 382, 354]|[0.023722615484483237, 0.015946398397527206, 0.013922041658248811]|
|4    |[343, 370, 288]|[0.07910243773662522, 0.07497035295669686, 0.0731856820906498]    |
|5    |[540, 568, 345]|[0.011322634202100442, 0.010589280994641473, 0.009794898065609554]|
|6

In [20]:
countTopDocs = (model.transform(dataset))

In [21]:
type(countTopDocs)

pyspark.sql.dataframe.DataFrame

In [22]:
countTopDocs.show()

+-----+--------------------+--------------------+
|label|            features|   topicDistribution|
+-----+--------------------+--------------------+
|  0.0|(692,[127,128,129...|[2.58892489975378...|
|  1.0|(692,[158,159,160...|[4.69801105671366...|
|  1.0|(692,[124,125,126...|[4.56204422220622...|
|  1.0|(692,[152,153,154...|[7.40282592515071...|
|  1.0|(692,[151,152,153...|[6.92639368392286...|
|  0.0|(692,[129,130,131...|[2.27197463878127...|
|  1.0|(692,[158,159,160...|[4.8561432182702E...|
|  1.0|(692,[99,100,101,...|[0.03967283554937...|
|  0.0|(692,[154,155,156...|[0.00759025049096...|
|  0.0|(692,[127,128,129...|[2.16025863395851...|
|  1.0|(692,[154,155,156...|[5.87802426271180...|
|  0.0|(692,[153,154,155...|[0.02401011755620...|
|  0.0|(692,[151,152,153...|[0.09505048919202...|
|  1.0|(692,[129,130,131...|[4.78087417059578...|
|  0.0|(692,[154,155,156...|[1.74080398890662...|
|  1.0|(692,[150,151,152...|[0.00497986029629...|
|  0.0|(692,[124,125,126...|[0.15012310687756...|


In [23]:
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows

