In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import col
import pyspark.sql.functions as fn


from pyspark.mllib.util import MLUtils
from pyspark.sql.types import *
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover

sc = pyspark.SparkContext()


In [20]:
dataSchema = StructType([
        StructField("textID",StringType(),True),
        StructField("ID(seq)",StringType(),True),
        StructField("word",StringType(),True),
        StructField("lemma",StringType(),True),
        StructField("PoS",StringType(),True)])

In [21]:
spark = SparkSession.builder.getOrCreate()

In [22]:
DATA_DIR = '../SampleData/'

In [23]:
dataAll  = spark.read.option('delimiter', '\t').csv(path=DATA_DIR+'us_mini.txt', schema=dataSchema)

In [24]:
dataAll.show()

+--------+----------+----------+--------+------+
|  textID|   ID(seq)|      word|   lemma|   PoS|
+--------+----------+----------+--------+------+
|14637197|4739839025|@@14637197|    null|    fo|
|14637197|4739839026|       <p>|    null|  null|
|14637197|4739839027|       NEW|     new|   np1|
|14637197|4739839028|      YORK|    york|   np1|
|14637197|4739839029|         (|    null|     (|
|14637197|4739839030|        AP|      ap|   np1|
|14637197|4739839031|         )|    null|     )|
|14637197|4739839032|        --|    null|jj_nn1|
|14637197|4739839033|    Donald|  donald|   np1|
|14637197|4739839034|     Trump|   trump|   nn1|
|14637197|4739839035|        's|      's|    ge|
|14637197|4739839036|  five-day|five-day|    jj|
|14637197|4739839037|      feud|    feud|   nn1|
|14637197|4739839038|      with|    with|    iw|
|14637197|4739839039|         a|       a|   at1|
|14637197|4739839040|    former|  former|    da|
|14637197|4739839041|    beauty|  beauty|   nn1|
|14637197|4739839042

In [25]:
df.select(df.columns[['textID','lemma']])

NameError: name 'df' is not defined

In [26]:
data_all = dataAll['textID','lemma'].na.drop()

In [27]:
# import pandas as pd
# pll = data_all.toPandas()
# pll.lemma.nunique()

In [28]:
data_all_g = data_all.groupby("textID").agg(fn.collect_list("lemma"))

In [29]:
data_all_g.show()

+--------+--------------------+
|  textID| collect_list(lemma)|
+--------+--------------------+
|14637197|[new, york, ap, d...|
|14637202|[in, this, sept, ...|
|14637201|[another, hotel, ...|
|14637200|[here, be, all, t...|
+--------+--------------------+



In [30]:
cv = CountVectorizer(inputCol="collect_list(lemma)", outputCol="vectors")
cv_model = cv.fit(data_all_g)
#data_all_v = cv_model.transform(data_all_g)

In [31]:
top20 = list(cv_model.vocabulary[0:20])
more_then_3_charachters = [word for word in cv_model.vocabulary if len(word) <= 3]
contains_digits = [word for word in cv_model.vocabulary if any(char.isdigit() for char in word)]


In [32]:
stopwords = []  #Add additional stopwords in this list
default_stop = StopWordsRemover.loadDefaultStopWords('english')
#Combine the three stopwords
stopwords = stopwords + top20  + more_then_3_charachters + contains_digits + default_stop

In [33]:
#Remove stopwords from the tokenized list
remover = StopWordsRemover(inputCol="collect_list(lemma)", outputCol="filtered", stopWords = stopwords)
data_all_filtered = remover.transform(data_all_g)

In [34]:
data_all_filtered.show()

+--------+--------------------+--------------------+
|  textID| collect_list(lemma)|            filtered|
+--------+--------------------+--------------------+
|14637197|[new, york, ap, d...|[york, donald, tr...|
|14637202|[in, this, sept, ...|[sept, photo, ric...|
|14637201|[another, hotel, ...|[another, hotel, ...|
|14637200|[here, be, all, t...|[crazy, stuff, ha...|
+--------+--------------------+--------------------+



In [35]:
#Create a new CountVectorizer model without the stopwords
cv = CountVectorizer(inputCol="filtered", outputCol="vectors")
cvmodel = cv.fit(data_all_filtered)
df_vect = cvmodel.transform(data_all_filtered)

In [36]:
#transform the dataframe to a format that can be used as input for LDA.train. LDA train expects a RDD with lists,
#where the list consists of a uid and (sparse) Vector
def parseVectors(line):
    return [int(line[2]), line[0]]

In [37]:
df_vect.show()

+--------+--------------------+--------------------+--------------------+
|  textID| collect_list(lemma)|            filtered|             vectors|
+--------+--------------------+--------------------+--------------------+
|14637197|[new, york, ap, d...|[york, donald, tr...|(314,[4,5,6,11,16...|
|14637202|[in, this, sept, ...|[sept, photo, ric...|(314,[1,3,7,9,12,...|
|14637201|[another, hotel, ...|[another, hotel, ...|(314,[2,6,14,15,1...|
|14637200|[here, be, all, t...|[crazy, stuff, ha...|(314,[0,5,8,10,12...|
+--------+--------------------+--------------------+--------------------+



In [38]:
type(df_vect)

pyspark.sql.dataframe.DataFrame

In [None]:
from pyspark.mllib.clustering import LDA, LDAModel

In [None]:
sparsevector = df_vect.select('vectors', 'filtered', 'textID') #.map(parseVectors)

In [41]:
from pyspark.mllib.linalg import Vectors, SparseVector
# data = [[1, Vectors.dense([0.0, 1.0])],[2, SparseVector(2, {0: 1.0})],]
# rdd =  sc.parallelize(data)

In [4]:
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

# Load and parse the data
data = sc.textFile("data/sample_lda_data.txt")
parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
# Index documents with unique IDs
corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distribu;tions over vocab of " + str(ldaModel.vocabSize())
      + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

# Save and load model
ldaModel.save(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel")
sameModel = LDAModel\
    .load(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel")

Learned topics (as distribu;tions over vocab of 11 words):
Topic 0:
 4.795578557451301
 6.095750212191132
 5.607507167633585
 17.744405827271557
 6.580147439528174
 5.861692264351127
 18.648060560075564
 1.4128362713016736
 2.540317606643392
 9.914053977270113
 6.128903053585665
Topic 1:
 12.066248592891844
 18.460671868543212
 3.0366434979996066
 4.981921140041983
 13.493327426200128
 13.411598690217076
 4.228664252196855
 6.883204276716702
 1.9353287268672172
 7.81580778123605
 1.9848002731641392
Topic 2:
 9.138172849656854
 4.443577919265655
 3.355849334366809
 17.27367303268646
 4.926525134271698
 2.726709045431797
 8.12327518772758
 1.7039594519816244
 3.5243536664893913
 6.270138241493836
 24.886296673250197


In [39]:
corpus.take(15)

[[0, DenseVector([1.0, 2.0, 6.0, 0.0, 2.0, 3.0, 1.0, 1.0, 0.0, 0.0, 3.0])],
 [1, DenseVector([1.0, 3.0, 0.0, 1.0, 3.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0])],
 [2, DenseVector([1.0, 4.0, 1.0, 0.0, 0.0, 4.0, 9.0, 0.0, 1.0, 2.0, 0.0])],
 [3, DenseVector([2.0, 1.0, 0.0, 3.0, 0.0, 0.0, 5.0, 0.0, 2.0, 3.0, 9.0])],
 [4, DenseVector([3.0, 1.0, 1.0, 9.0, 3.0, 0.0, 2.0, 0.0, 0.0, 1.0, 3.0])],
 [5, DenseVector([4.0, 2.0, 0.0, 3.0, 4.0, 5.0, 1.0, 1.0, 1.0, 4.0, 0.0])],
 [6, DenseVector([2.0, 1.0, 0.0, 3.0, 0.0, 0.0, 5.0, 0.0, 2.0, 2.0, 9.0])],
 [7, DenseVector([1.0, 1.0, 1.0, 9.0, 2.0, 1.0, 2.0, 0.0, 0.0, 1.0, 3.0])],
 [8, DenseVector([4.0, 4.0, 0.0, 3.0, 4.0, 2.0, 1.0, 3.0, 0.0, 0.0, 0.0])],
 [9, DenseVector([2.0, 8.0, 2.0, 0.0, 3.0, 0.0, 2.0, 0.0, 2.0, 7.0, 2.0])],
 [10, DenseVector([1.0, 1.0, 1.0, 9.0, 0.0, 2.0, 2.0, 0.0, 0.0, 3.0, 3.0])],
 [11, DenseVector([4.0, 1.0, 0.0, 0.0, 4.0, 5.0, 1.0, 3.0, 0.0, 1.0, 0.0])]]

In [40]:
df_vect.show()

+--------+--------------------+--------------------+--------------------+
|  textID| collect_list(lemma)|            filtered|             vectors|
+--------+--------------------+--------------------+--------------------+
|14637197|[new, york, ap, d...|[york, donald, tr...|(314,[4,5,6,11,16...|
|14637202|[in, this, sept, ...|[sept, photo, ric...|(314,[1,3,7,9,12,...|
|14637201|[another, hotel, ...|[another, hotel, ...|(314,[2,6,14,15,1...|
|14637200|[here, be, all, t...|[crazy, stuff, ha...|(314,[0,5,8,10,12...|
+--------+--------------------+--------------------+--------------------+



In [43]:
type(df_vect)

pyspark.sql.dataframe.DataFrame

In [59]:
parseData = df_vect.select('textID','vectors').rdd.map(lambda x: [int(x[0]), Vectors.dense(x[1])] )

In [61]:
ldaModel = LDA.train(parseData, k=2)

In [63]:
# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distribu;tions over vocab of " + str(ldaModel.vocabSize())
      + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(2):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

Learned topics (as distribu;tions over vocab of 314 words):
Topic 0:
 0.013168885633538078
 3.945512183388855
 3.954762035637311
 3.94551220003942
 3.954440670762215
 0.44500817832066936
 2.9562360699858172
 2.946208097980677
 0.013047930138406189
 2.946207964921116
 0.013047930031177237
 2.9549845673366706
 0.37592080940814654
 0.013047930359530696
 2.95594764601399
 2.956188563149555
 2.9549868950426
 2.9563357370211216
 0.01304793030924934
 0.013047930223821358
 1.9529424536212034
 1.9475628327705456
 0.6572527231116871
 1.9572210512674195
 0.012811861696808619
 1.9570622055819717
 0.7188830513073574
 1.9570853582910992
 0.01281186178183606
 1.9570007349586394
 1.9560500809438368
 1.9475629230691176
 0.012811861695923897
 1.9560520253549467
 0.012811861775455416
 0.012811861789224298
 1.9475628463457693
 1.9475628787377504
 0.012811862024640863
 1.9475629314359713
 1.9574644609280094
 1.956051450064842
 0.012811861824705633
 0.012811861968508252
 0.01281186176938459
 0.0128118618721

In [60]:
parseData.take(4)

[[14637197,
  DenseVector([0.0, 0.0, 0.0, 0.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0

In [None]:
sparsevector = df_vect.select('vectors', 'text', 'textID').map(parseVectors)

#Train the LDA model
model = LDA.train(sparsevector, k=5, seed=1)

#Print the topics in the model
topics = model.describeTopics(maxTermsPerTopic = 15)
for x, topic in enumerate(topics):
    print ('topic nr: ' + str(x))
    words = topic[0]
    weights = topic[1]
    for n in range(len(words)):
        print (cvmodel.vocabulary[words[n]] + ' ' + str(weights[n]))

In [65]:


#Print the topics in the model
topics = ldaModel.describeTopics(maxTermsPerTopic = 5)
for x, topic in enumerate(topics):
    print ('topic nr: ' + str(x))
    words = topic[0]
    weights = topic[1]
    for n in range(len(words)):
        print (cvmodel.vocabulary[words[n]] + ' ' + str(weights[n]))

topic nr: 0
hollywood 0.018468786515352976
need 0.018467285737501313
river 0.018425589671290012
rachel 0.01842558959353167
project 0.013806123630888198
topic nr: 1
leave 0.020997932679355753
grab 0.015731747014730096
sport 0.01573174701416534
brown 0.015731747013715477
mack 0.01573174701326554
