In [41]:
from pyspark import SparkConf, SparkContext,SQLContext  
from pyspark.sql import SparkSession   
from pyspark.ml.feature import Word2Vec,CountVectorizer, RegexTokenizer  
from pyspark.ml.clustering import LDA, LDAModel  
from pyspark.sql.functions import col, udf  
from pyspark.sql.functions import split, explode, udf, lit, size, col
from pyspark.sql.types import IntegerType,ArrayType,StringType  
import pylab as pl  

In [42]:
def to_word(termIndices):
  words = []  
  for termID in termIndices:
    words.append(vocab_broadcast.value[termID])      
  return words

def preprocess(tweet):
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r'https\S+', '', tweet)
    tweet = re.sub(r'bit.ly/\S+', '', tweet)
    tweet = re.sub(r't.co/\S+', '', tweet) 
    tweet = tweet.strip('[link]') 
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) 
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)
    tweet = re.sub('[^A-Za-z0-9]+', ' ', tweet) 
    tweet = re.sub('([0-9]+)', '', tweet)
    tweet = re.sub(r"[^\x00-\x7F]+", '', tweet)
    return tweet

def remove_empty_words(tweet):
    tweet = [word for word in tweet if len(word)>0]
    return tweet
     

In [43]:
#Load your document dataframe here
#================your code here==================
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .getOrCreate()
DATA_PATH = "gs://mk4427hw2/lda.csv"

spark_df = spark.read.csv(DATA_PATH,inferSchema=True, header=True)
print(spark_df.count(),len(spark_df.columns))



#==================================================
spark_df.show()

26198 1
+-------------------------------+
|                          tweet|
+-------------------------------+
|30分だけ パスはいつもの / お...|
|           RT @OT9TRANS: 211...|
|           MM: Ohh everyone ...|
|           RT @OT9TRANS: 211...|
|           MM: Ohh everyone ...|
|           Two full pages of...|
|           RT @decepcionadam...|
|           RT @OT9TRANS: 211...|
|           MM: Ohh everyone ...|
|           RT @iJaadee: A Ge...|
|           @JHunterPearson H...|
|           RT @Rebby72979221...|
|                         MADRID|
|                        Yes pls|
|           RT @OT9TRANS: 211...|
|           MM: Ohh everyone ...|
|           Mon frère il a un...|
|           Moi j'ai du prend...|
|           RT @princejael01:...|
|                            ---|
+-------------------------------+
only showing top 20 rows



In [44]:
#CountVectorizer
#================your code here==================
preprocess_tweet = udf(preprocess)
remove_empty_words=udf(remove_empty_words)

spark_df = spark_df.withColumn('processed_tweets', preprocess_tweet(spark_df['tweet']))
spark_df = spark_df.withColumn('processed_tweets', split(spark_df['processed_tweets'], ' '))
spark_df = spark_df.withColumn('processed_tweets', remove_empty_words(spark_df['processed_tweets']))
tokenizer = RegexTokenizer().setPattern("[\\W_]+").setMinTokenLength(3).setInputCol("processed_tweets").setOutputCol("words")
t_df = tokenizer.transform(spark_df)
t_df = t_df.drop("tweets", "processed_tweets")
t_df.printSchema()
t_df.show()

cv = CountVectorizer()
cv.setInputCol("words")
cv.setOutputCol("count_vectors")
cv_model = cv.fit(transformed_dataframe)
cv_model.setInputCol("words")
cvResult = cv_model.transform(transformed_dataframe)
cvResult.printSchema()

#==================================================

root
 |-- tweet: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-------------------------------+--------------------+
|                          tweet|               words|
+-------------------------------+--------------------+
|30分だけ パスはいつもの / お...|                  []|
|           RT @OT9TRANS: 211...|                  []|
|           MM: Ohh everyone ...|[ohh, everyone, s...|
|           RT @OT9TRANS: 211...|                  []|
|           MM: Ohh everyone ...|[ohh, everyone, s...|
|           Two full pages of...|[two, full, pages...|
|           RT @decepcionadam...|[trnsar, sei, mas...|
|           RT @OT9TRANS: 211...|                  []|
|           MM: Ohh everyone ...|[ohh, everyone, s...|
|           RT @iJaadee: A Ge...|[generator, can, ...|
|           @JHunterPearson H...|[happy, birthday,...|
|           RT @Rebby72979221...|[governo, spagnol...|
|                         MADRID|            [madrid]|
|   

In [45]:
#train LDA model, cluster the documents into 10 topics 
#================your code here==================

lda = LDA(featuresCol="count_vectors", k=10, maxIter=100)
ldaModel = lda.fit(cvResult)

#==================================================

In [46]:
transformed = ldaModel.transform(cvResult).select("topicDistribution")  
#show the weight of every topic Distribution 
transformed.show(truncate=False)  

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topicDistribution                                                                                                                                                                                                    |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]                                                                                                                                                                            |
|[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]                                                                                              

In [47]:
#The higher ll is, the lower lp is, the better model is.
ll = ldaModel.logLikelihood(cvResult)  
lp = ldaModel.logPerplexity(cvResult)
print("ll: ", ll)
print("lp: ", lp)

ll:  -980135.9996346916
lp:  8.024561572879858


In [48]:
# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize())+ " words):")
topics = ldaModel.topicsMatrix()
print(topics)

Learned topics (as distributions over vocab of 19704 words):
DenseMatrix([[2.74651855e-01, 1.53059382e-01, 6.94984291e+02, ...,
              2.32997000e-01, 2.61283903e+03, 6.02679314e-01],
             [6.40625818e-01, 2.31417702e-01, 2.97297982e+01, ...,
              5.76406763e-01, 2.68853354e+03, 3.31350583e-01],
             [1.80457762e-01, 5.09565691e-01, 6.65599944e+02, ...,
              1.57678497e-01, 8.31996264e+01, 3.55277925e-01],
             ...,
             [1.50490028e-01, 7.45127174e-01, 1.66876113e-01, ...,
              1.57048472e-01, 1.63981799e-01, 1.51637581e-01],
             [1.45755485e-01, 1.55839531e-01, 1.44684878e-01, ...,
              1.50621294e-01, 1.21741016e+00, 1.51056743e-01],
             [1.52404359e-01, 1.51924307e-01, 5.51864175e-01, ...,
              1.52186482e-01, 1.39570878e-01, 1.48896715e-01]])
