<a href="https://colab.research.google.com/github/ankesh86/PySparkNotebooks/blob/main/Unsupervised_LDA_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Latent Dirichlet Allocation (LDA)**

In [None]:
!pip install pyspark==3.4.0



In [None]:
# Import Sparksession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("LDA").getOrCreate()

In [None]:
import sys
print("Python version: "+sys.version)
print("Sparrk Session: "+spark.version)

Python version: 3.9.5 (default, Nov 23 2021, 15:27:38) 
[GCC 9.3.0]
Sparrk Session: 3.3.2


## Reading the data

In [None]:
file_location = "/FileStore/tables/lda_data-1.csv"
file_type = "csv"
infer_schema = "false"
first_row_is_header = "true"

df = spark.read.format(file_type)\
.option("inferSchema", infer_schema)\
.option("header", first_row_is_header)\
.load(file_location)

In [None]:
df.printSchema()

root
 |-- Pageurl: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Review Text: string (nullable = true)
 |-- Review Color: string (nullable = true)
 |-- User Verified: string (nullable = true)
 |-- Review Date: string (nullable = true)
 |-- Review Useful Count: string (nullable = true)
 |-- Configuration Text: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Declaration Text: string (nullable = true)



In [None]:
df.count()

Out[8]: 6855

In [None]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[?25l[K     |▏                               | 10 kB 20.6 MB/s eta 0:00:01[K     |▍                               | 20 kB 7.6 MB/s eta 0:00:01[K     |▋                               | 30 kB 10.5 MB/s eta 0:00:01[K     |▉                               | 40 kB 5.7 MB/s eta 0:00:01[K     |█                               | 51 kB 5.0 MB/s eta 0:00:01[K     |█▎                              | 61 kB 5.9 MB/s eta 0:00:01[K     |█▌                              | 71 kB 6.5 MB/s eta 0:00:01[K     |█▊                              | 81 kB 5.4 MB/s eta 0:00:01[K     |██                              | 92 kB 6.0 MB/s eta 0:00:01[K     |██▏                             | 102 kB 5.7 MB/s eta 0:00:01[K     |██▍                             | 112 kB 5.7 MB/s eta 0:00:01[K     |██▋                             | 122 kB 5.7 MB/s eta 0:00:01[K     |██▉                             | 133 kB 5.7 MB/s eta 0:00:01[K     

In [None]:
#import libraries
from pyspark.sql.types import *
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors as MLlibVectors

import re
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Out[10]: True

In [None]:
from nltk.corpus import stopwords

reviews = df.rdd.map(lambda x: x['Review Text']).filter(lambda x: x is not None)
StopWords = stopwords.words("english")
tokens = reviews.map(lambda document: document.strip().lower())\
    .map( lambda document: re.split("[\s;,#]", document)) \
    .map( lambda word: [x for x in word if x.isalpha()]) \
    .map( lambda word: [x for x in word if len(x) > 3] )\
    .map( lambda word: [x for x in word if x not in StopWords]).zipWithIndex()

In [None]:
#convert the rdd to dataframe
df_txts = spark.createDataFrame(tokens, ['list_of_words','index'])

#TF (Text doc to vector)
cv = CountVectorizer(inputCol="list_of_words", outputCol="raw-features", vocabSize=5000, minDF=10)
cvmodel = cv.fit(df_txts)
result_cv = cvmodel.transform(df_txts)

#IDF (features to lower values)
idf = IDF(inputCol="raw-features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv)

In [None]:
num_topics = 10
max_iterations = 100
lda_model = LDA.train(result_tfidf.select("index", "features").rdd.mapValues(MLlibVectors.fromML).map(list), k=num_topics, maxIterations = max_iterations)


In [None]:
wordNumbers = 5
data_topics = lda_model.describeTopics(maxTermsPerTopic= wordNumbers)
vocabArray = cvmodel.vocabulary

topicIndices = spark.sparkContext.parallelize(data_topics)
def topic_render(topic):
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
        result.append(term)
    return result



topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect()
for topic in range(len(topics_final)):
    print ("Topic" + str(topic) + ":")
    for term in topics_final[topic]:
        print (term)
    print ('\n')

Topic0:
home
smart
using
house
lights


Topic1:
time
alexa
every
find
google


Topic2:
would
even
work
many
everything


Topic3:
speaker
sound
echo
better
voice


Topic4:
device
amazon
another
back
never


Topic5:
things
weather
questions
news
know


Topic6:
much
still
easy
phone
well


Topic7:
love
great
little
product
thing


Topic8:
works
bought
could
nice
gift


Topic9:
music
play
want
able
listen


