# Latent Dirichlet Allocation

TF-IDF : https://www.seoquantum.com/billet/optimisez-vos-contenus-mots-rares

LDA :
https://fr.wikipedia.org/wiki/Allocation_de_Dirichlet_latente



http://eric.univ-lyon2.fr/~ricco/cours/slides/TM.D%20-%20reduction%20de%20dimension.pdf (à partir de la page 22)

Dataset : https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews

This is an updated version to Spark 3 of the 1.6 version of https://medium.com/@connectwithghosh/topic-modelling-with-latent-dirichlet-allocation-lda-in-pyspark-2cb3ebd5678e


In [None]:
#!pip install nltk

In [None]:
# import nltk
# nltk.download('stopwords')

In [1]:
#!mv ../data/lda/'Womens Clothing E-Commerce Reviews.csv' ../data/lda'WomensClothingE-CommerceReviews.csv'
# !ls ../data/
# !mv ../data/ldaWomensClothingE-CommerceReviews.csv ../data/lda/
# !hdfs dfs -mkdir -p /demo/lda/
# !hdfs dfs -copyFromLocal -f ../data/lda/* /demo/lda/
# !hdfs dfs -ls /demo/lda/

In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName('LDA') \
    .getOrCreate()
sc = spark.sparkContext

In [3]:
# importing some librariesimport pandas as pd
import pyspark
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
# stuff we'll need for text processing
from nltk.corpus import stopwords
import re as re
from pyspark.ml.feature import CountVectorizer , IDF # stuff we'll need for building the model
from pyspark.ml.clustering import LDA

In [4]:
data = sqlContext.read.format("csv") \
   .options(header='true', inferschema='true') \
   .load("/demo/lda/ldaWomensClothingE-CommerceReviews.csv")

In [6]:
data.show(2)

+---+-----------+---+-----+--------------------+------+---------------+-----------------------+-------------+---------------+----------+
|_c0|Clothing ID|Age|Title|         Review Text|Rating|Recommended IND|Positive Feedback Count|Division Name|Department Name|Class Name|
+---+-----------+---+-----+--------------------+------+---------------+-----------------------+-------------+---------------+----------+
|  0|        767| 33| null|Absolutely wonder...|     4|              1|                      0|    Initmates|       Intimate| Intimates|
|  1|       1080| 34| null|"Love this dress!...|     5|              1|                      4|      General|        Dresses|   Dresses|
+---+-----------+---+-----+--------------------+------+---------------+-----------------------+-------------+---------------+----------+
only showing top 2 rows



In [7]:
reviews = data.rdd.map(lambda x : x['Review Text']).filter(lambda x: x is not None)

In [8]:
reviews.take(2)

['Absolutely wonderful - silky and sexy and comfortable',
 '"Love this dress!  it\'s sooo pretty.  i happened to find it in a store, and i\'m glad i did bc i never would have ordered it online bc it\'s petite.  i bought a petite and am 5\'8"".  i love the length on me- hits just a little below the knee.  would definitely be a true midi on someone who is truly petite."']

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
StopWords = stopwords.words("english")

In [15]:
StopWords[5:10]

['our', 'ours', 'ourselves', 'you', "you're"]

In [16]:
tokens = reviews                                                \
    .map( lambda document: document.strip().lower())               \
    .map( lambda document: re.split(" ", document))          \
    .map( lambda word: [x for x in word if x.isalpha()])           \
    .map( lambda word: [x for x in word if len(x) > 3] )           \
    .map( lambda word: [x for x in word if x not in StopWords])    \
    .zipWithIndex()

In [17]:
tokens.take(1)

[(['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable'], 0)]

In [18]:
df_txts = sqlContext.createDataFrame(tokens, ["list_of_words",'index'])# TF

In [19]:
df_txts.show(2)

+--------------------+-----+
|       list_of_words|index|
+--------------------+-----+
|[absolutely, wond...|    0|
|[sooo, happened, ...|    1|
+--------------------+-----+
only showing top 2 rows



In [20]:
cv = CountVectorizer(inputCol="list_of_words", outputCol="raw_features", vocabSize=5000, minDF=10.0)
cvmodel = cv.fit(df_txts)
result_cv = cvmodel.transform(df_txts)# IDF

In [21]:
result_cv.show(3)
cvmodel.vocabulary[40]
cvmodel.vocabulary[100]
cvmodel.vocabulary[374]

+--------------------+-----+--------------------+
|       list_of_words|index|        raw_features|
+--------------------+-----+--------------------+
|[absolutely, wond...|    0|(2421,[40,100,374...|
|[sooo, happened, ...|    1|(2421,[0,6,8,10,1...|
|[high, hopes, dre...|    2|(2421,[1,7,8,14,2...|
+--------------------+-----+--------------------+
only showing top 3 rows



'wonderful'

In [22]:
result_cv[['raw_features']]

DataFrame[raw_features: vector]

In [23]:

idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv) 

In [24]:
result_tfidf.show(3)

+--------------------+-----+--------------------+--------------------+
|       list_of_words|index|        raw_features|            features|
+--------------------+-----+--------------------+--------------------+
|[absolutely, wond...|    0|(2421,[40,100,374...|(2421,[40,100,374...|
|[sooo, happened, ...|    1|(2421,[0,6,8,10,1...|(2421,[0,6,8,10,1...|
|[high, hopes, dre...|    2|(2421,[1,7,8,14,2...|(2421,[1,7,8,14,2...|
+--------------------+-----+--------------------+--------------------+
only showing top 3 rows



In [25]:
result_tfidf.take(1)

[Row(list_of_words=['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable'], index=0, raw_features=SparseVector(2421, {40: 1.0, 100: 1.0, 374: 1.0, 554: 1.0, 641: 1.0}), features=SparseVector(2421, {40: 2.8147, 100: 3.4858, 374: 4.9002, 554: 5.353, 641: 5.5826}))]

In [26]:
# dataset = result_tfidf[['index','features']].rdd.map(list)
# dataset.take(2)

In [27]:
dataset = result_tfidf[['index','features']]
dataset.show(2)

+-----+--------------------+
|index|            features|
+-----+--------------------+
|    0|(2421,[40,100,374...|
|    1|(2421,[0,6,8,10,1...|
+-----+--------------------+
only showing top 2 rows



In [28]:
num_topics = 10
max_iterations = 100
lda = LDA(k=num_topics, maxIter=max_iterations)
lda_model = lda.fit(dataset)

In [29]:
wordNumbers = 5
topics = lda_model.describeTopics(maxTermsPerTopic = wordNumbers)
topics.show()

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|  [3, 1, 155, 14, 6]|[0.01537205140417...|
|    1| [57, 5, 0, 196, 71]|[0.01578505275315...|
|    2|  [1, 2, 132, 9, 11]|[0.01114037985121...|
|    3|    [3, 1, 0, 5, 41]|[0.01215684394674...|
|    4|[55, 266, 4, 280, 5]|[0.01006032501593...|
|    5|    [8, 6, 2, 3, 73]|[0.01022646024831...|
|    6| [37, 221, 1, 0, 34]|[0.01273654923773...|
|    7|[187, 282, 202, 9...|[0.01388437475495...|
|    8|[75, 26, 341, 294...|[0.03827748471183...|
|    9|[228, 243, 13, 24...|[0.01004399154315...|
+-----+--------------------+--------------------+



In [30]:
wordNumbers = 5

def topic_render(topic, words):
    out=''
    for i in words:
        out += cvmodel.vocabulary[i] + ' '
    print(topic, out)

for row in topics.collect():
    topic_render(row[0], row[1])

0 size dress chest small would 
1 jeans great love leggings many 
2 dress like arms fabric look 
3 size dress love great runs 
4 black cami wear spring great 
5 ordered would like size thought 
6 long highly dress love length 
7 favorite purchase denim pair never 
8 general petite suit today sweater 
9 washed wash bought lightweight wear 
