Notebook written by [Zhedong Zheng](https://github.com/zhedongzheng)

<img src="img/lda.png" width="600">

In [1]:
"""
brew install apache-spark
pip3 install findspark
"""
from nltk.corpus import stopwords

import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StringType, ArrayType
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.clustering import LDA

In [2]:
N_TOPICS = 10
MAX_TERMS = 5

In [3]:
stopwords = set(stopwords.words('english')).union({
    'introduction', 'edition', 'series', 'application',
    'approach', 'card', 'access', 'package', 'plus', 'etext',
    'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed',
    'third', 'second', 'fourth'})

sc = SparkContext('local', 'nlp')
lines = sc.textFile('data/all_book_titles.txt')
lines = lines \
    .map(lambda line: line.strip().lower()) \
    .map(lambda line: line.split()) \
    .map(lambda words: [w for w in words if w.isalpha()]) \
    .map(lambda words: [w for w in words if len(w) > 3]) \
    .map(lambda words: [w for w in words if w not in stopwords]) \
    .zipWithIndex()

sess = SparkSession.builder.appName('nlp').getOrCreate()
df = sess.createDataFrame(lines, ['words', 'idx'])

cv = CountVectorizer(inputCol='words', outputCol='tf')
cv = cv.fit(df)
df = cv.transform(df)
df = IDF(inputCol='tf', outputCol='tfidf').fit(df).transform(df)

lda = LDA(k=N_TOPICS, featuresCol='tfidf').fit(df)

for i, indices in enumerate(lda.describeTopics(MAX_TERMS).toPandas().termIndices):
    print('Topic %d:'%(i+1), ' '.join([cv.vocabulary[idx] for idx in indices]))

Topic 1: marketing engineering culture reader theater
Topic 2: mechanics quantum hinduism buddhism nutrition
Topic 3: physics modern pharmacology engineers basic
Topic 4: computer theory calculus life game
Topic 5: analysis economics management real principles
Topic 6: language applications natural concepts probability
Topic 7: human data anatomy essentials java
Topic 8: science differential equations asian student
Topic 9: chemistry psychology history volume world
Topic 10: biology molecular american health critical
