In [2]:
from david.server import CommentsSQL
from david.pipeline import Pipeline

In [48]:
db = CommentsSQL(db_fname="unbox_therapy/comments.db")
db.column_names

['id', 'cid', 'text', 'author', 'video_id']

In [68]:
from david.utils.io import as_txt_file
from textacy.preprocessing import normalize_unicode, normalize_whitespace

query = f"select text from {db.table_name}"

comments = list()
for text in db.conn.execute(query).fetchall():
    text = " ".join(line.strip() for line in text)
    text = normalize_whitespace(text)
    text = normalize_unicode(text)
    comments.append(text)

as_txt_file(comments, "comments.txt")
print(f"{len(comments)} -> samples available")

792580 -> samples available


In [69]:
from pyspark.sql.functions import col
from pyspark.ml.clustering import LDA, LDAModel
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, Tokenizer
# Spark NLP
import sparknlp
from sparknlp.common import RegexRule
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.annotator import *
from sparknlp.base import *

In [70]:
spark = sparknlp.start()
print("spark NLP version")
sparknlp.version()
print("apache spark version")
spark.version

spark NLP version
2.3.2
apache spark version


'2.4.4'

In [71]:
comments = spark.read.text("comments.txt")
comments = comments.selectExpr("value as text")
comments.printSchema()
comments.show()

root
 |-- text: string (nullable = true)

+---------------------------------+
|                             text|
+---------------------------------+
|             AirPods vid in Sa...|
|             hey, if you have ...|
|                   Me at 2160p 4K|
|             "wtf am i doing h...|
|             5:00 lmao “so you...|
|             Somebody likes ap...|
|             Plz review the Op...|
|             Lmao Will is the ...|
|             ear buds because ...|
|             The Pacific Ocean...|
|             Do review of MI n...|
|             Where did you get...|
|             Lew how did Kanye...|
|             Power beats 3 vs ...|
|관심이 있지만 아직 한국에는 출...|
|             I was expecting "...|
|             NURALOOPS REVIEW ...|
|             Hi when I opened ...|
|             Do a review on th...|
|             its a HUGE differ...|
+---------------------------------+
only showing top 20 rows



In [72]:
comments.count()

904530

In [73]:
# lets create a spark nlp pipeline
document_assembler = DocumentAssembler().setInputCol("text")
sentence_detector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")
pos_tagger = PerceptronModel.pretrained().setInputCols(["sentence", "token"])
chunker = Chunker() \
    .setInputCols(["sentence", "pos"]) \
    .setOutputCol("chunk") \
    .setRegexParsers(["<NNP>+", "<DT>?<JJ>*<NN>"])
finisher = Finisher().setInputCols(["chunk"]).setIncludeMetadata(False)
# assemble the pipeline
nlp_pipeline = Pipeline(
    stages=[
        document_assembler,
        sentence_detector,
        tokenizer, pos_tagger,
        chunker, finisher
    ]
)

pos_anc download started this may take some time.
Approximate size to download 4.3 MB
[OK!]


In [74]:
nlp_pipeline_df = nlp_pipeline.fit(comments).transform(comments)
nlp_pipeline_df.select("finished_chunk").show()

+------------------------------+
|                finished_chunk|
+------------------------------+
|          [AirPods, Samsung...|
|          [DM, time, twitte...|
|                            []|
|                      [i, wtf]|
|          [lmao, “so, high ...|
|          [Somebody, apple,...|
|          [Plz, Optus, revi...|
|                   [Lmao Will]|
|                      [ear, u]|
|          [Pacific Ocean, l...|
|            [MI, review, note]|
|                     [AirPods]|
|          [Lew, Kanye West'...|
|                       [Power]|
|[관심이, 있지만, 아직, 한국...|
|          [AirPods Pros, Gr...|
|          [NURALOOPS REVIEW...|
|          [Hi, YouTube, cha...|
|           [a review, the a50]|
|          [HUGE, difference...|
+------------------------------+
only showing top 20 rows



In [75]:
# lets create spark ML pipeline
cv = CountVectorizer(inputCol="finished_chunk",
                     outputCol="features",
                     vocabSize=1000, minDF=10.0, minTF=10.0)
idf = IDF(inputCol="features", outputCol="idf")
lda = LDA(k=10, maxIter=5)
ml_pipeline = Pipeline(stages=[cv, idf, lda])

In [76]:
ml_model = ml_pipeline.fit(nlp_pipeline_df)
ml_pipeline_df = ml_model.transform(nlp_pipeline_df)
ml_pipeline_df.show()

+---------------------------------+------------------------------+------------+------------+--------------------+
|                             text|                finished_chunk|    features|         idf|   topicDistribution|
+---------------------------------+------------------------------+------------+------------+--------------------+
|             AirPods vid in Sa...|          [AirPods, Samsung...|(1000,[],[])|(1000,[],[])|[0.0,0.0,0.0,0.0,...|
|             hey, if you have ...|          [DM, time, twitte...|(1000,[],[])|(1000,[],[])|[0.0,0.0,0.0,0.0,...|
|                   Me at 2160p 4K|                            []|(1000,[],[])|(1000,[],[])|[0.0,0.0,0.0,0.0,...|
|             "wtf am i doing h...|                      [i, wtf]|(1000,[],[])|(1000,[],[])|[0.0,0.0,0.0,0.0,...|
|             5:00 lmao “so you...|          [lmao, “so, high ...|(1000,[],[])|(1000,[],[])|[0.0,0.0,0.0,0.0,...|
|             Somebody likes ap...|          [Somebody, apple,...|(1000,[],[])|(1000,[],

In [77]:
lda_model = ml_model.stages[2]
likelihood = lda_model.logLikelihood(ml_pipeline_df)
perplexity = lda_model.logPerplexity(ml_pipeline_df)
print(f"lower bound on the log <likelihood>  of the entire corpus: {likelihood}")
print(f"upper bound on <perplexity> of the corpus: {perplexity}")

lower bound on the log <likelihood>  of the entire corpus: -52806.521209049766
upper bound on <perplexity> of the corpus: 5.785747913777777


In [78]:
# descrive the topics.
lda_model.describeTopics(3).show(truncate=False)

+-----+---------------+---------------------------------------------------------------------+
|topic|termIndices    |termWeights                                                          |
+-----+---------------+---------------------------------------------------------------------+
|0    |[1, 654, 829]  |[0.014320388005265283, 0.0013644229570866453, 0.0013428754860297949] |
|1    |[2, 423, 603]  |[0.007843652525955211, 0.001334848569854842, 0.0013192949764731763]  |
|2    |[868, 319, 432]|[0.0013929601910555658, 0.0013429803234103553, 0.0013198393743944162]|
|3    |[187, 357, 994]|[0.06016702609764171, 0.026054836175562562, 0.001286357010091536]    |
|4    |[12, 4, 734]   |[0.027796407928555766, 0.0012979390975671969, 0.0012677971549013907] |
|5    |[784, 695, 776]|[0.001440830312657017, 0.0013013037393124376, 0.001298163883732273]  |
|6    |[30, 60, 45]   |[0.26964003093811106, 0.185685335216649, 0.0843751213092158]         |
|7    |[0, 21, 8]     |[0.03716086794518217, 0.0206353363780

In [79]:
# lets look at the topics.
print(f"learned topics (as distributions over vocab of) {lda_model.vocabSize()}")
topics = lda_model.describeTopics(50)
topics_rdd = topics.rdd
vocab = ml_model.stages[0].vocabulary
topic_words = topics_rdd.map(
    lambda row: row["termIndices"]).map(
    lambda idx_list: [vocab[idx] for idx in idx_list]).collect()

for i, topic in enumerate(topic_words):
    for word in topic: print(f"{i} : {word}")

learned topics (as distributions over vocab of) 1000
0 : phone
0 : yo
0 : @The
0 : a gaming
0 : the sound
0 : This phone
0 : Bluetooth
0 : cause
0 : $200
0 : year
0 : sensor
0 : APPLE
0 : Dave
0 : IOS
0 : The iPhone
0 : waterproof
0 : car
0 : the comment
0 : control
0 : scanner
0 : isn’t
0 : game
0 : xr
0 : rate
0 : innovation
0 : rog
0 : WIGGLE WIGGLE
0 : Mi
0 : bullshit
0 : screen
0 : Waiting
0 : chip
0 : Lew
0 : book
0 : hell
0 : the thumbnail
0 : the ear
0 : wireless charging
0 : it’s
0 : all day
0 : brother
0 : the middle
0 : Definitely
0 : bruh
0 : ipad
0 : Nokia
0 : This thing
0 : inch
0 : sense
0 : gen
1 : Apple
1 : feature
1 : USA
1 : difference
1 : Microsoft
1 : lol
1 : Am
1 : port
1 : cause
1 : review
1 : eee
1 : top
1 : :(
1 : Apple Watch
1 : That phone
1 : Dont
1 : pop
1 : this thing
1 : >
1 : paper
1 : Minecraft
1 : the speaker
1 : they're
1 : this product
1 : didn't
1 : view
1 : no headphone
1 : wallet
1 : mind
1 : What’s
1 : the Note
1 : Ur
1 : the screen
1 : the p30
1 

In [83]:
from collections import defaultdict

root_topics = defaultdict(list)
for i, topic in enumerate(topic_words):
    root_topics[f"topic-{i}"] = [word for word in topic]

In [84]:
root_topics.keys()

dict_keys(['topic-0', 'topic-1', 'topic-2', 'topic-3', 'topic-4', 'topic-5', 'topic-6', 'topic-7', 'topic-8', 'topic-9'])

In [86]:
sorted(root_topics["topic-0"])

['$200',
 '@The',
 'APPLE',
 'Bluetooth',
 'Dave',
 'Definitely',
 'IOS',
 'Lew',
 'Mi',
 'Nokia',
 'The iPhone',
 'This phone',
 'This thing',
 'WIGGLE WIGGLE',
 'Waiting',
 'a gaming',
 'all day',
 'book',
 'brother',
 'bruh',
 'bullshit',
 'car',
 'cause',
 'chip',
 'control',
 'game',
 'gen',
 'hell',
 'inch',
 'innovation',
 'ipad',
 'isn’t',
 'it’s',
 'phone',
 'rate',
 'rog',
 'scanner',
 'screen',
 'sense',
 'sensor',
 'the comment',
 'the ear',
 'the middle',
 'the sound',
 'the thumbnail',
 'waterproof',
 'wireless charging',
 'xr',
 'year',
 'yo']

In [87]:
# here we can see the model assinged topic-1 with product keywords.
sorted(root_topics["topic-1"])

[').',
 ':(',
 '>',
 'Am',
 'Apple',
 'Apple Watch',
 'China',
 'Dont',
 'Honestly',
 'Microsoft',
 'Minecraft',
 'That phone',
 'The iPhone',
 'USA',
 'Ur',
 'WIGGLE WIGGLE',
 'What’s',
 'a PC',
 'a thing',
 'boy',
 'case',
 'cause',
 "didn't",
 'difference',
 'edge',
 'eee',
 'feature',
 'hand',
 'lol',
 'mind',
 'no headphone',
 'paper',
 'pocket',
 'pop',
 'port',
 'ppl',
 'review',
 'software',
 'the Note',
 'the p30',
 'the screen',
 'the space',
 'the speaker',
 'the year',
 "they're",
 'this product',
 'this thing',
 'top',
 'view',
 'wallet']

In [96]:
sorted(root_topics["topic-3"])

['$150',
 '+',
 ':(',
 '>',
 'Galaxy',
 'Honestly',
 'Jesus',
 'Lou',
 'Ok',
 'Xiaomi',
 'a bunch',
 'a fan',
 'a lot',
 'a week',
 'buddy',
 'cam',
 'chip',
 'cost',
 'crap',
 'desktop',
 'didnt',
 'family',
 'gold',
 'haha',
 'hahaha',
 'hole',
 'jack',
 'lew',
 'lite',
 'n',
 'nah',
 'night',
 'pixel',
 'problem',
 'rip',
 'setup',
 'speed',
 'studio',
 'switch',
 'the company',
 'the keyboard',
 'the notch',
 'the same price',
 'the same thing',
 'the water',
 'the world',
 'this channel',
 'volume',
 'word',
 '🔥']

In [101]:
# saving the model.
lda_model.save("lda_model_save/lda_comments")

In [102]:
# reloading the trained model:
lda_model = lda_model.load("lda_model_save/lda_comments")

In [104]:
print(f"learned topics (as distributions over vocab of) {lda_model.vocabSize()}")
topics = lda_model.describeTopics(50)
topics_rdd = topics.rdd
vocab = ml_model.stages[0].vocabulary
topic_words = topics_rdd.map(
    lambda row: row["termIndices"]).map(
    lambda idx_list: [vocab[idx] for idx in idx_list]).collect()

for i, topic in enumerate(topic_words):
    for word in topic: print(f"{i} : {word}")

learned topics (as distributions over vocab of) 1000
0 : phone
0 : yo
0 : @The
0 : a gaming
0 : the sound
0 : This phone
0 : Bluetooth
0 : cause
0 : $200
0 : year
0 : sensor
0 : APPLE
0 : Dave
0 : IOS
0 : The iPhone
0 : waterproof
0 : car
0 : the comment
0 : control
0 : scanner
0 : isn’t
0 : game
0 : xr
0 : rate
0 : innovation
0 : rog
0 : WIGGLE WIGGLE
0 : Mi
0 : bullshit
0 : screen
0 : Waiting
0 : chip
0 : Lew
0 : book
0 : hell
0 : the thumbnail
0 : the ear
0 : wireless charging
0 : it’s
0 : all day
0 : brother
0 : the middle
0 : Definitely
0 : bruh
0 : ipad
0 : Nokia
0 : This thing
0 : inch
0 : sense
0 : gen
1 : Apple
1 : feature
1 : USA
1 : difference
1 : Microsoft
1 : lol
1 : Am
1 : port
1 : cause
1 : review
1 : eee
1 : top
1 : :(
1 : Apple Watch
1 : That phone
1 : Dont
1 : pop
1 : this thing
1 : >
1 : paper
1 : Minecraft
1 : the speaker
1 : they're
1 : this product
1 : didn't
1 : view
1 : no headphone
1 : wallet
1 : mind
1 : What’s
1 : the Note
1 : Ur
1 : the screen
1 : the p30
1 

In [105]:
ml_model.save("ml_model_save/ml_model_comments")