In [1]:
import sys, re
from pyspark import SparkContext, SparkConf
from pyspark.ml.feature import Tokenizer, CountVectorizer, StopWordsRemover
from pyspark.sql import *
from pyspark.sql.functions import lower, regexp_replace
from pyspark.ml.clustering import LDA
from IPython.core.display import display, HTML
from pymongo import MongoClient
import pandas as pd

# Adapted from:
# https://github.com/apache/spark/blob/master/examples/src/main/python/ml/lda_example.py

conf = SparkConf().setAppName('Steam Clustering').setMaster('spark://sparkmaster:7077')
SparkContext.setSystemProperty('spark.executor.memory', '2g') # bump the executor memory to prevent OOM issues
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [2]:
sc._conf.getAll()

[('spark.executor.memory', '2g'),
 ('spark.driver.host', 'jupyterlab'),
 ('spark.app.name', 'Steam Clustering'),
 ('spark.rdd.compress', 'True'),
 ('spark.app.id', 'app-20210629203431-0005'),
 ('spark.app.startTime', '1625020470548'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.driver.port', '40251'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.master', 'spark://sparkmaster:7077'),
 ('spark.ui.showConsoleProgress', 'true')]

In [3]:
client = MongoClient(host="192.168.1.171", port=27017)
db = client['steam']
collection = db['apps']

# Load and parse the data
ret = collection.find({"updated_date": {"$exists": True}, "type": {"$in": ["game", "dlc"]}},
                        {"detailed_description":1}).limit(1000)

In [4]:
# save as pandas dataframe
descriptions = pd.DataFrame(ret)

In [5]:
descriptions

Unnamed: 0,_id,detailed_description
0,5b8d041128931725e4144025,Become immersed in the fantastic world of Aris...
1,5b8d1ca328931725e41445e0,<p>Inviting you to explore a vast deserted isl...
2,5b8d40ef28931725e4144eda,<p>Enhance your Crusader Kings II Experience w...
3,5b8d515328931725e4145323,"Colonize solar systems, annihilate homeworlds,..."
4,5b8d546a28931725e41453fe,Mark of the Ninja: Special Edition expands the...
...,...,...
995,5d3a2f3c1534017272d00466,This free DLC include a collection of 5 erotic...
996,5d3a2f3c1534017272d00467,This free DLC include a collection of 5 fetish...
997,5d3a2f3c1534017272d00468,This free DLC include a collection of 5 fetish...
998,5d3a2f3c1534017272d00469,This free DLC include a collection of 5 wallpa...


In [6]:
descriptions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   _id                   1000 non-null   object
 1   detailed_description  1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [7]:
# convert to Spark dataframe
descriptions = sqlContext.createDataFrame(descriptions.astype(str))

In [8]:
descriptions

DataFrame[_id: string, detailed_description: string]

In [9]:
descriptions.show(n=10)

+--------------------+--------------------+
|                 _id|detailed_description|
+--------------------+--------------------+
|5b8d041128931725e...|Become immersed i...|
|5b8d1ca328931725e...|<p>Inviting you t...|
|5b8d40ef28931725e...|<p>Enhance your C...|
|5b8d515328931725e...|Colonize solar sy...|
|5b8d546a28931725e...|Mark of the Ninja...|
|5b8d9352852f693bc...|Adds a game field...|
|5b8d962b852f693bc...|<img src="https:/...|
|5b8d9635852f693bc...|Adds a game field...|
|5b8d9634852f693bc...|<h1>Developer's N...|
|5b8d9636852f693bc...|Your city was inf...|
+--------------------+--------------------+
only showing top 10 rows



In [10]:
# remove HTML tags and lowercase it
descriptions = descriptions.withColumn("detailed_description", lower((regexp_replace('detailed_description', '<[^<]+?>', ''))))
# remove special characters
descriptions = descriptions.withColumn("detailed_description", lower((regexp_replace('detailed_description', '[^0-9a-zA-Z ]+', ''))))
# remove specific words
descriptions = descriptions.withColumn("detailed_description", lower((regexp_replace('detailed_description', 'dlc|game|games', ''))))

In [11]:
descriptions.show(n=10)

+--------------------+--------------------+
|                 _id|detailed_description|
+--------------------+--------------------+
|5b8d041128931725e...|become immersed i...|
|5b8d1ca328931725e...|inviting you to e...|
|5b8d40ef28931725e...|enhance your crus...|
|5b8d515328931725e...|colonize solar sy...|
|5b8d546a28931725e...|mark of the ninja...|
|5b8d9352852f693bc...|adds a  field of ...|
|5b8d962b852f693bc...|be ready to face ...|
|5b8d9635852f693bc...|adds a  field of ...|
|5b8d9634852f693bc...|developers note o...|
|5b8d9636852f693bc...|your city was inf...|
+--------------------+--------------------+
only showing top 10 rows



In [12]:
# http://spark.apache.org/docs/latest/ml-features#tokenizer

tokenizer = Tokenizer(inputCol="detailed_description", outputCol="words")
tokenized = tokenizer.transform(descriptions)

In [13]:
tokenized.show(n=10)

+--------------------+--------------------+--------------------+
|                 _id|detailed_description|               words|
+--------------------+--------------------+--------------------+
|5b8d041128931725e...|become immersed i...|[become, immersed...|
|5b8d1ca328931725e...|inviting you to e...|[inviting, you, t...|
|5b8d40ef28931725e...|enhance your crus...|[enhance, your, c...|
|5b8d515328931725e...|colonize solar sy...|[colonize, solar,...|
|5b8d546a28931725e...|mark of the ninja...|[mark, of, the, n...|
|5b8d9352852f693bc...|adds a  field of ...|[adds, a, , field...|
|5b8d962b852f693bc...|be ready to face ...|[be, ready, to, f...|
|5b8d9635852f693bc...|adds a  field of ...|[adds, a, , field...|
|5b8d9634852f693bc...|developers note o...|[developers, note...|
|5b8d9636852f693bc...|your city was inf...|[your, city, was,...|
+--------------------+--------------------+--------------------+
only showing top 10 rows



In [14]:
# remove stopwords
# http://spark.apache.org/docs/latest/ml-features#stopwordsremover

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
tokenized = remover.transform(tokenized)

In [15]:
tokenized.show(truncate=True)

+--------------------+--------------------+--------------------+--------------------+
|                 _id|detailed_description|               words|            filtered|
+--------------------+--------------------+--------------------+--------------------+
|5b8d041128931725e...|become immersed i...|[become, immersed...|[become, immersed...|
|5b8d1ca328931725e...|inviting you to e...|[inviting, you, t...|[inviting, explor...|
|5b8d40ef28931725e...|enhance your crus...|[enhance, your, c...|[enhance, crusade...|
|5b8d515328931725e...|colonize solar sy...|[colonize, solar,...|[colonize, solar,...|
|5b8d546a28931725e...|mark of the ninja...|[mark, of, the, n...|[mark, ninja, spe...|
|5b8d9352852f693bc...|adds a  field of ...|[adds, a, , field...|[adds, , field, i...|
|5b8d962b852f693bc...|be ready to face ...|[be, ready, to, f...|[ready, face, wor...|
|5b8d9635852f693bc...|adds a  field of ...|[adds, a, , field...|[adds, , field, i...|
|5b8d9634852f693bc...|developers note o...|[developers

In [16]:
# http://spark.apache.org/docs/latest/ml-features#countvectorizer

# TODO, tweak minDF, etc.?
# http://spark.apache.org/docs/latest/api/scala/org/apache/spark/ml/feature/CountVectorizer.html
cv = CountVectorizer(inputCol="filtered", outputCol="features", minDF=3)
vectorized = cv.fit(tokenized)

In [17]:
features = vectorized.transform(tokenized)
features.show(truncate=True)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 _id|detailed_description|               words|            filtered|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|5b8d041128931725e...|become immersed i...|[become, immersed...|[become, immersed...|(3790,[0,4,8,10,1...|
|5b8d1ca328931725e...|inviting you to e...|[inviting, you, t...|[inviting, explor...|(3790,[0,4,10,23,...|
|5b8d40ef28931725e...|enhance your crus...|[enhance, your, c...|[enhance, crusade...|(3790,[0,4,5,40,4...|
|5b8d515328931725e...|colonize solar sy...|[colonize, solar,...|[colonize, solar,...|(3790,[0,1,6,9,11...|
|5b8d546a28931725e...|mark of the ninja...|[mark, of, the, n...|[mark, ninja, spe...|(3790,[0,2,7,8,9,...|
|5b8d9352852f693bc...|adds a  field of ...|[adds, a, , field...|[adds, , field, i...|(3790,[0,121,270,...|
|5b8d962b852f693bc...|be ready to fac

In [18]:
# number of topics
k = 50

lda = LDA(k=k, maxIter=10)
model = lda.fit(features)
print(type(model))

# https://stackoverflow.com/questions/33725122/reporting-log-likelihood-perplexity-of-spark-lda-model-different-in-local-vs
ll = model.logLikelihood(features)
lp = model.logPerplexity(features)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
topics = model.describeTopics(15)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

<class 'pyspark.ml.clustering.LocalLDAModel'>
The lower bound on the log likelihood of the entire corpus: -885528.9656972471
The upper bound on perplexity: 14.520200795219347
The topics described by their top-weighted terms:
+-----+------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|termIndices                                                                               |termWeights                                                                                                                                                                                                                                   

In [19]:
# TODO: is this needed?
topics_matrix = model.topicsMatrix().toArray()
topics_matrix

array([[0.73471822, 0.70473257, 0.71194883, ..., 2.91174743, 3.02105692,
        0.63022096],
       [0.69019133, 0.78298389, 0.68587308, ..., 0.69831265, 1.21935841,
        0.73414185],
       [0.76447422, 0.67364358, 0.76127377, ..., 0.85327955, 0.71821888,
        0.93140191],
       ...,
       [0.63527696, 0.85292464, 0.78234888, ..., 0.81375713, 0.76146196,
        0.85471433],
       [0.89992419, 0.80152763, 0.6344772 , ..., 0.7341108 , 0.7799453 ,
        0.70658562],
       [0.71468068, 0.89970003, 0.76454647, ..., 0.78075598, 0.9259023 ,
        0.65230046]])

In [20]:
# can't iterate over a dataframe so we collect it first
collected_topics = topics.collect()

for topic in range(k):
    print("Topic " + str(topic) + ":")
    terms = []
    for termIndex in collected_topics[topic]['termIndices']:
        terms.append(str(vectorized.vocabulary[termIndex]))
    print(', '.join(terms))
    print("-----------------------------")

Topic 0:
click, highspeed, eating, bats, casual, reduce, surround, shields, looking, villains, romantic, guardian, lesbian, president, 360
-----------------------------
Topic 1:
welltrained, poop, steps, selection, met, horror, taught, confronted, onslaught, enthusiasm, replaces, fruit, decorate, error, excellent
-----------------------------
Topic 2:
restore, done, chain, everyones, squares, obtain, quickly, protecting, offers, depend, picturefeatures20, includesrank, lower, 80sthe, guard
-----------------------------
Topic 3:
middle, appearance, investigation, gift, emitteractive, deck, army, fishing, bloody, heritage, initially, cells, achieve, finale, rival
-----------------------------
Topic 4:
grid, puzzle, , badges, nine, es, read, emoticons, guides, english, column, completed, backgrounds, objective, solution
-----------------------------
Topic 5:
planet, riding, casual, chicken, jump, avoid, system, sun, simulator, reach, collect, solar, , history, runner
---------------------

In [21]:
transformed = model.transform(features)
transformed.show(truncate=True)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 _id|detailed_description|               words|            filtered|            features|   topicDistribution|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|5b8d041128931725e...|become immersed i...|[become, immersed...|[become, immersed...|(3790,[0,4,8,10,1...|[6.39265878730774...|
|5b8d1ca328931725e...|inviting you to e...|[inviting, you, t...|[inviting, explor...|(3790,[0,4,10,23,...|[2.95475117972630...|
|5b8d40ef28931725e...|enhance your crus...|[enhance, your, c...|[enhance, crusade...|(3790,[0,4,5,40,4...|[5.73701821562152...|
|5b8d515328931725e...|colonize solar sy...|[colonize, solar,...|[colonize, solar,...|(3790,[0,1,6,9,11...|[1.39277350979563...|
|5b8d546a28931725e...|mark of the ninja...|[mark, of, the, n...|[mark, ninja, spe...|(3790,[0,2,7,8,9,..

In [22]:
# Save and load model
#ldaModel.save(sc, "/spark-steam-model")
#sameModel = LDAModel.load(sc, "/spark-steam-model")

In [23]:
sc.stop()