In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StringIndexer, IndexToString, RegexTokenizer,\
    StopWordsRemover, Word2Vec, CountVectorizer, IDF, HashingTF
from pyspark.ml.clustering import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.functions import vector_to_array
from pyspark.ml import Pipeline

# iplot won't work because I've not installed the extension
import chart_studio.plotly as ply
import plotly.offline as plyoff
import plotly.graph_objects as go
import plotly.subplots as plysub

plyoff.init_notebook_mode(connected=True)
init = go.Figure(data=[go.Scatter(x=[1,2], y=[42,42])], layout=go.Layout(title='Init'))
plyoff.iplot(init)

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'latentdirichlet'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [3]:
''' get the data '''
# load the data
fil = '../data/recipes.json'
rec = spark.read.format('json').load(fil)

# add an ID - don't actually care if it's monotonic
rec = rec.select(monotonically_increasing_id().alias('id'), '*')


# talk
cnt = rec.count()
print('%d records'%cnt)
rec.show(2, truncate=False)

1617 records
+---+------------+------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Data Prep

In [4]:
''' handle missing values '''
# check for missing values
nullCounts = {colm:rec.select(colm).where(col(colm).isNull()).count() for colm in rec.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in rec.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])

# talk
display(nullCountsDF)

# remove
rec = rec.dropna(how='any')

# talk some more
print('%d records'%rec.count())

Unnamed: 0,Column,Freq.,Rel. Freq.,Type
0,Description,188.0,0.116265,StringType
1,Author,6.0,0.003711,StringType
2,id,0.0,0.0,LongType
3,Ingredients,0.0,0.0,"ArrayType(StringType,true)"
4,Method,0.0,0.0,"ArrayType(StringType,true)"
5,Name,0.0,0.0,StringType
6,url,0.0,0.0,StringType


1423 records


In [5]:
''' get rid of
non-alphanumeric or whitespace chars
drop spaces after a number befor a word
get rid of multiplied spaces
'''
# Name + Description + Ingredients?
recML = rec.select('id', concat_ws(' ', 'Name', 'Description', 'Ingredients').alias('text'))\
    .withColumn('text', regexp_replace(col('text'), '[^A-Za-z0-9 ]', ' ').alias('text'))\
    .withColumn('text', regexp_replace(col('text'), '([0-9])\s+([A-Za-z])', '$1$2'))\
    .withColumn('text', regexp_replace(col('text'), ' +', ' '))

# talk
recML.show(2, truncate=False)

+---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [6]:
''' all feature engineering stuff '''
# could try HashingTF or Word2Vec - but want the vocabulary which HashingTF doesn't have

# tokenizer
toker = RegexTokenizer(inputCol='text', outputCol='words', pattern='\\W', toLowercase=True)
# stopper
stop = StopWordsRemover(inputCol=toker.getOutputCol(), outputCol='fewer_words')
# hashing term frequency processor
#tf = HashingTF(inputCol=stop.getOutputCol(), outputCol='freqs', numFeatures=256)
tf = CountVectorizer(inputCol=stop.getOutputCol(), outputCol='freqs')
# tfidf
idf = IDF(inputCol=tf.getOutputCol(), outputCol='features')

# pipeline
featEngine = Pipeline(stages=[toker, stop, tf, idf]).fit(recML)
recML = featEngine.transform(recML).drop('text', 'words', 'freqs')

# talk
recML.show(2, truncate=False)

+---+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Modeling

In [7]:
# split for cross-val
trainPerc = 0.7
randSeed = 42
tran, test = recML.select('id', 'features').randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# talk
print('Training Cases')
tran.select('id').show()
print('Testing Cases')
test.select('id').show()

Training Cases
+---+
| id|
+---+
|  0|
|  1|
|  3|
|  4|
|  5|
|  7|
| 10|
| 11|
| 12|
| 16|
| 17|
| 18|
| 20|
| 22|
| 25|
| 26|
| 27|
| 31|
| 33|
| 36|
+---+
only showing top 20 rows

Testing Cases
+---+
| id|
+---+
|  2|
|  6|
|  8|
|  9|
| 13|
| 14|
| 15|
| 19|
| 21|
| 23|
| 24|
| 28|
| 29|
| 30|
| 32|
| 34|
| 35|
| 39|
| 42|
| 43|
+---+
only showing top 20 rows



In [8]:
''' evaluate different clustering cardinalities - kmeans '''
# setup range that will be tried
kMax = 21
xs = list(range(2, kMax))
kLogLike = np.ones(shape=(kMax, 2))*-1*np.inf # log likelihood maximize
kLogPerp = np.ones(shape=(kMax, 2))*np.inf    # log perplexity minimize

# iterate over k
models = [None]*kMax
for k in range(2, kMax):
    print('Trying k = %d'%k)
    # fit the LDA model on the training set
    lda = LDA(k=k, maxIter=10, featuresCol='features')
    models[k] = lda.fit(tran)
    # eval the model on the training set
    kLogLike[k, 0] = models[k].logLikelihood(tran)
    kLogPerp[k, 0] = models[k].logPerplexity(tran)   
    print('\tTraining Likelihood = %0.3f, Perplexity = %0.3f'%(kLogLike[k, 0], kLogPerp[k, 0]))
    # eval the model on the testing set 
    kLogLike[k, 1] = models[k].logLikelihood(test)
    kLogPerp[k, 1] = models[k].logPerplexity(test)
    print('\tTesting Likelihood = %0.3f, Perplexity = %0.3f'%(kLogLike[k, 1], kLogPerp[k, 1]))
    
# build and show the scree plot
fig = plysub.make_subplots(rows=2, cols=1, print_grid=False, subplot_titles=('Likelihood vs. k (maximize)', 'Perplexity vs. k (minimize)'))
# add training traces
fig.add_trace(go.Scatter(x=xs, y=kLogLike[2:, 0], mode='markers+lines',
                         marker={'color':'red'}, line={'color':'red', 'width':1},
                         name='Training Metrics', legendgroup='train', showlegend=True), 1, 1)
fig.add_trace(go.Scatter(x=xs, y=kLogPerp[2:, 0], mode='markers+lines',
                         marker={'color':'red'}, line={'color':'red', 'width':1},
                         name='Training Metrics', legendgroup='train', showlegend=False), 2, 1)
# add testing traces
fig.add_trace(go.Scatter(x=xs, y=kLogLike[2:, 1], mode='markers+lines',
                         marker={'color':'green'}, line={'color':'green', 'width':1},
                         name='Testing Metrics', legendgroup='test', showlegend=True), 1, 1)
fig.add_trace(go.Scatter(x=xs, y=kLogPerp[2:, 1], mode='markers+lines',
                         marker={'color':'green'}, line={'color':'green', 'width':1},
                         name='Testing Metrics', legendgroup='test', showlegend=False), 2, 1)

fig['layout']['title'] = 'Latent Dirichlet Allocation Results'
plyoff.plot(fig)

# find the min
bestK = np.argmax(kLogLike[:,0])
print('Best model has %d topics, with testing values of %0.3f and %0.3f'%(bestK, kLogLike[bestK, 1], kLogPerp[bestK, 1]))

Trying k = 2
	Training Likelihood = -1482383.888, Perplexity = 8.021
	Testing Likelihood = -552471.755, Perplexity = 8.304
Trying k = 3
	Training Likelihood = -1479175.572, Perplexity = 8.003
	Testing Likelihood = -553822.266, Perplexity = 8.324
Trying k = 4
	Training Likelihood = -1483075.810, Perplexity = 8.024
	Testing Likelihood = -558189.148, Perplexity = 8.390
Trying k = 5
	Training Likelihood = -1490646.355, Perplexity = 8.065
	Testing Likelihood = -565150.798, Perplexity = 8.494
Trying k = 6
	Training Likelihood = -1497750.753, Perplexity = 8.104
	Testing Likelihood = -572281.501, Perplexity = 8.602
Trying k = 7
	Training Likelihood = -1506976.706, Perplexity = 8.154
	Testing Likelihood = -581554.203, Perplexity = 8.741
Trying k = 8
	Training Likelihood = -1513467.641, Perplexity = 8.189
	Testing Likelihood = -589605.618, Perplexity = 8.862
Trying k = 9
	Training Likelihood = -1524530.095, Perplexity = 8.249
	Testing Likelihood = -600493.674, Perplexity = 9.026
Trying k = 10
	T

In [9]:
''' Evaluate best model on test set '''
# get the best
bestK = int(input('Enter the "best" k'))
bestModel = models[bestK]
ll = kLogLike[bestK, :]
lp = kLogPerp[bestK, :]
print('Best model has %d topics, with testing (training) values of %0.3f (%0.3f) and %0.3f (%0.3f)'\
      %(bestK, kLogLike[bestK, 1], kLogLike[bestK, 0], kLogPerp[bestK, 1], kLogPerp[bestK, 0]))

Enter the "best" k 5


Best model has 5 topics, with testing (training) values of -565150.798 (-1490646.355) and 8.494 (8.065)


In [10]:
''' Interpret the topics '''
# get the topics & most important words for each
topics = bestModel.describeTopics(maxTermsPerTopic=5).collect()
words = featEngine.stages[-2].vocabulary

# iterate over topics
for (indx, topic) in enumerate(topics):
    terms = ', '.join(['%s=%0.2f%%'%(words[term], 100*wait) for (term, wait) in zip(topic['termIndices'], topic['termWeights'])])
    print('Topic %d = %s'%(indx, terms))

Topic 0 = horseradish=0.53%, pies=0.36%, prawn=0.35%, mincemeat=0.32%, mince=0.32%
Topic 1 = chocolate=0.53%, ginger=0.49%, sugar=0.45%, cake=0.44%, icing=0.44%
Topic 2 = sprouts=0.52%, ham=0.44%, fra=0.41%, cr=0.41%, really=0.40%
Topic 3 = chopped=0.41%, 1tbsp=0.40%, sauce=0.38%, cheese=0.38%, finely=0.36%
Topic 4 = maple=0.55%, haddock=0.36%, chorizo=0.31%, raspberry=0.27%, pecan=0.26%


In [12]:
''' predict topics on input data '''
topicCols = ['Topic %d'%k for k in range(bestK)]
# predict, join with the raw data, and parse the topic probabilities
preds = bestModel.transform(recML).select('id', vector_to_array('topicDistribution').alias('topics'))\
    .join(rec, how='inner', on=['id']).select('id', 'Author', 'url', 'Name', 'Description', 'Ingredients', 'topics').toPandas()
# label the topic probabilities and drop the vector column
preds[topicCols] = preds['topics'].tolist()
preds.drop(columns=['topics'], inplace=True)
# get the most likely topic
preds['Predicted Topic'] = preds[['Topic %d'%k for k in range(bestK)]].idxmax(axis=1)
preds['Predicted Topic Prob.'] = preds[['Topic %d'%k for k in range(bestK)]].max(axis=1)
# talk
display(preds.head())

Unnamed: 0,id,Author,url,Name,Description,Ingredients,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Predicted Topic,Predicted Topic Prob.
0,0,Mary Cadogan,https://www.bbcgoodfood.com/recipes/2793/chris...,Christmas pie,Combine a few key Christmas flavours here to m...,"[2 tbsp olive oil, knob butter, 1 onion, finel...",0.000923,0.001054,0.000956,0.996156,0.000911,Topic 3,0.996156
1,1,Mary Cadogan,https://www.bbcgoodfood.com/recipes/1160/simme...,Simmer-&-stir Christmas cake,An easy-to-make alternative to traditional Chr...,"[175g butter, chopped, 200g dark muscovado sug...",0.000865,0.83918,0.000899,0.1582,0.000856,Topic 1,0.83918
2,2,Sara Buenfeld,https://www.bbcgoodfood.com/recipes/72622/chri...,Christmas cupcakes,These beautiful and classy little cakes make l...,"[200g dark muscovado sugar, 175g butter, chopp...",0.140048,0.857966,0.000605,0.000806,0.000575,Topic 1,0.857966
3,3,Paul Hollywood,https://www.bbcgoodfood.com/recipes/1803633/ch...,Christmas buns,Paul Hollywood's fruit rolls can be made ahead...,"[500g strong white flour, plus extra for dusti...",0.000971,0.819254,0.001002,0.087722,0.091051,Topic 1,0.819254
4,4,Barney Desmazery,https://www.bbcgoodfood.com/recipes/981634/chr...,Christmas cupcakes,"Made these for the second time today, and I ha...","[280g self-raising flour, 175g golden caster s...",0.000687,0.821378,0.000712,0.176543,0.00068,Topic 1,0.821378


In [None]:
sc.stop()