### Group Members: Anirav Jain, Daniel Tinoco, Joren Libunao, Theo Kim, Zemin Cai

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='notebook', style='darkgrid')
%config InlineBackend.figure_format = 'retina'
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer, VectorAssembler, StopWordsRemover, Word2Vec
from pyspark.ml.clustering import KMeans
from wordcloud import WordCloud

import spacy

# Analytics Goals:
 - Tokenize, remove stop words, and vectorize post titles
 - Cluster posts by their title vectors to create 'topic' clusters
 - Create word clouds for each cluster

Downloading spacy model for English Stop Words

In [0]:
cls = spacy.util.get_lang_class('en')

Out[97]: True

## Downloading Data from MongoDB

In [0]:
spark = SparkSession.builder.getOrCreate()

In [0]:
## info needed for agg_to_mongo

mongo_username = 'zcai25'
mongo_password = 'caGrSOuIc3XqmRQo'
# mongo_ip_address = 'msds697-cluster.iwmrj.mongodb.net/'
mongo_ip_address = 'msds697-cluster.iwmrj.mongodb.net'
database_name = 'admin'
collection_name = 'posts'

connection_string = f'mongodb+srv://{mongo_username}:{mongo_password}@{mongo_ip_address}/{database_name}.{collection_name}'

In [0]:
df_posts = spark.read.format('mongo').option('uri', connection_string).load()

Viewing the posts dataframe to be sure the data is captured correctly

In [0]:
df_posts.display(5)

## Clustering Posts by Titles

Goal: tokenize titles and remove stop words, then use word2vec to cluster with k-means via the vectorization of their titles

In [0]:
tokenizer = Tokenizer(inputCol="post_title", outputCol="words")
tokenized = tokenizer.transform(df_posts)

remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", stopWords=)
tokenized = remover.transform(tokenized)

# Create a feature vector
w2v = Word2Vec(inputCol = 'filtered_words', outputCol = 'result')
model = w2v.fit(tokenized)
df_post2vec = model.transform(tokenized)

Using the elbow method, identify the best k for number of clusters.

In [0]:
wss_values = []
for k in range(2, 33, 3):
    kmeans = KMeans(k=k, seed=1)
    kmeans.setFeaturesCol("result") # Specify the name of the feature column
    wss = kmeans.fit(df_post2vec).summary.trainingCost
    wss_values.append(wss)

In [0]:
# Plot the WSS values against K
plt.plot(range(2, 33, 3), wss_values)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Within-cluster sum of squares')
plt.show()

Set k to 15. Create clusters with labels.

In [0]:
kmeans = KMeans(k=15, seed=1)
kmeans.setFeaturesCol("result") # Specify the name of the feature column
model = kmeans.fit(df_post2vec)
predictions = model.transform(df_post2vec)

We now have all of our posts clustered, so we can return their titles and cluster labels for grouping and word cloud creation.

In [0]:
predictions.select('post_title', 'prediction').orderBy('prediction').display(5)

Removing some extra stop words that appeared in the word cloud yet do not feel relevant.

In [0]:
cls.Defaults.stop_words.add('u')
cls.Defaults.stop_words.add('s')
cls.Defaults.stop_words.add('new')
cls.Defaults.stop_words.add('us')
cls.Defaults.stop_words.add('says')

In [0]:
fig, ax = plt.subplots(5,3, figsize = (18,30))
ax = ax.flatten()    

for i in range(15):
    cluster = predictions.filter(f'prediction == {i}').select('filtered_words')
    word_list_df = cluster.select(explode(cluster.filtered_words).alias("word"))

    words_list = [row.word for row in word_list_df.collect()]
    wordcloud = WordCloud(height = 400, width = 400, background_color ='white', stopwords = cls.Defaults.stop_words).generate(' '.join(words_list))

    ax[i].imshow(wordcloud)
    ax[i].axis("off")
    ax[i].set_title(f'Cluster {i}')

fig.tight_layout(pad = 0)
plt.show()

![wordcloud](files/shared_uploads/bkim39@dons.usfca.edu/download.png)

## Summary
 - Converted post titles into vectorizations using word2vec, removing stop words as well
 - 15 clusters created to group posts together by similarity distance of vectorizations
 - Created a word cloud for each cluster to identify keywords/phrases and general hot topics