In [None]:
import findspark
findspark.init()

from neon_demo.kmeans import KMeans
from neon_demo.neon import NeuralizedKMeans, neon
from neon_demo.utils import *
from pyspark import SparkConf, SparkContext
from pyspark.sql import functions as F
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, FloatType
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
import torch as tr

In [None]:
spark = SparkSession.builder\
    .master('local[*]')\
    .appName('explore')\
    .getOrCreate()

In [None]:
vectorized_df = spark.read.json('/common/users/shared/cs543_fall22_group3/combined/deep_vectors')
vectorized_df.show()

In [None]:
vectorized_df = vectorized_df.withColumn('vector', F.col('vector').cast('array<float>'))
vectorized_df.printSchema()

In [None]:
tensorize_udf = F.udf(lambda x: tr.tensor(x))
vectorized_df = vectorized_df.withColumn('vector', tensorize_udf(F.col('vector')))

In [None]:
sample = vectorized_df.take(100000)

In [None]:
sample_df = pd.DataFrame(sample, columns=['text', 'vectors'])
sample_df.head()

In [None]:
sample_df.to_csv('sample_df.csv')

In [None]:
# random state for reproducibility
n_clusters = 10
m = KMeans(n_clusters=n_clusters, random_state=77)
m.fit(sample_df['vectors'])

In [None]:
m.centroids.size()

In [None]:
sample_df['predictions'] = m.kmeans.predict(sample_df['vectors'])
sample_df.head()

In [None]:
stopwords = set(STOPWORDS)

for i in range(n_clusters):
    cluster = sample_df[predictions == i]
    
    if cluster.empty:
        continue

    cluster_text = ' '.join(i for i in cluster.text)
    wordcloud = WordCloud(stopwords=stopwords, background_color='white').generate(cluster_text)
    
    plt.figure(figsize=(15, 15))
    plt.title(f'Cluster {i}', fontsize=20)
    plt.imshow(wordcloud)
    plt.show()

In [None]:
m = NeuralizedKMeans(m)