Imports and helper functions:

In [None]:
import umap
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import sys
import os

import plotly.express as px
import plotly.offline as pyo

import numpy as np
import pandas as pd
pd.options.plotting.backend = "plotly"

import os.path


# from embedding_utils import *
from sentence_transformers import SentenceTransformer


def get_embeddings(sentences, model_name="paraphrase-MiniLM-L6-v2"):
    """ Returns embeddings for a provided list of text samples

    Args:
        sentences: List-like set of text samples to be embedded
        model_name: embedding model to use (paraphrase-MiniLM-L6-v2 is Sentence-BERT)

    Returns:
        sentence embeddings: array of embedding vectors, order corresponds to order of input sentences
    """
    model = SentenceTransformer(model_name)
    sentence_embeddings = model.encode(sentences)
    return(sentence_embeddings)


def get_transform(data, method, n_neighbors=15, min_dist=0.1, n_pca=50, n_umap=2, metric='euclidean'):
    """ Implements transformation of high-dimensional data using UMAP and PCA, either seperately or in tandem. 

    Args:
        data (Series): Set of vectors to be transformed. 
        method (String - 'umap', 'pca', or 'both'): Defines which transformation methods to use. If both, function will first run PCA, then run UMAP on the results of the PCA transform.
        n_neighbors (int, optional): Parameter for UMAP. Defaults to 15.
        min_dist (float, optional): Parameter for UMAP. Defaults to 0.1.
        n_pca (int, optional): Dimensionality of vectors to be returned from PCA transform. Defaults to 50.
        n_umap (int, optional): Dimensionality of vectors to be returned from UMAP transform. Defaults to 2.
        metric (str, optional): Distance metric for UMAP. Defaults to 'euclidean'.

    Returns:
        tfm: array of transformed vectors 
    """
    if method == 'umap':
        fit = umap.UMAP(
            n_neighbors=n_neighbors,
            min_dist=min_dist,
            n_components=n_umap,
            metric=metric
        )
        tfm = fit.fit_transform(data);
    if method == 'pca':
        n_pca=n_pca
        fit = PCA(
            n_components=n_pca
        )
        tfm = fit.fit_transform(data)
    if method == 'both':
        pca = PCA(n_components=n_pca).fit_transform(data)
        tfm = umap.UMAP(
            n_neighbors=n_neighbors,
            min_dist=min_dist,
            n_components=n_umap,
            metric=metric
        ).fit_transform(pca)
        
    return tfm


def sil_range(tfm, test_vals=np.arange(5,20)):
    """
    Iteratively tests k-means clustering over a range of k values with silhouette scores to identify an optimal k value. Tested k-values are displayed in a plot

    Args:
        tfm: array of transformed values, such as those output by get_transform
        test_vals: List of integer k-values to test. Default to test all values between 5 and 20

    Returns:
        (int, float): optimal k-value and associated silhouette score
    """
    print("Testing k values:")
    silhouette_scores = []

    for k in test_vals:
        print(k)
        kmeans = KMeans(n_clusters=k, random_state=0).fit(tfm)
        silhouette_scores.append(silhouette_score(tfm, kmeans.labels_))

    df = pd.DataFrame({'k':test_vals, 'silhouette':silhouette_scores})
    fig = px.bar(df, x='k', y='silhouette')
    fig.show()
    
    return int(df.loc[df['silhouette'].argmax()]['k']), df.loc[df['silhouette'].argmax()]['silhouette']


def px_plot(data, save=False, fpath="fig.html", title="Semantic Clustering of Monkeypox Discussion on Twitter", colormap=False):
    """
    Function for generating interactive plot of transformed tweet embeddings. Includes displaying various additional parameters as hover data. Depending on your data format, you may need to modify this to correspond to your tweet metadata and variable names.

    Args:
        data: DataFrame of tweets and embedding coordinates. For this function, should contain column labeled as:
            x: transformed x-coordinate
            y: transformed y-coordinate
            hover_text: text to be displayed when hovering on plot
            retweet_count, favorite_count: Tweet metadata
            k_means_category
            size: desired size of points on plot. For twitter data, something like ln(retweets) can be useful
        save: Set to True to save your plot as .html file
        fpath: filepath and name to save your plot .html
        title: set title of plot
        colormap: Dict of label:color pairs for coloring points (e.g. kmeans category:color)
    """
    pyo.init_notebook_mode()

    if not colormap:
        fig = px.scatter(
            data,
            x="x",
            y="y",
            hover_data=["hover_text", 'retweet_count', 'favorite_count', "k_means_category"],
            size="size",
            title=title,
        )
    else:
        fig = px.scatter(
            data,
            x="x",
            y="y",
            color="label",
            hover_data=["hover_text", 'retweet_count', 'favorite_count', "k_means_category"],
            size="size",
            title=title,
            color_discrete_map=colormap,
        )


    fig.layout.showlegend = False

    fig.update_traces(
        customdata=sampled[['hover_text', 'favorite_count', 'retweet_count', 'k_means_category', 'id']], 
        hovertemplate='%{customdata[0]}<br><br>'
            +'Likes: %{customdata[1]}   Retweets: %{customdata[2]}<br>'
            +'K-Means Category: %{customdata[3]}<br>'
            +'TweetID: %{customdata[4]}',
    )

    fig.update_layout(
        autosize=False,
        width=1600,
        height=900,
    )

#     fig.update(layout_coloraxis_showscale=False)
    fig.update_yaxes(title='y', visible=False)
    fig.update_xaxes(title='x', visible=False)
    fig.update(layout_showlegend=True)
    fig.update_layout(
        legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,),
        legend_title_text='Topic Label'
    )
    

    fig.show()
    
    if save:
        print("Saving figure as: ", fpath)
        fig.write_html(fpath)
        
        

In [None]:
# map kmeans cluster #'s to cluster labels
cluster_names = {
    0:'Noise - Outside', 
    1:'Noise - Middle', 
    2:'Case reports', 
    3:'Transmissibility, MSM communities', 
    4:'WHO Emergency', 
    5:'Vaccines', 
    6:'Monkeypox vs. Covid'
}

sampled['label'] = sampled['k_means_category'].apply(lambda x: cluster_names[x])

# color by label name
# setting up color map for plotting with standardized cluster colors across graphics
categories = cluster_names.values()
col = px.colors.qualitative.Plotly[:len(categories)]
gray = '#BAB0AC'


color_map_l = dict(zip(categories,  col))
print(color_map_l)

px_plot(sampled, fname="../results/mpox_{}_{}_{}_{}_{}.html".format(km_type, n_pca, n_neighbors, k, min_dist))


Read preprocessed 10% sample of tweets generated by "data_cleaning.ipynb"

In [None]:
import pandas as pd
import textwrap

sampled = pd.read_pickle("../data/010sample_091222.pkl")
# hover_text formats the tweets text with html formatting for line breaks 
sampled["hover_text"] = sampled.full_text.apply(lambda txt: '<br>'.join(textwrap.wrap(txt, width=50)))
sampled['size']=(sampled['retweet_count']+sampled['favorite_count']).apply(lambda x: np.log(x*x + 1))

print(len(sampled))

Generate high-dimensional vectors from your text for later reduction and plotting. This can take awhile.

In [None]:
from sklearn import preprocessing

print(sampled)
embeddings = get_embeddings(sampled['text_proc'], model_name="paraphrase-MiniLM-L6-v2")
# data = preprocessing.normalize(np.array(embeddings))

# sampled['embeddings'] = pd.Series(embeddings)

# data = np.array(embeddings)

print(sampled)

These cell will run through all combinations of parameters: n_pca, n_neighbors, k-means value, and min_dist for PCA/UMAP reduction and k-means clustering, and produce a plot for each. This is helpful for testing the effects of different values and tweaking to find the best plot. Running too many in one cell can cause the plots to lag in the notebook, so it's also set up to save the plots for each parameter set so you can inspect them separately. 

In [None]:
km_type = "post"

# Set overwrite to True to always redo transformations. If False, will check for previously saved version of data
overwrite = True

for n_pca in [40]:
    for n_neighbors in [500]:
        for k in [7]:
            for min_dist in [0.4]:
                print(km_type, n_pca, n_neighbors, k, min_dist)
                
                # Change this filepath to where you would like to save the plots:
                tfm_fname = "../models/mpox_tfm_{}_{}_{}_{}_{}.html".format(km_type, n_pca, n_neighbors, k, min_dist)

                # check for saved version of transformed data; redo the transformation if it doesn't exist
                if os.path.isfile(tfm_fname) and not overwrite:
                    print("Previous file found")
                    tfm = pd.read_pickle(tfm_fname)
                else:
                    print("No file read - Computing new transformation:")
                    tfm = get_transform(data, method='both', n_pca=n_pca, n_neighbors=n_neighbors, min_dist=min_dist)
                    
                    k, score = sil_range(tfm, test_vals=np.arange(5,20))
                    
                    print(k, score)
                    kmeans = KMeans(n_clusters=k, random_state=0).fit(tfm)
                    tfm = pd.DataFrame(tfm)
                    tfm['k_means_category'] = kmeans.labels_
                    
                    tfm_fname = "../models/mpox_tfm_{}_{}_{}_{}_{}.html".format(km_type, n_pca, n_neighbors, k, min_dist)
                    tfm.to_pickle(tfm_fname)

                sampled['y'] = tfm[1]
                sampled['x'] = tfm[0]
                sampled['k_means_category'] = tfm['k_means_category']
                
                px_plot(sampled, fname="../results/mpox_{}_{}_{}_{}_{}.html".format(km_type, n_pca, n_neighbors, k, min_dist))

                


In [None]:
km_type = "post"
overwrite=False

for n_pca in [15]:
    for n_neighbors in [30]:
        for k in [25]:
            for min_dist in [0.1]:
                print(km_type, n_pca, n_neighbors, k, min_dist)
                
                tfm_fname = "../models/mpox_tfm_{}_{}_{}_{}_{}.html".format(km_type, n_pca, n_neighbors, k, min_dist)

                # check for saved version of transformed data; create new if it doesn't exist
                if os.path.isfile(tfm_fname) and not overwrite:
                    print("Previous file found")
                    tfm = pd.read_pickle(tfm_fname)
                else:
                    print("No file read - Computing new transformation:")
                    tfm = get_transform(data, method='both', n_pca=n_pca, n_neighbors=n_neighbors, min_dist=min_dist)
                    
#                     k, score = sil_range(tfm, test_vals=np.arange(5,20))
                    kmeans = KMeans(n_clusters=k, random_state=0).fit(tfm)
                    tfm = pd.DataFrame(tfm)
                    tfm['k_means_category'] = kmeans.labels_
                    
                    tfm_fname = "../models/mpox_tfm_{}_{}_{}_{}_{}.html".format(km_type, n_pca, n_neighbors, k, min_dist)
                    tfm.to_pickle(tfm_fname)

                sampled['y'] = tfm[1]
                sampled['x'] = tfm[0]
                sampled['k_means_category'] = tfm['k_means_category']
                
                px_plot(sampled, save=True, fname="../results/mpox_{}_{}_{}_{}_{}.html".format(km_type, n_pca, n_neighbors, k, min_dist))

                


More example runs:

In [None]:
n_pca = 15
n_neighbors = 30
min_dist = 0.5

kmtype = "post"

print(n_pca, n_neighbors,  min_dist)

print("Initial data shape:", data.shape)

# apply PCA
fit = PCA(
    n_components=n_pca
)
pca = fit.fit_transform(data)

print("After PCA:", pca.shape)

sil_range(tfm, test_vals=np.arange(5,20))

# apply Kmeans
kmeans = KMeans(n_clusters=7, random_state=0).fit(tfm)

# apply UMAP
tfm = umap.UMAP(
    n_neighbors=n_neighbors,
    min_dist=min_dist,
    n_components=2,
    metric='cosine'
).fit_transform(pca)

print("After UMAP:", tfm.shape)

sampled['y'] = tfm[:,1]
sampled['x'] = tfm[:,0]

sampled['k_means_category'] = kmeans.labels_


px_plot(sampled, save=True, fname="../results/mpox_{}_{}_{}_{}_{}.html".format(km_type, n_pca, n_neighbors, k, min_dist))


In [None]:
n_pca = 40
n_neighbors = 30
min_dist = 0.1

print(n_pca, n_neighbors,  min_dist)

print("Initial data shape:", data.shape)

# apply PCA
fit = PCA(
    n_components=n_pca
)
pca = fit.fit_transform(data)

print("After PCA:", pca.shape)

k, score = sil_range(pca, test_vals=np.arange(5,20))


# apply Kmeans (pre-UMAP)
kmeans = KMeans(n_clusters=k, random_state=0).fit(pca)

# apply UMAP
tfm = umap.UMAP(
    n_neighbors=n_neighbors,
    min_dist=min_dist,
    n_components=2,
    metric='cosine'
).fit_transform(pca)

print("After UMAP:", tfm.shape)

sampled['y'] = tfm[:,1]
sampled['x'] = tfm[:,0]

sampled['k_means_category'] = kmeans.labels_

print("Pre-UMAP clustering:")
print("optimal k clusters:", k, score)


px_plot(sampled, fname="../results/mpox_{}_{}_{}_{}_{}.html".format(km_type, n_pca, n_neighbors, k, min_dist))


k, score = sil_range(tfm, test_vals=np.arange(5,20))

# apply Kmeans (post-UMAP)
kmeans2 = KMeans(n_clusters=k, random_state=0).fit(tfm)

sampled['k_means_category'] = kmeans2.labels_

print("Post-UMAP clustering:")
print("optimal k clusters:", k, score)


px_plot(sampled, fname="../results/mpox_{}_{}_{}_{}_{}.html".format(km_type, n_pca, n_neighbors, k, min_dist))

# apply Kmeans (post-UMAP)
kmeans = KMeans(n_clusters=k, random_state=0).fit(tfm)
