clustering hashtags of videos in metadata

In [10]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### STEP 1: Clean data

In [78]:
dir_path = "/users/carolinejung/tiktok-news-exposure-1/" # CHANGE ME!
users = ["26301", "33534", "38129", "48271", "69117", "83721"]

# for one user:
def get_hash_or_sugg(user, to_parse):
    """Grab either the hashtag or suggested words column and clean data."""
    data = pd.read_csv(dir_path + f"3-filter-metadata/news_relevant_videos_{user}.csv")

    if data.empty == False:
        nested_hashtags = []
        for hashtag_list_as_str in data[to_parse]:
            hashtag_list_as_list = hashtag_list_as_str.strip('][').split(", ")
            cleaned_hashtag = [hashtag.strip("'") for hashtag in hashtag_list_as_list]
            nested_hashtags.append(cleaned_hashtag)
        
        flattened_hashtags = [hashtag.strip(" ").lower() for hashlist in nested_hashtags for hashtag in hashlist]
        no_empty_hashtags = list(filter(None, flattened_hashtags))
        return no_empty_hashtags
    

get_hash_or_sugg("26301", "hashtags")
#get_hash_or_sugg("26301", "suggested_words")
get_hash_or_sugg("38129", "hashtags")

# for user in users:
#     get_hashtags(user)

### STEP 2: Clustering

In [56]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import json
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [106]:
# get embeddings
embed_hash = {}
for user in users:
    if get_hash_or_sugg(user, "hashtags") != None: # if empty file, don't get embeddings
        embed_hash[user] = embed(get_hash_or_sugg(user, "hashtags"))
embed_hash["26301"]

<tf.Tensor: shape=(1700, 512), dtype=float32, numpy=
array([[ 0.00337258,  0.03773119, -0.00500557, ..., -0.03416413,
         0.07234713,  0.01763893],
       [-0.01852792,  0.03192428,  0.01496101, ..., -0.01370201,
         0.0625033 , -0.06378186],
       [-0.06867842,  0.02728777,  0.05366998, ...,  0.03335827,
         0.00753285, -0.02771246],
       ...,
       [-0.00011565, -0.00420487,  0.05659379, ..., -0.01594581,
         0.02513054, -0.03087805],
       [-0.0195898 ,  0.06461032,  0.0443362 , ..., -0.02980971,
         0.07319511, -0.03591765],
       [-0.01288995,  0.01143374,  0.06181577, ..., -0.06507827,
         0.07534911, -0.04261151]], dtype=float32)>

In [84]:
# cluster embeddings
from sklearn.cluster import KMeans # pip install scikit-learn

In [None]:
def create_clusters(user, to_parse, k=10):
    kmeans = KMeans(n_clusters=k, random_state=42)
    clusters = kmeans.fit_predict(embed_hash[user])
    
    final_clusters = {}
    for i in range(k):
        #print(f"Cluster {i}:", end='')
        data = get_hash_or_sugg(user, to_parse)
        cluster_words = [data[j] for j in range(len(data)) if clusters[j]==i]
        final_clusters[f"cluster{i}"] = cluster_words
        #print(cluster_words)
    return final_clusters

create_clusters("26301", "hashtags", 5)["cluster1"]

In [109]:
# compute tsne
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)  
tsne_results = tsne.fit_transform(embed_hash["26301"])

embed_hash_tsne = pd.DataFrame(tsne_results, columns=['tsne_1', 'tsne_2'])
embed_hash_tsne['hashtag'] = get_hash_or_sugg("26301", "hashtags")

In [110]:
# actually visualize it
import plotly.express as px
fig = px.scatter(embed_hash_tsne, x='tsne_1', y='tsne_2', text='hashtag')
fig.update_traces(textposition='top center', mode='markers+text', textfont=dict(size=6))
fig.update_layout(title='Embeddings of TikTok News Hashtags')