In [2]:
import pandas as pd
from collections import Counter
import plotly.express as px

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [8]:
# packages for clustering
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.cluster import KMeans # pip install scikit-learn
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
from sklearn.manifold import TSNE

2024-03-29 18:37:42.015126: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [71]:
dir_path = "/users/carolinejung/tiktok-news-exposure-1/" # CHANGE ME!

### STEP 1: Clean data

In [37]:
hashtag_filter_out = ['fyp', 'foryourpage', 'wow', 'news', 'fypツ', 'fypシ', 'viral'
                     'fyi', 'fypシ゚viral', 'foryouuuu', 'trendingtiktok', 'longvideos',
                     'fypppp', 'themoreyouknow', 'didyouknow', 'foryoupage', 'foryou', 'tiktok',
                     'breakingnews', 'topnews', 'viraltiktok', 'fypシ゚viral', 'catchupnews', 'viralvideo']

In [66]:
users = ["26301", "33534", "38129", "48271", "69117", "83721"]

def get_hash_or_sugg(user, to_parse): # for one user
    """Grab either the hashtag or suggested words column and clean data."""
    data = pd.read_csv(dir_path + f"3-filter-metadata/news_relevant_videos_{user}.csv")

    if data.empty == False:
        nested_hashtags = []
        for hashtag_list_as_str in data[to_parse]:
            hashtag_list_as_list = hashtag_list_as_str.strip('][').split(", ")
            cleaned_hashtag = [hashtag.strip("'") for hashtag in hashtag_list_as_list]
            nested_hashtags.append(cleaned_hashtag)
        
        flattened_hashtags = [hashtag.strip(" ").lower() for hashlist in nested_hashtags for hashtag in hashlist]
        no_empty_hashtags = list(filter(None, flattened_hashtags))
        return no_empty_hashtags

In [70]:
def filtering_hashtag(hashtags, threshold):
    """ Threshold: Filters out words that do not appear more than the threshold (exclusive)."""
    # filter out common hashtags
    output = [hashtag for hashtag in hashtags if hashtag not in hashtag_filter_out]

    # filter by frequency
    count = Counter(output)
    freq_hash = []
    for hashtag in count.items():
        if hashtag[1] > threshold:
            freq_hash.append(hashtag[0])
    return list(set(freq_hash)) # get unique values

print(filtering_hashtag(get_hash_or_sugg("26301", "hashtags"),2))

['economics', 'supertuesday', 'oscars', 'awareness', 'latina', 'parati', 'foxnews', 'bodycam', 'storytime', 'election', 'mystery', 'relatable', 'meme', 'crimejunkie', 'missing', 'medical', 'funny', 'greenscreen', 'movietok', '2024', 'wendys', 'biden', 'nfl', 'truecrimetiktok', 'truecrimecommunity', 'college', 'scary', 'stitch', 'california', 'politicaltiktok', 'law', 'history', 'republican', 'film', 'creepy', 'dark', 'truecrime', 'leftist', 'commentary', 'president', 'scotus', 'nyc', 'education', 'cnn', 'disturbing', 'football', 'usa', 'historytok', 'america', 'nyt', 'criminalminds', 'michigan', 'inflation', 'police', 'surgepricing', 'trending', 'democrat', 'millennials', 'language', 'trump', 'newyork', 'cops', '2024election', 'conservative', 'filmtok', 'learnontiktok', 'donaldtrump', 'bilingual', 'greenscreenvideo', 'liberal', 'finance', 'spanish', 'election2024', 'criminal', 'interpreter', 'florida', 'crime', 'academyawards', 'viral', 'supremecourt', 'politics', 'accessibility', 'tre

### STEP 2: Clustering

In [77]:
def get_embeddings(to_parse, threshold): #for all users
    """Parameter to_parse should either take a value of 'hashtags' or 'suggested_words'."""
    embedding = {}
    for user in users:
        if get_hash_or_sugg(user, to_parse) != None: # if empty file, don't get embeddings
            hash_or_sugg = filtering_hashtag(get_hash_or_sugg(user, to_parse), threshold)
            embedding[user] = embed(hash_or_sugg)
    return embedding
#get_embeddings("hashtags")["26301"]

In [83]:
def create_clusters(user, to_parse, k, threshold):
    kmeans = KMeans(n_clusters=k, random_state=42)
    clusters = kmeans.fit_predict(get_embeddings(to_parse, threshold)[user])
    final_clusters = {}
    for i in range(k):
        data = filtering_hashtag(get_hash_or_sugg(user, to_parse), threshold)
        cluster_words = [data[j] for j in range(len(data)) if clusters[j]==i]
        final_clusters[f"cluster{i}"] = cluster_words
    return final_clusters
#create_clusters("26301", "hashtags", 20, 1)["cluster1"] # to see the individual words

### STEP 3: Compare K-Means Clustering to Graphical Representation of Words

In [84]:
def calculate_tsne(user, to_parse, k, threshold):
    tsne = TSNE(n_components=2, random_state=42)  
    tsne_results = tsne.fit_transform(get_embeddings(to_parse, threshold)[user])
 
    embed_tsne = pd.DataFrame(tsne_results, columns=['tsne_1', 'tsne_2'])
    embed_tsne[to_parse] = filtering_hashtag(get_hash_or_sugg(user, to_parse), threshold)

    # restructure storage of cluster assignments
    cluster_dict = create_clusters(user, to_parse, k, threshold)
    cluster_assignment = {}
    for hashtag in embed_tsne[to_parse]:
        for cluster in cluster_dict.items():
            if hashtag in cluster[1]:
                cluster_assignment[hashtag] = cluster[0]
                
    # assign hashtags to their respective cluster groups
    embed_tsne[f"cluster_{to_parse}"] = "" 
    for row in range(embed_tsne.shape[0]):
        hashtag = embed_tsne[to_parse][row]
        embed_tsne.loc[row, f'cluster_{to_parse}'] = cluster_assignment[hashtag]
    return embed_tsne

calculate_tsne("26301", "hashtags", k=20, threshold=2)
#calculate_tsne("26301", "suggested_words", k=20, threshold=2)

Unnamed: 0,tsne_1,tsne_2,hashtags,cluster_hashtags
0,0.701737,-1.497256,economics,cluster14
1,-5.612346,5.950376,supertuesday,cluster3
2,0.281676,5.414684,oscars,cluster10
3,-1.244491,2.120918,awareness,cluster6
4,4.848599,5.305988,latina,cluster15
...,...,...,...,...
87,5.663321,0.653138,new,cluster1
88,-1.350715,-2.912508,twitter,cluster7
89,5.450714,3.254547,translate,cluster9
90,-5.684354,6.629288,truecrimetok,cluster3


In [85]:
def visualize_tsne(user, to_parse, k, threshold):
    data = calculate_tsne(user, to_parse, k, threshold)
    fig = px.scatter(data, x='tsne_1', y='tsne_2', text=to_parse, color="cluster_hashtags")
    fig.update_traces(textposition='top center', mode='markers+text', textfont=dict(size=8))
    fig.update_layout(title=f'User {user}: Embeddings of TikTok News {to_parse.capitalize()}',
                      width=1100,  # Set the width of the figure
                      height=700)  # Set the height of the figure
                    # slightly decrease to see it in one screen
    fig.show()

## Analysis 1: Hashtags for all Users

In [89]:
visualize_tsne("26301", "hashtags", 10, 2)

In [90]:
visualize_tsne("33534", "hashtags", 10,1)

In [92]:
visualize_tsne("48271", "hashtags", 10, 0) # hashtags only have frequency of 1

In [104]:
visualize_tsne("69117", "hashtags", 10, 1)

In [103]:
visualize_tsne("83721", "hashtags", 8, 2)

## Analysis 2: Suggested words for all Users