In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### STEP 1: Clean data

In [2]:
hashtag_filter_out = ['fyp', 'foryourpage', 'wow', 'news', 'fypツ', 'fypシ', 
                     'fyi', 'fypシ゚viral', 'foryouuuu', 'trendingtiktok', 'longvideos',
                     'fypppp', 'themoreyouknow', 'didyouknow', 'foryoupage', 'foryou', 'tiktok',
                     'breakingnews', 'topnews', 'viraltiktok', 'fypシ゚viral', 'catchupnews', 'viralvideo',
                     ]

In [3]:
def filtering_hashtag(hashtags):
    """
    filter the hashtags
    """
    # filter common hashtags
    output = list(set(hashtags)) # unique values - do this later
    output = [hashtag for hashtag in output if hashtag not in hashtag_filter_out]
    return output
# sandy version

In [4]:
dir_path = "/users/carolinejung/tiktok-news-exposure-1/" # CHANGE ME!
users = ["26301", "33534", "38129", "48271", "69117", "83721"]

# for one user:
def get_hash_or_sugg(user, to_parse):
    """Grab either the hashtag or suggested words column and clean data."""
    data = pd.read_csv(dir_path + f"3-filter-metadata/news_relevant_videos_{user}.csv")

    if data.empty == False:
        nested_hashtags = []
        for hashtag_list_as_str in data[to_parse]:
            hashtag_list_as_list = hashtag_list_as_str.strip('][').split(", ")
            cleaned_hashtag = [hashtag.strip("'") for hashtag in hashtag_list_as_list]
            nested_hashtags.append(cleaned_hashtag)
        
        flattened_hashtags = [hashtag.strip(" ").lower() for hashlist in nested_hashtags for hashtag in hashlist]
        no_empty_hashtags = list(filter(None, flattened_hashtags))
        #no_empty_hashtags = filtering_hashtag(no_empty_hashtags)
        return no_empty_hashtags

get_hash_or_sugg("26301", "hashtags")
#get_hash_or_sugg("26301", "suggested_words")
get_hash_or_sugg("38129", "hashtags")

In [5]:
from collections import Counter

def filtering_hashtag(hashtags):
    """
    filter the hashtags
    """
    # filter out common hashtags
    output = [hashtag for hashtag in hashtags if hashtag not in hashtag_filter_out]

    # filter by frequency
    # count how many times a word appears
    count = Counter(output)
    freq_hash = []
    for hashtag in count.items():
        # hashtag format is a tuple: (key, value)
        if hashtag[1] > 1:
            freq_hash.append(hashtag[0])

    # then, only get unique values
    return list(set(freq_hash))

print(filtering_hashtag(get_hash_or_sugg("26301", "hashtags")))



In [6]:
hashtag_26301 = filtering_hashtag(get_hash_or_sugg("26301", "hashtags"))
len(hashtag_26301)

222

### STEP 2: Clustering

In [7]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from sklearn.cluster import KMeans # pip install scikit-learn
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

2024-03-29 18:32:02.578387: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
def get_embeddings(to_parse): #for all users
    """Parameter to_parse should either take a value of 'hashtags' or 'suggested_words'."""
    embedding = {}
    for user in users:
        if get_hash_or_sugg(user, to_parse) != None: # if empty file, don't get embeddings
            hash_or_sugg = filtering_hashtag(get_hash_or_sugg(user, to_parse))
            embedding[user] = embed(hash_or_sugg)
    return embedding
    
get_embeddings("hashtags")["26301"]

<tf.Tensor: shape=(222, 512), dtype=float32, numpy=
array([[ 0.02722869, -0.04502684, -0.00341357, ..., -0.06166474,
         0.06512767, -0.0567526 ],
       [-0.0171115 ,  0.07519299,  0.01799584, ..., -0.03785214,
         0.03043216, -0.02350079],
       [-0.02651086,  0.03063033,  0.0091166 , ..., -0.02033245,
         0.02707102, -0.04549546],
       ...,
       [-0.05089248,  0.08197766,  0.00808257, ...,  0.01064078,
        -0.03839003,  0.02104974],
       [-0.00062161,  0.001374  ,  0.06505843, ..., -0.05724625,
        -0.02909397, -0.01648107],
       [ 0.03201279,  0.03623682,  0.07640833, ...,  0.05101123,
         0.00593138, -0.02307279]], dtype=float32)>

In [9]:
def create_clusters(user, to_parse, k=20):
    kmeans = KMeans(n_clusters=k, random_state=42)
    clusters = kmeans.fit_predict(get_embeddings(to_parse)[user])
    
    final_clusters = {}
    for i in range(k):
        data = filtering_hashtag(get_hash_or_sugg(user, to_parse))
        cluster_words = [data[j] for j in range(len(data)) if clusters[j]==i]
        final_clusters[f"cluster{i}"] = cluster_words
    return final_clusters

create_clusters("26301", "hashtags", 20)["cluster1"] # to see the individual words

['medicalterminology',
 'chineseadoption',
 'floridacheck',
 'nikkihaley',
 'ivf',
 'todayshow']

In [10]:
create_clusters("26301", "hashtags", 20)["cluster2"] 

['lawtok',
 'lawschool',
 'gaza',
 'fy',
 'royalnews',
 'orlandoflorida',
 'lawyertiktok',
 'missing',
 'madelinesoto',
 'longervideos',
 'reproductiverights',
 'americantranslatorsassociation',
 'dog',
 'copsoftiktok',
 'election',
 'filmtok',
 'zerocon24',
 'disabilitytiktok',
 'reachclimbgo',
 'onechildpolicy',
 'unitednations']

In [11]:
create_clusters("26301", "hashtags", 20)["cluster3"]

['willandkate',
 'scotus',
 'movietok',
 'inflation',
 'cheese',
 'stitch',
 'royalupdate',
 'lgbtq',
 'viral',
 'stem',
 'accessibility',
 'mobility',
 'oppenheimer',
 'disability']

In [12]:
create_clusters("26301", "hashtags", 20)

{'cluster0': ['latina', 'latino', 'brie'],
 'cluster1': ['medicalterminology',
  'chineseadoption',
  'floridacheck',
  'nikkihaley',
  'ivf',
  'todayshow'],
 'cluster2': ['lawtok',
  'lawschool',
  'gaza',
  'fy',
  'royalnews',
  'orlandoflorida',
  'lawyertiktok',
  'missing',
  'madelinesoto',
  'longervideos',
  'reproductiverights',
  'americantranslatorsassociation',
  'dog',
  'copsoftiktok',
  'election',
  'filmtok',
  'zerocon24',
  'disabilitytiktok',
  'reachclimbgo',
  'onechildpolicy',
  'unitednations'],
 'cluster3': ['willandkate',
  'scotus',
  'movietok',
  'inflation',
  'cheese',
  'stitch',
  'royalupdate',
  'lgbtq',
  'viral',
  'stem',
  'accessibility',
  'mobility',
  'oppenheimer',
  'disability'],
 'cluster4': ['film', 'oscars', 'movie', 'greenscreen', 'netflix', 'comedy'],
 'cluster5': ['boston',
  'florida',
  'palestine',
  'america',
  'american',
  'california',
  'michigan',
  'alabama',
  'south',
  'texas',
  'nyc',
  'southern',
  'eu',
  'india',

### STEP 3: Compare K-Means Clustering to Graphical Representation of Words

In [14]:
from sklearn.manifold import TSNE

def calculate_tsne(user, to_parse, k):
    tsne = TSNE(n_components=2, random_state=42)  
    tsne_results = tsne.fit_transform(get_embeddings(to_parse)[user])
    #return tsne_results
    embed_tsne = pd.DataFrame(tsne_results, columns=['tsne_1', 'tsne_2'])
    embed_tsne[to_parse] = filtering_hashtag(get_hash_or_sugg(user, to_parse))

    # edit
    # add in a column for which cluster it is assigned to
    cluster_dict = create_clusters(user, to_parse, k)
    exit()

    embed_tsne["cluster"] = "" #--> rewrite this

    # list which hashtags are in which cluster first, then order
    #output = [hashtag for hashtag in hashtags if hashtag not in hashtag_filter_out]
    
    cluster_assignment = {} # make this a dictionary instead
    for hashtag in embed_tsne["hashtags"]:
        for cluster in cluster_dict.items():
            if hashtag in cluster[1]:
                cluster_assignment[hashtag] = cluster[0]
    #return cluster_assignment
    
    # replace values in the 'age' column based on condition
    #df.loc[df['age'] >= 50, 'age'] = 50 ## age greater than 50
                
    # for assignment in cluster_assignment:
    #     # for each (hashtag, cluster) pair
    #     embed_tsne.loc[embed_tsne["cluster"] != assignment[1], 'hashtags'] = assignment[1]
    
    # need a loop?
    # for row in
    # #hashtag in embed_tsne["hashtags"]:
    #     print(cluster_assignment[hashtag])
    #     embed_tsne["cluster"]
    #embed_tsne.loc[embed_tsne["cluster"] != cluster_assignment[hashtag], "cluster"] = cluster_assignment[hashtag]
   
    for row in embed_tsne:
        hashtag = embed_tsne["hashtags"][row]
        embed_tsne.iloc[row]["cluster"] = cluster_assignment[hashtag]


    #     embed_tsne.iloc[row["cluster"] != cluster_assignment[row["hashtags"]]] = cluster_assignment[row["hashtags"]]
       

  
    # for row in range(embed_tsne.shape[0]):
    #     hashtag = embed_tsne.iloc[row]["hashtags"]
    #     for cluster in cluster_dict.items():
    #         if hashtag in cluster[1]:
    #             #embed_tsne.iloc[row]["cluster"] = cluster[0]
    #             #embed_tsne.loc[embed_tsne[row], "cluster"] = cluster[0]

    return embed_tsne

calculate_tsne("26301", "hashtags", 20) # 20 clusters

KeyError: 'tsne_1'

: 

In [165]:
create_clusters("26301", "hashtags", 20)["cluster15"]

['kissimmee_florida',
 'princewilliam',
 'orangecounty',
 'bodycam',
 'kissimmeeflorida',
 'costofliving',
 'royalnews',
 'crimejunkie',
 'supertuesday',
 'policeofficer',
 'movietok',
 'chineseadoptee',
 'supremecourt',
 'abcnews',
 'willandkate',
 'fy',
 'parati',
 'medicalspanish',
 'dailyshow',
 'mvsterious',
 'voteblue',
 'lawstudent']

In [139]:
import plotly.express as px

def visualize_tsne(user, to_parse):
    data = calculate_tsne(user, to_parse)
    fig = px.scatter(data, x='tsne_1', y='tsne_2', text=to_parse)
    fig.update_traces(textposition='top center', mode='markers+text', textfont=dict(size=6))
    # Update layout with larger width and height
    fig.update_layout(title=f'User {user}: Embeddings of TikTok News {to_parse.capitalize()}',
                      width=1200,  # Set the width of the figure
                      height=800)  # Set the height of the figure
    fig.show()

## Analysis 1: Hashtags for all Users

In [140]:
visualize_tsne("26301", "hashtags")

In [136]:
visualize_tsne("33534", "hashtags")

## Analysis 2: Suggested words for all Users