**To use the hashformers package, the runtime must be a GPU**

In [1]:
!pip install hashformers



In [2]:
from hashformers import TransformerWordSegmenter as WordSegmenter
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import tensorflow as tf
import tensorflow_hub as hub
import plotly.express as px
import json
import re
import random

In [3]:
ws = WordSegmenter(
    segmenter_model_name_or_path="distilgpt2",
    segmenter_model_type="incremental",
    reranker_model_name_or_path="distilbert-base-uncased",
    reranker_model_type="masked",
    segmenter_gpu_batch_size=1,
    reranker_gpu_batch_size=2000
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# Mount Drive
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
df = pd.read_csv('drive/MyDrive/results.csv')

In [None]:
df = pd.concat([df, df2])

In [11]:
df = df[df['video_locationcreated'] == 'US']

In [12]:
df.head()

Unnamed: 0,video_id,video_timestamp,video_duration,video_locationcreated,suggested_words,video_diggcount,video_sharecount,video_commentcount,video_playcount,video_description,video_is_ad,video_stickers,author_username,author_name,author_followercount,author_followingcount,author_heartcount,author_videocount,author_diggcount,author_verified
0,7315561816673750318,2023-12-22T18:16:20,18.0,US,"flipnote, flipnote animation, asdf movie, flip...",3500000.0,39500.0,10800.0,18700000.0,Bonus flipnote for you,False,,zanelittlemusic,Zane Little,,,,,,False
1,7315561816673750318,2023-12-22T18:16:20,18.0,US,"flipnote, flipnote animation, asdf movie, flip...",3500000.0,39500.0,10800.0,18700000.0,Bonus flipnote for you,False,,zanelittlemusic,Zane Little,,,,,,False
2,7315561816673750318,2023-12-22T18:16:20,18.0,US,"flipnote, flipnote animation, asdf movie, flip...",3500000.0,39500.0,10800.0,18700000.0,Bonus flipnote for you,False,,zanelittlemusic,Zane Little,,,,,,False
3,7315561816673750318,2023-12-22T18:16:20,18.0,US,"flipnote, flipnote animation, asdf movie, flip...",3500000.0,39500.0,10800.0,18700000.0,Bonus flipnote for you,False,,zanelittlemusic,Zane Little,,,,,,False
4,7315561816673750318,2023-12-22T18:16:20,18.0,US,"flipnote, flipnote animation, asdf movie, flip...",3500000.0,39500.0,10800.0,18700000.0,Bonus flipnote for you,False,,zanelittlemusic,Zane Little,,,,,,False


In [13]:
import random
random.seed(42)

In [14]:
def get_hashtags(my_string):
  if isinstance(my_string, str):
    return re.findall(r'#(\S+)', my_string.lower())
  return []


In [15]:
def get_segments(my_string):
  if my_string == []:
    return []
  if isinstance(my_string, str):
    my_string = [my_string]
  return ws.segment(my_string)

In [16]:
df['hashtags'] = df.video_description.apply(get_hashtags)

In [17]:
myDF = df[["video_id", "author_username", "video_description",
                "hashtags", "suggested_words"]].copy()

In [18]:
myDF.shape

(5411, 5)

In [19]:
mask = myDF['hashtags'].apply(lambda x: len(x) > 0)  # Check if each list is not empty
filt_dta = myDF[mask]

In [20]:
filt_dta.shape

(4391, 5)

In [21]:
all_hash = filt_dta['hashtags'].tolist()
result = []
trash = [result.extend(el) for el in all_hash]

In [None]:
result = list(set(result))
print(len(result))

17568


In [23]:
hashed = []
for word in result:
  if word == "":
    continue
  word2 = word.split('#')
  if len(word2) <= 1:
    word2 = word2[0]
  else:
    word2 = ' '.join(word2)
  try:
    if word2[0] != '#':
      word2 = '#' + word
    hashed.append(word2)
  except:
    print(f"Something went wrong with {word}")
    continue


In [None]:
len(result)

17568

In [22]:
my_sample = random.sample(result, 250)

In [24]:
segmentations = ws.segment(my_sample, steps=10, use_reranker=False)

In [25]:
segmentations

['avatar the last airbender movie',
 'stitch',
 'viral',
 'fyp',
 'broskination',
 'santagato studios',
 'poetry',
 '40s',
 'vienna travel',
 'charlidamelio',
 'green screen',
 'fyp',
 'mechanical engineering',
 'weddingday',
 'yiruma',
 'ivf',
 'cap cut template',
 'reddit',
 'patrick jadams',
 'native tiktok',
 'talia is not sleepy',
 'green screen',
 'royalnews',
 'anime edit',
 'pa',
 'starwars',
 'date',
 'fyp',
 'con law',
 'jamescameron',
 'green screen',
 'stem',
 'cs',
 'fyp',
 'wendy',
 'fed is best',
 'queer wedding',
 'dress',
 'first gen',
 'for you',
 'first gen',
 'movie',
 'weeb',
 'for you',
 'bedroom decor',
 'ni_ki',
 'french',
 'walker',
 'musec reacts',
 'reddit',
 'funny',
 'spoilers',
 'urban design',
 'mit student',
 'reddit',
 'for you page',
 'engineering student',
 'guts tour',
 'dune 2',
 'an aerobic fermentation',
 'fy',
 'royal drama',
 'ambani wedding',
 'fyp',
 'hermanas',
 'imogenkeeper',
 'netflix recommendation',
 'newgirlschmidt',
 'medical school',


In [26]:
segs = []
for word in segmentations:
  if "#" in word:
    word = word.split('#')
    if len(word) > 1:
      segs.extend(word)
  else:
    segs.append(word)


In [27]:
segs = segs + ['palestine', 'wsj', 'artists 4 ceasefire','boycott','legend', 'icon','broken system', 'farsi', 'egyptian',
               'paki', 'pakistani', 'hijabi problems', 'indian tok','airforce', 'marines', 'slavic', 'tibetan']
segs = segs + ['slavic', 'tibetan']

In [35]:
segs = list(set(segs))

In [28]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [36]:
embeddings=embed(segs)

In [None]:
len(embeddings)

525

In [39]:
k = 25
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(embeddings)





In [40]:
all_clusters = []
for i in range(25):
  print(f"Cluser {i}")
  cluster_words = {segs[j] for j in range(len(segs)) if clusters[j] == i}
  all_clusters.append(cluster_words)
  print(cluster_words)

Cluser 0
{'web weaving slideshow', 'film', 'movie', 'cap cut', 'bedroom decor', 'cap cut template', 'green screen'}
Cluser 1
{'dating', 'human resources life', 'celebrity blind items', 'childfree women', 'anime edit', 'polite society', 'single dad life', 'girls under 25', 'date', 'relationship'}
Cluser 2
{'poemascortos', 'bobmenendez', 'shapershifter', 'broskination', 'ockyway', 'kyliejenner', 'nailsart', 'newgirlschmidt', 'dogcute', 'fyp', 'imogenkeeper', 'biglaw', 'blacktiktok', 'jamescameron', 'freechani', 'annieelise', 'womeninstem', 'thebarkleys', 'royalnews', 'candycrush'}
Cluser 3
{'slavic', 'weeb', 'pakistani', 'srilankan', 'paki'}
Cluser 4
{'stpattysday', 'zerocon24', 'me xico magico', 'walker', 'savings tips', 'mother of junk', 'patrick jadams', 'jogo', 'artists 4 ceasefire'}
Cluser 5
{'zacefron', 'dreamcosplayminecraft', 'memecut', 'quittok', 'hermanas', '40s', 'charlidamelio', 'anantambani', 'cs', 'wsj', 'rholsc', 'refybeauty', 'devinbooker kendalljenner', 'covergirlpartner

In [41]:
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(embeddings)

# Step 3: Create a pandas dataframe
df = pd.DataFrame(tsne_results,
                  columns=['tsne_1', 'tsne_2'])
df['hashtag'] = segs

# Step 4: Use plotly to visualize it
fig = px.scatter(df, x='tsne_1', y='tsne_2', text='hashtag')
fig.update_traces(textposition='top center', mode='markers+text', textfont=dict(size=6))
fig.update_layout(title='Embeddings of TikTok News Hashtags')


In [44]:
df.shape

(233, 3)

In [None]:
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(embeddings)

# Step 3: Create a pandas dataframe
df = pd.DataFrame(tsne_results,
                  columns=['tsne_1', 'tsne_2'])
df['hashtag'] = segmentations

# Step 4: Use plotly to visualize it
fig = px.scatter(df, x='tsne_1', y='tsne_2', text='hashtag')
fig.update_traces(textposition='top center', mode='markers+text', textfont=dict(size=6))
fig.update_layout(title='Embeddings of TikTok News Hashtags')


In [None]:
myDF = df[["video_id", "author_username", "video_description",
                "hashtags", "suggested_words"]].copy()


In [None]:
news = pd.read_csv('news.csv')

In [None]:
news.head()

Unnamed: 0,Hashtag name,Link,Initials
0,uspolitics,https://www.tiktok.com/tag/uspolitics?lang=en,EM
1,republicans,https://www.tiktok.com/tag/republicans?lang=en,EM
2,republican,https://www.tiktok.com/tag/republican?lang=en,EM
3,democrat,https://www.tiktok.com/tag/democrat?lang=en,EM
4,democrats,https://www.tiktok.com/tag/democrats?lang=en,EM


In [None]:
news_hash = news['Hashtag name'].to_list()

In [None]:
set(news_hash).intersection(set(segs))

{'blacklivesmatter',
 'palestine',
 'sports',
 'unemployment',
 'vote',
 'worldnews'}

In [None]:
headlines = pd.read_csv('output.csv')

In [None]:
headlines.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,Sheriff Recorded Making Inflammatory Comments ...
1,1,Governor Vetoes Louisiana’s Ban on Transition ...
2,2,Robert Kennedy Jr. Reports Income of $7.8 Million
3,3,Coast Guard Apologizes for Covering Up Long Hi...
4,4,Certain Surprise Party?


In [None]:
headline = headlines.iloc[:,1].to_list()

In [None]:
headline = random.sample(headline, 25)

In [None]:
len(headline)

29780

In [None]:
em_all = embed(headline + segs[-300:])

In [None]:
len(em_all)

625

In [None]:
df

Unnamed: 0,tsne_1,tsne_2
0,-9.999284,-33.425861
1,-0.397730,-28.652657
2,-9.998209,-21.816757
3,-7.527874,-38.671719
4,-5.265017,-35.932163
...,...,...
620,-0.187387,-7.200427
621,-3.645554,0.518941
622,-3.913756,1.374411
623,12.512190,27.008213


In [None]:
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(em_all)

# Step 3: Create a pandas dataframe
df = pd.DataFrame(tsne_results,
                  columns=['tsne_1', 'tsne_2'])
df['hashtag'] = headline + segs[-300:]

# Step 4: Use plotly to visualize it
fig = px.scatter(df, x='tsne_1', y='tsne_2', text='hashtag')
fig.update_traces(textposition='top center', mode='markers+text', textfont=dict(size=6))
fig.update_layout(title='Embeddings of TikTok News Hashtags and Headlines')