In [None]:
import torch
import pandas as pd
# from torch.utils.data import DataLoader, SequentialSampler
from sklearn.preprocessing import MaxAbsScaler
import numpy as np
from sklearn.cluster import KMeans
from collections import Counter
from itertools import chain
from tqdm.auto import tqdm

from dataset import TwitterDataset
from autoencoders import LinearAutoEncoder

In [44]:
f"1_{pd.Timedelta('30m')}"

'1_0 days 00:30:00'

In [1]:
dset = TwitterDataset(
    'timestamp',
    'sentiment/vader/',
    # whole_text_path = 'whole_text',
    token_path = 'tokens/text',
    embedding_path = 'embeddings/all-MiniLM-L6-v2/',
)

loading from timestamp:   0%|          | 0/221 [00:00<?, ?it/s]

loading tokens...:   0%|          | 0/221 [00:00<?, ?it/s]

loading embeddings..:   0%|          | 0/221 [00:00<?, ?it/s]

loading sentiment..:   0%|          | 0/221 [00:00<?, ?it/s]

loaded dataset. took 193.87294996343553 ms


In [2]:
# torch.save(model.state_dict(),)
import torch
from autoencoders import LinearAutoEncoder

model = LinearAutoEncoder()
model.load_state_dict(torch.load('models/LinearAutoEncoder.pkl'))
model = model.cuda()

In [31]:
slice_size = '30m'
import pandas as pd
dset_start = pd.to_datetime(dset.timestamp[dset.sorted_idx][0])
dset_end = pd.to_datetime(dset.timestamp[dset.sorted_idx][-1])
slice_size = pd.Timedelta(slice_size)
time_slices = [
    [dset_start + slice_size * i, dset_start + slice_size * (i+1)]
    for i in range(int(np.ceil((dset_end - dset_start)/slice_size))-1)
]


In [40]:
n = 10
slice_beg,slice_end = time_slices[0]
beg,end = dset.get_range(slice_beg,slice_end)
for p in model.parameters():
    p.requires_grad = False


original_idxs = dset[beg:end]['original_index']
original_embs = dset[beg:end]['embedding'].cuda()
reduced_embs = model.encoder(original_embs).cpu().numpy()
scaled_embs = MaxAbsScaler().fit_transform(reduced_embs)
# del original_dims

kmeans = KMeans(n_clusters=5).fit(scaled_embs)


top_N_idxs = []
word_counts = []
for label in set(kmeans.labels_):
    centroid = kmeans.cluster_centers_[label]
    cluster = scaled_embs[kmeans.labels_ == label]
    top_N_closest = np.argsort(np.sqrt(((cluster - centroid)**2).sum(1)))[:n]
    word_count = sorted(
        Counter(
            chain(
                *[
                    dset.tokens[i] 
                    for i in original_idxs[kmeans.labels_ == label]
                ]
            )
        ).items(),
        key = lambda x: x[1],
        reverse=True
    )
    top_N_idxs.append(top_N_closest)
    word_counts.append(word_count)
top_N_idxs = np.stack(top_N_idxs)

In [43]:
for i,c in enumerate(word_counts):
    print('label:', i)
    print(c[:10])

label: 0
[('twitter', 1483), ('join', 1455), ('check', 1450), ('follow', 1442), ('crypto', 1439), ('bsc', 1432), ('telegram', 1427), ('maybe', 1420), ('tweet', 1417), ('cryptocurrencies', 1414)]
label: 1
[('$', 2633), ('far', 2623), ('uniswap', 2622), ('exploited', 2622), ('dude', 2622), ('200k', 2622), ('leaked', 2622), ('alpha', 2622), ('group', 2622), ('talking', 1996)]
label: 2
[('$', 3044), ('nft', 2358), ('token', 2013), ('crypto', 1856), ('join', 1229), ('binance', 1115), ('premium', 1059), ('apesport', 1056), ('nfts', 906), ('+', 905)]
label: 3
[('$', 2083), ('binance', 1383), ('crypto', 1089), ('pump', 1001), ('kucoin', 809), ('btc', 703), ('ftt', 685), ('bnb', 567), ('group', 500), ('ftx', 480)]
label: 4
[('security', 606), ('roll', 263), ('bridge', 260), ('social', 139), ('amp', 124), ('people', 96), ('like', 75), ('sushi', 61), ('good', 53), ('want', 52)]


In [41]:
word_counts[0]

[('twitter', 1483),
 ('join', 1455),
 ('check', 1450),
 ('follow', 1442),
 ('crypto', 1439),
 ('bsc', 1432),
 ('telegram', 1427),
 ('maybe', 1420),
 ('tweet', 1417),
 ('cryptocurrencies', 1414),
 ('bnb', 1413),
 ('month', 1411),
 ('gives', 1409),
 ('constantly', 1406),
 ('busd', 1403),
 ('spookyshiba', 1401),
 ('slice', 1401),
 ('spky', 1401),
 ('pie', 1401),
 ('blockchain', 429),
 ('security', 288),
 ('$', 214),
 ('world', 207),
 ('hack', 165),
 ('muunomics', 158),
 ('bitcoin', 151),
 ('contact', 138),
 ('platform', 136),
 ('market', 134),
 ('like', 133),
 ('got', 129),
 ('muu', 128),
 ('people', 125),
 ('instagram', 123),
 ('account', 121),
 ('cryptocurrency', 107),
 ('send', 105),
 ('day', 105),
 ('to', 98),
 ('go', 96),
 ('cyber', 93),
 ('adscoin', 92),
 ('money', 91),
 ('ftx', 88),
 ('able', 87),
 ('help', 87),
 ('network', 86),
 ('india', 86),
 ('new', 85),
 ('project', 82),
 ('k', 82),
 ('listing', 82),
 ('expert', 80),
 ('hyperonchain', 80),
 ('cmc', 79),
 ('style', 79),
 ('pro

(20853638, 20902679)

In [4]:
from torch.utils.data import DataLoader, SequentialSampler
from sklearn.preprocessing import MaxAbsScaler
import numpy as np

beg,end = dset.get_range(start,start + pd.Timedelta('30m'))
for p in model.parameters():
    p.requires_grad = False


original_idxs = dset[beg:end]['original_index']
original_embs = dset[beg:end]['embedding'].cuda()
reduced_embs = model.encoder(original_embs).cpu().numpy()
scaled_embs = MaxAbsScaler().fit_transform(reduced_embs)
# del original_dims


In [5]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5).fit(scaled_embs)


In [6]:

label = 0


In [10]:
# [dset.tokens[i] for i in original_idxs[top_N_closest]]




Counter({'join': 1226,
         'invest': 12,
         'ico': 3,
         'miss': 82,
         'great': 110,
         'opportunity': 41,
         'supporter': 1,
         'best': 433,
         'projects': 26,
         'crypto': 1774,
         'space': 20,
         'sure': 18,
         'metaversenft': 3,
         'cryptocurrency': 110,
         'playtoearn': 14,
         'bsc': 394,
         'metasport': 4,
         'mtsp': 3,
         'cred': 21,
         'amazing': 181,
         'thanks': 50,
         'mentorship': 48,
         'program': 51,
         'insight': 21,
         'concerning': 21,
         'market': 122,
         'telegram': 634,
         'check': 90,
         'nearly': 50,
         'late': 92,
         '50': 24,
         'unique': 9,
         'manual': 1,
         'high': 49,
         'da': 2,
         'pa': 1,
         'forum': 2,
         'posting': 1,
         'dofollow': 3,
         'seo': 1,
         'backlinks': 1,
         '$': 2979,
         '10': 685,
         'l